pmap.c revision 204041
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: head/sys/i386/xen/pmap.c 204041 2010-02-18 14:28:38Z ed $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * In addition to hardware address maps, this 84 * module is called upon to provide software-use-only 85 * maps which may or may not be stored in the same 86 * form as hardware maps. These pseudo-maps are 87 * used to store intermediate results from copy 88 * operations to and from address spaces. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106#define PMAP_DIAGNOSTIC 107 108#include "opt_cpu.h" 109#include "opt_pmap.h" 110#include "opt_msgbuf.h" 111#include "opt_smp.h" 112#include "opt_xbox.h" 113 114#include <sys/param.h> 115#include <sys/systm.h> 116#include <sys/kernel.h> 117#include <sys/ktr.h> 118#include <sys/lock.h> 119#include <sys/malloc.h> 120#include <sys/mman.h> 121#include <sys/msgbuf.h> 122#include <sys/mutex.h> 123#include <sys/proc.h> 124#include <sys/sf_buf.h> 125#include <sys/sx.h> 126#include <sys/vmmeter.h> 127#include <sys/sched.h> 128#include <sys/sysctl.h> 129#ifdef SMP 130#include <sys/smp.h> 131#endif 132 133#include <vm/vm.h> 134#include <vm/vm_param.h> 135#include <vm/vm_kern.h> 136#include <vm/vm_page.h> 137#include <vm/vm_map.h> 138#include <vm/vm_object.h> 139#include <vm/vm_extern.h> 140#include <vm/vm_pageout.h> 141#include <vm/vm_pager.h> 142#include <vm/uma.h> 143 144#include <machine/cpu.h> 145#include <machine/cputypes.h> 146#include <machine/md_var.h> 147#include <machine/pcb.h> 148#include <machine/specialreg.h> 149#ifdef SMP 150#include <machine/smp.h> 151#endif 152 153#ifdef XBOX 154#include <machine/xbox.h> 155#endif 156 157#include <xen/interface/xen.h> 158#include <xen/hypervisor.h> 159#include <machine/xen/hypercall.h> 160#include <machine/xen/xenvar.h> 161#include <machine/xen/xenfunc.h> 162 163#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 164#define CPU_ENABLE_SSE 165#endif 166 167#ifndef PMAP_SHPGPERPROC 168#define PMAP_SHPGPERPROC 200 169#endif 170 171#if defined(DIAGNOSTIC) 172#define PMAP_DIAGNOSTIC 173#endif 174 175#if !defined(PMAP_DIAGNOSTIC) 176#ifdef __GNUC_GNU_INLINE__ 177#define PMAP_INLINE inline 178#else 179#define PMAP_INLINE extern inline 180#endif 181#else 182#define PMAP_INLINE 183#endif 184 185#define PV_STATS 186#ifdef PV_STATS 187#define PV_STAT(x) do { x ; } while (0) 188#else 189#define PV_STAT(x) do { } while (0) 190#endif 191 192#define pa_index(pa) ((pa) >> PDRSHIFT) 193#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 194 195/* 196 * Get PDEs and PTEs for user/kernel address space 197 */ 198#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 199#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 200 201#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 202#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 203#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 204#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 205#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 206 207#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 208 209struct pmap kernel_pmap_store; 210LIST_HEAD(pmaplist, pmap); 211static struct pmaplist allpmaps; 212static struct mtx allpmaps_lock; 213 214vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 215vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 216int pgeflag = 0; /* PG_G or-in */ 217int pseflag = 0; /* PG_PS or-in */ 218 219int nkpt; 220vm_offset_t kernel_vm_end; 221extern u_int32_t KERNend; 222 223#ifdef PAE 224pt_entry_t pg_nx; 225#if !defined(XEN) 226static uma_zone_t pdptzone; 227#endif 228#endif 229 230static int pat_works; /* Is page attribute table sane? */ 231 232/* 233 * Data for the pv entry allocation mechanism 234 */ 235static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 236static struct md_page *pv_table; 237static int shpgperproc = PMAP_SHPGPERPROC; 238 239struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 240int pv_maxchunks; /* How many chunks we have KVA for */ 241vm_offset_t pv_vafree; /* freelist stored in the PTE */ 242 243/* 244 * All those kernel PT submaps that BSD is so fond of 245 */ 246struct sysmaps { 247 struct mtx lock; 248 pt_entry_t *CMAP1; 249 pt_entry_t *CMAP2; 250 caddr_t CADDR1; 251 caddr_t CADDR2; 252}; 253static struct sysmaps sysmaps_pcpu[MAXCPU]; 254pt_entry_t *CMAP1 = 0; 255static pt_entry_t *CMAP3; 256caddr_t CADDR1 = 0, ptvmmap = 0; 257static caddr_t CADDR3; 258struct msgbuf *msgbufp = 0; 259 260/* 261 * Crashdump maps. 262 */ 263static caddr_t crashdumpmap; 264 265static pt_entry_t *PMAP1 = 0, *PMAP2; 266static pt_entry_t *PADDR1 = 0, *PADDR2; 267#ifdef SMP 268static int PMAP1cpu; 269static int PMAP1changedcpu; 270SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 271 &PMAP1changedcpu, 0, 272 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 273#endif 274static int PMAP1changed; 275SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 276 &PMAP1changed, 0, 277 "Number of times pmap_pte_quick changed PMAP1"); 278static int PMAP1unchanged; 279SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 280 &PMAP1unchanged, 0, 281 "Number of times pmap_pte_quick didn't change PMAP1"); 282static struct mtx PMAP2mutex; 283 284SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 285static int pg_ps_enabled; 286SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 287 "Are large page mappings enabled?"); 288 289SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 290 "Max number of PV entries"); 291SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 292 "Page share factor per proc"); 293 294static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 295static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); 296 297static vm_page_t pmap_enter_quick_locked(multicall_entry_t **mcl, int *count, pmap_t pmap, vm_offset_t va, 298 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 299static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 300 vm_page_t *free); 301static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 302 vm_page_t *free); 303static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 304 vm_offset_t va); 305static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 306static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 307 vm_page_t m); 308 309static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 310 311static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); 312static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); 313static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 314static void pmap_pte_release(pt_entry_t *pte); 315static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); 316static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 317static boolean_t pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr); 318static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 319 320static __inline void pagezero(void *page); 321 322#if defined(PAE) && !defined(XEN) 323static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 324#endif 325#ifndef XEN 326static void pmap_set_pg(void); 327#endif 328 329CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 330CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 331 332/* 333 * If you get an error here, then you set KVA_PAGES wrong! See the 334 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 335 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 336 */ 337CTASSERT(KERNBASE % (1 << 24) == 0); 338 339 340 341void 342pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) 343{ 344 vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); 345 346 switch (type) { 347 case SH_PD_SET_VA: 348#if 0 349 xen_queue_pt_update(shadow_pdir_ma, 350 xpmap_ptom(val & ~(PG_RW))); 351#endif 352 xen_queue_pt_update(pdir_ma, 353 xpmap_ptom(val)); 354 break; 355 case SH_PD_SET_VA_MA: 356#if 0 357 xen_queue_pt_update(shadow_pdir_ma, 358 val & ~(PG_RW)); 359#endif 360 xen_queue_pt_update(pdir_ma, val); 361 break; 362 case SH_PD_SET_VA_CLEAR: 363#if 0 364 xen_queue_pt_update(shadow_pdir_ma, 0); 365#endif 366 xen_queue_pt_update(pdir_ma, 0); 367 break; 368 } 369} 370 371/* 372 * Move the kernel virtual free pointer to the next 373 * 4MB. This is used to help improve performance 374 * by using a large (4MB) page for much of the kernel 375 * (.text, .data, .bss) 376 */ 377static vm_offset_t 378pmap_kmem_choose(vm_offset_t addr) 379{ 380 vm_offset_t newaddr = addr; 381 382#ifndef DISABLE_PSE 383 if (cpu_feature & CPUID_PSE) 384 newaddr = (addr + PDRMASK) & ~PDRMASK; 385#endif 386 return newaddr; 387} 388 389/* 390 * Bootstrap the system enough to run with virtual memory. 391 * 392 * On the i386 this is called after mapping has already been enabled 393 * and just syncs the pmap module with what has already been done. 394 * [We can't call it easily with mapping off since the kernel is not 395 * mapped with PA == VA, hence we would have to relocate every address 396 * from the linked base (virtual) address "KERNBASE" to the actual 397 * (physical) address starting relative to 0] 398 */ 399void 400pmap_bootstrap(vm_paddr_t firstaddr) 401{ 402 vm_offset_t va; 403 pt_entry_t *pte, *unused; 404 struct sysmaps *sysmaps; 405 int i; 406 407 /* 408 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too 409 * large. It should instead be correctly calculated in locore.s and 410 * not based on 'first' (which is a physical address, not a virtual 411 * address, for the start of unused physical memory). The kernel 412 * page tables are NOT double mapped and thus should not be included 413 * in this calculation. 414 */ 415 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 416 virtual_avail = pmap_kmem_choose(virtual_avail); 417 418 virtual_end = VM_MAX_KERNEL_ADDRESS; 419 420 /* 421 * Initialize the kernel pmap (which is statically allocated). 422 */ 423 PMAP_LOCK_INIT(kernel_pmap); 424 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 425#ifdef PAE 426 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 427#endif 428 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 429 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 430 LIST_INIT(&allpmaps); 431 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 432 mtx_lock_spin(&allpmaps_lock); 433 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 434 mtx_unlock_spin(&allpmaps_lock); 435 if (nkpt == 0) 436 nkpt = NKPT; 437 438 /* 439 * Reserve some special page table entries/VA space for temporary 440 * mapping of pages. 441 */ 442#define SYSMAP(c, p, v, n) \ 443 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 444 445 va = virtual_avail; 446 pte = vtopte(va); 447 448 /* 449 * CMAP1/CMAP2 are used for zeroing and copying pages. 450 * CMAP3 is used for the idle process page zeroing. 451 */ 452 for (i = 0; i < MAXCPU; i++) { 453 sysmaps = &sysmaps_pcpu[i]; 454 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 455 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 456 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 457 } 458 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 459 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 460 PT_SET_MA(CADDR3, 0); 461 462 /* 463 * Crashdump maps. 464 */ 465 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 466 467 /* 468 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 469 */ 470 SYSMAP(caddr_t, unused, ptvmmap, 1) 471 472 /* 473 * msgbufp is used to map the system message buffer. 474 */ 475 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) 476 477 /* 478 * ptemap is used for pmap_pte_quick 479 */ 480 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); 481 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); 482 483 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 484 485 virtual_avail = va; 486 PT_SET_MA(CADDR1, 0); 487 488 /* 489 * Leave in place an identity mapping (virt == phys) for the low 1 MB 490 * physical memory region that is used by the ACPI wakeup code. This 491 * mapping must not have PG_G set. 492 */ 493#ifndef XEN 494 /* 495 * leave here deliberately to show that this is not supported 496 */ 497#ifdef XBOX 498 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 499 * an early stadium, we cannot yet neatly map video memory ... :-( 500 * Better fixes are very welcome! */ 501 if (!arch_i386_is_xbox) 502#endif 503 for (i = 1; i < NKPT; i++) 504 PTD[i] = 0; 505 506 /* Initialize the PAT MSR if present. */ 507 pmap_init_pat(); 508 509 /* Turn on PG_G on kernel page(s) */ 510 pmap_set_pg(); 511#endif 512} 513 514/* 515 * Setup the PAT MSR. 516 */ 517void 518pmap_init_pat(void) 519{ 520 uint64_t pat_msr; 521 522 /* Bail if this CPU doesn't implement PAT. */ 523 if (!(cpu_feature & CPUID_PAT)) 524 return; 525 526 if (cpu_vendor_id != CPU_VENDOR_INTEL || 527 (CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) { 528 /* 529 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. 530 * Program 4 and 5 as WP and WC. 531 * Leave 6 and 7 as UC and UC-. 532 */ 533 pat_msr = rdmsr(MSR_PAT); 534 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); 535 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | 536 PAT_VALUE(5, PAT_WRITE_COMBINING); 537 pat_works = 1; 538 } else { 539 /* 540 * Due to some Intel errata, we can only safely use the lower 4 541 * PAT entries. Thus, just replace PAT Index 2 with WC instead 542 * of UC-. 543 * 544 * Intel Pentium III Processor Specification Update 545 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 546 * or Mode C Paging) 547 * 548 * Intel Pentium IV Processor Specification Update 549 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 550 */ 551 pat_msr = rdmsr(MSR_PAT); 552 pat_msr &= ~PAT_MASK(2); 553 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 554 pat_works = 0; 555 } 556 wrmsr(MSR_PAT, pat_msr); 557} 558 559#ifndef XEN 560/* 561 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 562 */ 563static void 564pmap_set_pg(void) 565{ 566 pd_entry_t pdir; 567 pt_entry_t *pte; 568 vm_offset_t va, endva; 569 int i; 570 571 if (pgeflag == 0) 572 return; 573 574 i = KERNLOAD/NBPDR; 575 endva = KERNBASE + KERNend; 576 577 if (pseflag) { 578 va = KERNBASE + KERNLOAD; 579 while (va < endva) { 580 pdir = kernel_pmap->pm_pdir[KPTDI+i]; 581 pdir |= pgeflag; 582 kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; 583 invltlb(); /* Play it safe, invltlb() every time */ 584 i++; 585 va += NBPDR; 586 } 587 } else { 588 va = (vm_offset_t)btext; 589 while (va < endva) { 590 pte = vtopte(va); 591 if (*pte & PG_V) 592 *pte |= pgeflag; 593 invltlb(); /* Play it safe, invltlb() every time */ 594 va += PAGE_SIZE; 595 } 596 } 597} 598#endif 599 600/* 601 * Initialize a vm_page's machine-dependent fields. 602 */ 603void 604pmap_page_init(vm_page_t m) 605{ 606 607 TAILQ_INIT(&m->md.pv_list); 608 m->md.pat_mode = PAT_WRITE_BACK; 609} 610 611#if defined(PAE) && !defined(XEN) 612static void * 613pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 614{ 615 616 /* Inform UMA that this allocator uses kernel_map/object. */ 617 *flags = UMA_SLAB_KERNEL; 618 return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL, 619 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 620} 621#endif 622 623/* 624 * ABuse the pte nodes for unmapped kva to thread a kva freelist through. 625 * Requirements: 626 * - Must deal with pages in order to ensure that none of the PG_* bits 627 * are ever set, PG_V in particular. 628 * - Assumes we can write to ptes without pte_store() atomic ops, even 629 * on PAE systems. This should be ok. 630 * - Assumes nothing will ever test these addresses for 0 to indicate 631 * no mapping instead of correctly checking PG_V. 632 * - Assumes a vm_offset_t will fit in a pte (true for i386). 633 * Because PG_V is never set, there can be no mappings to invalidate. 634 */ 635static int ptelist_count = 0; 636static vm_offset_t 637pmap_ptelist_alloc(vm_offset_t *head) 638{ 639 vm_offset_t va; 640 vm_offset_t *phead = (vm_offset_t *)*head; 641 642 if (ptelist_count == 0) { 643 printf("out of memory!!!!!!\n"); 644 return (0); /* Out of memory */ 645 } 646 ptelist_count--; 647 va = phead[ptelist_count]; 648 return (va); 649} 650 651static void 652pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 653{ 654 vm_offset_t *phead = (vm_offset_t *)*head; 655 656 phead[ptelist_count++] = va; 657} 658 659static void 660pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 661{ 662 int i, nstackpages; 663 vm_offset_t va; 664 vm_page_t m; 665 666 nstackpages = (npages + PAGE_SIZE/sizeof(vm_offset_t) - 1)/ (PAGE_SIZE/sizeof(vm_offset_t)); 667 for (i = 0; i < nstackpages; i++) { 668 va = (vm_offset_t)base + i * PAGE_SIZE; 669 m = vm_page_alloc(NULL, i, 670 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 671 VM_ALLOC_ZERO); 672 pmap_qenter(va, &m, 1); 673 } 674 675 *head = (vm_offset_t)base; 676 for (i = npages - 1; i >= nstackpages; i--) { 677 va = (vm_offset_t)base + i * PAGE_SIZE; 678 pmap_ptelist_free(head, va); 679 } 680} 681 682 683/* 684 * Initialize the pmap module. 685 * Called by vm_init, to initialize any structures that the pmap 686 * system needs to map virtual memory. 687 */ 688void 689pmap_init(void) 690{ 691 vm_page_t mpte; 692 vm_size_t s; 693 int i, pv_npg; 694 695 /* 696 * Initialize the vm page array entries for the kernel pmap's 697 * page table pages. 698 */ 699 for (i = 0; i < nkpt; i++) { 700 mpte = PHYS_TO_VM_PAGE(xpmap_mtop(PTD[i + KPTDI] & PG_FRAME)); 701 KASSERT(mpte >= vm_page_array && 702 mpte < &vm_page_array[vm_page_array_size], 703 ("pmap_init: page table page is out of range")); 704 mpte->pindex = i + KPTDI; 705 mpte->phys_addr = xpmap_mtop(PTD[i + KPTDI] & PG_FRAME); 706 } 707 708 /* 709 * Initialize the address space (zone) for the pv entries. Set a 710 * high water mark so that the system can recover from excessive 711 * numbers of pv entries. 712 */ 713 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 714 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 715 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 716 pv_entry_max = roundup(pv_entry_max, _NPCPV); 717 pv_entry_high_water = 9 * (pv_entry_max / 10); 718 719 /* 720 * Are large page mappings enabled? 721 */ 722 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 723 724 /* 725 * Calculate the size of the pv head table for superpages. 726 */ 727 for (i = 0; phys_avail[i + 1]; i += 2); 728 pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; 729 730 /* 731 * Allocate memory for the pv head table for superpages. 732 */ 733 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 734 s = round_page(s); 735 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 736 for (i = 0; i < pv_npg; i++) 737 TAILQ_INIT(&pv_table[i].pv_list); 738 739 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 740 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, 741 PAGE_SIZE * pv_maxchunks); 742 if (pv_chunkbase == NULL) 743 panic("pmap_init: not enough kvm for pv chunks"); 744 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 745#if defined(PAE) && !defined(XEN) 746 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 747 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 748 UMA_ZONE_VM | UMA_ZONE_NOFREE); 749 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 750#endif 751} 752 753 754/*************************************************** 755 * Low level helper routines..... 756 ***************************************************/ 757 758/* 759 * Determine the appropriate bits to set in a PTE or PDE for a specified 760 * caching mode. 761 */ 762int 763pmap_cache_bits(int mode, boolean_t is_pde) 764{ 765 int pat_flag, pat_index, cache_bits; 766 767 /* The PAT bit is different for PTE's and PDE's. */ 768 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 769 770 /* If we don't support PAT, map extended modes to older ones. */ 771 if (!(cpu_feature & CPUID_PAT)) { 772 switch (mode) { 773 case PAT_UNCACHEABLE: 774 case PAT_WRITE_THROUGH: 775 case PAT_WRITE_BACK: 776 break; 777 case PAT_UNCACHED: 778 case PAT_WRITE_COMBINING: 779 case PAT_WRITE_PROTECTED: 780 mode = PAT_UNCACHEABLE; 781 break; 782 } 783 } 784 785 /* Map the caching mode to a PAT index. */ 786 if (pat_works) { 787 switch (mode) { 788 case PAT_UNCACHEABLE: 789 pat_index = 3; 790 break; 791 case PAT_WRITE_THROUGH: 792 pat_index = 1; 793 break; 794 case PAT_WRITE_BACK: 795 pat_index = 0; 796 break; 797 case PAT_UNCACHED: 798 pat_index = 2; 799 break; 800 case PAT_WRITE_COMBINING: 801 pat_index = 5; 802 break; 803 case PAT_WRITE_PROTECTED: 804 pat_index = 4; 805 break; 806 default: 807 panic("Unknown caching mode %d\n", mode); 808 } 809 } else { 810 switch (mode) { 811 case PAT_UNCACHED: 812 case PAT_UNCACHEABLE: 813 case PAT_WRITE_PROTECTED: 814 pat_index = 3; 815 break; 816 case PAT_WRITE_THROUGH: 817 pat_index = 1; 818 break; 819 case PAT_WRITE_BACK: 820 pat_index = 0; 821 break; 822 case PAT_WRITE_COMBINING: 823 pat_index = 2; 824 break; 825 default: 826 panic("Unknown caching mode %d\n", mode); 827 } 828 } 829 830 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 831 cache_bits = 0; 832 if (pat_index & 0x4) 833 cache_bits |= pat_flag; 834 if (pat_index & 0x2) 835 cache_bits |= PG_NC_PCD; 836 if (pat_index & 0x1) 837 cache_bits |= PG_NC_PWT; 838 return (cache_bits); 839} 840#ifdef SMP 841/* 842 * For SMP, these functions have to use the IPI mechanism for coherence. 843 * 844 * N.B.: Before calling any of the following TLB invalidation functions, 845 * the calling processor must ensure that all stores updating a non- 846 * kernel page table are globally performed. Otherwise, another 847 * processor could cache an old, pre-update entry without being 848 * invalidated. This can happen one of two ways: (1) The pmap becomes 849 * active on another processor after its pm_active field is checked by 850 * one of the following functions but before a store updating the page 851 * table is globally performed. (2) The pmap becomes active on another 852 * processor before its pm_active field is checked but due to 853 * speculative loads one of the following functions stills reads the 854 * pmap as inactive on the other processor. 855 * 856 * The kernel page table is exempt because its pm_active field is 857 * immutable. The kernel page table is always active on every 858 * processor. 859 */ 860void 861pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 862{ 863 u_int cpumask; 864 u_int other_cpus; 865 866 CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", 867 pmap, va); 868 869 sched_pin(); 870 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 871 invlpg(va); 872 smp_invlpg(va); 873 } else { 874 cpumask = PCPU_GET(cpumask); 875 other_cpus = PCPU_GET(other_cpus); 876 if (pmap->pm_active & cpumask) 877 invlpg(va); 878 if (pmap->pm_active & other_cpus) 879 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 880 } 881 sched_unpin(); 882 PT_UPDATES_FLUSH(); 883} 884 885void 886pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 887{ 888 u_int cpumask; 889 u_int other_cpus; 890 vm_offset_t addr; 891 892 CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", 893 pmap, sva, eva); 894 895 sched_pin(); 896 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 897 for (addr = sva; addr < eva; addr += PAGE_SIZE) 898 invlpg(addr); 899 smp_invlpg_range(sva, eva); 900 } else { 901 cpumask = PCPU_GET(cpumask); 902 other_cpus = PCPU_GET(other_cpus); 903 if (pmap->pm_active & cpumask) 904 for (addr = sva; addr < eva; addr += PAGE_SIZE) 905 invlpg(addr); 906 if (pmap->pm_active & other_cpus) 907 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 908 sva, eva); 909 } 910 sched_unpin(); 911 PT_UPDATES_FLUSH(); 912} 913 914void 915pmap_invalidate_all(pmap_t pmap) 916{ 917 u_int cpumask; 918 u_int other_cpus; 919 920 CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); 921 922 sched_pin(); 923 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 924 invltlb(); 925 smp_invltlb(); 926 } else { 927 cpumask = PCPU_GET(cpumask); 928 other_cpus = PCPU_GET(other_cpus); 929 if (pmap->pm_active & cpumask) 930 invltlb(); 931 if (pmap->pm_active & other_cpus) 932 smp_masked_invltlb(pmap->pm_active & other_cpus); 933 } 934 sched_unpin(); 935} 936 937void 938pmap_invalidate_cache(void) 939{ 940 941 sched_pin(); 942 wbinvd(); 943 smp_cache_flush(); 944 sched_unpin(); 945} 946#else /* !SMP */ 947/* 948 * Normal, non-SMP, 486+ invalidation functions. 949 * We inline these within pmap.c for speed. 950 */ 951PMAP_INLINE void 952pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 953{ 954 CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", 955 pmap, va); 956 957 if (pmap == kernel_pmap || pmap->pm_active) 958 invlpg(va); 959 PT_UPDATES_FLUSH(); 960} 961 962PMAP_INLINE void 963pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 964{ 965 vm_offset_t addr; 966 967 if (eva - sva > PAGE_SIZE) 968 CTR3(KTR_PMAP, "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", 969 pmap, sva, eva); 970 971 if (pmap == kernel_pmap || pmap->pm_active) 972 for (addr = sva; addr < eva; addr += PAGE_SIZE) 973 invlpg(addr); 974 PT_UPDATES_FLUSH(); 975} 976 977PMAP_INLINE void 978pmap_invalidate_all(pmap_t pmap) 979{ 980 981 CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); 982 983 if (pmap == kernel_pmap || pmap->pm_active) 984 invltlb(); 985} 986 987PMAP_INLINE void 988pmap_invalidate_cache(void) 989{ 990 991 wbinvd(); 992} 993#endif /* !SMP */ 994 995void 996pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 997{ 998 999 KASSERT((sva & PAGE_MASK) == 0, 1000 ("pmap_invalidate_cache_range: sva not page-aligned")); 1001 KASSERT((eva & PAGE_MASK) == 0, 1002 ("pmap_invalidate_cache_range: eva not page-aligned")); 1003 1004 if (cpu_feature & CPUID_SS) 1005 ; /* If "Self Snoop" is supported, do nothing. */ 1006 else if (cpu_feature & CPUID_CLFSH) { 1007 1008 /* 1009 * Otherwise, do per-cache line flush. Use the mfence 1010 * instruction to insure that previous stores are 1011 * included in the write-back. The processor 1012 * propagates flush to other processors in the cache 1013 * coherence domain. 1014 */ 1015 mfence(); 1016 for (; sva < eva; sva += cpu_clflush_line_size) 1017 clflush(sva); 1018 mfence(); 1019 } else { 1020 1021 /* 1022 * No targeted cache flush methods are supported by CPU, 1023 * globally invalidate cache as a last resort. 1024 */ 1025 pmap_invalidate_cache(); 1026 } 1027} 1028 1029/* 1030 * Are we current address space or kernel? N.B. We return FALSE when 1031 * a pmap's page table is in use because a kernel thread is borrowing 1032 * it. The borrowed page table can change spontaneously, making any 1033 * dependence on its continued use subject to a race condition. 1034 */ 1035static __inline int 1036pmap_is_current(pmap_t pmap) 1037{ 1038 1039 return (pmap == kernel_pmap || 1040 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1041 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1042} 1043 1044/* 1045 * If the given pmap is not the current or kernel pmap, the returned pte must 1046 * be released by passing it to pmap_pte_release(). 1047 */ 1048pt_entry_t * 1049pmap_pte(pmap_t pmap, vm_offset_t va) 1050{ 1051 pd_entry_t newpf; 1052 pd_entry_t *pde; 1053 1054 pde = pmap_pde(pmap, va); 1055 if (*pde & PG_PS) 1056 return (pde); 1057 if (*pde != 0) { 1058 /* are we current address space or kernel? */ 1059 if (pmap_is_current(pmap)) 1060 return (vtopte(va)); 1061 mtx_lock(&PMAP2mutex); 1062 newpf = *pde & PG_FRAME; 1063 if ((*PMAP2 & PG_FRAME) != newpf) { 1064 PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); 1065 CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", 1066 pmap, va, (*PMAP2 & 0xffffffff)); 1067 } 1068 1069 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1070 } 1071 return (0); 1072} 1073 1074/* 1075 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1076 * being NULL. 1077 */ 1078static __inline void 1079pmap_pte_release(pt_entry_t *pte) 1080{ 1081 1082 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { 1083 CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", 1084 *PMAP2); 1085 PT_SET_VA(PMAP2, 0, TRUE); 1086 mtx_unlock(&PMAP2mutex); 1087 } 1088} 1089 1090static __inline void 1091invlcaddr(void *caddr) 1092{ 1093 1094 invlpg((u_int)caddr); 1095 PT_UPDATES_FLUSH(); 1096} 1097 1098/* 1099 * Super fast pmap_pte routine best used when scanning 1100 * the pv lists. This eliminates many coarse-grained 1101 * invltlb calls. Note that many of the pv list 1102 * scans are across different pmaps. It is very wasteful 1103 * to do an entire invltlb for checking a single mapping. 1104 * 1105 * If the given pmap is not the current pmap, vm_page_queue_mtx 1106 * must be held and curthread pinned to a CPU. 1107 */ 1108static pt_entry_t * 1109pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1110{ 1111 pd_entry_t newpf; 1112 pd_entry_t *pde; 1113 1114 pde = pmap_pde(pmap, va); 1115 if (*pde & PG_PS) 1116 return (pde); 1117 if (*pde != 0) { 1118 /* are we current address space or kernel? */ 1119 if (pmap_is_current(pmap)) 1120 return (vtopte(va)); 1121 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1122 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1123 newpf = *pde & PG_FRAME; 1124 if ((*PMAP1 & PG_FRAME) != newpf) { 1125 PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M); 1126 CTR3(KTR_PMAP, "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x", 1127 pmap, va, (u_long)*PMAP1); 1128 1129#ifdef SMP 1130 PMAP1cpu = PCPU_GET(cpuid); 1131#endif 1132 PMAP1changed++; 1133 } else 1134#ifdef SMP 1135 if (PMAP1cpu != PCPU_GET(cpuid)) { 1136 PMAP1cpu = PCPU_GET(cpuid); 1137 invlcaddr(PADDR1); 1138 PMAP1changedcpu++; 1139 } else 1140#endif 1141 PMAP1unchanged++; 1142 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1143 } 1144 return (0); 1145} 1146 1147/* 1148 * Routine: pmap_extract 1149 * Function: 1150 * Extract the physical page address associated 1151 * with the given map/virtual_address pair. 1152 */ 1153vm_paddr_t 1154pmap_extract(pmap_t pmap, vm_offset_t va) 1155{ 1156 vm_paddr_t rtval; 1157 pt_entry_t *pte; 1158 pd_entry_t pde; 1159 pt_entry_t pteval; 1160 1161 rtval = 0; 1162 PMAP_LOCK(pmap); 1163 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1164 if (pde != 0) { 1165 if ((pde & PG_PS) != 0) { 1166 rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK); 1167 PMAP_UNLOCK(pmap); 1168 return rtval; 1169 } 1170 pte = pmap_pte(pmap, va); 1171 pteval = *pte ? xpmap_mtop(*pte) : 0; 1172 rtval = (pteval & PG_FRAME) | (va & PAGE_MASK); 1173 pmap_pte_release(pte); 1174 } 1175 PMAP_UNLOCK(pmap); 1176 return (rtval); 1177} 1178 1179/* 1180 * Routine: pmap_extract_ma 1181 * Function: 1182 * Like pmap_extract, but returns machine address 1183 */ 1184vm_paddr_t 1185pmap_extract_ma(pmap_t pmap, vm_offset_t va) 1186{ 1187 vm_paddr_t rtval; 1188 pt_entry_t *pte; 1189 pd_entry_t pde; 1190 1191 rtval = 0; 1192 PMAP_LOCK(pmap); 1193 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1194 if (pde != 0) { 1195 if ((pde & PG_PS) != 0) { 1196 rtval = (pde & ~PDRMASK) | (va & PDRMASK); 1197 PMAP_UNLOCK(pmap); 1198 return rtval; 1199 } 1200 pte = pmap_pte(pmap, va); 1201 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1202 pmap_pte_release(pte); 1203 } 1204 PMAP_UNLOCK(pmap); 1205 return (rtval); 1206} 1207 1208/* 1209 * Routine: pmap_extract_and_hold 1210 * Function: 1211 * Atomically extract and hold the physical page 1212 * with the given pmap and virtual address pair 1213 * if that mapping permits the given protection. 1214 */ 1215vm_page_t 1216pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1217{ 1218 pd_entry_t pde; 1219 pt_entry_t pte; 1220 vm_page_t m; 1221 1222 m = NULL; 1223 vm_page_lock_queues(); 1224 PMAP_LOCK(pmap); 1225 pde = PT_GET(pmap_pde(pmap, va)); 1226 if (pde != 0) { 1227 if (pde & PG_PS) { 1228 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1229 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1230 (va & PDRMASK)); 1231 vm_page_hold(m); 1232 } 1233 } else { 1234 sched_pin(); 1235 pte = PT_GET(pmap_pte_quick(pmap, va)); 1236 if (*PMAP1) 1237 PT_SET_MA(PADDR1, 0); 1238 if ((pte & PG_V) && 1239 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1240 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1241 vm_page_hold(m); 1242 } 1243 sched_unpin(); 1244 } 1245 } 1246 vm_page_unlock_queues(); 1247 PMAP_UNLOCK(pmap); 1248 return (m); 1249} 1250 1251/*************************************************** 1252 * Low level mapping routines..... 1253 ***************************************************/ 1254 1255/* 1256 * Add a wired page to the kva. 1257 * Note: not SMP coherent. 1258 */ 1259void 1260pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1261{ 1262 PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag); 1263} 1264 1265void 1266pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma) 1267{ 1268 pt_entry_t *pte; 1269 1270 pte = vtopte(va); 1271 pte_store_ma(pte, ma | PG_RW | PG_V | pgeflag); 1272} 1273 1274 1275static __inline void 1276pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1277{ 1278 PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1279} 1280 1281/* 1282 * Remove a page from the kernel pagetables. 1283 * Note: not SMP coherent. 1284 */ 1285PMAP_INLINE void 1286pmap_kremove(vm_offset_t va) 1287{ 1288 pt_entry_t *pte; 1289 1290 pte = vtopte(va); 1291 PT_CLEAR_VA(pte, FALSE); 1292} 1293 1294/* 1295 * Used to map a range of physical addresses into kernel 1296 * virtual address space. 1297 * 1298 * The value passed in '*virt' is a suggested virtual address for 1299 * the mapping. Architectures which can support a direct-mapped 1300 * physical to virtual region can return the appropriate address 1301 * within that region, leaving '*virt' unchanged. Other 1302 * architectures should map the pages starting at '*virt' and 1303 * update '*virt' with the first usable address after the mapped 1304 * region. 1305 */ 1306vm_offset_t 1307pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1308{ 1309 vm_offset_t va, sva; 1310 1311 va = sva = *virt; 1312 CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x", 1313 va, start, end, prot); 1314 while (start < end) { 1315 pmap_kenter(va, start); 1316 va += PAGE_SIZE; 1317 start += PAGE_SIZE; 1318 } 1319 pmap_invalidate_range(kernel_pmap, sva, va); 1320 *virt = va; 1321 return (sva); 1322} 1323 1324 1325/* 1326 * Add a list of wired pages to the kva 1327 * this routine is only used for temporary 1328 * kernel mappings that do not need to have 1329 * page modification or references recorded. 1330 * Note that old mappings are simply written 1331 * over. The page *must* be wired. 1332 * Note: SMP coherent. Uses a ranged shootdown IPI. 1333 */ 1334void 1335pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1336{ 1337 pt_entry_t *endpte, *pte; 1338 vm_paddr_t pa; 1339 vm_offset_t va = sva; 1340 int mclcount = 0; 1341 multicall_entry_t mcl[16]; 1342 multicall_entry_t *mclp = mcl; 1343 int error; 1344 1345 CTR2(KTR_PMAP, "pmap_qenter:sva=0x%x count=%d", va, count); 1346 pte = vtopte(sva); 1347 endpte = pte + count; 1348 while (pte < endpte) { 1349 pa = xpmap_ptom(VM_PAGE_TO_PHYS(*ma)) | pgeflag | PG_RW | PG_V | PG_M | PG_A; 1350 1351 mclp->op = __HYPERVISOR_update_va_mapping; 1352 mclp->args[0] = va; 1353 mclp->args[1] = (uint32_t)(pa & 0xffffffff); 1354 mclp->args[2] = (uint32_t)(pa >> 32); 1355 mclp->args[3] = (*pte & PG_V) ? UVMF_INVLPG|UVMF_ALL : 0; 1356 1357 va += PAGE_SIZE; 1358 pte++; 1359 ma++; 1360 mclp++; 1361 mclcount++; 1362 if (mclcount == 16) { 1363 error = HYPERVISOR_multicall(mcl, mclcount); 1364 mclp = mcl; 1365 mclcount = 0; 1366 KASSERT(error == 0, ("bad multicall %d", error)); 1367 } 1368 } 1369 if (mclcount) { 1370 error = HYPERVISOR_multicall(mcl, mclcount); 1371 KASSERT(error == 0, ("bad multicall %d", error)); 1372 } 1373 1374#ifdef INVARIANTS 1375 for (pte = vtopte(sva), mclcount = 0; mclcount < count; mclcount++, pte++) 1376 KASSERT(*pte, ("pte not set for va=0x%x", sva + mclcount*PAGE_SIZE)); 1377#endif 1378} 1379 1380 1381/* 1382 * This routine tears out page mappings from the 1383 * kernel -- it is meant only for temporary mappings. 1384 * Note: SMP coherent. Uses a ranged shootdown IPI. 1385 */ 1386void 1387pmap_qremove(vm_offset_t sva, int count) 1388{ 1389 vm_offset_t va; 1390 1391 CTR2(KTR_PMAP, "pmap_qremove: sva=0x%x count=%d", sva, count); 1392 va = sva; 1393 vm_page_lock_queues(); 1394 critical_enter(); 1395 while (count-- > 0) { 1396 pmap_kremove(va); 1397 va += PAGE_SIZE; 1398 } 1399 pmap_invalidate_range(kernel_pmap, sva, va); 1400 critical_exit(); 1401 vm_page_unlock_queues(); 1402} 1403 1404/*************************************************** 1405 * Page table page management routines..... 1406 ***************************************************/ 1407static __inline void 1408pmap_free_zero_pages(vm_page_t free) 1409{ 1410 vm_page_t m; 1411 1412 while (free != NULL) { 1413 m = free; 1414 free = m->right; 1415 vm_page_free_zero(m); 1416 } 1417} 1418 1419/* 1420 * This routine unholds page table pages, and if the hold count 1421 * drops to zero, then it decrements the wire count. 1422 */ 1423static __inline int 1424pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) 1425{ 1426 1427 --m->wire_count; 1428 if (m->wire_count == 0) 1429 return _pmap_unwire_pte_hold(pmap, m, free); 1430 else 1431 return 0; 1432} 1433 1434static int 1435_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) 1436{ 1437 vm_offset_t pteva; 1438 1439 PT_UPDATES_FLUSH(); 1440 /* 1441 * unmap the page table page 1442 */ 1443 xen_pt_unpin(pmap->pm_pdir[m->pindex]); 1444 /* 1445 * page *might* contain residual mapping :-/ 1446 */ 1447 PD_CLEAR_VA(pmap, m->pindex, TRUE); 1448 pmap_zero_page(m); 1449 --pmap->pm_stats.resident_count; 1450 1451 /* 1452 * This is a release store so that the ordinary store unmapping 1453 * the page table page is globally performed before TLB shoot- 1454 * down is begun. 1455 */ 1456 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1457 1458 /* 1459 * Do an invltlb to make the invalidated mapping 1460 * take effect immediately. 1461 */ 1462 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1463 pmap_invalidate_page(pmap, pteva); 1464 1465 /* 1466 * Put page on a list so that it is released after 1467 * *ALL* TLB shootdown is done 1468 */ 1469 m->right = *free; 1470 *free = m; 1471 1472 return 1; 1473} 1474 1475/* 1476 * After removing a page table entry, this routine is used to 1477 * conditionally free the page, and manage the hold/wire counts. 1478 */ 1479static int 1480pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) 1481{ 1482 pd_entry_t ptepde; 1483 vm_page_t mpte; 1484 1485 if (va >= VM_MAXUSER_ADDRESS) 1486 return 0; 1487 ptepde = PT_GET(pmap_pde(pmap, va)); 1488 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1489 return pmap_unwire_pte_hold(pmap, mpte, free); 1490} 1491 1492void 1493pmap_pinit0(pmap_t pmap) 1494{ 1495 1496 PMAP_LOCK_INIT(pmap); 1497 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1498#ifdef PAE 1499 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1500#endif 1501 pmap->pm_active = 0; 1502 PCPU_SET(curpmap, pmap); 1503 TAILQ_INIT(&pmap->pm_pvchunk); 1504 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1505 mtx_lock_spin(&allpmaps_lock); 1506 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1507 mtx_unlock_spin(&allpmaps_lock); 1508} 1509 1510/* 1511 * Initialize a preallocated and zeroed pmap structure, 1512 * such as one in a vmspace structure. 1513 */ 1514int 1515pmap_pinit(pmap_t pmap) 1516{ 1517 vm_page_t m, ptdpg[NPGPTD + 1]; 1518 int npgptd = NPGPTD + 1; 1519 static int color; 1520 int i; 1521 1522 PMAP_LOCK_INIT(pmap); 1523 1524 /* 1525 * No need to allocate page table space yet but we do need a valid 1526 * page directory table. 1527 */ 1528 if (pmap->pm_pdir == NULL) { 1529 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1530 NBPTD); 1531 if (pmap->pm_pdir == NULL) { 1532 PMAP_LOCK_DESTROY(pmap); 1533 return (0); 1534 } 1535#if defined(XEN) && defined(PAE) 1536 pmap->pm_pdpt = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1); 1537#endif 1538 1539#if defined(PAE) && !defined(XEN) 1540 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1541 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1542 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1543 ("pmap_pinit: pdpt misaligned")); 1544 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1545 ("pmap_pinit: pdpt above 4g")); 1546#endif 1547 } 1548 1549 /* 1550 * allocate the page directory page(s) 1551 */ 1552 for (i = 0; i < npgptd;) { 1553 m = vm_page_alloc(NULL, color++, 1554 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1555 VM_ALLOC_ZERO); 1556 if (m == NULL) 1557 VM_WAIT; 1558 else { 1559 ptdpg[i++] = m; 1560 } 1561 } 1562 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1563 for (i = 0; i < NPGPTD; i++) { 1564 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1565 pagezero(&pmap->pm_pdir[i*NPTEPG]); 1566 } 1567 1568 mtx_lock_spin(&allpmaps_lock); 1569 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1570 mtx_unlock_spin(&allpmaps_lock); 1571 /* Wire in kernel global address entries. */ 1572 1573 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1574#ifdef PAE 1575#ifdef XEN 1576 pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1); 1577 if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0) 1578 bzero(pmap->pm_pdpt, PAGE_SIZE); 1579#endif 1580 for (i = 0; i < NPGPTD; i++) { 1581 vm_paddr_t ma; 1582 1583 ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); 1584 pmap->pm_pdpt[i] = ma | PG_V; 1585 1586 } 1587#endif 1588#ifdef XEN 1589 for (i = 0; i < NPGPTD; i++) { 1590 pt_entry_t *pd; 1591 vm_paddr_t ma; 1592 1593 ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); 1594 pd = pmap->pm_pdir + (i * NPDEPG); 1595 PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW)); 1596#if 0 1597 xen_pgd_pin(ma); 1598#endif 1599 } 1600 1601#ifdef PAE 1602 PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW); 1603#endif 1604 vm_page_lock_queues(); 1605 xen_flush_queue(); 1606 xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[NPGPTD]))); 1607 for (i = 0; i < NPGPTD; i++) { 1608 vm_paddr_t ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); 1609 PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE); 1610 } 1611 xen_flush_queue(); 1612 vm_page_unlock_queues(); 1613#endif 1614 pmap->pm_active = 0; 1615 TAILQ_INIT(&pmap->pm_pvchunk); 1616 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1617 1618 return (1); 1619} 1620 1621/* 1622 * this routine is called if the page table page is not 1623 * mapped correctly. 1624 */ 1625static vm_page_t 1626_pmap_allocpte(pmap_t pmap, unsigned int ptepindex, int flags) 1627{ 1628 vm_paddr_t ptema; 1629 vm_page_t m; 1630 1631 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1632 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1633 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1634 1635 /* 1636 * Allocate a page table page. 1637 */ 1638 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1639 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1640 if (flags & M_WAITOK) { 1641 PMAP_UNLOCK(pmap); 1642 vm_page_unlock_queues(); 1643 VM_WAIT; 1644 vm_page_lock_queues(); 1645 PMAP_LOCK(pmap); 1646 } 1647 1648 /* 1649 * Indicate the need to retry. While waiting, the page table 1650 * page may have been allocated. 1651 */ 1652 return (NULL); 1653 } 1654 if ((m->flags & PG_ZERO) == 0) 1655 pmap_zero_page(m); 1656 1657 /* 1658 * Map the pagetable page into the process address space, if 1659 * it isn't already there. 1660 */ 1661 pmap->pm_stats.resident_count++; 1662 1663 ptema = xpmap_ptom(VM_PAGE_TO_PHYS(m)); 1664 xen_pt_pin(ptema); 1665 PT_SET_VA_MA(&pmap->pm_pdir[ptepindex], 1666 (ptema | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); 1667 1668 KASSERT(pmap->pm_pdir[ptepindex], 1669 ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex)); 1670 return (m); 1671} 1672 1673static vm_page_t 1674pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1675{ 1676 unsigned ptepindex; 1677 pd_entry_t ptema; 1678 vm_page_t m; 1679 1680 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1681 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1682 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1683 1684 /* 1685 * Calculate pagetable page index 1686 */ 1687 ptepindex = va >> PDRSHIFT; 1688retry: 1689 /* 1690 * Get the page directory entry 1691 */ 1692 ptema = pmap->pm_pdir[ptepindex]; 1693 1694 /* 1695 * This supports switching from a 4MB page to a 1696 * normal 4K page. 1697 */ 1698 if (ptema & PG_PS) { 1699 /* 1700 * XXX 1701 */ 1702 pmap->pm_pdir[ptepindex] = 0; 1703 ptema = 0; 1704 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1705 pmap_invalidate_all(kernel_pmap); 1706 } 1707 1708 /* 1709 * If the page table page is mapped, we just increment the 1710 * hold count, and activate it. 1711 */ 1712 if (ptema & PG_V) { 1713 m = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); 1714 m->wire_count++; 1715 } else { 1716 /* 1717 * Here if the pte page isn't mapped, or if it has 1718 * been deallocated. 1719 */ 1720 CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x", 1721 pmap, va, flags); 1722 m = _pmap_allocpte(pmap, ptepindex, flags); 1723 if (m == NULL && (flags & M_WAITOK)) 1724 goto retry; 1725 1726 KASSERT(pmap->pm_pdir[ptepindex], ("ptepindex=%d did not get mapped", ptepindex)); 1727 } 1728 return (m); 1729} 1730 1731 1732/*************************************************** 1733* Pmap allocation/deallocation routines. 1734 ***************************************************/ 1735 1736#ifdef SMP 1737/* 1738 * Deal with a SMP shootdown of other users of the pmap that we are 1739 * trying to dispose of. This can be a bit hairy. 1740 */ 1741static cpumask_t *lazymask; 1742static u_int lazyptd; 1743static volatile u_int lazywait; 1744 1745void pmap_lazyfix_action(void); 1746 1747void 1748pmap_lazyfix_action(void) 1749{ 1750 cpumask_t mymask = PCPU_GET(cpumask); 1751 1752#ifdef COUNT_IPIS 1753 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1754#endif 1755 if (rcr3() == lazyptd) 1756 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1757 atomic_clear_int(lazymask, mymask); 1758 atomic_store_rel_int(&lazywait, 1); 1759} 1760 1761static void 1762pmap_lazyfix_self(cpumask_t mymask) 1763{ 1764 1765 if (rcr3() == lazyptd) 1766 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1767 atomic_clear_int(lazymask, mymask); 1768} 1769 1770 1771static void 1772pmap_lazyfix(pmap_t pmap) 1773{ 1774 cpumask_t mymask, mask; 1775 u_int spins; 1776 1777 while ((mask = pmap->pm_active) != 0) { 1778 spins = 50000000; 1779 mask = mask & -mask; /* Find least significant set bit */ 1780 mtx_lock_spin(&smp_ipi_mtx); 1781#ifdef PAE 1782 lazyptd = vtophys(pmap->pm_pdpt); 1783#else 1784 lazyptd = vtophys(pmap->pm_pdir); 1785#endif 1786 mymask = PCPU_GET(cpumask); 1787 if (mask == mymask) { 1788 lazymask = &pmap->pm_active; 1789 pmap_lazyfix_self(mymask); 1790 } else { 1791 atomic_store_rel_int((u_int *)&lazymask, 1792 (u_int)&pmap->pm_active); 1793 atomic_store_rel_int(&lazywait, 0); 1794 ipi_selected(mask, IPI_LAZYPMAP); 1795 while (lazywait == 0) { 1796 ia32_pause(); 1797 if (--spins == 0) 1798 break; 1799 } 1800 } 1801 mtx_unlock_spin(&smp_ipi_mtx); 1802 if (spins == 0) 1803 printf("pmap_lazyfix: spun for 50000000\n"); 1804 } 1805} 1806 1807#else /* SMP */ 1808 1809/* 1810 * Cleaning up on uniprocessor is easy. For various reasons, we're 1811 * unlikely to have to even execute this code, including the fact 1812 * that the cleanup is deferred until the parent does a wait(2), which 1813 * means that another userland process has run. 1814 */ 1815static void 1816pmap_lazyfix(pmap_t pmap) 1817{ 1818 u_int cr3; 1819 1820 cr3 = vtophys(pmap->pm_pdir); 1821 if (cr3 == rcr3()) { 1822 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1823 pmap->pm_active &= ~(PCPU_GET(cpumask)); 1824 } 1825} 1826#endif /* SMP */ 1827 1828/* 1829 * Release any resources held by the given physical map. 1830 * Called when a pmap initialized by pmap_pinit is being released. 1831 * Should only be called if the map contains no valid mappings. 1832 */ 1833void 1834pmap_release(pmap_t pmap) 1835{ 1836 vm_page_t m, ptdpg[2*NPGPTD+1]; 1837 vm_paddr_t ma; 1838 int i; 1839#ifdef XEN 1840#ifdef PAE 1841 int npgptd = NPGPTD + 1; 1842#else 1843 int npgptd = NPGPTD; 1844#endif 1845#else 1846 int npgptd = NPGPTD; 1847#endif 1848 KASSERT(pmap->pm_stats.resident_count == 0, 1849 ("pmap_release: pmap resident count %ld != 0", 1850 pmap->pm_stats.resident_count)); 1851 PT_UPDATES_FLUSH(); 1852 1853 pmap_lazyfix(pmap); 1854 mtx_lock_spin(&allpmaps_lock); 1855 LIST_REMOVE(pmap, pm_list); 1856 mtx_unlock_spin(&allpmaps_lock); 1857 1858 for (i = 0; i < NPGPTD; i++) 1859 ptdpg[i] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdir + (i*NPDEPG)) & PG_FRAME); 1860 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 1861#if defined(PAE) && defined(XEN) 1862 ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt)); 1863#endif 1864 1865 for (i = 0; i < npgptd; i++) { 1866 m = ptdpg[i]; 1867 ma = xpmap_ptom(VM_PAGE_TO_PHYS(m)); 1868 /* unpinning L1 and L2 treated the same */ 1869 xen_pgd_unpin(ma); 1870#ifdef PAE 1871 KASSERT(xpmap_ptom(VM_PAGE_TO_PHYS(m)) == (pmap->pm_pdpt[i] & PG_FRAME), 1872 ("pmap_release: got wrong ptd page")); 1873#endif 1874 m->wire_count--; 1875 atomic_subtract_int(&cnt.v_wire_count, 1); 1876 vm_page_free(m); 1877 } 1878 PMAP_LOCK_DESTROY(pmap); 1879} 1880 1881static int 1882kvm_size(SYSCTL_HANDLER_ARGS) 1883{ 1884 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 1885 1886 return sysctl_handle_long(oidp, &ksize, 0, req); 1887} 1888SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1889 0, 0, kvm_size, "IU", "Size of KVM"); 1890 1891static int 1892kvm_free(SYSCTL_HANDLER_ARGS) 1893{ 1894 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1895 1896 return sysctl_handle_long(oidp, &kfree, 0, req); 1897} 1898SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1899 0, 0, kvm_free, "IU", "Amount of KVM free"); 1900 1901/* 1902 * grow the number of kernel page table entries, if needed 1903 */ 1904void 1905pmap_growkernel(vm_offset_t addr) 1906{ 1907 struct pmap *pmap; 1908 vm_paddr_t ptppaddr; 1909 vm_page_t nkpg; 1910 pd_entry_t newpdir; 1911 1912 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1913 if (kernel_vm_end == 0) { 1914 kernel_vm_end = KERNBASE; 1915 nkpt = 0; 1916 while (pdir_pde(PTD, kernel_vm_end)) { 1917 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1918 nkpt++; 1919 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1920 kernel_vm_end = kernel_map->max_offset; 1921 break; 1922 } 1923 } 1924 } 1925 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1926 if (addr - 1 >= kernel_map->max_offset) 1927 addr = kernel_map->max_offset; 1928 while (kernel_vm_end < addr) { 1929 if (pdir_pde(PTD, kernel_vm_end)) { 1930 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1931 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1932 kernel_vm_end = kernel_map->max_offset; 1933 break; 1934 } 1935 continue; 1936 } 1937 1938 /* 1939 * This index is bogus, but out of the way 1940 */ 1941 nkpg = vm_page_alloc(NULL, nkpt, 1942 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1943 if (!nkpg) 1944 panic("pmap_growkernel: no memory to grow kernel"); 1945 1946 nkpt++; 1947 1948 pmap_zero_page(nkpg); 1949 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1950 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1951 vm_page_lock_queues(); 1952 PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); 1953 mtx_lock_spin(&allpmaps_lock); 1954 LIST_FOREACH(pmap, &allpmaps, pm_list) 1955 PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); 1956 1957 mtx_unlock_spin(&allpmaps_lock); 1958 vm_page_unlock_queues(); 1959 1960 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1961 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1962 kernel_vm_end = kernel_map->max_offset; 1963 break; 1964 } 1965 } 1966} 1967 1968 1969/*************************************************** 1970 * page management routines. 1971 ***************************************************/ 1972 1973CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1974CTASSERT(_NPCM == 11); 1975 1976static __inline struct pv_chunk * 1977pv_to_chunk(pv_entry_t pv) 1978{ 1979 1980 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); 1981} 1982 1983#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1984 1985#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 1986#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 1987 1988static uint32_t pc_freemask[11] = { 1989 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 1990 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 1991 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 1992 PC_FREE0_9, PC_FREE10 1993}; 1994 1995SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1996 "Current number of pv entries"); 1997 1998#ifdef PV_STATS 1999static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2000 2001SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2002 "Current number of pv entry chunks"); 2003SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2004 "Current number of pv entry chunks allocated"); 2005SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2006 "Current number of pv entry chunks frees"); 2007SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2008 "Number of times tried to get a chunk page but failed."); 2009 2010static long pv_entry_frees, pv_entry_allocs; 2011static int pv_entry_spare; 2012 2013SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2014 "Current number of pv entry frees"); 2015SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2016 "Current number of pv entry allocs"); 2017SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2018 "Current number of spare pv entries"); 2019 2020static int pmap_collect_inactive, pmap_collect_active; 2021 2022SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, 2023 "Current number times pmap_collect called on inactive queue"); 2024SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, 2025 "Current number times pmap_collect called on active queue"); 2026#endif 2027 2028/* 2029 * We are in a serious low memory condition. Resort to 2030 * drastic measures to free some pages so we can allocate 2031 * another pv entry chunk. This is normally called to 2032 * unmap inactive pages, and if necessary, active pages. 2033 */ 2034static void 2035pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) 2036{ 2037 pmap_t pmap; 2038 pt_entry_t *pte, tpte; 2039 pv_entry_t next_pv, pv; 2040 vm_offset_t va; 2041 vm_page_t m, free; 2042 2043 sched_pin(); 2044 TAILQ_FOREACH(m, &vpq->pl, pageq) { 2045 if (m->hold_count || m->busy) 2046 continue; 2047 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { 2048 va = pv->pv_va; 2049 pmap = PV_PMAP(pv); 2050 /* Avoid deadlock and lock recursion. */ 2051 if (pmap > locked_pmap) 2052 PMAP_LOCK(pmap); 2053 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 2054 continue; 2055 pmap->pm_stats.resident_count--; 2056 pte = pmap_pte_quick(pmap, va); 2057 tpte = pte_load_clear(pte); 2058 KASSERT((tpte & PG_W) == 0, 2059 ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); 2060 if (tpte & PG_A) 2061 vm_page_flag_set(m, PG_REFERENCED); 2062 if (tpte & PG_M) { 2063 KASSERT((tpte & PG_RW), 2064 ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", 2065 va, (uintmax_t)tpte)); 2066 vm_page_dirty(m); 2067 } 2068 free = NULL; 2069 pmap_unuse_pt(pmap, va, &free); 2070 pmap_invalidate_page(pmap, va); 2071 pmap_free_zero_pages(free); 2072 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2073 if (TAILQ_EMPTY(&m->md.pv_list)) 2074 vm_page_flag_clear(m, PG_WRITEABLE); 2075 free_pv_entry(pmap, pv); 2076 if (pmap != locked_pmap) 2077 PMAP_UNLOCK(pmap); 2078 } 2079 } 2080 sched_unpin(); 2081} 2082 2083 2084/* 2085 * free the pv_entry back to the free list 2086 */ 2087static void 2088free_pv_entry(pmap_t pmap, pv_entry_t pv) 2089{ 2090 vm_page_t m; 2091 struct pv_chunk *pc; 2092 int idx, field, bit; 2093 2094 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2095 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2096 PV_STAT(pv_entry_frees++); 2097 PV_STAT(pv_entry_spare++); 2098 pv_entry_count--; 2099 pc = pv_to_chunk(pv); 2100 idx = pv - &pc->pc_pventry[0]; 2101 field = idx / 32; 2102 bit = idx % 32; 2103 pc->pc_map[field] |= 1ul << bit; 2104 /* move to head of list */ 2105 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2106 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2107 for (idx = 0; idx < _NPCM; idx++) 2108 if (pc->pc_map[idx] != pc_freemask[idx]) 2109 return; 2110 PV_STAT(pv_entry_spare -= _NPCPV); 2111 PV_STAT(pc_chunk_count--); 2112 PV_STAT(pc_chunk_frees++); 2113 /* entire chunk is free, return it */ 2114 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2115 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2116 pmap_qremove((vm_offset_t)pc, 1); 2117 vm_page_unwire(m, 0); 2118 vm_page_free(m); 2119 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2120} 2121 2122/* 2123 * get a new pv_entry, allocating a block from the system 2124 * when needed. 2125 */ 2126static pv_entry_t 2127get_pv_entry(pmap_t pmap, int try) 2128{ 2129 static const struct timeval printinterval = { 60, 0 }; 2130 static struct timeval lastprint; 2131 static vm_pindex_t colour; 2132 struct vpgqueues *pq; 2133 int bit, field; 2134 pv_entry_t pv; 2135 struct pv_chunk *pc; 2136 vm_page_t m; 2137 2138 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2139 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2140 PV_STAT(pv_entry_allocs++); 2141 pv_entry_count++; 2142 if (pv_entry_count > pv_entry_high_water) 2143 if (ratecheck(&lastprint, &printinterval)) 2144 printf("Approaching the limit on PV entries, consider " 2145 "increasing either the vm.pmap.shpgperproc or the " 2146 "vm.pmap.pv_entry_max tunable.\n"); 2147 pq = NULL; 2148retry: 2149 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2150 if (pc != NULL) { 2151 for (field = 0; field < _NPCM; field++) { 2152 if (pc->pc_map[field]) { 2153 bit = bsfl(pc->pc_map[field]); 2154 break; 2155 } 2156 } 2157 if (field < _NPCM) { 2158 pv = &pc->pc_pventry[field * 32 + bit]; 2159 pc->pc_map[field] &= ~(1ul << bit); 2160 /* If this was the last item, move it to tail */ 2161 for (field = 0; field < _NPCM; field++) 2162 if (pc->pc_map[field] != 0) { 2163 PV_STAT(pv_entry_spare--); 2164 return (pv); /* not full, return */ 2165 } 2166 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2167 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2168 PV_STAT(pv_entry_spare--); 2169 return (pv); 2170 } 2171 } 2172 /* 2173 * Access to the ptelist "pv_vafree" is synchronized by the page 2174 * queues lock. If "pv_vafree" is currently non-empty, it will 2175 * remain non-empty until pmap_ptelist_alloc() completes. 2176 */ 2177 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq == 2178 &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | 2179 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2180 if (try) { 2181 pv_entry_count--; 2182 PV_STAT(pc_chunk_tryfail++); 2183 return (NULL); 2184 } 2185 /* 2186 * Reclaim pv entries: At first, destroy mappings to 2187 * inactive pages. After that, if a pv chunk entry 2188 * is still needed, destroy mappings to active pages. 2189 */ 2190 if (pq == NULL) { 2191 PV_STAT(pmap_collect_inactive++); 2192 pq = &vm_page_queues[PQ_INACTIVE]; 2193 } else if (pq == &vm_page_queues[PQ_INACTIVE]) { 2194 PV_STAT(pmap_collect_active++); 2195 pq = &vm_page_queues[PQ_ACTIVE]; 2196 } else 2197 panic("get_pv_entry: increase vm.pmap.shpgperproc"); 2198 pmap_collect(pmap, pq); 2199 goto retry; 2200 } 2201 PV_STAT(pc_chunk_count++); 2202 PV_STAT(pc_chunk_allocs++); 2203 colour++; 2204 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2205 pmap_qenter((vm_offset_t)pc, &m, 1); 2206 if ((m->flags & PG_ZERO) == 0) 2207 pagezero(pc); 2208 pc->pc_pmap = pmap; 2209 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2210 for (field = 1; field < _NPCM; field++) 2211 pc->pc_map[field] = pc_freemask[field]; 2212 pv = &pc->pc_pventry[0]; 2213 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2214 PV_STAT(pv_entry_spare += _NPCPV - 1); 2215 return (pv); 2216} 2217 2218static void 2219pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2220{ 2221 pv_entry_t pv; 2222 2223 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2224 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2225 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2226 if (pmap == PV_PMAP(pv) && va == pv->pv_va) 2227 break; 2228 } 2229 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); 2230 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2231 if (TAILQ_EMPTY(&m->md.pv_list)) 2232 vm_page_flag_clear(m, PG_WRITEABLE); 2233 free_pv_entry(pmap, pv); 2234} 2235 2236/* 2237 * Create a pv entry for page at pa for 2238 * (pmap, va). 2239 */ 2240static void 2241pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2242{ 2243 pv_entry_t pv; 2244 2245 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2246 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2247 pv = get_pv_entry(pmap, FALSE); 2248 pv->pv_va = va; 2249 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2250} 2251 2252/* 2253 * Conditionally create a pv entry. 2254 */ 2255static boolean_t 2256pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2257{ 2258 pv_entry_t pv; 2259 2260 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2261 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2262 if (pv_entry_count < pv_entry_high_water && 2263 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2264 pv->pv_va = va; 2265 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2266 return (TRUE); 2267 } else 2268 return (FALSE); 2269} 2270 2271/* 2272 * pmap_remove_pte: do the things to unmap a page in a process 2273 */ 2274static int 2275pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) 2276{ 2277 pt_entry_t oldpte; 2278 vm_page_t m; 2279 2280 CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x", 2281 pmap, (u_long)*ptq, va); 2282 2283 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2284 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2285 oldpte = *ptq; 2286 PT_SET_VA_MA(ptq, 0, TRUE); 2287 if (oldpte & PG_W) 2288 pmap->pm_stats.wired_count -= 1; 2289 /* 2290 * Machines that don't support invlpg, also don't support 2291 * PG_G. 2292 */ 2293 if (oldpte & PG_G) 2294 pmap_invalidate_page(kernel_pmap, va); 2295 pmap->pm_stats.resident_count -= 1; 2296 /* 2297 * XXX This is not strictly correctly, but somewhere along the line 2298 * we are losing the managed bit on some pages. It is unclear to me 2299 * why, but I think the most likely explanation is that xen's writable 2300 * page table implementation doesn't respect the unused bits. 2301 */ 2302 if ((oldpte & PG_MANAGED) || ((oldpte & PG_V) && (va < VM_MAXUSER_ADDRESS)) 2303 ) { 2304 m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte) & PG_FRAME); 2305 2306 if (!(oldpte & PG_MANAGED)) 2307 printf("va=0x%x is unmanaged :-( pte=0x%llx\n", va, oldpte); 2308 2309 if (oldpte & PG_M) { 2310 KASSERT((oldpte & PG_RW), 2311 ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", 2312 va, (uintmax_t)oldpte)); 2313 vm_page_dirty(m); 2314 } 2315 if (oldpte & PG_A) 2316 vm_page_flag_set(m, PG_REFERENCED); 2317 pmap_remove_entry(pmap, m, va); 2318 } else if ((va < VM_MAXUSER_ADDRESS) && (oldpte & PG_V)) 2319 printf("va=0x%x is unmanaged :-( pte=0x%llx\n", va, oldpte); 2320 2321 return (pmap_unuse_pt(pmap, va, free)); 2322} 2323 2324/* 2325 * Remove a single page from a process address space 2326 */ 2327static void 2328pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) 2329{ 2330 pt_entry_t *pte; 2331 2332 CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x", 2333 pmap, va); 2334 2335 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2336 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2337 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2338 if ((pte = pmap_pte_quick(pmap, va)) == NULL || (*pte & PG_V) == 0) 2339 return; 2340 pmap_remove_pte(pmap, pte, va, free); 2341 pmap_invalidate_page(pmap, va); 2342 if (*PMAP1) 2343 PT_SET_MA(PADDR1, 0); 2344 2345} 2346 2347/* 2348 * Remove the given range of addresses from the specified map. 2349 * 2350 * It is assumed that the start and end are properly 2351 * rounded to the page size. 2352 */ 2353void 2354pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2355{ 2356 vm_offset_t pdnxt; 2357 pd_entry_t ptpaddr; 2358 pt_entry_t *pte; 2359 vm_page_t free = NULL; 2360 int anyvalid; 2361 2362 CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x", 2363 pmap, sva, eva); 2364 2365 /* 2366 * Perform an unsynchronized read. This is, however, safe. 2367 */ 2368 if (pmap->pm_stats.resident_count == 0) 2369 return; 2370 2371 anyvalid = 0; 2372 2373 vm_page_lock_queues(); 2374 sched_pin(); 2375 PMAP_LOCK(pmap); 2376 2377 /* 2378 * special handling of removing one page. a very 2379 * common operation and easy to short circuit some 2380 * code. 2381 */ 2382 if ((sva + PAGE_SIZE == eva) && 2383 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2384 pmap_remove_page(pmap, sva, &free); 2385 goto out; 2386 } 2387 2388 for (; sva < eva; sva = pdnxt) { 2389 unsigned pdirindex; 2390 2391 /* 2392 * Calculate index for next page table. 2393 */ 2394 pdnxt = (sva + NBPDR) & ~PDRMASK; 2395 if (pmap->pm_stats.resident_count == 0) 2396 break; 2397 2398 pdirindex = sva >> PDRSHIFT; 2399 ptpaddr = pmap->pm_pdir[pdirindex]; 2400 2401 /* 2402 * Weed out invalid mappings. Note: we assume that the page 2403 * directory table is always allocated, and in kernel virtual. 2404 */ 2405 if (ptpaddr == 0) 2406 continue; 2407 2408 /* 2409 * Check for large page. 2410 */ 2411 if ((ptpaddr & PG_PS) != 0) { 2412 PD_CLEAR_VA(pmap, pdirindex, TRUE); 2413 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2414 anyvalid = 1; 2415 continue; 2416 } 2417 2418 /* 2419 * Limit our scan to either the end of the va represented 2420 * by the current page table page, or to the end of the 2421 * range being removed. 2422 */ 2423 if (pdnxt > eva) 2424 pdnxt = eva; 2425 2426 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 2427 sva += PAGE_SIZE) { 2428 if ((*pte & PG_V) == 0) 2429 continue; 2430 2431 /* 2432 * The TLB entry for a PG_G mapping is invalidated 2433 * by pmap_remove_pte(). 2434 */ 2435 if ((*pte & PG_G) == 0) 2436 anyvalid = 1; 2437 if (pmap_remove_pte(pmap, pte, sva, &free)) 2438 break; 2439 } 2440 } 2441 PT_UPDATES_FLUSH(); 2442 if (*PMAP1) 2443 PT_SET_VA_MA(PMAP1, 0, TRUE); 2444out: 2445 if (anyvalid) 2446 pmap_invalidate_all(pmap); 2447 sched_unpin(); 2448 vm_page_unlock_queues(); 2449 PMAP_UNLOCK(pmap); 2450 pmap_free_zero_pages(free); 2451} 2452 2453/* 2454 * Routine: pmap_remove_all 2455 * Function: 2456 * Removes this physical page from 2457 * all physical maps in which it resides. 2458 * Reflects back modify bits to the pager. 2459 * 2460 * Notes: 2461 * Original versions of this routine were very 2462 * inefficient because they iteratively called 2463 * pmap_remove (slow...) 2464 */ 2465 2466void 2467pmap_remove_all(vm_page_t m) 2468{ 2469 pv_entry_t pv; 2470 pmap_t pmap; 2471 pt_entry_t *pte, tpte; 2472 vm_page_t free; 2473 2474#if defined(PMAP_DIAGNOSTIC) 2475 /* 2476 * XXX This makes pmap_remove_all() illegal for non-managed pages! 2477 */ 2478 if (m->flags & PG_FICTITIOUS) { 2479 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%jx", 2480 VM_PAGE_TO_PHYS(m) & 0xffffffff); 2481 } 2482#endif 2483 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2484 sched_pin(); 2485 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2486 pmap = PV_PMAP(pv); 2487 PMAP_LOCK(pmap); 2488 pmap->pm_stats.resident_count--; 2489 pte = pmap_pte_quick(pmap, pv->pv_va); 2490 2491 tpte = *pte; 2492 PT_SET_VA_MA(pte, 0, TRUE); 2493 if (tpte & PG_W) 2494 pmap->pm_stats.wired_count--; 2495 if (tpte & PG_A) 2496 vm_page_flag_set(m, PG_REFERENCED); 2497 2498 /* 2499 * Update the vm_page_t clean and reference bits. 2500 */ 2501 if (tpte & PG_M) { 2502 KASSERT((tpte & PG_RW), 2503 ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", 2504 pv->pv_va, (uintmax_t)tpte)); 2505 vm_page_dirty(m); 2506 } 2507 free = NULL; 2508 pmap_unuse_pt(pmap, pv->pv_va, &free); 2509 pmap_invalidate_page(pmap, pv->pv_va); 2510 pmap_free_zero_pages(free); 2511 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2512 free_pv_entry(pmap, pv); 2513 PMAP_UNLOCK(pmap); 2514 } 2515 vm_page_flag_clear(m, PG_WRITEABLE); 2516 PT_UPDATES_FLUSH(); 2517 if (*PMAP1) 2518 PT_SET_MA(PADDR1, 0); 2519 sched_unpin(); 2520} 2521 2522/* 2523 * Set the physical protection on the 2524 * specified range of this map as requested. 2525 */ 2526void 2527pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2528{ 2529 vm_offset_t pdnxt; 2530 pd_entry_t ptpaddr; 2531 pt_entry_t *pte; 2532 int anychanged; 2533 2534 CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x", 2535 pmap, sva, eva, prot); 2536 2537 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2538 pmap_remove(pmap, sva, eva); 2539 return; 2540 } 2541 2542#ifdef PAE 2543 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 2544 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 2545 return; 2546#else 2547 if (prot & VM_PROT_WRITE) 2548 return; 2549#endif 2550 2551 anychanged = 0; 2552 2553 vm_page_lock_queues(); 2554 sched_pin(); 2555 PMAP_LOCK(pmap); 2556 for (; sva < eva; sva = pdnxt) { 2557 pt_entry_t obits, pbits; 2558 unsigned pdirindex; 2559 2560 pdnxt = (sva + NBPDR) & ~PDRMASK; 2561 2562 pdirindex = sva >> PDRSHIFT; 2563 ptpaddr = pmap->pm_pdir[pdirindex]; 2564 2565 /* 2566 * Weed out invalid mappings. Note: we assume that the page 2567 * directory table is always allocated, and in kernel virtual. 2568 */ 2569 if (ptpaddr == 0) 2570 continue; 2571 2572 /* 2573 * Check for large page. 2574 */ 2575 if ((ptpaddr & PG_PS) != 0) { 2576 if ((prot & VM_PROT_WRITE) == 0) 2577 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); 2578#ifdef PAE 2579 if ((prot & VM_PROT_EXECUTE) == 0) 2580 pmap->pm_pdir[pdirindex] |= pg_nx; 2581#endif 2582 anychanged = 1; 2583 continue; 2584 } 2585 2586 if (pdnxt > eva) 2587 pdnxt = eva; 2588 2589 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 2590 sva += PAGE_SIZE) { 2591 vm_page_t m; 2592 2593retry: 2594 /* 2595 * Regardless of whether a pte is 32 or 64 bits in 2596 * size, PG_RW, PG_A, and PG_M are among the least 2597 * significant 32 bits. 2598 */ 2599 obits = pbits = *pte; 2600 if ((pbits & PG_V) == 0) 2601 continue; 2602 if (pbits & PG_MANAGED) { 2603 m = NULL; 2604 if (pbits & PG_A) { 2605 m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & PG_FRAME); 2606 vm_page_flag_set(m, PG_REFERENCED); 2607 pbits &= ~PG_A; 2608 } 2609 if ((pbits & PG_M) != 0) { 2610 if (m == NULL) 2611 m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & PG_FRAME); 2612 vm_page_dirty(m); 2613 } 2614 } 2615 2616 if ((prot & VM_PROT_WRITE) == 0) 2617 pbits &= ~(PG_RW | PG_M); 2618#ifdef PAE 2619 if ((prot & VM_PROT_EXECUTE) == 0) 2620 pbits |= pg_nx; 2621#endif 2622 2623 if (pbits != obits) { 2624#ifdef XEN 2625 obits = *pte; 2626 PT_SET_VA_MA(pte, pbits, TRUE); 2627 if (*pte != pbits) 2628 goto retry; 2629#else 2630#ifdef PAE 2631 if (!atomic_cmpset_64(pte, obits, pbits)) 2632 goto retry; 2633#else 2634 if (!atomic_cmpset_int((u_int *)pte, obits, 2635 pbits)) 2636 goto retry; 2637#endif 2638#endif 2639 if (obits & PG_G) 2640 pmap_invalidate_page(pmap, sva); 2641 else 2642 anychanged = 1; 2643 } 2644 } 2645 } 2646 PT_UPDATES_FLUSH(); 2647 if (*PMAP1) 2648 PT_SET_VA_MA(PMAP1, 0, TRUE); 2649 if (anychanged) 2650 pmap_invalidate_all(pmap); 2651 sched_unpin(); 2652 vm_page_unlock_queues(); 2653 PMAP_UNLOCK(pmap); 2654} 2655 2656/* 2657 * Insert the given physical page (p) at 2658 * the specified virtual address (v) in the 2659 * target physical map with the protection requested. 2660 * 2661 * If specified, the page will be wired down, meaning 2662 * that the related pte can not be reclaimed. 2663 * 2664 * NB: This is the only routine which MAY NOT lazy-evaluate 2665 * or lose information. That is, this routine must actually 2666 * insert this page into the given map NOW. 2667 */ 2668void 2669pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 2670 vm_prot_t prot, boolean_t wired) 2671{ 2672 vm_paddr_t pa; 2673 pd_entry_t *pde; 2674 pt_entry_t *pte; 2675 vm_paddr_t opa; 2676 pt_entry_t origpte, newpte; 2677 vm_page_t mpte, om; 2678 boolean_t invlva; 2679 2680 CTR6(KTR_PMAP, "pmap_enter: pmap=%08p va=0x%08x access=0x%x ma=0x%08x prot=0x%x wired=%d", 2681 pmap, va, access, xpmap_ptom(VM_PAGE_TO_PHYS(m)), prot, wired); 2682 va = trunc_page(va); 2683#ifdef PMAP_DIAGNOSTIC 2684 if (va > VM_MAX_KERNEL_ADDRESS) 2685 panic("pmap_enter: toobig"); 2686 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2687 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); 2688#endif 2689 2690 mpte = NULL; 2691 2692 vm_page_lock_queues(); 2693 PMAP_LOCK(pmap); 2694 sched_pin(); 2695 2696 /* 2697 * In the case that a page table page is not 2698 * resident, we are creating it here. 2699 */ 2700 if (va < VM_MAXUSER_ADDRESS) { 2701 mpte = pmap_allocpte(pmap, va, M_WAITOK); 2702 } 2703#if 0 && defined(PMAP_DIAGNOSTIC) 2704 else { 2705 pd_entry_t *pdeaddr = pmap_pde(pmap, va); 2706 origpte = *pdeaddr; 2707 if ((origpte & PG_V) == 0) { 2708 panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", 2709 pmap->pm_pdir[PTDPTDI], origpte, va); 2710 } 2711 } 2712#endif 2713 2714 pde = pmap_pde(pmap, va); 2715 if ((*pde & PG_PS) != 0) 2716 panic("pmap_enter: attempted pmap_enter on 4MB page"); 2717 pte = pmap_pte_quick(pmap, va); 2718 2719 /* 2720 * Page Directory table entry not valid, we need a new PT page 2721 */ 2722 if (pte == NULL) { 2723 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", 2724 (uintmax_t)pmap->pm_pdir[va >> PDRSHIFT], va); 2725 } 2726 2727 pa = VM_PAGE_TO_PHYS(m); 2728 om = NULL; 2729 opa = origpte = 0; 2730 2731#if 0 2732 KASSERT((*pte & PG_V) || (*pte == 0), ("address set but not valid pte=%p *pte=0x%016jx", 2733 pte, *pte)); 2734#endif 2735 origpte = *pte; 2736 if (origpte) 2737 origpte = xpmap_mtop(origpte); 2738 opa = origpte & PG_FRAME; 2739 2740 /* 2741 * Mapping has not changed, must be protection or wiring change. 2742 */ 2743 if (origpte && (opa == pa)) { 2744 /* 2745 * Wiring change, just update stats. We don't worry about 2746 * wiring PT pages as they remain resident as long as there 2747 * are valid mappings in them. Hence, if a user page is wired, 2748 * the PT page will be also. 2749 */ 2750 if (wired && ((origpte & PG_W) == 0)) 2751 pmap->pm_stats.wired_count++; 2752 else if (!wired && (origpte & PG_W)) 2753 pmap->pm_stats.wired_count--; 2754 2755 /* 2756 * Remove extra pte reference 2757 */ 2758 if (mpte) 2759 mpte->wire_count--; 2760 2761 /* 2762 * We might be turning off write access to the page, 2763 * so we go ahead and sense modify status. 2764 */ 2765 if (origpte & PG_MANAGED) { 2766 om = m; 2767 pa |= PG_MANAGED; 2768 } 2769 goto validate; 2770 } 2771 /* 2772 * Mapping has changed, invalidate old range and fall through to 2773 * handle validating new mapping. 2774 */ 2775 if (opa) { 2776 if (origpte & PG_W) 2777 pmap->pm_stats.wired_count--; 2778 if (origpte & PG_MANAGED) { 2779 om = PHYS_TO_VM_PAGE(opa); 2780 pmap_remove_entry(pmap, om, va); 2781 } else if (va < VM_MAXUSER_ADDRESS) 2782 printf("va=0x%x is unmanaged :-( \n", va); 2783 2784 if (mpte != NULL) { 2785 mpte->wire_count--; 2786 KASSERT(mpte->wire_count > 0, 2787 ("pmap_enter: missing reference to page table page," 2788 " va: 0x%x", va)); 2789 } 2790 } else 2791 pmap->pm_stats.resident_count++; 2792 2793 /* 2794 * Enter on the PV list if part of our managed memory. 2795 */ 2796 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 2797 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 2798 ("pmap_enter: managed mapping within the clean submap")); 2799 pmap_insert_entry(pmap, va, m); 2800 pa |= PG_MANAGED; 2801 } 2802 2803 /* 2804 * Increment counters 2805 */ 2806 if (wired) 2807 pmap->pm_stats.wired_count++; 2808 2809validate: 2810 /* 2811 * Now validate mapping with desired protection/wiring. 2812 */ 2813 newpte = (pt_entry_t)(pa | PG_V); 2814 if ((prot & VM_PROT_WRITE) != 0) { 2815 newpte |= PG_RW; 2816 vm_page_flag_set(m, PG_WRITEABLE); 2817 } 2818#ifdef PAE 2819 if ((prot & VM_PROT_EXECUTE) == 0) 2820 newpte |= pg_nx; 2821#endif 2822 if (wired) 2823 newpte |= PG_W; 2824 if (va < VM_MAXUSER_ADDRESS) 2825 newpte |= PG_U; 2826 if (pmap == kernel_pmap) 2827 newpte |= pgeflag; 2828 2829 critical_enter(); 2830 /* 2831 * if the mapping or permission bits are different, we need 2832 * to update the pte. 2833 */ 2834 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2835 if (origpte) { 2836 invlva = FALSE; 2837 origpte = *pte; 2838 PT_SET_VA(pte, newpte | PG_A, FALSE); 2839 if (origpte & PG_A) { 2840 if (origpte & PG_MANAGED) 2841 vm_page_flag_set(om, PG_REFERENCED); 2842 if (opa != VM_PAGE_TO_PHYS(m)) 2843 invlva = TRUE; 2844#ifdef PAE 2845 if ((origpte & PG_NX) == 0 && 2846 (newpte & PG_NX) != 0) 2847 invlva = TRUE; 2848#endif 2849 } 2850 if (origpte & PG_M) { 2851 KASSERT((origpte & PG_RW), 2852 ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", 2853 va, (uintmax_t)origpte)); 2854 if ((origpte & PG_MANAGED) != 0) 2855 vm_page_dirty(om); 2856 if ((prot & VM_PROT_WRITE) == 0) 2857 invlva = TRUE; 2858 } 2859 if (invlva) 2860 pmap_invalidate_page(pmap, va); 2861 } else{ 2862 PT_SET_VA(pte, newpte | PG_A, FALSE); 2863 } 2864 2865 } 2866 PT_UPDATES_FLUSH(); 2867 critical_exit(); 2868 if (*PMAP1) 2869 PT_SET_VA_MA(PMAP1, 0, TRUE); 2870 sched_unpin(); 2871 vm_page_unlock_queues(); 2872 PMAP_UNLOCK(pmap); 2873} 2874 2875/* 2876 * Maps a sequence of resident pages belonging to the same object. 2877 * The sequence begins with the given page m_start. This page is 2878 * mapped at the given virtual address start. Each subsequent page is 2879 * mapped at a virtual address that is offset from start by the same 2880 * amount as the page is offset from m_start within the object. The 2881 * last page in the sequence is the page with the largest offset from 2882 * m_start that can be mapped at a virtual address less than the given 2883 * virtual address end. Not every virtual page between start and end 2884 * is mapped; only those for which a resident page exists with the 2885 * corresponding offset from m_start are mapped. 2886 */ 2887void 2888pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 2889 vm_page_t m_start, vm_prot_t prot) 2890{ 2891 vm_page_t m, mpte; 2892 vm_pindex_t diff, psize; 2893 multicall_entry_t mcl[16]; 2894 multicall_entry_t *mclp = mcl; 2895 int error, count = 0; 2896 2897 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 2898 psize = atop(end - start); 2899 2900 mpte = NULL; 2901 m = m_start; 2902 PMAP_LOCK(pmap); 2903 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 2904 mpte = pmap_enter_quick_locked(&mclp, &count, pmap, start + ptoa(diff), m, 2905 prot, mpte); 2906 m = TAILQ_NEXT(m, listq); 2907 if (count == 16) { 2908 error = HYPERVISOR_multicall(mcl, count); 2909 KASSERT(error == 0, ("bad multicall %d", error)); 2910 mclp = mcl; 2911 count = 0; 2912 } 2913 } 2914 if (count) { 2915 error = HYPERVISOR_multicall(mcl, count); 2916 KASSERT(error == 0, ("bad multicall %d", error)); 2917 } 2918 2919 PMAP_UNLOCK(pmap); 2920} 2921 2922/* 2923 * this code makes some *MAJOR* assumptions: 2924 * 1. Current pmap & pmap exists. 2925 * 2. Not wired. 2926 * 3. Read access. 2927 * 4. No page table pages. 2928 * but is *MUCH* faster than pmap_enter... 2929 */ 2930 2931void 2932pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 2933{ 2934 multicall_entry_t mcl, *mclp; 2935 int count = 0; 2936 mclp = &mcl; 2937 2938 CTR4(KTR_PMAP, "pmap_enter_quick: pmap=%p va=0x%x m=%p prot=0x%x", 2939 pmap, va, m, prot); 2940 2941 PMAP_LOCK(pmap); 2942 (void) pmap_enter_quick_locked(&mclp, &count, pmap, va, m, prot, NULL); 2943 if (count) 2944 HYPERVISOR_multicall(&mcl, count); 2945 PMAP_UNLOCK(pmap); 2946} 2947 2948#ifdef notyet 2949void 2950pmap_enter_quick_range(pmap_t pmap, vm_offset_t *addrs, vm_page_t *pages, vm_prot_t *prots, int count) 2951{ 2952 int i, error, index = 0; 2953 multicall_entry_t mcl[16]; 2954 multicall_entry_t *mclp = mcl; 2955 2956 PMAP_LOCK(pmap); 2957 for (i = 0; i < count; i++, addrs++, pages++, prots++) { 2958 if (!pmap_is_prefaultable_locked(pmap, *addrs)) 2959 continue; 2960 2961 (void) pmap_enter_quick_locked(&mclp, &index, pmap, *addrs, *pages, *prots, NULL); 2962 if (index == 16) { 2963 error = HYPERVISOR_multicall(mcl, index); 2964 mclp = mcl; 2965 index = 0; 2966 KASSERT(error == 0, ("bad multicall %d", error)); 2967 } 2968 } 2969 if (index) { 2970 error = HYPERVISOR_multicall(mcl, index); 2971 KASSERT(error == 0, ("bad multicall %d", error)); 2972 } 2973 2974 PMAP_UNLOCK(pmap); 2975} 2976#endif 2977 2978static vm_page_t 2979pmap_enter_quick_locked(multicall_entry_t **mclpp, int *count, pmap_t pmap, vm_offset_t va, vm_page_t m, 2980 vm_prot_t prot, vm_page_t mpte) 2981{ 2982 pt_entry_t *pte; 2983 vm_paddr_t pa; 2984 vm_page_t free; 2985 multicall_entry_t *mcl = *mclpp; 2986 2987 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 2988 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, 2989 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 2990 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2991 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2992 2993 /* 2994 * In the case that a page table page is not 2995 * resident, we are creating it here. 2996 */ 2997 if (va < VM_MAXUSER_ADDRESS) { 2998 unsigned ptepindex; 2999 pd_entry_t ptema; 3000 3001 /* 3002 * Calculate pagetable page index 3003 */ 3004 ptepindex = va >> PDRSHIFT; 3005 if (mpte && (mpte->pindex == ptepindex)) { 3006 mpte->wire_count++; 3007 } else { 3008 /* 3009 * Get the page directory entry 3010 */ 3011 ptema = pmap->pm_pdir[ptepindex]; 3012 3013 /* 3014 * If the page table page is mapped, we just increment 3015 * the hold count, and activate it. 3016 */ 3017 if (ptema & PG_V) { 3018 if (ptema & PG_PS) 3019 panic("pmap_enter_quick: unexpected mapping into 4MB page"); 3020 mpte = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); 3021 mpte->wire_count++; 3022 } else { 3023 mpte = _pmap_allocpte(pmap, ptepindex, 3024 M_NOWAIT); 3025 if (mpte == NULL) 3026 return (mpte); 3027 } 3028 } 3029 } else { 3030 mpte = NULL; 3031 } 3032 3033 /* 3034 * This call to vtopte makes the assumption that we are 3035 * entering the page into the current pmap. In order to support 3036 * quick entry into any pmap, one would likely use pmap_pte_quick. 3037 * But that isn't as quick as vtopte. 3038 */ 3039 KASSERT(pmap_is_current(pmap), ("entering pages in non-current pmap")); 3040 pte = vtopte(va); 3041 if (*pte & PG_V) { 3042 if (mpte != NULL) { 3043 mpte->wire_count--; 3044 mpte = NULL; 3045 } 3046 return (mpte); 3047 } 3048 3049 /* 3050 * Enter on the PV list if part of our managed memory. 3051 */ 3052 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && 3053 !pmap_try_insert_pv_entry(pmap, va, m)) { 3054 if (mpte != NULL) { 3055 free = NULL; 3056 if (pmap_unwire_pte_hold(pmap, mpte, &free)) { 3057 pmap_invalidate_page(pmap, va); 3058 pmap_free_zero_pages(free); 3059 } 3060 3061 mpte = NULL; 3062 } 3063 return (mpte); 3064 } 3065 3066 /* 3067 * Increment counters 3068 */ 3069 pmap->pm_stats.resident_count++; 3070 3071 pa = VM_PAGE_TO_PHYS(m); 3072#ifdef PAE 3073 if ((prot & VM_PROT_EXECUTE) == 0) 3074 pa |= pg_nx; 3075#endif 3076 3077#if 0 3078 /* 3079 * Now validate mapping with RO protection 3080 */ 3081 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 3082 pte_store(pte, pa | PG_V | PG_U); 3083 else 3084 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3085#else 3086 /* 3087 * Now validate mapping with RO protection 3088 */ 3089 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 3090 pa = xpmap_ptom(pa | PG_V | PG_U); 3091 else 3092 pa = xpmap_ptom(pa | PG_V | PG_U | PG_MANAGED); 3093 3094 mcl->op = __HYPERVISOR_update_va_mapping; 3095 mcl->args[0] = va; 3096 mcl->args[1] = (uint32_t)(pa & 0xffffffff); 3097 mcl->args[2] = (uint32_t)(pa >> 32); 3098 mcl->args[3] = 0; 3099 *mclpp = mcl + 1; 3100 *count = *count + 1; 3101#endif 3102 return mpte; 3103} 3104 3105/* 3106 * Make a temporary mapping for a physical address. This is only intended 3107 * to be used for panic dumps. 3108 */ 3109void * 3110pmap_kenter_temporary(vm_paddr_t pa, int i) 3111{ 3112 vm_offset_t va; 3113 vm_paddr_t ma = xpmap_ptom(pa); 3114 3115 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3116 PT_SET_MA(va, (ma & ~PAGE_MASK) | PG_V | pgeflag); 3117 invlpg(va); 3118 return ((void *)crashdumpmap); 3119} 3120 3121/* 3122 * This code maps large physical mmap regions into the 3123 * processor address space. Note that some shortcuts 3124 * are taken, but the code works. 3125 */ 3126void 3127pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, 3128 vm_object_t object, vm_pindex_t pindex, 3129 vm_size_t size) 3130{ 3131 vm_page_t p; 3132 3133 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3134 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3135 ("pmap_object_init_pt: non-device object")); 3136 if (pseflag && 3137 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { 3138 int i; 3139 vm_page_t m[1]; 3140 unsigned int ptepindex; 3141 int npdes; 3142 pd_entry_t ptepa; 3143 3144 PMAP_LOCK(pmap); 3145 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) 3146 goto out; 3147 PMAP_UNLOCK(pmap); 3148retry: 3149 p = vm_page_lookup(object, pindex); 3150 if (p != NULL) { 3151 if (vm_page_sleep_if_busy(p, FALSE, "init4p")) 3152 goto retry; 3153 } else { 3154 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); 3155 if (p == NULL) 3156 return; 3157 m[0] = p; 3158 3159 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { 3160 vm_page_lock_queues(); 3161 vm_page_free(p); 3162 vm_page_unlock_queues(); 3163 return; 3164 } 3165 3166 p = vm_page_lookup(object, pindex); 3167 vm_page_wakeup(p); 3168 } 3169 3170 ptepa = VM_PAGE_TO_PHYS(p); 3171 if (ptepa & (NBPDR - 1)) 3172 return; 3173 3174 p->valid = VM_PAGE_BITS_ALL; 3175 3176 PMAP_LOCK(pmap); 3177 pmap->pm_stats.resident_count += size >> PAGE_SHIFT; 3178 npdes = size >> PDRSHIFT; 3179 critical_enter(); 3180 for(i = 0; i < npdes; i++) { 3181 PD_SET_VA(pmap, ptepindex, 3182 ptepa | PG_U | PG_M | PG_RW | PG_V | PG_PS, FALSE); 3183 ptepa += NBPDR; 3184 ptepindex += 1; 3185 } 3186 pmap_invalidate_all(pmap); 3187 critical_exit(); 3188out: 3189 PMAP_UNLOCK(pmap); 3190 } 3191} 3192 3193/* 3194 * Routine: pmap_change_wiring 3195 * Function: Change the wiring attribute for a map/virtual-address 3196 * pair. 3197 * In/out conditions: 3198 * The mapping must already exist in the pmap. 3199 */ 3200void 3201pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3202{ 3203 pt_entry_t *pte; 3204 3205 vm_page_lock_queues(); 3206 PMAP_LOCK(pmap); 3207 pte = pmap_pte(pmap, va); 3208 3209 if (wired && !pmap_pte_w(pte)) { 3210 PT_SET_VA_MA((pte), *(pte) | PG_W, TRUE); 3211 pmap->pm_stats.wired_count++; 3212 } else if (!wired && pmap_pte_w(pte)) { 3213 PT_SET_VA_MA((pte), *(pte) & ~PG_W, TRUE); 3214 pmap->pm_stats.wired_count--; 3215 } 3216 3217 /* 3218 * Wiring is not a hardware characteristic so there is no need to 3219 * invalidate TLB. 3220 */ 3221 pmap_pte_release(pte); 3222 PMAP_UNLOCK(pmap); 3223 vm_page_unlock_queues(); 3224} 3225 3226 3227 3228/* 3229 * Copy the range specified by src_addr/len 3230 * from the source map to the range dst_addr/len 3231 * in the destination map. 3232 * 3233 * This routine is only advisory and need not do anything. 3234 */ 3235 3236void 3237pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3238 vm_offset_t src_addr) 3239{ 3240 vm_page_t free; 3241 vm_offset_t addr; 3242 vm_offset_t end_addr = src_addr + len; 3243 vm_offset_t pdnxt; 3244 3245 if (dst_addr != src_addr) 3246 return; 3247 3248 if (!pmap_is_current(src_pmap)) { 3249 CTR2(KTR_PMAP, 3250 "pmap_copy, skipping: pdir[PTDPTDI]=0x%jx PTDpde[0]=0x%jx", 3251 (src_pmap->pm_pdir[PTDPTDI] & PG_FRAME), (PTDpde[0] & PG_FRAME)); 3252 3253 return; 3254 } 3255 CTR5(KTR_PMAP, "pmap_copy: dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x", 3256 dst_pmap, src_pmap, dst_addr, len, src_addr); 3257 3258 vm_page_lock_queues(); 3259 if (dst_pmap < src_pmap) { 3260 PMAP_LOCK(dst_pmap); 3261 PMAP_LOCK(src_pmap); 3262 } else { 3263 PMAP_LOCK(src_pmap); 3264 PMAP_LOCK(dst_pmap); 3265 } 3266 sched_pin(); 3267 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 3268 pt_entry_t *src_pte, *dst_pte; 3269 vm_page_t dstmpte, srcmpte; 3270 pd_entry_t srcptepaddr; 3271 unsigned ptepindex; 3272 3273 if (addr >= UPT_MIN_ADDRESS) 3274 panic("pmap_copy: invalid to pmap_copy page tables"); 3275 3276 pdnxt = (addr + NBPDR) & ~PDRMASK; 3277 ptepindex = addr >> PDRSHIFT; 3278 3279 srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]); 3280 if (srcptepaddr == 0) 3281 continue; 3282 3283 if (srcptepaddr & PG_PS) { 3284 if (dst_pmap->pm_pdir[ptepindex] == 0) { 3285 PD_SET_VA(dst_pmap, ptepindex, srcptepaddr & ~PG_W, TRUE); 3286 dst_pmap->pm_stats.resident_count += 3287 NBPDR / PAGE_SIZE; 3288 } 3289 continue; 3290 } 3291 3292 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 3293 if (srcmpte->wire_count == 0) 3294 panic("pmap_copy: source page table page is unused"); 3295 3296 if (pdnxt > end_addr) 3297 pdnxt = end_addr; 3298 3299 src_pte = vtopte(addr); 3300 while (addr < pdnxt) { 3301 pt_entry_t ptetemp; 3302 ptetemp = *src_pte; 3303 /* 3304 * we only virtual copy managed pages 3305 */ 3306 if ((ptetemp & PG_MANAGED) != 0) { 3307 dstmpte = pmap_allocpte(dst_pmap, addr, 3308 M_NOWAIT); 3309 if (dstmpte == NULL) 3310 break; 3311 dst_pte = pmap_pte_quick(dst_pmap, addr); 3312 if (*dst_pte == 0 && 3313 pmap_try_insert_pv_entry(dst_pmap, addr, 3314 PHYS_TO_VM_PAGE(xpmap_mtop(ptetemp) & PG_FRAME))) { 3315 /* 3316 * Clear the wired, modified, and 3317 * accessed (referenced) bits 3318 * during the copy. 3319 */ 3320 KASSERT(ptetemp != 0, ("src_pte not set")); 3321 PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), TRUE /* XXX debug */); 3322 KASSERT(*dst_pte == (ptetemp & ~(PG_W | PG_M | PG_A)), 3323 ("no pmap copy expected: 0x%jx saw: 0x%jx", 3324 ptetemp & ~(PG_W | PG_M | PG_A), *dst_pte)); 3325 dst_pmap->pm_stats.resident_count++; 3326 } else { 3327 free = NULL; 3328 if (pmap_unwire_pte_hold(dst_pmap, 3329 dstmpte, &free)) { 3330 pmap_invalidate_page(dst_pmap, 3331 addr); 3332 pmap_free_zero_pages(free); 3333 } 3334 } 3335 if (dstmpte->wire_count >= srcmpte->wire_count) 3336 break; 3337 } 3338 addr += PAGE_SIZE; 3339 src_pte++; 3340 } 3341 } 3342 PT_UPDATES_FLUSH(); 3343 sched_unpin(); 3344 vm_page_unlock_queues(); 3345 PMAP_UNLOCK(src_pmap); 3346 PMAP_UNLOCK(dst_pmap); 3347} 3348 3349static __inline void 3350pagezero(void *page) 3351{ 3352#if defined(I686_CPU) 3353 if (cpu_class == CPUCLASS_686) { 3354#if defined(CPU_ENABLE_SSE) 3355 if (cpu_feature & CPUID_SSE2) 3356 sse2_pagezero(page); 3357 else 3358#endif 3359 i686_pagezero(page); 3360 } else 3361#endif 3362 bzero(page, PAGE_SIZE); 3363} 3364 3365/* 3366 * pmap_zero_page zeros the specified hardware page by mapping 3367 * the page into KVM and using bzero to clear its contents. 3368 */ 3369void 3370pmap_zero_page(vm_page_t m) 3371{ 3372 struct sysmaps *sysmaps; 3373 3374 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 3375 mtx_lock(&sysmaps->lock); 3376 if (*sysmaps->CMAP2) 3377 panic("pmap_zero_page: CMAP2 busy"); 3378 sched_pin(); 3379 PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); 3380 pagezero(sysmaps->CADDR2); 3381 PT_SET_MA(sysmaps->CADDR2, 0); 3382 sched_unpin(); 3383 mtx_unlock(&sysmaps->lock); 3384} 3385 3386/* 3387 * pmap_zero_page_area zeros the specified hardware page by mapping 3388 * the page into KVM and using bzero to clear its contents. 3389 * 3390 * off and size may not cover an area beyond a single hardware page. 3391 */ 3392void 3393pmap_zero_page_area(vm_page_t m, int off, int size) 3394{ 3395 struct sysmaps *sysmaps; 3396 3397 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 3398 mtx_lock(&sysmaps->lock); 3399 if (*sysmaps->CMAP2) 3400 panic("pmap_zero_page: CMAP2 busy"); 3401 sched_pin(); 3402 PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); 3403 3404 if (off == 0 && size == PAGE_SIZE) 3405 pagezero(sysmaps->CADDR2); 3406 else 3407 bzero((char *)sysmaps->CADDR2 + off, size); 3408 PT_SET_MA(sysmaps->CADDR2, 0); 3409 sched_unpin(); 3410 mtx_unlock(&sysmaps->lock); 3411} 3412 3413/* 3414 * pmap_zero_page_idle zeros the specified hardware page by mapping 3415 * the page into KVM and using bzero to clear its contents. This 3416 * is intended to be called from the vm_pagezero process only and 3417 * outside of Giant. 3418 */ 3419void 3420pmap_zero_page_idle(vm_page_t m) 3421{ 3422 3423 if (*CMAP3) 3424 panic("pmap_zero_page: CMAP3 busy"); 3425 sched_pin(); 3426 PT_SET_MA(CADDR3, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M); 3427 pagezero(CADDR3); 3428 PT_SET_MA(CADDR3, 0); 3429 sched_unpin(); 3430} 3431 3432/* 3433 * pmap_copy_page copies the specified (machine independent) 3434 * page by mapping the page into virtual memory and using 3435 * bcopy to copy the page, one machine dependent page at a 3436 * time. 3437 */ 3438void 3439pmap_copy_page(vm_page_t src, vm_page_t dst) 3440{ 3441 struct sysmaps *sysmaps; 3442 3443 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 3444 mtx_lock(&sysmaps->lock); 3445 if (*sysmaps->CMAP1) 3446 panic("pmap_copy_page: CMAP1 busy"); 3447 if (*sysmaps->CMAP2) 3448 panic("pmap_copy_page: CMAP2 busy"); 3449 sched_pin(); 3450 PT_SET_MA(sysmaps->CADDR1, PG_V | xpmap_ptom(VM_PAGE_TO_PHYS(src)) | PG_A); 3451 PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | xpmap_ptom(VM_PAGE_TO_PHYS(dst)) | PG_A | PG_M); 3452 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 3453 PT_SET_MA(sysmaps->CADDR1, 0); 3454 PT_SET_MA(sysmaps->CADDR2, 0); 3455 sched_unpin(); 3456 mtx_unlock(&sysmaps->lock); 3457} 3458 3459/* 3460 * Returns true if the pmap's pv is one of the first 3461 * 16 pvs linked to from this page. This count may 3462 * be changed upwards or downwards in the future; it 3463 * is only necessary that true be returned for a small 3464 * subset of pmaps for proper page aging. 3465 */ 3466boolean_t 3467pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3468{ 3469 pv_entry_t pv; 3470 int loops = 0; 3471 3472 if (m->flags & PG_FICTITIOUS) 3473 return (FALSE); 3474 3475 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3476 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3477 if (PV_PMAP(pv) == pmap) { 3478 return TRUE; 3479 } 3480 loops++; 3481 if (loops >= 16) 3482 break; 3483 } 3484 return (FALSE); 3485} 3486 3487/* 3488 * pmap_page_wired_mappings: 3489 * 3490 * Return the number of managed mappings to the given physical page 3491 * that are wired. 3492 */ 3493int 3494pmap_page_wired_mappings(vm_page_t m) 3495{ 3496 pv_entry_t pv; 3497 pt_entry_t *pte; 3498 pmap_t pmap; 3499 int count; 3500 3501 count = 0; 3502 if ((m->flags & PG_FICTITIOUS) != 0) 3503 return (count); 3504 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3505 sched_pin(); 3506 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3507 pmap = PV_PMAP(pv); 3508 PMAP_LOCK(pmap); 3509 pte = pmap_pte_quick(pmap, pv->pv_va); 3510 if ((*pte & PG_W) != 0) 3511 count++; 3512 PMAP_UNLOCK(pmap); 3513 } 3514 sched_unpin(); 3515 return (count); 3516} 3517 3518/* 3519 * Returns TRUE if the given page is mapped individually or as part of 3520 * a 4mpage. Otherwise, returns FALSE. 3521 */ 3522boolean_t 3523pmap_page_is_mapped(vm_page_t m) 3524{ 3525 struct md_page *pvh; 3526 3527 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) 3528 return (FALSE); 3529 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3530 if (TAILQ_EMPTY(&m->md.pv_list)) { 3531 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3532 return (!TAILQ_EMPTY(&pvh->pv_list)); 3533 } else 3534 return (TRUE); 3535} 3536 3537/* 3538 * Remove all pages from specified address space 3539 * this aids process exit speeds. Also, this code 3540 * is special cased for current process only, but 3541 * can have the more generic (and slightly slower) 3542 * mode enabled. This is much faster than pmap_remove 3543 * in the case of running down an entire address space. 3544 */ 3545void 3546pmap_remove_pages(pmap_t pmap) 3547{ 3548 pt_entry_t *pte, tpte; 3549 vm_page_t m, free = NULL; 3550 pv_entry_t pv; 3551 struct pv_chunk *pc, *npc; 3552 int field, idx; 3553 int32_t bit; 3554 uint32_t inuse, bitmask; 3555 int allfree; 3556 3557 CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap); 3558 3559 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { 3560 printf("warning: pmap_remove_pages called with non-current pmap\n"); 3561 return; 3562 } 3563 vm_page_lock_queues(); 3564 KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap")); 3565 PMAP_LOCK(pmap); 3566 sched_pin(); 3567 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3568 allfree = 1; 3569 for (field = 0; field < _NPCM; field++) { 3570 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 3571 while (inuse != 0) { 3572 bit = bsfl(inuse); 3573 bitmask = 1UL << bit; 3574 idx = field * 32 + bit; 3575 pv = &pc->pc_pventry[idx]; 3576 inuse &= ~bitmask; 3577 3578 pte = vtopte(pv->pv_va); 3579 tpte = *pte ? xpmap_mtop(*pte) : 0; 3580 3581 if (tpte == 0) { 3582 printf( 3583 "TPTE at %p IS ZERO @ VA %08x\n", 3584 pte, pv->pv_va); 3585 panic("bad pte"); 3586 } 3587 3588/* 3589 * We cannot remove wired pages from a process' mapping at this time 3590 */ 3591 if (tpte & PG_W) { 3592 allfree = 0; 3593 continue; 3594 } 3595 3596 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3597 KASSERT(m->phys_addr == (tpte & PG_FRAME), 3598 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 3599 m, (uintmax_t)m->phys_addr, 3600 (uintmax_t)tpte)); 3601 3602 KASSERT(m < &vm_page_array[vm_page_array_size], 3603 ("pmap_remove_pages: bad tpte %#jx", 3604 (uintmax_t)tpte)); 3605 3606 3607 PT_CLEAR_VA(pte, FALSE); 3608 3609 /* 3610 * Update the vm_page_t clean/reference bits. 3611 */ 3612 if (tpte & PG_M) 3613 vm_page_dirty(m); 3614 3615 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3616 if (TAILQ_EMPTY(&m->md.pv_list)) 3617 vm_page_flag_clear(m, PG_WRITEABLE); 3618 3619 pmap_unuse_pt(pmap, pv->pv_va, &free); 3620 3621 /* Mark free */ 3622 PV_STAT(pv_entry_frees++); 3623 PV_STAT(pv_entry_spare++); 3624 pv_entry_count--; 3625 pc->pc_map[field] |= bitmask; 3626 pmap->pm_stats.resident_count--; 3627 } 3628 } 3629 PT_UPDATES_FLUSH(); 3630 if (allfree) { 3631 PV_STAT(pv_entry_spare -= _NPCPV); 3632 PV_STAT(pc_chunk_count--); 3633 PV_STAT(pc_chunk_frees++); 3634 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3635 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 3636 pmap_qremove((vm_offset_t)pc, 1); 3637 vm_page_unwire(m, 0); 3638 vm_page_free(m); 3639 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 3640 } 3641 } 3642 PT_UPDATES_FLUSH(); 3643 if (*PMAP1) 3644 PT_SET_MA(PADDR1, 0); 3645 3646 sched_unpin(); 3647 pmap_invalidate_all(pmap); 3648 vm_page_unlock_queues(); 3649 PMAP_UNLOCK(pmap); 3650 pmap_free_zero_pages(free); 3651} 3652 3653/* 3654 * pmap_is_modified: 3655 * 3656 * Return whether or not the specified physical page was modified 3657 * in any physical maps. 3658 */ 3659boolean_t 3660pmap_is_modified(vm_page_t m) 3661{ 3662 pv_entry_t pv; 3663 pt_entry_t *pte; 3664 pmap_t pmap; 3665 boolean_t rv; 3666 3667 rv = FALSE; 3668 if (m->flags & PG_FICTITIOUS) 3669 return (rv); 3670 3671 sched_pin(); 3672 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3673 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3674 pmap = PV_PMAP(pv); 3675 PMAP_LOCK(pmap); 3676 pte = pmap_pte_quick(pmap, pv->pv_va); 3677 rv = (*pte & PG_M) != 0; 3678 PMAP_UNLOCK(pmap); 3679 if (rv) 3680 break; 3681 } 3682 if (*PMAP1) 3683 PT_SET_MA(PADDR1, 0); 3684 sched_unpin(); 3685 return (rv); 3686} 3687 3688/* 3689 * pmap_is_prefaultable: 3690 * 3691 * Return whether or not the specified virtual address is elgible 3692 * for prefault. 3693 */ 3694static boolean_t 3695pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr) 3696{ 3697 pt_entry_t *pte; 3698 boolean_t rv = FALSE; 3699 3700 return (rv); 3701 3702 if (pmap_is_current(pmap) && *pmap_pde(pmap, addr)) { 3703 pte = vtopte(addr); 3704 rv = (*pte == 0); 3705 } 3706 return (rv); 3707} 3708 3709boolean_t 3710pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3711{ 3712 boolean_t rv; 3713 3714 PMAP_LOCK(pmap); 3715 rv = pmap_is_prefaultable_locked(pmap, addr); 3716 PMAP_UNLOCK(pmap); 3717 return (rv); 3718} 3719 3720void 3721pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) 3722{ 3723 int i, npages = round_page(len) >> PAGE_SHIFT; 3724 for (i = 0; i < npages; i++) { 3725 pt_entry_t *pte; 3726 pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); 3727 pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M))); 3728 PMAP_MARK_PRIV(xpmap_mtop(*pte)); 3729 pmap_pte_release(pte); 3730 } 3731} 3732 3733void 3734pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) 3735{ 3736 int i, npages = round_page(len) >> PAGE_SHIFT; 3737 for (i = 0; i < npages; i++) { 3738 pt_entry_t *pte; 3739 pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); 3740 PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); 3741 pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M)); 3742 pmap_pte_release(pte); 3743 } 3744} 3745 3746/* 3747 * Clear the write and modified bits in each of the given page's mappings. 3748 */ 3749void 3750pmap_remove_write(vm_page_t m) 3751{ 3752 pv_entry_t pv; 3753 pmap_t pmap; 3754 pt_entry_t oldpte, *pte; 3755 3756 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3757 if ((m->flags & PG_FICTITIOUS) != 0 || 3758 (m->flags & PG_WRITEABLE) == 0) 3759 return; 3760 sched_pin(); 3761 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3762 pmap = PV_PMAP(pv); 3763 PMAP_LOCK(pmap); 3764 pte = pmap_pte_quick(pmap, pv->pv_va); 3765retry: 3766 oldpte = *pte; 3767 if ((oldpte & PG_RW) != 0) { 3768 vm_paddr_t newpte = oldpte & ~(PG_RW | PG_M); 3769 3770 /* 3771 * Regardless of whether a pte is 32 or 64 bits 3772 * in size, PG_RW and PG_M are among the least 3773 * significant 32 bits. 3774 */ 3775 PT_SET_VA_MA(pte, newpte, TRUE); 3776 if (*pte != newpte) 3777 goto retry; 3778 3779 if ((oldpte & PG_M) != 0) 3780 vm_page_dirty(m); 3781 pmap_invalidate_page(pmap, pv->pv_va); 3782 } 3783 PMAP_UNLOCK(pmap); 3784 } 3785 vm_page_flag_clear(m, PG_WRITEABLE); 3786 PT_UPDATES_FLUSH(); 3787 if (*PMAP1) 3788 PT_SET_MA(PADDR1, 0); 3789 sched_unpin(); 3790} 3791 3792/* 3793 * pmap_ts_referenced: 3794 * 3795 * Return a count of reference bits for a page, clearing those bits. 3796 * It is not necessary for every reference bit to be cleared, but it 3797 * is necessary that 0 only be returned when there are truly no 3798 * reference bits set. 3799 * 3800 * XXX: The exact number of bits to check and clear is a matter that 3801 * should be tested and standardized at some point in the future for 3802 * optimal aging of shared pages. 3803 */ 3804int 3805pmap_ts_referenced(vm_page_t m) 3806{ 3807 pv_entry_t pv, pvf, pvn; 3808 pmap_t pmap; 3809 pt_entry_t *pte; 3810 int rtval = 0; 3811 3812 if (m->flags & PG_FICTITIOUS) 3813 return (rtval); 3814 sched_pin(); 3815 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3816 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3817 pvf = pv; 3818 do { 3819 pvn = TAILQ_NEXT(pv, pv_list); 3820 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3821 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3822 pmap = PV_PMAP(pv); 3823 PMAP_LOCK(pmap); 3824 pte = pmap_pte_quick(pmap, pv->pv_va); 3825 if ((*pte & PG_A) != 0) { 3826 PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); 3827 pmap_invalidate_page(pmap, pv->pv_va); 3828 rtval++; 3829 if (rtval > 4) 3830 pvn = NULL; 3831 } 3832 PMAP_UNLOCK(pmap); 3833 } while ((pv = pvn) != NULL && pv != pvf); 3834 } 3835 PT_UPDATES_FLUSH(); 3836 if (*PMAP1) 3837 PT_SET_MA(PADDR1, 0); 3838 3839 sched_unpin(); 3840 return (rtval); 3841} 3842 3843/* 3844 * Clear the modify bits on the specified physical page. 3845 */ 3846void 3847pmap_clear_modify(vm_page_t m) 3848{ 3849 pv_entry_t pv; 3850 pmap_t pmap; 3851 pt_entry_t *pte; 3852 3853 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3854 if ((m->flags & PG_FICTITIOUS) != 0) 3855 return; 3856 sched_pin(); 3857 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3858 pmap = PV_PMAP(pv); 3859 PMAP_LOCK(pmap); 3860 pte = pmap_pte_quick(pmap, pv->pv_va); 3861 if ((*pte & PG_M) != 0) { 3862 /* 3863 * Regardless of whether a pte is 32 or 64 bits 3864 * in size, PG_M is among the least significant 3865 * 32 bits. 3866 */ 3867 PT_SET_VA_MA(pte, *pte & ~PG_M, FALSE); 3868 pmap_invalidate_page(pmap, pv->pv_va); 3869 } 3870 PMAP_UNLOCK(pmap); 3871 } 3872 sched_unpin(); 3873} 3874 3875/* 3876 * pmap_clear_reference: 3877 * 3878 * Clear the reference bit on the specified physical page. 3879 */ 3880void 3881pmap_clear_reference(vm_page_t m) 3882{ 3883 pv_entry_t pv; 3884 pmap_t pmap; 3885 pt_entry_t *pte; 3886 3887 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3888 if ((m->flags & PG_FICTITIOUS) != 0) 3889 return; 3890 sched_pin(); 3891 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3892 pmap = PV_PMAP(pv); 3893 PMAP_LOCK(pmap); 3894 pte = pmap_pte_quick(pmap, pv->pv_va); 3895 if ((*pte & PG_A) != 0) { 3896 /* 3897 * Regardless of whether a pte is 32 or 64 bits 3898 * in size, PG_A is among the least significant 3899 * 32 bits. 3900 */ 3901 PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); 3902 pmap_invalidate_page(pmap, pv->pv_va); 3903 } 3904 PMAP_UNLOCK(pmap); 3905 } 3906 sched_unpin(); 3907} 3908 3909/* 3910 * Miscellaneous support routines follow 3911 */ 3912 3913/* 3914 * Map a set of physical memory pages into the kernel virtual 3915 * address space. Return a pointer to where it is mapped. This 3916 * routine is intended to be used for mapping device memory, 3917 * NOT real memory. 3918 */ 3919void * 3920pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 3921{ 3922 vm_offset_t va, offset; 3923 vm_size_t tmpsize; 3924 3925 offset = pa & PAGE_MASK; 3926 size = roundup(offset + size, PAGE_SIZE); 3927 pa = pa & PG_FRAME; 3928 3929 if (pa < KERNLOAD && pa + size <= KERNLOAD) 3930 va = KERNBASE + pa; 3931 else 3932 va = kmem_alloc_nofault(kernel_map, size); 3933 if (!va) 3934 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3935 3936 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 3937 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 3938 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 3939 pmap_invalidate_cache_range(va, va + size); 3940 return ((void *)(va + offset)); 3941} 3942 3943void * 3944pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3945{ 3946 3947 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 3948} 3949 3950void * 3951pmap_mapbios(vm_paddr_t pa, vm_size_t size) 3952{ 3953 3954 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 3955} 3956 3957void 3958pmap_unmapdev(vm_offset_t va, vm_size_t size) 3959{ 3960 vm_offset_t base, offset, tmpva; 3961 3962 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 3963 return; 3964 base = trunc_page(va); 3965 offset = va & PAGE_MASK; 3966 size = roundup(offset + size, PAGE_SIZE); 3967 critical_enter(); 3968 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 3969 pmap_kremove(tmpva); 3970 pmap_invalidate_range(kernel_pmap, va, tmpva); 3971 critical_exit(); 3972 kmem_free(kernel_map, base, size); 3973} 3974 3975/* 3976 * Sets the memory attribute for the specified page. 3977 */ 3978void 3979pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 3980{ 3981 struct sysmaps *sysmaps; 3982 vm_offset_t sva, eva; 3983 3984 m->md.pat_mode = ma; 3985 if ((m->flags & PG_FICTITIOUS) != 0) 3986 return; 3987 3988 /* 3989 * If "m" is a normal page, flush it from the cache. 3990 * See pmap_invalidate_cache_range(). 3991 * 3992 * First, try to find an existing mapping of the page by sf 3993 * buffer. sf_buf_invalidate_cache() modifies mapping and 3994 * flushes the cache. 3995 */ 3996 if (sf_buf_invalidate_cache(m)) 3997 return; 3998 3999 /* 4000 * If page is not mapped by sf buffer, but CPU does not 4001 * support self snoop, map the page transient and do 4002 * invalidation. In the worst case, whole cache is flushed by 4003 * pmap_invalidate_cache_range(). 4004 */ 4005 if ((cpu_feature & (CPUID_SS|CPUID_CLFSH)) == CPUID_CLFSH) { 4006 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4007 mtx_lock(&sysmaps->lock); 4008 if (*sysmaps->CMAP2) 4009 panic("pmap_page_set_memattr: CMAP2 busy"); 4010 sched_pin(); 4011 PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | 4012 xpmap_ptom(VM_PAGE_TO_PHYS(m)) | PG_A | PG_M | 4013 pmap_cache_bits(m->md.pat_mode, 0)); 4014 invlcaddr(sysmaps->CADDR2); 4015 sva = (vm_offset_t)sysmaps->CADDR2; 4016 eva = sva + PAGE_SIZE; 4017 } else 4018 sva = eva = 0; /* gcc */ 4019 pmap_invalidate_cache_range(sva, eva); 4020 if (sva != 0) { 4021 PT_SET_MA(sysmaps->CADDR2, 0); 4022 sched_unpin(); 4023 mtx_unlock(&sysmaps->lock); 4024 } 4025} 4026 4027int 4028pmap_change_attr(va, size, mode) 4029 vm_offset_t va; 4030 vm_size_t size; 4031 int mode; 4032{ 4033 vm_offset_t base, offset, tmpva; 4034 pt_entry_t *pte; 4035 u_int opte, npte; 4036 pd_entry_t *pde; 4037 boolean_t changed; 4038 4039 base = trunc_page(va); 4040 offset = va & PAGE_MASK; 4041 size = roundup(offset + size, PAGE_SIZE); 4042 4043 /* Only supported on kernel virtual addresses. */ 4044 if (base <= VM_MAXUSER_ADDRESS) 4045 return (EINVAL); 4046 4047 /* 4MB pages and pages that aren't mapped aren't supported. */ 4048 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { 4049 pde = pmap_pde(kernel_pmap, tmpva); 4050 if (*pde & PG_PS) 4051 return (EINVAL); 4052 if ((*pde & PG_V) == 0) 4053 return (EINVAL); 4054 pte = vtopte(va); 4055 if ((*pte & PG_V) == 0) 4056 return (EINVAL); 4057 } 4058 4059 changed = FALSE; 4060 4061 /* 4062 * Ok, all the pages exist and are 4k, so run through them updating 4063 * their cache mode. 4064 */ 4065 for (tmpva = base; size > 0; ) { 4066 pte = vtopte(tmpva); 4067 4068 /* 4069 * The cache mode bits are all in the low 32-bits of the 4070 * PTE, so we can just spin on updating the low 32-bits. 4071 */ 4072 do { 4073 opte = *(u_int *)pte; 4074 npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); 4075 npte |= pmap_cache_bits(mode, 0); 4076 PT_SET_VA_MA(pte, npte, TRUE); 4077 } while (npte != opte && (*pte != npte)); 4078 if (npte != opte) 4079 changed = TRUE; 4080 tmpva += PAGE_SIZE; 4081 size -= PAGE_SIZE; 4082 } 4083 4084 /* 4085 * Flush CPU caches to make sure any data isn't cached that shouldn't 4086 * be, etc. 4087 */ 4088 if (changed) { 4089 pmap_invalidate_range(kernel_pmap, base, tmpva); 4090 pmap_invalidate_cache_range(base, tmpva); 4091 } 4092 return (0); 4093} 4094 4095/* 4096 * perform the pmap work for mincore 4097 */ 4098int 4099pmap_mincore(pmap_t pmap, vm_offset_t addr) 4100{ 4101 pt_entry_t *ptep, pte; 4102 vm_page_t m; 4103 int val = 0; 4104 4105 PMAP_LOCK(pmap); 4106 ptep = pmap_pte(pmap, addr); 4107 pte = (ptep != NULL) ? PT_GET(ptep) : 0; 4108 pmap_pte_release(ptep); 4109 PMAP_UNLOCK(pmap); 4110 4111 if (pte != 0) { 4112 vm_paddr_t pa; 4113 4114 val = MINCORE_INCORE; 4115 if ((pte & PG_MANAGED) == 0) 4116 return val; 4117 4118 pa = pte & PG_FRAME; 4119 4120 m = PHYS_TO_VM_PAGE(pa); 4121 4122 /* 4123 * Modified by us 4124 */ 4125 if (pte & PG_M) 4126 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 4127 else { 4128 /* 4129 * Modified by someone else 4130 */ 4131 vm_page_lock_queues(); 4132 if (m->dirty || pmap_is_modified(m)) 4133 val |= MINCORE_MODIFIED_OTHER; 4134 vm_page_unlock_queues(); 4135 } 4136 /* 4137 * Referenced by us 4138 */ 4139 if (pte & PG_A) 4140 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 4141 else { 4142 /* 4143 * Referenced by someone else 4144 */ 4145 vm_page_lock_queues(); 4146 if ((m->flags & PG_REFERENCED) || 4147 pmap_ts_referenced(m)) { 4148 val |= MINCORE_REFERENCED_OTHER; 4149 vm_page_flag_set(m, PG_REFERENCED); 4150 } 4151 vm_page_unlock_queues(); 4152 } 4153 } 4154 return val; 4155} 4156 4157void 4158pmap_activate(struct thread *td) 4159{ 4160 pmap_t pmap, oldpmap; 4161 u_int32_t cr3; 4162 4163 critical_enter(); 4164 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4165 oldpmap = PCPU_GET(curpmap); 4166#if defined(SMP) 4167 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 4168 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 4169#else 4170 oldpmap->pm_active &= ~1; 4171 pmap->pm_active |= 1; 4172#endif 4173#ifdef PAE 4174 cr3 = vtophys(pmap->pm_pdpt); 4175#else 4176 cr3 = vtophys(pmap->pm_pdir); 4177#endif 4178 /* 4179 * pmap_activate is for the current thread on the current cpu 4180 */ 4181 td->td_pcb->pcb_cr3 = cr3; 4182 PT_UPDATES_FLUSH(); 4183 load_cr3(cr3); 4184 PCPU_SET(curpmap, pmap); 4185 critical_exit(); 4186} 4187 4188void 4189pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 4190{ 4191} 4192 4193/* 4194 * Increase the starting virtual address of the given mapping if a 4195 * different alignment might result in more superpage mappings. 4196 */ 4197void 4198pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4199 vm_offset_t *addr, vm_size_t size) 4200{ 4201 vm_offset_t superpage_offset; 4202 4203 if (size < NBPDR) 4204 return; 4205 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4206 offset += ptoa(object->pg_color); 4207 superpage_offset = offset & PDRMASK; 4208 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 4209 (*addr & PDRMASK) == superpage_offset) 4210 return; 4211 if ((*addr & PDRMASK) < superpage_offset) 4212 *addr = (*addr & ~PDRMASK) + superpage_offset; 4213 else 4214 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 4215} 4216 4217#ifdef XEN 4218 4219void 4220pmap_suspend() 4221{ 4222 pmap_t pmap; 4223 int i, pdir, offset; 4224 vm_paddr_t pdirma; 4225 mmu_update_t mu[4]; 4226 4227 /* 4228 * We need to remove the recursive mapping structure from all 4229 * our pmaps so that Xen doesn't get confused when it restores 4230 * the page tables. The recursive map lives at page directory 4231 * index PTDPTDI. We assume that the suspend code has stopped 4232 * the other vcpus (if any). 4233 */ 4234 LIST_FOREACH(pmap, &allpmaps, pm_list) { 4235 for (i = 0; i < 4; i++) { 4236 /* 4237 * Figure out which page directory (L2) page 4238 * contains this bit of the recursive map and 4239 * the offset within that page of the map 4240 * entry 4241 */ 4242 pdir = (PTDPTDI + i) / NPDEPG; 4243 offset = (PTDPTDI + i) % NPDEPG; 4244 pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; 4245 mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); 4246 mu[i].val = 0; 4247 } 4248 HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); 4249 } 4250} 4251 4252void 4253pmap_resume() 4254{ 4255 pmap_t pmap; 4256 int i, pdir, offset; 4257 vm_paddr_t pdirma; 4258 mmu_update_t mu[4]; 4259 4260 /* 4261 * Restore the recursive map that we removed on suspend. 4262 */ 4263 LIST_FOREACH(pmap, &allpmaps, pm_list) { 4264 for (i = 0; i < 4; i++) { 4265 /* 4266 * Figure out which page directory (L2) page 4267 * contains this bit of the recursive map and 4268 * the offset within that page of the map 4269 * entry 4270 */ 4271 pdir = (PTDPTDI + i) / NPDEPG; 4272 offset = (PTDPTDI + i) % NPDEPG; 4273 pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; 4274 mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); 4275 mu[i].val = (pmap->pm_pdpt[i] & PG_FRAME) | PG_V; 4276 } 4277 HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); 4278 } 4279} 4280 4281#endif 4282 4283#if defined(PMAP_DEBUG) 4284pmap_pid_dump(int pid) 4285{ 4286 pmap_t pmap; 4287 struct proc *p; 4288 int npte = 0; 4289 int index; 4290 4291 sx_slock(&allproc_lock); 4292 FOREACH_PROC_IN_SYSTEM(p) { 4293 if (p->p_pid != pid) 4294 continue; 4295 4296 if (p->p_vmspace) { 4297 int i,j; 4298 index = 0; 4299 pmap = vmspace_pmap(p->p_vmspace); 4300 for (i = 0; i < NPDEPTD; i++) { 4301 pd_entry_t *pde; 4302 pt_entry_t *pte; 4303 vm_offset_t base = i << PDRSHIFT; 4304 4305 pde = &pmap->pm_pdir[i]; 4306 if (pde && pmap_pde_v(pde)) { 4307 for (j = 0; j < NPTEPG; j++) { 4308 vm_offset_t va = base + (j << PAGE_SHIFT); 4309 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 4310 if (index) { 4311 index = 0; 4312 printf("\n"); 4313 } 4314 sx_sunlock(&allproc_lock); 4315 return npte; 4316 } 4317 pte = pmap_pte(pmap, va); 4318 if (pte && pmap_pte_v(pte)) { 4319 pt_entry_t pa; 4320 vm_page_t m; 4321 pa = PT_GET(pte); 4322 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 4323 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 4324 va, pa, m->hold_count, m->wire_count, m->flags); 4325 npte++; 4326 index++; 4327 if (index >= 2) { 4328 index = 0; 4329 printf("\n"); 4330 } else { 4331 printf(" "); 4332 } 4333 } 4334 } 4335 } 4336 } 4337 } 4338 } 4339 sx_sunlock(&allproc_lock); 4340 return npte; 4341} 4342#endif 4343 4344#if defined(DEBUG) 4345 4346static void pads(pmap_t pm); 4347void pmap_pvdump(vm_paddr_t pa); 4348 4349/* print address space of pmap*/ 4350static void 4351pads(pmap_t pm) 4352{ 4353 int i, j; 4354 vm_paddr_t va; 4355 pt_entry_t *ptep; 4356 4357 if (pm == kernel_pmap) 4358 return; 4359 for (i = 0; i < NPDEPTD; i++) 4360 if (pm->pm_pdir[i]) 4361 for (j = 0; j < NPTEPG; j++) { 4362 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 4363 if (pm == kernel_pmap && va < KERNBASE) 4364 continue; 4365 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 4366 continue; 4367 ptep = pmap_pte(pm, va); 4368 if (pmap_pte_v(ptep)) 4369 printf("%x:%x ", va, *ptep); 4370 }; 4371 4372} 4373 4374void 4375pmap_pvdump(vm_paddr_t pa) 4376{ 4377 pv_entry_t pv; 4378 pmap_t pmap; 4379 vm_page_t m; 4380 4381 printf("pa %x", pa); 4382 m = PHYS_TO_VM_PAGE(pa); 4383 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4384 pmap = PV_PMAP(pv); 4385 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 4386 pads(pmap); 4387 } 4388 printf(" "); 4389} 4390#endif 4391