1/* $OpenBSD: pmapae.c,v 1.72 2024/05/30 10:56:24 mpi Exp $ */ 2 3/* 4 * Copyright (c) 2006-2008 Michael Shalayeff 5 * All rights reserved. 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER IN 16 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 17 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19/* 20 * Copyright (c) 1997 Charles D. Cranor and Washington University. 21 * All rights reserved. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the above copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 33 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 34 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 35 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 36 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 41 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 * 43 * from OpenBSD: pmap.c,v 1.85 2005/11/18 17:05:04 brad Exp 44 */ 45 46/* 47 * pmap.c: i386 pmap module rewrite 48 * Chuck Cranor <chuck@ccrc.wustl.edu> 49 * 11-Aug-97 50 * 51 * history of this pmap module: in addition to my own input, i used 52 * the following references for this rewrite of the i386 pmap: 53 * 54 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 55 * BSD hp300 pmap done by Mike Hibler at University of Utah. 56 * it was then ported to the i386 by William Jolitz of UUNET 57 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 58 * project fixed some bugs and provided some speed ups. 59 * 60 * [2] the FreeBSD i386 pmap. this pmap seems to be the 61 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 62 * and David Greenman. 63 * 64 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 65 * between several processors. the VAX version was done by 66 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 67 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 68 * David Golub, and Richard Draves. the alpha version was 69 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 70 * (NetBSD/alpha). 71 */ 72/* 73 * PAE support 74 * Michael Shalayeff <mickey@lucifier.net> 75 * 76 * This module implements PAE mode for i386. 77 * 78 */ 79 80#include <sys/param.h> 81#include <sys/systm.h> 82#include <sys/atomic.h> 83#include <sys/pool.h> 84#include <sys/user.h> 85#include <sys/mutex.h> 86 87#include <uvm/uvm.h> 88 89#include <machine/specialreg.h> 90 91#include <dev/isa/isareg.h> 92#include <i386/isa/isa_machdep.h> 93 94#include "ksyms.h" 95 96/* #define PMAPAE_DEBUG */ 97 98#ifdef PMAPAE_DEBUG 99#define DPRINTF(x...) do { printf(x); } while(0) 100#else 101#define DPRINTF(x...) 102#endif /* PMAPAE_DEBUG */ 103 104/* 105 * this file contains the code for the "pmap module." the module's 106 * job is to manage the hardware's virtual to physical address mappings. 107 * note that there are two levels of mapping in the VM system: 108 * 109 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 110 * to map ranges of virtual address space to objects/files. for 111 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 112 * to the file /bin/ls starting at offset zero." note that 113 * the upper layer mapping is not concerned with how individual 114 * vm_pages are mapped. 115 * 116 * [2] the lower layer of the VM system (the pmap) maintains the mappings 117 * from virtual addresses. it is concerned with which vm_page is 118 * mapped where. for example, when you run /bin/ls and start 119 * at page 0x1000 the fault routine may lookup the correct page 120 * of the /bin/ls file and then ask the pmap layer to establish 121 * a mapping for it. 122 * 123 * note that information in the lower layer of the VM system can be 124 * thrown away since it can easily be reconstructed from the info 125 * in the upper layer. 126 * 127 * data structures we use include: 128 * 129 * - struct pmap: describes the address space of one thread 130 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 131 * - struct pv_head: there is one pv_head per managed page of 132 * physical memory. the pv_head points to a list of pv_entry 133 * structures which describe all the <PMAP,VA> pairs that this 134 * page is mapped in. this is critical for page based operations 135 * such as pmap_page_protect() [change protection on _all_ mappings 136 * of a page] 137 */ 138/* 139 * i386 PAE hardware Page Tables structure: 140 * 141 * the i386 PAE Page Table is a three-level PT which maps 4GB of VA. 142 * the pagesize is 4K (4096 [0x1000] bytes) or 2MB. 143 * 144 * the first level table is called "page directory index" and consists 145 * of 4 page directory index entries (PDIE) each 64 bits in size. 146 * 147 * the second level table is called a "page directory" and it contains 148 * 512 page directory entries (PDEs). each PDE is 149 * 8 bytes (a long long), so a PD fits in a single 4K page. this page is 150 * the page directory page (PDP). each PDE in a PDP maps 1GB of space 151 * (512 * 2MB = 1GB). a PDE contains the physical address of the 152 * second level table: the page table. or, if 2MB pages are being used, 153 * then the PDE contains the PA of the 2MB page being mapped. 154 * 155 * a page table consists of 512 page table entries (PTEs). each PTE is 156 * 8 bytes (a long long), so a page table also fits in a single 4K page. 157 * a 4K page being used as a page table is called a page table page (PTP). 158 * each PTE in a PTP maps one 4K page (512 * 4K = 2MB). a PTE contains 159 * the physical address of the page it maps and some flag bits (described 160 * below). 161 * 162 * the processor has a special register, "cr3", which points to the 163 * the PDP which is currently controlling the mappings of the virtual 164 * address space. 165 * 166 * the following picture shows the translation process for a 4K page: 167 * 168 * %cr3 register [PA of PDPT] 169 * | 170 * | bits <31-30> of VA 171 * | index the DPE (0-3) 172 * | | 173 * v v 174 * +-----------+ 175 * | PDP Ptr | 176 * | 4 entries | 177 * +-----------+ 178 * | 179 * PA of PDP 180 * | 181 * | 182 * | bits <29-21> of VA bits <20-12> of VA bits <11-0> 183 * | index the PDP (0 - 512) index the PTP are the page offset 184 * | | | | 185 * | v | | 186 * +-->+---------+ | | 187 * | PD Page | PA of v | 188 * | |-----PTP----->+------------+ | 189 * | 512 PDE | | page table |--PTE--+ | 190 * | entries | | (aka PTP) | | | 191 * +---------+ | 512 PTE | | | 192 * | entries | | | 193 * +------------+ | | 194 * | | 195 * bits <35-12> bits <11-0> 196 * p h y s i c a l a d d r 197 * 198 * the i386 caches PTEs in a TLB. it is important to flush out old 199 * TLB mappings when making a change to a mapping. writing to the 200 * %cr3 will flush the entire TLB. newer processors also have an 201 * instruction that will invalidate the mapping of a single page (which 202 * is useful if you are changing a single mapping because it preserves 203 * all the cached TLB entries). 204 * 205 * as shows, bits 31-12 of the PTE contain PA of the page being mapped. 206 * the rest of the PTE is defined as follows: 207 * bit# name use 208 * 63 NX no-execute bit (0=ITLB, 1=DTLB), optional 209 * 11 n/a available for OS use, hardware ignores it 210 * 10 n/a available for OS use, hardware ignores it 211 * 9 n/a available for OS use, hardware ignores it 212 * 8 G global bit (see discussion below) 213 * 7 PS page size [for PDEs] (0=4k, 1=4M <if supported>) 214 * 6 D dirty (modified) page 215 * 5 A accessed (referenced) page 216 * 4 PCD cache disable 217 * 3 PWT prevent write through (cache) 218 * 2 U/S user/supervisor bit (0=supervisor only, 1=both u&s) 219 * 1 R/W read/write bit (0=read only, 1=read-write) 220 * 0 P present (valid) 221 * 222 * notes: 223 * - on the i386 the R/W bit is ignored if processor is in supervisor 224 * state (bug!) 225 * - PS is only supported on newer processors 226 * - PTEs with the G bit are global in the sense that they are not 227 * flushed from the TLB when %cr3 is written (to flush, use the 228 * "flush single page" instruction). this is only supported on 229 * newer processors. this bit can be used to keep the kernel's 230 * TLB entries around while context switching. since the kernel 231 * is mapped into all processes at the same place it does not make 232 * sense to flush these entries when switching from one process' 233 * pmap to another. 234 */ 235/* 236 * A pmap describes a process' 4GB virtual address space. This 237 * virtual address space can be broken up into 2048 2MB regions which 238 * are described by PDEs in the PDP. The PDEs are defined as follows: 239 * 240 * Ranges are inclusive -> exclusive, just like vm_map_entry start/end. 241 * The following assumes that KERNBASE is 0xd0000000. 242 * 243 * PDE#s VA range Usage 244 * 0->1660 0x0 -> 0xcf800000 user address space, note that the 245 * max user address is 0xcfbfe000 246 * the final two pages in the last 4MB 247 * used to be reserved for the UAREA 248 * but now are no longer used. 249 * 1660 0xcf800000-> recursive mapping of PDP (used for 250 * 0xd0000000 linear mapping of PTPs). 251 * 1664->2044 0xd0000000-> kernel address space (constant 252 * 0xff800000 across all pmaps/processes). 253 * 2044 0xff800000-> "alternate" recursive PDP mapping 254 * <end> (for other pmaps). 255 * 256 * 257 * Note: A recursive PDP mapping provides a way to map all the PTEs for 258 * a 4GB address space into a linear chunk of virtual memory. In other 259 * words, the PTE for page 0 is the first 8b mapped into the 2MB recursive 260 * area. The PTE for page 1 is the second 8b. The very last 8b in the 261 * 2MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB 262 * address). 263 * 264 * All pmaps' PDs must have the same values in slots 1660->2043 so that 265 * the kernel is always mapped in every process. These values are loaded 266 * into the PD at pmap creation time. 267 * 268 * At any one time only one pmap can be active on a processor. This is 269 * the pmap whose PDP is pointed to by processor register %cr3. This pmap 270 * will have all its PTEs mapped into memory at the recursive mapping 271 * point (slots #1660-3 as show above). When the pmap code wants to find the 272 * PTE for a virtual address, all it has to do is the following: 273 * 274 * Address of PTE = (1660 * 2MB) + (VA / NBPG) * sizeof(pt_entry_t) 275 * = 0xcf800000 + (VA / 4096) * 8 276 * 277 * What happens if the pmap layer is asked to perform an operation 278 * on a pmap that is not the one which is currently active? In that 279 * case we take the PA of the PDP of the non-active pmap and put it in 280 * slots 2044-7 of the active pmap. This causes the non-active pmap's 281 * PTEs to get mapped in the final 4MB of the 4GB address space 282 * (e.g. starting at 0xffc00000). 283 * 284 * The following figure shows the effects of the recursive PDP mapping: 285 * 286 * PDP (%cr3->PDPTP) 287 * +----+ 288 * | 0| -> PTP#0 that maps VA 0x0 -> 0x200000 289 * | | 290 * | | 291 * |1660| -> points back to PDP (%cr3) mapping VA 0xcf800000 -> 0xd0000000 292 * |1661| (PDP is 4 pages) 293 * |1662| 294 * |1663| 295 * |1664| -> first kernel PTP (maps 0xd0000000 -> 0xe0200000) 296 * | | 297 * |2044| -> points to alternate pmap's PDP (maps 0xff800000 -> end) 298 * |2045| 299 * |2046| 300 * |2047| 301 * +----+ 302 * 303 * Note that the PDE#1660 VA (0xcf8033e0) is defined as "PTE_BASE". 304 * Note that the PDE#2044 VA (0xff803fe0) is defined as "APTE_BASE". 305 * 306 * Starting at VA 0xcf8033e0 the current active PDPs (%cr3) act as a 307 * PDPTP and reference four consecutively mapped pages: 308 * 309 * PTP#1660-3 == PDP(%cr3) => maps VA 0xcf800000 -> 0xd0000000 310 * +----+ 311 * | 0| -> maps the contents of PTP#0 at VA 0xcf800000->0xcf801000 312 * | | 313 * | | 314 * |1660| -> maps the contents of PTP#1660 (the PDP) at VA 0xcfe7c000 315 * |1661| 316 * |1662| 317 * |1663| 318 * |1664| -> maps the contents of first kernel PTP 319 * | | 320 * |2047| 321 * +----+ 322 * 323 * Note that mapping of the PDP at PTP#1660's VA (0xcfe7c000) is 324 * defined as "PDP_BASE".... within that mapping there are two 325 * defines: 326 * "PDP_PDE" (0xcfe7f3e0) is the VA of the PDE in the PDP 327 * which points back to itself. 328 * "APDP_PDE" (0xfff02fe0) is the VA of the PDE in the PDP which 329 * establishes the recursive mapping of the alternate pmap. 330 * To set the alternate PDP, one just has to put the correct 331 * PA info in *APDP_PDE. 332 * 333 * Note that in the APTE_BASE space, the APDP appears at VA 334 * "APDP_BASE" (0xffffc000). 335 * 336 * unfortunately, we cannot use recursive PDPT from the page tables 337 * because cr3 is only 32 bits wide. 338 * 339 */ 340#define PG_FRAME 0xffffff000ULL /* page frame mask */ 341#define PG_LGFRAME 0xfffe00000ULL /* large (2M) page frame mask */ 342 343/* 344 * Redefine the PDSHIFT and NBPD macros for PAE 345 */ 346#undef PDSHIFT 347#define PDSHIFT 21 /* page directory address shift */ 348#undef NBPD 349#define NBPD (1U << PDSHIFT) /* # bytes mapped by PD (2MB) */ 350 351#define PDSHIFT86 22 /* for pmap86 transfer */ 352 353#undef PDSLOT_PTE 354#define PDSLOT_PTE (1660U) /* 1660: for recursive PDP map */ 355#undef PDSLOT_KERN 356#define PDSLOT_KERN (1664U) /* 1664: start of kernel space */ 357#undef PDSLOT_APTE 358#define PDSLOT_APTE (2044U) /* 2044: alternative recursive slot */ 359 360/* 361 * The following defines give the virtual addresses of various MMU 362 * data structures: 363 * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings 364 * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP 365 */ 366#define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD)) 367#define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD)) 368#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG))) 369#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG))) 370#define PDP_PDE (PDP_BASE + PDSLOT_PTE) 371#define APDP_PDE (PDP_BASE + PDSLOT_APTE) 372 373/* 374 * pdei/ptei: generate index into PDP/PTP from a VA 375 */ 376#define PD_MASK 0xffe00000 /* page directory address bits */ 377#define PT_MASK 0x001ff000 /* page table address bits */ 378#define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT) 379#define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT) 380 381#define PD_MASK86 0xffc00000 /* for pmap86 transfer */ 382#define PT_MASK86 0x003ff000 /* for pmap86 transfer */ 383 384/* 385 * Mach derived conversion macros 386 */ 387#define i386_round_pdr(x) ((((unsigned)(x)) + ~PD_MASK) & PD_MASK) 388 389/* 390 * various address macros 391 * 392 * vtopte: return a pointer to the PTE mapping a VA 393 */ 394#define vtopte(VA) (PTE_BASE + atop((vaddr_t)VA)) 395 396/* 397 * PTP macros: 398 * A PTP's index is the PD index of the PDE that points to it. 399 * A PTP's offset is the byte-offset in the PTE space that this PTP is at. 400 * A PTP's VA is the first VA mapped by that PTP. 401 * 402 * Note that NBPG == number of bytes in a PTP (4096 bytes == 512 entries) 403 * NBPD == number of bytes a PTP can map (2MB) 404 */ 405 406#define ptp_i2o(I) ((I) * NBPG) /* index => offset */ 407#define ptp_o2i(O) ((O) / NBPG) /* offset => index */ 408#define ptp_i2v(I) ((I) * NBPD) /* index => VA */ 409#define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */ 410 411/* 412 * Access PD and PT 413 */ 414#define PDE(pm,i) (((pd_entry_t *)(pm)->pm_pdir)[(i)]) 415 416/* 417 * here we define the data types for PDEs and PTEs for PAE 418 */ 419typedef u_int64_t pd_entry_t; /* PDE */ 420typedef u_int64_t pt_entry_t; /* PTE */ 421 422#define PG_NX 0x8000000000000000ULL /* execute-disable */ 423 424/* 425 * Number of PTEs per cache line. 8 byte pte, 64-byte cache line 426 * Used to avoid false sharing of cache lines. 427 */ 428#define NPTECL 8 429 430/* 431 * other data structures 432 */ 433 434extern u_int32_t protection_codes[]; /* maps MI prot to i386 prot code */ 435extern int pmap_initialized; /* pmap_init done yet? */ 436 437/* Segment boundaries */ 438extern vaddr_t kernel_text, etext, __rodata_start, erodata, __data_start; 439extern vaddr_t edata, __bss_start, end, ssym, esym, PTmap; 440 441/* 442 * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a 443 * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing 444 * due to false sharing. 445 */ 446 447#ifdef MULTIPROCESSOR 448#define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 449#define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG) 450#else 451#define PTESLEW(pte, id) (pte) 452#define VASLEW(va,id) (va) 453#endif 454 455/* 456 * special VAs and the PTEs that map them 457 */ 458 459static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte; 460extern caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp; 461 462extern int pmap_pg_g; 463extern int pmap_pg_wc; 464extern struct pmap_head pmaps; 465extern struct mutex pmaps_lock; 466 467extern uint32_t cpu_meltdown; 468 469/* 470 * local prototypes 471 */ 472struct vm_page *pmap_alloc_ptp_pae(struct pmap *, int, pt_entry_t); 473struct vm_page *pmap_get_ptp_pae(struct pmap *, int); 474void pmap_drop_ptp_pae(struct pmap *, vaddr_t, struct vm_page *, 475 pt_entry_t *); 476pt_entry_t *pmap_map_ptes_pae(struct pmap *); 477void pmap_unmap_ptes_pae(struct pmap *); 478void pmap_do_remove_pae(struct pmap *, vaddr_t, vaddr_t, int); 479void pmap_remove_ptes_pae(struct pmap *, struct vm_page *, 480 vaddr_t, vaddr_t, vaddr_t, int, struct pv_entry **); 481void pmap_sync_flags_pte_pae(struct vm_page *, pt_entry_t); 482 483static __inline u_int 484pmap_pte2flags(pt_entry_t pte) 485{ 486 return (((pte & PG_U) ? PG_PMAP_REF : 0) | 487 ((pte & PG_M) ? PG_PMAP_MOD : 0)); 488} 489 490void 491pmap_sync_flags_pte_pae(struct vm_page *pg, pt_entry_t pte) 492{ 493 if (pte & (PG_U|PG_M)) { 494 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte)); 495 } 496} 497 498/* 499 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 500 * 501 * => we lock enough pmaps to keep things locked in 502 * => must be undone with pmap_unmap_ptes before returning 503 */ 504 505pt_entry_t * 506pmap_map_ptes_pae(struct pmap *pmap) 507{ 508 pd_entry_t opde; 509 510 /* the kernel's pmap is always accessible */ 511 if (pmap == pmap_kernel()) { 512 return(PTE_BASE); 513 } 514 515 mtx_enter(&pmap->pm_mtx); 516 517 /* if curpmap then we are always mapped */ 518 if (pmap_is_curpmap(pmap)) { 519 return(PTE_BASE); 520 } 521 522 mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx); 523 524 /* need to load a new alternate pt space into curpmap? */ 525 opde = *APDP_PDE; 526#if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC) 527 if (pmap_valid_entry(opde)) 528 panic("pmap_map_ptes_pae: APTE valid"); 529#endif 530 if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdidx[0]) { 531 APDP_PDE[0] = pmap->pm_pdidx[0] | PG_RW | PG_V | PG_U | PG_M; 532 APDP_PDE[1] = pmap->pm_pdidx[1] | PG_RW | PG_V | PG_U | PG_M; 533 APDP_PDE[2] = pmap->pm_pdidx[2] | PG_RW | PG_V | PG_U | PG_M; 534 APDP_PDE[3] = pmap->pm_pdidx[3] | PG_RW | PG_V | PG_U | PG_M; 535 if (pmap_valid_entry(opde)) 536 pmap_apte_flush(); 537 } 538 return(APTE_BASE); 539} 540 541/* 542 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 543 */ 544 545void 546pmap_unmap_ptes_pae(struct pmap *pmap) 547{ 548 if (pmap == pmap_kernel()) 549 return; 550 551 if (!pmap_is_curpmap(pmap)) { 552#if defined(MULTIPROCESSOR) 553 APDP_PDE[0] = 0; 554 APDP_PDE[1] = 0; 555 APDP_PDE[2] = 0; 556 APDP_PDE[3] = 0; 557 pmap_apte_flush(); 558#endif 559 mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx); 560 } 561 562 mtx_leave(&pmap->pm_mtx); 563} 564 565u_int32_t 566pmap_pte_set_pae(vaddr_t va, paddr_t pa, u_int32_t bits) 567{ 568 pt_entry_t pte, *ptep = vtopte(va); 569 uint64_t nx; 570 571 pa &= PMAP_PA_MASK; 572 573 if (bits & PG_X) 574 nx = 0; 575 else 576 nx = PG_NX; 577 578 pte = i386_atomic_testset_uq(ptep, pa | bits | nx); /* zap! */ 579 return (pte & ~PG_FRAME); 580} 581 582u_int32_t 583pmap_pte_setbits_pae(vaddr_t va, u_int32_t set, u_int32_t clr) 584{ 585 pt_entry_t *ptep = vtopte(va); 586 pt_entry_t pte = *ptep; 587 588 i386_atomic_testset_uq(ptep, (pte | set) & ~(pt_entry_t)clr); 589 return (pte & ~PG_FRAME); 590} 591 592u_int32_t 593pmap_pte_bits_pae(vaddr_t va) 594{ 595 pt_entry_t *ptep = vtopte(va); 596 597 return (*ptep & ~PG_FRAME); 598} 599 600paddr_t 601pmap_pte_paddr_pae(vaddr_t va) 602{ 603 pt_entry_t *ptep = vtopte(va); 604 605 return (*ptep & PG_FRAME); 606} 607 608/* 609 * Switch over to PAE page tables 610 */ 611void 612pmap_bootstrap_pae(void) 613{ 614 extern int nkpde; 615 struct pmap *kpm = pmap_kernel(); 616 struct vm_page *ptp; 617 paddr_t ptaddr; 618 u_int32_t bits; 619 vaddr_t va, eva; 620 pt_entry_t pte; 621 622 if ((cpu_feature & CPUID_PAE) == 0 || 623 (ecpu_feature & CPUID_NXE) == 0) 624 return; 625 626 cpu_pae = 1; 627 628 DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n", __func__, 629 (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa, 630 kpm->pm_pdirsize); 631 632 va = (vaddr_t)kpm->pm_pdir; 633 kpm->pm_pdidx[0] = (va + 0*NBPG - KERNBASE) | PG_V; 634 kpm->pm_pdidx[1] = (va + 1*NBPG - KERNBASE) | PG_V; 635 kpm->pm_pdidx[2] = (va + 2*NBPG - KERNBASE) | PG_V; 636 kpm->pm_pdidx[3] = (va + 3*NBPG - KERNBASE) | PG_V; 637 /* map pde recursively into itself */ 638 PDE(kpm, PDSLOT_PTE+0) = kpm->pm_pdidx[0] | PG_KW | PG_M | PG_U; 639 PDE(kpm, PDSLOT_PTE+1) = kpm->pm_pdidx[1] | PG_KW | PG_M | PG_U; 640 PDE(kpm, PDSLOT_PTE+2) = kpm->pm_pdidx[2] | PG_KW | PG_M | PG_U; 641 PDE(kpm, PDSLOT_PTE+3) = kpm->pm_pdidx[3] | PG_KW | PG_M | PG_U; 642 643 /* transfer all kernel mappings over into pae tables */ 644 for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86); 645 va < eva; va += PAGE_SIZE) { 646 if (!pmap_valid_entry(PDE(kpm, pdei(va)))) { 647 ptp = uvm_pagealloc(&kpm->pm_obj, va, NULL, 648 UVM_PGA_ZERO); 649 if (ptp == NULL) 650 panic("%s: uvm_pagealloc() failed", __func__); 651 ptaddr = VM_PAGE_TO_PHYS(ptp); 652 PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V | 653 PG_U | PG_M; 654 pmap_pte_set_86((vaddr_t)vtopte(va), 655 ptaddr, PG_KW | PG_V | PG_U | PG_M); 656 657 /* count PTP as resident */ 658 kpm->pm_stats.resident_count++; 659 } 660 bits = pmap_pte_bits_86(va) | pmap_pg_g; 661 662 /* 663 * At this point, ideally only kernel text should be executable. 664 * However, we need to leave the ISA hole executable to handle 665 * bios32, pcibios, and apmbios calls that may potentially 666 * happen later since we don't know (yet) which of those may be 667 * in use. Later (in biosattach), we will reset the permissions 668 * according to what we actually need. 669 */ 670 if ((va >= (vaddr_t)&kernel_text && va <= (vaddr_t)&etext) || 671 (va >= (vaddr_t)atdevbase && va <= 672 (vaddr_t)(atdevbase + IOM_SIZE))) 673 bits |= PG_X; 674 else 675 bits &= ~PG_X; 676 677 if (pmap_valid_entry(bits)) 678 pmap_pte_set_pae(va, pmap_pte_paddr_86(va), bits); 679 } 680 681 /* Transfer special mappings */ 682 if (kpm->pm_pdir_intel) { 683 uint32_t *pd, *ptp; 684 uint32_t l1idx, l2idx; 685 paddr_t npa; 686 struct vm_page *ptppg; 687 688 pd = (uint32_t *)kpm->pm_pdir_intel; 689 kpm->pm_pdir_intel = kpm->pm_pdirpa_intel = 0; 690 691 for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86); va < eva; 692 va += PAGE_SIZE) { 693 l1idx = ((va & PT_MASK86) >> PGSHIFT); 694 l2idx = ((va & PD_MASK86) >> PDSHIFT86); 695 696 if (!pmap_valid_entry(pd[l2idx])) 697 continue; 698 699 npa = pd[l2idx] & PMAP_PA_MASK; 700 ptppg = PHYS_TO_VM_PAGE(npa); 701 mtx_enter(&ptppg->mdpage.pv_mtx); 702 703 /* still running on pmap86 */ 704 ptp = (uint32_t *)pmap_tmpmap_pa_86(npa); 705 706 if (!pmap_valid_entry(ptp[l1idx])) { 707 mtx_leave(&ptppg->mdpage.pv_mtx); 708 pmap_tmpunmap_pa_86(); 709 continue; 710 } 711 DPRINTF("%s: va 0x%x l2idx %u 0x%x lx1idx %u 0x%x\n", 712 __func__, (uint32_t)va, l2idx, (uint32_t)pd[l2idx], 713 l1idx, (uint32_t)ptp[l1idx]); 714 715 /* protection and cacheability */ 716 bits = ptp[l1idx] & (PG_PROT|PG_N|PG_WT); 717 npa = ptp[l1idx] & PMAP_PA_MASK; 718 719 /* still running on pmap86 */ 720 pmap_tmpunmap_pa_86(); 721 mtx_leave(&ptppg->mdpage.pv_mtx); 722 723 /* enforce use of pmap86 */ 724 cpu_pae = 0; 725 pmap_enter_special_pae(va, npa, 0, bits); 726 cpu_pae = 1; 727 728 if (--ptppg->wire_count == 1) { 729 ptppg->wire_count = 0; 730 uvm_pagerealloc(ptppg, NULL, 0); 731 DPRINTF("%s: freeing PT page 0x%x\n", __func__, 732 (uint32_t)VM_PAGE_TO_PHYS(ptppg)); 733 } 734 } 735 km_free(pd, NBPG, &kv_any, &kp_dirty); 736 DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd); 737 } 738 739 if (!cpu_paenable(&kpm->pm_pdidx[0])) { 740 extern struct user *proc0paddr; 741 742 proc0paddr->u_pcb.pcb_cr3 = kpm->pm_pdirpa = 743 (vaddr_t)kpm - KERNBASE; 744 kpm->pm_pdirsize = 4 * NBPG; 745 746 /* Reset cr3 for NMI task switch */ 747 cpu_update_nmi_cr3(kpm->pm_pdirpa); 748 749 DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n", 750 __func__, (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa, 751 kpm->pm_pdirsize); 752 753 csrc_pte = vtopte(pmap_csrcp); 754 cdst_pte = vtopte(pmap_cdstp); 755 zero_pte = vtopte(pmap_zerop); 756 ptp_pte = vtopte(pmap_ptpp); 757 flsh_pte = vtopte(pmap_flshp); 758 759 nkpde *= 2; 760 nkptp_max = 2048 - PDSLOT_KERN - 4; 761 762 pmap_pte_set_p = pmap_pte_set_pae; 763 pmap_pte_setbits_p = pmap_pte_setbits_pae; 764 pmap_pte_bits_p = pmap_pte_bits_pae; 765 pmap_pte_paddr_p = pmap_pte_paddr_pae; 766 pmap_clear_attrs_p = pmap_clear_attrs_pae; 767 pmap_enter_p = pmap_enter_pae; 768 pmap_enter_special_p = pmap_enter_special_pae; 769 pmap_extract_p = pmap_extract_pae; 770 pmap_growkernel_p = pmap_growkernel_pae; 771 pmap_page_remove_p = pmap_page_remove_pae; 772 pmap_do_remove_p = pmap_do_remove_pae; 773 pmap_test_attrs_p = pmap_test_attrs_pae; 774 pmap_unwire_p = pmap_unwire_pae; 775 pmap_write_protect_p = pmap_write_protect_pae; 776 pmap_pinit_pd_p = pmap_pinit_pd_pae; 777 pmap_zero_phys_p = pmap_zero_phys_pae; 778 pmap_zero_page_uncached_p = pmap_zero_page_uncached_pae; 779 pmap_copy_page_p = pmap_copy_page_pae; 780 781 bzero((void *)kpm->pm_pdir + 8, (PDSLOT_PTE-1) * 8); 782 /* TODO also reclaim old PDPs */ 783 } 784 785 /* Set region permissions */ 786 for (va = (vaddr_t)&PTmap; va < KERNBASE; va += NBPD) { 787 pte = PDE(kpm, pdei(va)); 788 PDE(kpm, pdei(va)) = pte | PG_NX; 789 } 790 791 va = (vaddr_t)APTE_BASE; 792 pte = PDE(kpm, pdei(va)); 793 PDE(kpm, pdei(va)) = pte | PG_NX; 794 795 pmap_write_protect(kpm, (vaddr_t)&kernel_text, (vaddr_t)&etext, 796 PROT_READ | PROT_EXEC); 797 pmap_write_protect(kpm, (vaddr_t)&__rodata_start, 798 (vaddr_t)&erodata, PROT_READ); 799 pmap_write_protect(kpm, (vaddr_t)&__data_start, (vaddr_t)&edata, 800 PROT_READ | PROT_WRITE); 801 pmap_write_protect(kpm, (vaddr_t)&__bss_start, (vaddr_t)&end, 802 PROT_READ | PROT_WRITE); 803 804#if defined(DDB) || NKSYMS > 0 805 pmap_write_protect(kpm, ssym, esym, PROT_READ); 806#endif 807} 808 809/* 810 * p t p f u n c t i o n s 811 */ 812 813/* 814 * pmap_alloc_ptp: allocate a PTP for a PMAP 815 * 816 * => pmap should already be locked by caller 817 * => we use the ptp's wire_count to count the number of active mappings 818 * in the PTP (we start it at one to prevent any chance this PTP 819 * will ever leak onto the active/inactive queues) 820 * => we should not be holding any pv_head locks (in case we are forced 821 * to call pmap_steal_ptp()) 822 * => we may need to lock pv_head's if we have to steal a PTP 823 */ 824 825struct vm_page * 826pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, pt_entry_t pde_flags) 827{ 828 struct vm_page *ptp; 829 pd_entry_t *pva_intel; 830 831 ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, 832 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 833 if (ptp == NULL) 834 return (NULL); 835 836 /* got one! */ 837 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 838 ptp->wire_count = 1; /* no mappings yet */ 839 PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) | 840 PG_RW | PG_V | PG_M | PG_U | pde_flags); 841 842 /* 843 * Meltdown special case - if we are adding a new PDE for 844 * usermode addresses, just copy the PDE to the U-K 845 * table. 846 */ 847 if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) { 848 pva_intel = (pd_entry_t *)pmap->pm_pdir_intel; 849 pva_intel[pde_index] = PDE(pmap, pde_index); 850 DPRINTF("%s: copying usermode PDE (content=0x%llx) pde_index " 851 "%d from 0x%llx -> 0x%llx\n", __func__, 852 PDE(pmap, pde_index), pde_index, 853 (uint64_t)&PDE(pmap, pde_index), 854 (uint64_t)&(pva_intel[pde_index])); 855 } 856 857 pmap->pm_stats.resident_count++; /* count PTP as resident */ 858 pmap->pm_ptphint = ptp; 859 return(ptp); 860} 861 862/* 863 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 864 * 865 * => pmap should NOT be pmap_kernel() 866 * => pmap should be locked 867 */ 868 869struct vm_page * 870pmap_get_ptp_pae(struct pmap *pmap, int pde_index) 871{ 872 struct vm_page *ptp; 873 874 if (pmap_valid_entry(PDE(pmap, pde_index))) { 875 /* valid... check hint (saves us a PA->PG lookup) */ 876 if (pmap->pm_ptphint && 877 (PDE(pmap, pde_index) & PG_FRAME) == 878 VM_PAGE_TO_PHYS(pmap->pm_ptphint)) 879 return(pmap->pm_ptphint); 880 881 ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index)); 882#ifdef DIAGNOSTIC 883 if (ptp == NULL) 884 panic("pmap_get_ptp_pae: unmanaged user PTP"); 885#endif 886 pmap->pm_ptphint = ptp; 887 return(ptp); 888 } 889 890 /* allocate a new PTP (updates ptphint) */ 891 return (pmap_alloc_ptp_pae(pmap, pde_index, PG_u)); 892} 893 894void 895pmap_drop_ptp_pae(struct pmap *pm, vaddr_t va, struct vm_page *ptp, 896 pt_entry_t *ptes) 897{ 898 pd_entry_t *pva_intel; 899 900 i386_atomic_testset_uq(&PDE(pm, pdei(va)), 0); 901 pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset); 902#ifdef MULTIPROCESSOR 903 /* 904 * Always shoot down the other pmap's 905 * self-mapping of the PTP. 906 */ 907 pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset); 908#endif 909 pm->pm_stats.resident_count--; 910 /* update hint */ 911 if (pm->pm_ptphint == ptp) 912 pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt); 913 ptp->wire_count = 0; 914 /* Postpone free to after shootdown. */ 915 uvm_pagerealloc(ptp, NULL, 0); 916 917 if (pm->pm_pdir_intel) { 918 KASSERT(va < VM_MAXUSER_ADDRESS); 919 /* Zap special meltdown PDE */ 920 pva_intel = (pd_entry_t *)pm->pm_pdir_intel; 921 i386_atomic_testset_uq(&pva_intel[pdei(va)], 0); 922 DPRINTF("%s: cleared meltdown PDE @ index %lu " 923 "(va range start 0x%x)\n", __func__, pdei(va), 924 (uint32_t)va); 925 } 926} 927 928/* 929 * pmap_pinit_pd: given a freshly allocated pmap structure, give it a PD 930 */ 931void 932pmap_pinit_pd_pae(struct pmap *pmap) 933{ 934 extern int nkpde; 935 vaddr_t va; 936 paddr_t pdidx[4]; 937 938 /* allocate PDP */ 939 pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_dirty, 940 &kd_waitok); 941 if (pmap->pm_pdir == 0) 942 panic("pmap_pinit_pd_pae: kernel_map out of virtual space!"); 943 /* page index is in the pmap! */ 944 pmap_extract(pmap_kernel(), (vaddr_t)pmap, &pmap->pm_pdirpa); 945 va = (vaddr_t)pmap->pm_pdir; 946 pmap_extract(pmap_kernel(), va + 0*NBPG, &pdidx[0]); 947 pmap_extract(pmap_kernel(), va + 1*NBPG, &pdidx[1]); 948 pmap_extract(pmap_kernel(), va + 2*NBPG, &pdidx[2]); 949 pmap_extract(pmap_kernel(), va + 3*NBPG, &pdidx[3]); 950 pmap->pm_pdidx[0] = (uint64_t)pdidx[0]; 951 pmap->pm_pdidx[1] = (uint64_t)pdidx[1]; 952 pmap->pm_pdidx[2] = (uint64_t)pdidx[2]; 953 pmap->pm_pdidx[3] = (uint64_t)pdidx[3]; 954 pmap->pm_pdidx[0] |= PG_V; 955 pmap->pm_pdidx[1] |= PG_V; 956 pmap->pm_pdidx[2] |= PG_V; 957 pmap->pm_pdidx[3] |= PG_V; 958 pmap->pm_pdirsize = 4 * NBPG; 959 960 /* init PDP */ 961 /* zero init area */ 962 bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t)); 963 /* put in recursive PDE to map the PTEs */ 964 PDE(pmap, PDSLOT_PTE+0) = pmap->pm_pdidx[0] | PG_KW | PG_U | 965 PG_M | PG_V | PG_NX; 966 PDE(pmap, PDSLOT_PTE+1) = pmap->pm_pdidx[1] | PG_KW | PG_U | 967 PG_M | PG_V | PG_NX; 968 PDE(pmap, PDSLOT_PTE+2) = pmap->pm_pdidx[2] | PG_KW | PG_U | 969 PG_M | PG_V | PG_NX; 970 PDE(pmap, PDSLOT_PTE+3) = pmap->pm_pdidx[3] | PG_KW | PG_U | 971 PG_M | PG_V | PG_NX; 972 973 /* 974 * we need to lock pmaps_lock to prevent nkpde from changing on 975 * us. note that there is no need to splvm to protect us from 976 * malloc since malloc allocates out of a submap and we should have 977 * already allocated kernel PTPs to cover the range... 978 */ 979 /* put in kernel VM PDEs */ 980 bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN), 981 nkpde * sizeof(pd_entry_t)); 982 /* zero the rest */ 983 bzero(&PDE(pmap, PDSLOT_KERN + nkpde), pmap->pm_pdirsize - 984 ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t))); 985 986 /* 987 * Intel CPUs need a special page table to be used during usermode 988 * execution, one that lacks all kernel mappings. 989 */ 990 if (cpu_meltdown) { 991 int i; 992 993 va = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_zero, &kd_waitok); 994 if (va == 0) 995 panic("%s: kernel_map out of virtual space!", __func__); 996 if (!pmap_extract(pmap_kernel(), 997 (vaddr_t)&pmap->pm_pdidx_intel, &pmap->pm_pdirpa_intel)) 998 panic("%s: can't locate PDPT", __func__); 999 pmap->pm_pdir_intel = va; 1000 1001 for (i = 0; i < 4; i++) { 1002 pmap->pm_pdidx_intel[i] = 0; 1003 if (!pmap_extract(pmap, va + i * NBPG, 1004 (paddr_t *)&pmap->pm_pdidx_intel[i])) 1005 panic("%s: can't locate PD page", __func__); 1006 pmap->pm_pdidx_intel[i] |= PG_V; 1007 DPRINTF("%s: pm_pdidx_intel[%d] = 0x%llx\n", __func__, 1008 i, pmap->pm_pdidx_intel[i]); 1009 } 1010 1011 /* Copy PDEs from pmap_kernel's U-K view */ 1012 bcopy((void *)pmap_kernel()->pm_pdir_intel, 1013 (void *)pmap->pm_pdir_intel, 4 * NBPG); 1014 1015 DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx " 1016 "pdir_intel 0x%lx pdirpa_intel 0x%lx\n", 1017 __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa, 1018 pmap->pm_pdir_intel, pmap->pm_pdirpa_intel); 1019 } else { 1020 pmap->pm_pdir_intel = 0; 1021 pmap->pm_pdirpa_intel = 0; 1022 } 1023 1024 mtx_enter(&pmaps_lock); 1025 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 1026 mtx_leave(&pmaps_lock); 1027} 1028 1029/* 1030 * some misc. functions 1031 */ 1032 1033/* 1034 * pmap_extract: extract a PA for the given VA 1035 */ 1036 1037int 1038pmap_extract_pae(struct pmap *pmap, vaddr_t va, paddr_t *pap) 1039{ 1040 pt_entry_t *ptes, pte; 1041 1042 ptes = pmap_map_ptes_pae(pmap); 1043 if (pmap_valid_entry(PDE(pmap, pdei(va)))) { 1044 pte = ptes[atop(va)]; 1045 pmap_unmap_ptes_pae(pmap); 1046 if (!pmap_valid_entry(pte)) 1047 return 0; 1048 if (pap != NULL) 1049 *pap = (pte & PG_FRAME) | (va & ~PG_FRAME); 1050 return 1; 1051 } 1052 pmap_unmap_ptes_pae(pmap); 1053 return 0; 1054} 1055 1056extern void (*pagezero)(void *, size_t); 1057 1058/* 1059 * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are 1060 * initialized. 1061 */ 1062void 1063pmap_zero_phys_pae(paddr_t pa) 1064{ 1065#ifdef MULTIPROCESSOR 1066 int id = cpu_number(); 1067#endif 1068 pt_entry_t *zpte = PTESLEW(zero_pte, id); 1069 caddr_t zerova = VASLEW(pmap_zerop, id); 1070 1071#ifdef DIAGNOSTIC 1072 if (*zpte) 1073 panic("pmap_zero_phys_pae: lock botch"); 1074#endif 1075 1076 *zpte = (pa & PG_FRAME) | PG_V | PG_RW; /* map in */ 1077 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 1078 pagezero(zerova, PAGE_SIZE); /* zero */ 1079 *zpte = 0; 1080} 1081 1082/* 1083 * pmap_zero_page_uncached: the same, except uncached. 1084 */ 1085 1086int 1087pmap_zero_page_uncached_pae(paddr_t pa) 1088{ 1089#ifdef MULTIPROCESSOR 1090 int id = cpu_number(); 1091#endif 1092 pt_entry_t *zpte = PTESLEW(zero_pte, id); 1093 caddr_t zerova = VASLEW(pmap_zerop, id); 1094 1095#ifdef DIAGNOSTIC 1096 if (*zpte) 1097 panic("pmap_zero_page_uncached_pae: lock botch"); 1098#endif 1099 1100 *zpte = (pa & PG_FRAME) | PG_V | PG_RW | PG_N; /* map in */ 1101 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 1102 pagezero(zerova, PAGE_SIZE); /* zero */ 1103 *zpte = 0; 1104 1105 return 1; 1106} 1107 1108/* 1109 * pmap_copy_page: copy a page 1110 */ 1111 1112void 1113pmap_copy_page_pae(struct vm_page *srcpg, struct vm_page *dstpg) 1114{ 1115 paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg); 1116 paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg); 1117#ifdef MULTIPROCESSOR 1118 int id = cpu_number(); 1119#endif 1120 pt_entry_t *spte = PTESLEW(csrc_pte, id); 1121 pt_entry_t *dpte = PTESLEW(cdst_pte, id); 1122 caddr_t csrcva = VASLEW(pmap_csrcp, id); 1123 caddr_t cdstva = VASLEW(pmap_cdstp, id); 1124 1125#ifdef DIAGNOSTIC 1126 if (*spte || *dpte) 1127 panic("pmap_copy_page_pae: lock botch"); 1128#endif 1129 1130 *spte = (srcpa & PG_FRAME) | PG_V | PG_RW; 1131 *dpte = (dstpa & PG_FRAME) | PG_V | PG_RW; 1132 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 1133 bcopy(csrcva, cdstva, PAGE_SIZE); 1134 *spte = *dpte = 0; 1135 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 1136} 1137 1138/* 1139 * p m a p r e m o v e f u n c t i o n s 1140 * 1141 * functions that remove mappings 1142 */ 1143 1144/* 1145 * pmap_remove_ptes: remove PTEs from a PTP 1146 * 1147 * => caller must hold pmap's lock 1148 * => PTP must be mapped into KVA 1149 * => PTP should be null if pmap == pmap_kernel() 1150*/ 1151 1152void 1153pmap_remove_ptes_pae(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 1154 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs) 1155{ 1156 struct pv_entry *pve; 1157 pt_entry_t *pte = (pt_entry_t *) ptpva; 1158 struct vm_page *pg; 1159 pt_entry_t opte; 1160 1161 /* 1162 * note that ptpva points to the PTE that maps startva. this may 1163 * or may not be the first PTE in the PTP. 1164 * 1165 * we loop through the PTP while there are still PTEs to look at 1166 * and the wire_count is greater than 1 (because we use the wire_count 1167 * to keep track of the number of real PTEs in the PTP). 1168 */ 1169 1170 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 1171 ; pte++, startva += NBPG) { 1172 if (!pmap_valid_entry(*pte)) 1173 continue; /* VA not mapped */ 1174 1175 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) 1176 continue; 1177 1178 /* atomically save the old PTE and zero it */ 1179 opte = i386_atomic_testset_uq(pte, 0); 1180 1181 if (opte & PG_W) 1182 pmap->pm_stats.wired_count--; 1183 pmap->pm_stats.resident_count--; 1184 1185 if (ptp) 1186 ptp->wire_count--; /* dropping a PTE */ 1187 1188 /* 1189 * Unnecessary work if not PG_PVLIST. 1190 */ 1191 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1192 1193 /* 1194 * if we are not on a pv list we are done. 1195 */ 1196 if ((opte & PG_PVLIST) == 0) { 1197#ifdef DIAGNOSTIC 1198 if (pg != NULL) 1199 panic("pmap_remove_ptes_pae: managed page " 1200 "without PG_PVLIST for 0x%lx", startva); 1201#endif 1202 continue; 1203 } 1204 1205#ifdef DIAGNOSTIC 1206 if (pg == NULL) 1207 panic("pmap_remove_ptes_pae: unmanaged page marked " 1208 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 1209 startva, (u_long)(opte & PG_FRAME)); 1210#endif 1211 1212 /* sync R/M bits */ 1213 pmap_sync_flags_pte_pae(pg, opte); 1214 pve = pmap_remove_pv(pg, pmap, startva); 1215 if (pve) { 1216 pve->pv_next = *free_pvs; 1217 *free_pvs = pve; 1218 } 1219 1220 /* end of "for" loop: time for next pte */ 1221 } 1222} 1223 1224/* 1225 * pmap_remove: top level mapping removal function 1226 * 1227 * => caller should not be holding any pmap locks 1228 */ 1229 1230void 1231pmap_do_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 1232{ 1233 pt_entry_t *ptes; 1234 paddr_t ptppa; 1235 vaddr_t blkendva; 1236 struct vm_page *ptp; 1237 struct pv_entry *pve; 1238 struct pv_entry *free_pvs = NULL; 1239 TAILQ_HEAD(, vm_page) empty_ptps; 1240 int shootall; 1241 vaddr_t va; 1242 1243 TAILQ_INIT(&empty_ptps); 1244 1245 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */ 1246 1247 /* 1248 * Decide if we want to shoot the whole tlb or just the range. 1249 * Right now, we simply shoot everything when we remove more 1250 * than 32 pages, but never in the kernel pmap. XXX - tune. 1251 */ 1252 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel()) 1253 shootall = 1; 1254 else 1255 shootall = 0; 1256 1257 for (va = sva ; va < eva ; va = blkendva) { 1258 /* determine range of block */ 1259 blkendva = i386_round_pdr(va + 1); 1260 if (blkendva > eva) 1261 blkendva = eva; 1262 1263 /* 1264 * XXXCDC: our PTE mappings should never be removed 1265 * with pmap_remove! if we allow this (and why would 1266 * we?) then we end up freeing the pmap's page 1267 * directory page (PDP) before we are finished using 1268 * it when we hit it in the recursive mapping. this 1269 * is BAD. 1270 * 1271 * long term solution is to move the PTEs out of user 1272 * address space. and into kernel address space (up 1273 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1274 * be VM_MAX_ADDRESS. 1275 */ 1276 1277 if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3)) 1278 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1279 continue; 1280 1281 if (!pmap_valid_entry(PDE(pmap, pdei(va)))) 1282 /* valid block? */ 1283 continue; 1284 1285 /* PA of the PTP */ 1286 ptppa = PDE(pmap, pdei(va)) & PG_FRAME; 1287 1288 /* get PTP if non-kernel mapping */ 1289 if (pmap == pmap_kernel()) { 1290 /* we never free kernel PTPs */ 1291 ptp = NULL; 1292 } else { 1293 if (pmap->pm_ptphint && 1294 VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) { 1295 ptp = pmap->pm_ptphint; 1296 } else { 1297 ptp = PHYS_TO_VM_PAGE(ptppa); 1298#ifdef DIAGNOSTIC 1299 if (ptp == NULL) 1300 panic("pmap_do_remove_pae: unmanaged " 1301 "PTP detected"); 1302#endif 1303 } 1304 } 1305 1306 pmap_remove_ptes_pae(pmap, ptp, (vaddr_t)&ptes[atop(va)], 1307 va, blkendva, flags, &free_pvs); 1308 1309 /* If PTP is no longer being used, free it. */ 1310 if (ptp && ptp->wire_count <= 1) { 1311 pmap_drop_ptp_pae(pmap, va, ptp, ptes); 1312 TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq); 1313 } 1314 1315 if (!shootall) 1316 pmap_tlb_shootrange(pmap, va, blkendva); 1317 } 1318 1319 if (shootall) 1320 pmap_tlb_shoottlb(); 1321 1322 pmap_unmap_ptes_pae(pmap); 1323 pmap_tlb_shootwait(); 1324 1325 while ((pve = free_pvs) != NULL) { 1326 free_pvs = pve->pv_next; 1327 pool_put(&pmap_pv_pool, pve); 1328 } 1329 1330 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1331 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1332 uvm_pagefree(ptp); 1333 } 1334} 1335 1336/* 1337 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 1338 * 1339 * => R/M bits are sync'd back to attrs 1340 */ 1341 1342void 1343pmap_page_remove_pae(struct vm_page *pg) 1344{ 1345 struct pv_entry *pve; 1346 struct pmap *pm; 1347 pt_entry_t *ptes, opte; 1348 TAILQ_HEAD(, vm_page) empty_ptps; 1349 struct vm_page *ptp; 1350 1351 if (pg->mdpage.pv_list == NULL) 1352 return; 1353 1354 TAILQ_INIT(&empty_ptps); 1355 1356 mtx_enter(&pg->mdpage.pv_mtx); 1357 while ((pve = pg->mdpage.pv_list) != NULL) { 1358 pmap_reference(pve->pv_pmap); 1359 pm = pve->pv_pmap; 1360 mtx_leave(&pg->mdpage.pv_mtx); 1361 1362 ptes = pmap_map_ptes_pae(pve->pv_pmap); /* locks pmap */ 1363 1364 /* 1365 * We dropped the pvlist lock before grabbing the pmap 1366 * lock to avoid lock ordering problems. This means 1367 * we have to check the pvlist again since somebody 1368 * else might have modified it. All we care about is 1369 * that the pvlist entry matches the pmap we just 1370 * locked. If it doesn't, unlock the pmap and try 1371 * again. 1372 */ 1373 mtx_enter(&pg->mdpage.pv_mtx); 1374 if ((pve = pg->mdpage.pv_list) == NULL || 1375 pve->pv_pmap != pm) { 1376 mtx_leave(&pg->mdpage.pv_mtx); 1377 pmap_unmap_ptes_pae(pm); /* unlocks pmap */ 1378 pmap_destroy(pm); 1379 mtx_enter(&pg->mdpage.pv_mtx); 1380 continue; 1381 } 1382 1383 pg->mdpage.pv_list = pve->pv_next; 1384 mtx_leave(&pg->mdpage.pv_mtx); 1385 1386#ifdef DIAGNOSTIC 1387 if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) & 1388 PG_FRAME) 1389 != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 1390 printf("pmap_page_remove_pae: pg=%p: va=%lx, " 1391 "pv_ptp=%p\n", 1392 pg, pve->pv_va, pve->pv_ptp); 1393 printf("pmap_page_remove_pae: PTP's phys addr: " 1394 "actual=%llx, recorded=%lx\n", 1395 (PDE(pve->pv_pmap, pdei(pve->pv_va)) & 1396 PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp)); 1397 panic("pmap_page_remove_pae: mapped managed page has " 1398 "invalid pv_ptp field"); 1399} 1400#endif 1401 opte = i386_atomic_testset_uq(&ptes[atop(pve->pv_va)], 0); 1402 1403 if (opte & PG_W) 1404 pve->pv_pmap->pm_stats.wired_count--; 1405 pve->pv_pmap->pm_stats.resident_count--; 1406 1407 /* sync R/M bits */ 1408 pmap_sync_flags_pte_pae(pg, opte); 1409 1410 /* update the PTP reference count. free if last reference. */ 1411 if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) { 1412 pmap_drop_ptp_pae(pve->pv_pmap, pve->pv_va, 1413 pve->pv_ptp, ptes); 1414 TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq); 1415 } 1416 1417 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va); 1418 1419 pmap_unmap_ptes_pae(pve->pv_pmap); /* unlocks pmap */ 1420 pmap_destroy(pve->pv_pmap); 1421 pool_put(&pmap_pv_pool, pve); 1422 mtx_enter(&pg->mdpage.pv_mtx); 1423 } 1424 mtx_leave(&pg->mdpage.pv_mtx); 1425 1426 pmap_tlb_shootwait(); 1427 1428 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1429 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1430 uvm_pagefree(ptp); 1431 } 1432} 1433 1434/* 1435 * p m a p a t t r i b u t e f u n c t i o n s 1436 * functions that test/change managed page's attributes 1437 * since a page can be mapped multiple times we must check each PTE that 1438 * maps it by going down the pv lists. 1439 */ 1440 1441/* 1442 * pmap_test_attrs: test a page's attributes 1443 * 1444 * => we set pv_head => pmap locking 1445 */ 1446 1447int 1448pmap_test_attrs_pae(struct vm_page *pg, int testbits) 1449{ 1450 struct pv_entry *pve; 1451 pt_entry_t *ptes, pte; 1452 u_long mybits, testflags; 1453 paddr_t ptppa; 1454 1455 testflags = pmap_pte2flags(testbits); 1456 1457 if (pg->pg_flags & testflags) 1458 return 1; 1459 1460 mybits = 0; 1461 mtx_enter(&pg->mdpage.pv_mtx); 1462 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0; 1463 pve = pve->pv_next) { 1464 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME; 1465 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa); 1466 pte = ptes[ptei(pve->pv_va)]; 1467 pmap_tmpunmap_pa(); 1468 mybits |= (pte & testbits); 1469 } 1470 mtx_leave(&pg->mdpage.pv_mtx); 1471 1472 if (mybits == 0) 1473 return 0; 1474 1475 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits)); 1476 1477 return 1; 1478} 1479 1480/* 1481 * pmap_clear_attrs: change a page's attributes 1482 * 1483 * => we return 1 if we cleared one of the bits we were asked to 1484 */ 1485int 1486pmap_clear_attrs_pae(struct vm_page *pg, int clearbits) 1487{ 1488 struct pv_entry *pve; 1489 pt_entry_t *ptes, npte, opte; 1490 u_long clearflags; 1491 paddr_t ptppa; 1492 int result; 1493 1494 clearflags = pmap_pte2flags(clearbits); 1495 1496 result = pg->pg_flags & clearflags; 1497 if (result) 1498 atomic_clearbits_int(&pg->pg_flags, clearflags); 1499 1500 mtx_enter(&pg->mdpage.pv_mtx); 1501 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) { 1502 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME; 1503 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa); 1504#ifdef DIAGNOSTIC 1505 if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va)))) 1506 panic("pmap_clear_attrs_pae: mapping without PTP " 1507 "detected"); 1508#endif 1509 1510 opte = ptes[ptei(pve->pv_va)]; 1511 if (opte & clearbits) { 1512 result = 1; 1513 npte = opte & ~clearbits; 1514 opte = i386_atomic_testset_uq( 1515 &ptes[ptei(pve->pv_va)], npte); 1516 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va); 1517 } 1518 pmap_tmpunmap_pa(); 1519 } 1520 mtx_leave(&pg->mdpage.pv_mtx); 1521 1522 pmap_tlb_shootwait(); 1523 1524 return (result != 0); 1525} 1526 1527 1528/* 1529 * p m a p p r o t e c t i o n f u n c t i o n s 1530 */ 1531 1532/* 1533 * pmap_page_protect: change the protection of all recorded mappings 1534 * of a managed page 1535 * 1536 * => NOTE: this is an inline function in pmap.h 1537 */ 1538 1539/* see pmap.h */ 1540 1541/* 1542 * pmap_protect: set the protection in of the pages in a pmap 1543 * 1544 * => NOTE: this is an inline function in pmap.h 1545 */ 1546 1547/* see pmap.h */ 1548 1549/* 1550 * pmap_write_protect: write-protect pages in a pmap 1551 */ 1552 1553void 1554pmap_write_protect_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva, 1555 vm_prot_t prot) 1556{ 1557 pt_entry_t *ptes, *spte, *epte, npte, opte; 1558 vaddr_t blockend; 1559 u_int64_t md_prot; 1560 vaddr_t va; 1561 int shootall = 0; 1562 1563 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */ 1564 1565 /* should be ok, but just in case ... */ 1566 sva &= PG_FRAME; 1567 eva &= PG_FRAME; 1568 1569 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel()) 1570 shootall = 1; 1571 1572 for (va = sva; va < eva; va = blockend) { 1573 blockend = (va & PD_MASK) + NBPD; 1574 if (blockend > eva) 1575 blockend = eva; 1576 1577 /* 1578 * XXXCDC: our PTE mappings should never be write-protected! 1579 * 1580 * long term solution is to move the PTEs out of user 1581 * address space. and into kernel address space (up 1582 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1583 * be VM_MAX_ADDRESS. 1584 */ 1585 1586 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1587 if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3)) 1588 continue; 1589 1590 /* empty block? */ 1591 if (!pmap_valid_entry(PDE(pmap, pdei(va)))) 1592 continue; 1593 1594 md_prot = protection_codes[prot]; 1595 if (!(prot & PROT_EXEC)) 1596 md_prot |= PG_NX; 1597 if (va < VM_MAXUSER_ADDRESS) 1598 md_prot |= PG_u; 1599 else if (va < VM_MAX_ADDRESS) 1600 /* XXX: write-prot our PTES? never! */ 1601 md_prot |= PG_RW; 1602 1603 spte = &ptes[atop(va)]; 1604 epte = &ptes[atop(blockend)]; 1605 1606 for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) { 1607 1608 if (!pmap_valid_entry(*spte)) /* no mapping? */ 1609 continue; 1610 1611 opte = *spte; 1612 npte = (opte & ~(pt_entry_t)PG_PROT) | md_prot; 1613 1614 if (npte != opte) { 1615 pmap_exec_account(pmap, va, *spte, npte); 1616 i386_atomic_testset_uq(spte, npte); 1617 } 1618 } 1619 } 1620 if (shootall) 1621 pmap_tlb_shoottlb(); 1622 else 1623 pmap_tlb_shootrange(pmap, sva, eva); 1624 1625 pmap_unmap_ptes_pae(pmap); /* unlocks pmap */ 1626 pmap_tlb_shootwait(); 1627} 1628 1629/* 1630 * end of protection functions 1631 */ 1632 1633/* 1634 * pmap_unwire: clear the wired bit in the PTE 1635 * 1636 * => mapping should already be in map 1637 */ 1638 1639void 1640pmap_unwire_pae(struct pmap *pmap, vaddr_t va) 1641{ 1642 pt_entry_t *ptes; 1643 1644 if (pmap_valid_entry(PDE(pmap, pdei(va)))) { 1645 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */ 1646 1647#ifdef DIAGNOSTIC 1648 if (!pmap_valid_entry(ptes[atop(va)])) 1649 panic("pmap_unwire_pae: invalid (unmapped) va " 1650 "0x%lx", va); 1651#endif 1652 if ((ptes[atop(va)] & PG_W) != 0) { 1653 i386_atomic_testset_uq(&ptes[atop(va)], 1654 ptes[atop(va)] & ~PG_W); 1655 pmap->pm_stats.wired_count--; 1656 } 1657#ifdef DIAGNOSTIC 1658 else { 1659 printf("pmap_unwire_pae: wiring for pmap %p va 0x%lx " 1660 "didn't change!\n", pmap, va); 1661 } 1662#endif 1663 pmap_unmap_ptes_pae(pmap); /* unlocks map */ 1664 } 1665#ifdef DIAGNOSTIC 1666 else { 1667 panic("pmap_unwire_pae: invalid PDE"); 1668 } 1669#endif 1670} 1671 1672/* 1673 * pmap_enter: enter a mapping into a pmap 1674 * 1675 * => must be done "now" ... no lazy-evaluation 1676 */ 1677 1678int 1679pmap_enter_pae(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 1680 int flags) 1681{ 1682 pt_entry_t *ptes, opte, npte; 1683 struct vm_page *ptp; 1684 struct pv_entry *pve, *opve = NULL; 1685 int wired = (flags & PMAP_WIRED) != 0; 1686 int nocache = (pa & PMAP_NOCACHE) != 0; 1687 int wc = (pa & PMAP_WC) != 0; 1688 struct vm_page *pg = NULL; 1689 int error, wired_count, resident_count, ptp_count; 1690 1691 KASSERT(!(wc && nocache)); 1692 pa &= PMAP_PA_MASK; /* nuke flags from pa */ 1693 1694#ifdef DIAGNOSTIC 1695 /* sanity check: totally out of range? */ 1696 if (va >= VM_MAX_KERNEL_ADDRESS) 1697 panic("pmap_enter_pae: too big"); 1698 1699 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 1700 panic("pmap_enter_pae: trying to map over PDP/APDP!"); 1701 1702 /* sanity check: kernel PTPs should already have been pre-allocated */ 1703 if (va >= VM_MIN_KERNEL_ADDRESS && 1704 !pmap_valid_entry(PDE(pmap, pdei(va)))) 1705 panic("pmap_enter_pae: missing kernel PTP!"); 1706#endif 1707 1708 if (pmap_initialized) 1709 pve = pool_get(&pmap_pv_pool, PR_NOWAIT); 1710 else 1711 pve = NULL; 1712 wired_count = resident_count = ptp_count = 0; 1713 1714 /* 1715 * map in ptes and get a pointer to our PTP (unless we are the kernel) 1716 */ 1717 1718 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */ 1719 if (pmap == pmap_kernel()) { 1720 ptp = NULL; 1721 } else { 1722 ptp = pmap_get_ptp_pae(pmap, pdei(va)); 1723 if (ptp == NULL) { 1724 if (flags & PMAP_CANFAIL) { 1725 error = ENOMEM; 1726 pmap_unmap_ptes_pae(pmap); 1727 goto out; 1728 } 1729 panic("pmap_enter_pae: get ptp failed"); 1730 } 1731 } 1732 /* 1733 * not allowed to sleep after here! 1734 */ 1735 opte = ptes[atop(va)]; /* old PTE */ 1736 1737 /* 1738 * is there currently a valid mapping at our VA? 1739 */ 1740 1741 if (pmap_valid_entry(opte)) { 1742 1743 /* 1744 * first, calculate pm_stats updates. resident count will not 1745 * change since we are replacing/changing a valid 1746 * mapping. wired count might change... 1747 */ 1748 1749 if (wired && (opte & PG_W) == 0) 1750 wired_count++; 1751 else if (!wired && (opte & PG_W) != 0) 1752 wired_count--; 1753 1754 /* 1755 * is the currently mapped PA the same as the one we 1756 * want to map? 1757 */ 1758 1759 if ((opte & PG_FRAME) == pa) { 1760 1761 /* if this is on the PVLIST, sync R/M bit */ 1762 if (opte & PG_PVLIST) { 1763 pg = PHYS_TO_VM_PAGE(pa); 1764#ifdef DIAGNOSTIC 1765 if (pg == NULL) 1766 panic("pmap_enter_pae: same pa " 1767 "PG_PVLIST mapping with " 1768 "unmanaged page " 1769 "pa = 0x%lx (0x%lx)", pa, 1770 atop(pa)); 1771#endif 1772 pmap_sync_flags_pte_pae(pg, opte); 1773 } 1774 goto enter_now; 1775 } 1776 1777 /* 1778 * changing PAs: we must remove the old one first 1779 */ 1780 1781 /* 1782 * if current mapping is on a pvlist, 1783 * remove it (sync R/M bits) 1784 */ 1785 1786 if (opte & PG_PVLIST) { 1787 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1788#ifdef DIAGNOSTIC 1789 if (pg == NULL) 1790 panic("pmap_enter_pae: PG_PVLIST mapping with " 1791 "unmanaged page " 1792 "pa = 0x%lx (0x%lx)", pa, atop(pa)); 1793#endif 1794 pmap_sync_flags_pte_pae(pg, opte); 1795 opve = pmap_remove_pv(pg, pmap, va); 1796 pg = NULL; /* This is not the page we are looking for */ 1797 } 1798 } else { /* opte not valid */ 1799 resident_count++; 1800 if (wired) 1801 wired_count++; 1802 if (ptp) 1803 ptp_count++; /* count # of valid entries */ 1804 } 1805 1806 /* 1807 * pve is either NULL or points to a now-free pv_entry structure 1808 * (the latter case is if we called pmap_remove_pv above). 1809 * 1810 * if this entry is to be on a pvlist, enter it now. 1811 */ 1812 1813 if (pmap_initialized && pg == NULL) 1814 pg = PHYS_TO_VM_PAGE(pa); 1815 1816 if (pg != NULL) { 1817 if (pve == NULL) { 1818 pve = opve; 1819 opve = NULL; 1820 } 1821 if (pve == NULL) { 1822 if (flags & PMAP_CANFAIL) { 1823 pmap_unmap_ptes_pae(pmap); 1824 error = ENOMEM; 1825 goto out; 1826 } 1827 panic("pmap_enter_pae: no pv entries available"); 1828 } 1829 /* lock pg when adding */ 1830 pmap_enter_pv(pg, pve, pmap, va, ptp); 1831 pve = NULL; 1832 } 1833 1834enter_now: 1835 /* 1836 * at this point pg is !NULL if we want the PG_PVLIST bit set 1837 */ 1838 1839 npte = pa | protection_codes[prot] | PG_V; 1840 if (!(prot & PROT_EXEC)) 1841 npte |= PG_NX; 1842 pmap_exec_account(pmap, va, opte, npte); 1843 if (wired) 1844 npte |= PG_W; 1845 if (nocache) 1846 npte |= PG_N; 1847 if (va < VM_MAXUSER_ADDRESS) 1848 npte |= PG_u; 1849 else if (va < VM_MAX_ADDRESS) 1850 npte |= PG_RW; /* XXXCDC: no longer needed? */ 1851 if (pmap == pmap_kernel()) 1852 npte |= pmap_pg_g; 1853 if (flags & PROT_READ) 1854 npte |= PG_U; 1855 if (flags & PROT_WRITE) 1856 npte |= PG_M; 1857 if (pg) { 1858 npte |= PG_PVLIST; 1859 if (pg->pg_flags & PG_PMAP_WC) { 1860 KASSERT(nocache == 0); 1861 wc = 1; 1862 } 1863 pmap_sync_flags_pte_pae(pg, npte); 1864 } 1865 if (wc) 1866 npte |= pmap_pg_wc; 1867 1868 opte = i386_atomic_testset_uq(&ptes[atop(va)], npte); 1869 if (ptp) 1870 ptp->wire_count += ptp_count; 1871 pmap->pm_stats.resident_count += resident_count; 1872 pmap->pm_stats.wired_count += wired_count; 1873 1874 if (pmap_valid_entry(opte)) { 1875 if (nocache && (opte & PG_N) == 0) 1876 wbinvd_on_all_cpus(); /* XXX clflush before we enter? */ 1877 pmap_tlb_shootpage(pmap, va); 1878 } 1879 1880 pmap_unmap_ptes_pae(pmap); 1881 pmap_tlb_shootwait(); 1882 1883 error = 0; 1884 1885out: 1886 if (pve) 1887 pool_put(&pmap_pv_pool, pve); 1888 if (opve) 1889 pool_put(&pmap_pv_pool, opve); 1890 1891 return error; 1892} 1893 1894/* 1895 * Allocate an extra PDPT and PT pages as needed to map kernel pages 1896 * used for the U-K mappings. These special mappings are set up 1897 * during bootstrap and get never removed and are part of pmap_kernel. 1898 * 1899 * New pmaps inherit the kernel portion of pmap_kernel including 1900 * the special mappings (see pmap_pinit_pd_pae()). 1901 */ 1902void 1903pmap_enter_special_pae(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags) 1904{ 1905 struct pmap *pmap = pmap_kernel(); 1906 struct vm_page *ptppg = NULL, *pdppg; 1907 pd_entry_t *pd, *ptp; 1908 pt_entry_t *ptes; 1909 uint32_t l2idx, l1idx; 1910 vaddr_t vapd; 1911 paddr_t npa; 1912 int i; 1913 1914 /* If CPU is secure, no need to do anything */ 1915 if (!cpu_meltdown) 1916 return; 1917 1918 /* Must be kernel VA */ 1919 if (va < VM_MIN_KERNEL_ADDRESS) 1920 panic("%s: invalid special mapping va 0x%lx requested", 1921 __func__, va); 1922 1923 if (!pmap->pm_pdir_intel) { 1924 if ((vapd = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0) 1925 panic("%s: kernel_map out of virtual space!", __func__); 1926 pmap->pm_pdir_intel = vapd; 1927 if (!pmap_extract(pmap, (vaddr_t)&pmap->pm_pdidx_intel, 1928 &pmap->pm_pdirpa_intel)) 1929 panic("%s: can't locate PDPT", __func__); 1930 1931 for (i = 0; i < 4; i++) { 1932 pmap->pm_pdidx_intel[i] = 0; 1933 if (!pmap_extract(pmap, vapd + i*NBPG, 1934 (paddr_t *)&pmap->pm_pdidx_intel[i])) 1935 panic("%s: can't locate PD page", __func__); 1936 1937 /* ensure PDPs are wired down XXX hshoexer why? */ 1938 pdppg = PHYS_TO_VM_PAGE(pmap->pm_pdidx_intel[i]); 1939 if (pdppg == NULL) 1940 panic("%s: no vm_page for pdidx %d", __func__, i); 1941 atomic_clearbits_int(&pdppg->pg_flags, PG_BUSY); 1942 pdppg->wire_count = 1; /* no mappings yet */ 1943 1944 pmap->pm_pdidx_intel[i] |= PG_V; 1945 1946 DPRINTF("%s: pm_pdidx_intel[%d] = 0x%llx\n", __func__, 1947 i, pmap->pm_pdidx_intel[i]); 1948 } 1949 } 1950 1951 DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__, 1952 (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel); 1953 1954 /* These are the PAE versions of pdei() and ptei() */ 1955 l2idx = pdei(va); 1956 l1idx = ptei(va); 1957 1958 DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x " 1959 "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot, 1960 flags, l2idx, l1idx); 1961 1962 if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == 0) 1963 panic("%s: PD not initialized for pmap @ %p", __func__, pmap); 1964 1965 /* npa = physaddr of PT page */ 1966 npa = pd[l2idx] & PMAP_PA_MASK; 1967 1968 /* Valid PDE for the 2MB region containing va? */ 1969 if (!npa) { 1970 /* 1971 * No valid PDE - allocate PT page and set PDE. We 1972 * get it from pm_obj, which is used for PT pages. 1973 * We calculate the offset from l2idx+2048, so we are 1974 * beyond the regular PT pages. For their l2dix 1975 * 0 <= l2idx < 2048 holds. 1976 */ 1977 ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 2048), 1978 NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1979 if (ptppg == NULL) 1980 panic("%s: failed to allocate PT page", __func__); 1981 1982 atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY); 1983 ptppg->wire_count = 1; /* no mappings yet */ 1984 1985 npa = VM_PAGE_TO_PHYS(ptppg); 1986 pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U); 1987 1988 DPRINTF("%s: allocated new PT page at phys 0x%x, " 1989 "setting PDE[%d] = 0x%llx\n", __func__, (uint32_t)npa, 1990 l2idx, pd[l2idx]); 1991 } 1992 1993 /* temporarily map PT page and set PTE for U-K mapping */ 1994 if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL) 1995 panic("%s: no vm_page for PT page", __func__); 1996 mtx_enter(&ptppg->mdpage.pv_mtx); 1997 ptp = (pd_entry_t *)pmap_tmpmap_pa(npa); 1998 ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags); 1999 DPRINTF("%s: setting PTE[%d] = 0x%llx\n", __func__, l1idx, ptp[l1idx]); 2000 pmap_tmpunmap_pa(); 2001 mtx_leave(&ptppg->mdpage.pv_mtx); 2002 2003 /* if supported, set the PG_G flag on the corresponding U+K entry */ 2004 if (!(cpu_feature & CPUID_PGE)) 2005 return; 2006 ptes = pmap_map_ptes_pae(pmap); /* pmap_kernel -> PTE_BASE */ 2007 if (pmap_valid_entry(ptes[atop(va)])) 2008 ptes[atop(va)] |= PG_G; 2009 else 2010 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__); 2011 pmap_unmap_ptes_pae(pmap); /* pmap_kernel -> nothing */ 2012} 2013 2014/* 2015 * pmap_growkernel: increase usage of KVM space 2016 * 2017 * => we allocate new PTPs for the kernel and install them in all 2018 * the pmaps on the system. 2019 */ 2020 2021vaddr_t 2022pmap_growkernel_pae(vaddr_t maxkvaddr) 2023{ 2024 extern int nkpde; 2025 struct pmap *kpm = pmap_kernel(), *pm; 2026 int needed_kpde; /* needed number of kernel PTPs */ 2027 int s; 2028 paddr_t ptaddr; 2029 2030 needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1)) 2031 / NBPD; 2032 if (needed_kpde <= nkpde) 2033 goto out; /* we are OK */ 2034 2035 /* 2036 * whoops! we need to add kernel PTPs 2037 */ 2038 2039 s = splhigh(); /* to be safe */ 2040 2041 for (/*null*/ ; nkpde < needed_kpde ; nkpde++) { 2042 2043 if (uvm.page_init_done == 0) { 2044 2045 /* 2046 * we're growing the kernel pmap early (from 2047 * uvm_pageboot_alloc()). this case must be 2048 * handled a little differently. 2049 */ 2050 2051 if (uvm_page_physget(&ptaddr) == 0) 2052 panic("pmap_growkernel: out of memory"); 2053 pmap_zero_phys_pae(ptaddr); 2054 2055 PDE(kpm, PDSLOT_KERN + nkpde) = 2056 ptaddr | PG_RW | PG_V | PG_U | PG_M; 2057 2058 /* count PTP as resident */ 2059 kpm->pm_stats.resident_count++; 2060 continue; 2061 } 2062 2063 /* 2064 * THIS *MUST* BE CODED SO AS TO WORK IN THE 2065 * pmap_initialized == 0 CASE! WE MAY BE 2066 * INVOKED WHILE pmap_init() IS RUNNING! 2067 */ 2068 2069 while (!pmap_alloc_ptp_pae(kpm, PDSLOT_KERN + nkpde, 0)) 2070 uvm_wait("pmap_growkernel"); 2071 2072 /* distribute new kernel PTP to all active pmaps */ 2073 mtx_enter(&pmaps_lock); 2074 LIST_FOREACH(pm, &pmaps, pm_list) { 2075 PDE(pm, PDSLOT_KERN + nkpde) = 2076 PDE(kpm, PDSLOT_KERN + nkpde); 2077 } 2078 mtx_leave(&pmaps_lock); 2079 } 2080 2081 splx(s); 2082 2083out: 2084 return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD)); 2085} 2086 2087/* 2088 * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various 2089 * trampoline code can be entered. 2090 */ 2091void 2092pmap_prealloc_lowmem_ptp_pae(void) 2093{ 2094 pt_entry_t *pte, npte; 2095 vaddr_t ptpva = (vaddr_t)vtopte(0); 2096 2097 /* enter pa for pte 0 into recursive map */ 2098 pte = vtopte(ptpva); 2099 npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M; 2100 2101 i386_atomic_testset_uq(pte, npte); 2102 2103 /* make sure it is clean before using */ 2104 memset((void *)ptpva, 0, NBPG); 2105} 2106 2107/* 2108 * pmap_tmpmap_pa_pae: map a page in for tmp usage 2109 */ 2110 2111vaddr_t 2112pmap_tmpmap_pa_pae(paddr_t pa) 2113{ 2114#ifdef MULTIPROCESSOR 2115 int id = cpu_number(); 2116#endif 2117 pt_entry_t *ptpte = PTESLEW(ptp_pte, id); 2118 caddr_t ptpva = VASLEW(pmap_ptpp, id); 2119#if defined(DIAGNOSTIC) 2120 if (*ptpte) 2121 panic("pmap_tmpmap_pa_pae: ptp_pte in use?"); 2122#endif 2123 *ptpte = PG_V | PG_RW | pa; /* always a new mapping */ 2124 return((vaddr_t)ptpva); 2125} 2126 2127/* 2128 * pmap_tmpunmap_pa_pae: unmap a tmp use page (undoes pmap_tmpmap_pa_pae) 2129 */ 2130 2131void 2132pmap_tmpunmap_pa_pae(void) 2133{ 2134#ifdef MULTIPROCESSOR 2135 int id = cpu_number(); 2136#endif 2137 pt_entry_t *ptpte = PTESLEW(ptp_pte, id); 2138 caddr_t ptpva = VASLEW(pmap_ptpp, id); 2139#if defined(DIAGNOSTIC) 2140 if (!pmap_valid_entry(*ptpte)) 2141 panic("pmap_tmpunmap_pa_pae: our pte invalid?"); 2142#endif 2143 *ptpte = 0; 2144 pmap_update_pg((vaddr_t)ptpva); 2145#ifdef MULTIPROCESSOR 2146 /* 2147 * No need for tlb shootdown here, since ptp_pte is per-CPU. 2148 */ 2149#endif 2150} 2151 2152paddr_t 2153vtophys_pae(vaddr_t va) 2154{ 2155 return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME)); 2156} 2157 2158void 2159pmap_flush_page_pae(paddr_t pa) 2160{ 2161#ifdef MULTIPROCESSOR 2162 int id = cpu_number(); 2163#endif 2164 pt_entry_t *pte = PTESLEW(flsh_pte, id); 2165 caddr_t va = VASLEW(pmap_flshp, id); 2166 2167 KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL); 2168#ifdef DIAGNOSTIC 2169 if (*pte) 2170 panic("pmap_flush_page_pae: lock botch"); 2171#endif 2172 2173 *pte = (pa & PG_FRAME) | PG_V | PG_RW; 2174 pmap_update_pg(va); 2175 pmap_flush_cache((vaddr_t)va, PAGE_SIZE); 2176 *pte = 0; 2177 pmap_update_pg(va); 2178} 2179