1/*	$OpenBSD: pmapae.c,v 1.72 2024/05/30 10:56:24 mpi Exp $	*/
2
3/*
4 * Copyright (c) 2006-2008 Michael Shalayeff
5 * All rights reserved.
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER IN
16 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
17 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19/*
20 * Copyright (c) 1997 Charles D. Cranor and Washington University.
21 * All rights reserved.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the above copyright
27 *    notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 *    notice, this list of conditions and the following disclaimer in the
30 *    documentation and/or other materials provided with the distribution.
31 *
32 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
33 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
36 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 *
43 *	from OpenBSD: pmap.c,v 1.85 2005/11/18 17:05:04 brad Exp
44 */
45
46/*
47 * pmap.c: i386 pmap module rewrite
48 * Chuck Cranor <chuck@ccrc.wustl.edu>
49 * 11-Aug-97
50 *
51 * history of this pmap module: in addition to my own input, i used
52 *    the following references for this rewrite of the i386 pmap:
53 *
54 * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
55 *     BSD hp300 pmap done by Mike Hibler at University of Utah.
56 *     it was then ported to the i386 by William Jolitz of UUNET
57 *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
58 *     project fixed some bugs and provided some speed ups.
59 *
60 * [2] the FreeBSD i386 pmap.   this pmap seems to be the
61 *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
62 *     and David Greenman.
63 *
64 * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
65 *     between several processors.   the VAX version was done by
66 *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
67 *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
68 *     David Golub, and Richard Draves.    the alpha version was
69 *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
70 *     (NetBSD/alpha).
71 */
72/*
73 * PAE support
74 * Michael Shalayeff <mickey@lucifier.net>
75 *
76 * This module implements PAE mode for i386.
77 *
78 */
79
80#include <sys/param.h>
81#include <sys/systm.h>
82#include <sys/atomic.h>
83#include <sys/pool.h>
84#include <sys/user.h>
85#include <sys/mutex.h>
86
87#include <uvm/uvm.h>
88
89#include <machine/specialreg.h>
90
91#include <dev/isa/isareg.h>
92#include <i386/isa/isa_machdep.h>
93
94#include "ksyms.h"
95
96/* #define PMAPAE_DEBUG */
97
98#ifdef PMAPAE_DEBUG
99#define DPRINTF(x...)	do { printf(x); } while(0)
100#else
101#define DPRINTF(x...)
102#endif	/* PMAPAE_DEBUG */
103
104/*
105 * this file contains the code for the "pmap module."   the module's
106 * job is to manage the hardware's virtual to physical address mappings.
107 * note that there are two levels of mapping in the VM system:
108 *
109 *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
110 *      to map ranges of virtual address space to objects/files.  for
111 *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
112 *      to the file /bin/ls starting at offset zero."   note that
113 *      the upper layer mapping is not concerned with how individual
114 *      vm_pages are mapped.
115 *
116 *  [2] the lower layer of the VM system (the pmap) maintains the mappings
117 *      from virtual addresses.   it is concerned with which vm_page is
118 *      mapped where.   for example, when you run /bin/ls and start
119 *      at page 0x1000 the fault routine may lookup the correct page
120 *      of the /bin/ls file and then ask the pmap layer to establish
121 *      a mapping for it.
122 *
123 * note that information in the lower layer of the VM system can be
124 * thrown away since it can easily be reconstructed from the info
125 * in the upper layer.
126 *
127 * data structures we use include:
128 *
129 *  - struct pmap: describes the address space of one thread
130 *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
131 *  - struct pv_head: there is one pv_head per managed page of
132 *	physical memory.   the pv_head points to a list of pv_entry
133 *	structures which describe all the <PMAP,VA> pairs that this
134 *      page is mapped in.    this is critical for page based operations
135 *      such as pmap_page_protect() [change protection on _all_ mappings
136 *      of a page]
137 */
138/*
139 * i386 PAE hardware Page Tables structure:
140 *
141 * the i386 PAE Page Table is a three-level PT which maps 4GB of VA.
142 * the pagesize is 4K (4096 [0x1000] bytes) or 2MB.
143 *
144 * the first level table is called "page directory index" and consists
145 * of 4 page directory index entries (PDIE) each 64 bits in size.
146 *
147 * the second level table is called a "page directory" and it contains
148 * 512 page directory entries (PDEs).   each PDE is
149 * 8 bytes (a long long), so a PD fits in a single 4K page.   this page is
150 * the page directory page (PDP).  each PDE in a PDP maps 1GB of space
151 * (512 * 2MB = 1GB).   a PDE contains the physical address of the
152 * second level table: the page table.   or, if 2MB pages are being used,
153 * then the PDE contains the PA of the 2MB page being mapped.
154 *
155 * a page table consists of 512 page table entries (PTEs).  each PTE is
156 * 8 bytes (a long long), so a page table also fits in a single 4K page.
157 * a 4K page being used as a page table is called a page table page (PTP).
158 * each PTE in a PTP maps one 4K page (512 * 4K = 2MB).   a PTE contains
159 * the physical address of the page it maps and some flag bits (described
160 * below).
161 *
162 * the processor has a special register, "cr3", which points to the
163 * the PDP which is currently controlling the mappings of the virtual
164 * address space.
165 *
166 * the following picture shows the translation process for a 4K page:
167 *
168 * %cr3 register [PA of PDPT]
169 *  |
170 *  |  bits <31-30> of VA
171 *  |  index the DPE (0-3)
172 *  |        |
173 *  v        v
174 *  +-----------+
175 *  |  PDP Ptr  |
176 *  | 4 entries |
177 *  +-----------+
178 *       |
179 *    PA of PDP
180 *       |
181 *       |
182 *       |  bits <29-21> of VA       bits <20-12> of VA   bits <11-0>
183 *       |  index the PDP (0 - 512)  index the PTP        are the page offset
184 *       |        |                         |                    |
185 *       |        v                         |                    |
186 *       +-->+---------+                    |                    |
187 *           | PD Page |    PA of           v                    |
188 *           |         |-----PTP----->+------------+             |
189 *           | 512 PDE |              | page table |--PTE--+     |
190 *           | entries |              | (aka PTP)  |       |     |
191 *           +---------+              |  512 PTE   |       |     |
192 *                                    |  entries   |       |     |
193 *                                    +------------+       |     |
194 *                                                         |     |
195 *                                              bits <35-12>   bits <11-0>
196 *                                               p h y s i c a l  a d d r
197 *
198 * the i386 caches PTEs in a TLB.   it is important to flush out old
199 * TLB mappings when making a change to a mapping.   writing to the
200 * %cr3 will flush the entire TLB.    newer processors also have an
201 * instruction that will invalidate the mapping of a single page (which
202 * is useful if you are changing a single mapping because it preserves
203 * all the cached TLB entries).
204 *
205 * as shows, bits 31-12 of the PTE contain PA of the page being mapped.
206 * the rest of the PTE is defined as follows:
207 *   bit#	name	use
208 *   63		NX	no-execute bit (0=ITLB, 1=DTLB), optional
209 *   11		n/a	available for OS use, hardware ignores it
210 *   10		n/a	available for OS use, hardware ignores it
211 *   9		n/a	available for OS use, hardware ignores it
212 *   8		G	global bit (see discussion below)
213 *   7		PS	page size [for PDEs] (0=4k, 1=4M <if supported>)
214 *   6		D	dirty (modified) page
215 *   5		A	accessed (referenced) page
216 *   4		PCD	cache disable
217 *   3		PWT	prevent write through (cache)
218 *   2		U/S	user/supervisor bit (0=supervisor only, 1=both u&s)
219 *   1		R/W	read/write bit (0=read only, 1=read-write)
220 *   0		P	present (valid)
221 *
222 * notes:
223 *  - on the i386 the R/W bit is ignored if processor is in supervisor
224 *    state (bug!)
225 *  - PS is only supported on newer processors
226 *  - PTEs with the G bit are global in the sense that they are not
227 *    flushed from the TLB when %cr3 is written (to flush, use the
228 *    "flush single page" instruction).   this is only supported on
229 *    newer processors.    this bit can be used to keep the kernel's
230 *    TLB entries around while context switching.   since the kernel
231 *    is mapped into all processes at the same place it does not make
232 *    sense to flush these entries when switching from one process'
233 *    pmap to another.
234 */
235/*
236 * A pmap describes a process' 4GB virtual address space.  This
237 * virtual address space can be broken up into 2048 2MB regions which
238 * are described by PDEs in the PDP.  The PDEs are defined as follows:
239 *
240 * Ranges are inclusive -> exclusive, just like vm_map_entry start/end.
241 * The following assumes that KERNBASE is 0xd0000000.
242 *
243 * PDE#s	VA range		Usage
244 * 0->1660	0x0 -> 0xcf800000	user address space, note that the
245 *					max user address is 0xcfbfe000
246 *					the final two pages in the last 4MB
247 *					used to be reserved for the UAREA
248 *					but now are no longer used.
249 * 1660		0xcf800000->		recursive mapping of PDP (used for
250 *			0xd0000000	linear mapping of PTPs).
251 * 1664->2044	0xd0000000->		kernel address space (constant
252 *			0xff800000	across all pmaps/processes).
253 * 2044		0xff800000->		"alternate" recursive PDP mapping
254 *			<end>		(for other pmaps).
255 *
256 *
257 * Note: A recursive PDP mapping provides a way to map all the PTEs for
258 * a 4GB address space into a linear chunk of virtual memory.  In other
259 * words, the PTE for page 0 is the first 8b mapped into the 2MB recursive
260 * area.  The PTE for page 1 is the second 8b.  The very last 8b in the
261 * 2MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
262 * address).
263 *
264 * All pmaps' PDs must have the same values in slots 1660->2043 so that
265 * the kernel is always mapped in every process.  These values are loaded
266 * into the PD at pmap creation time.
267 *
268 * At any one time only one pmap can be active on a processor.  This is
269 * the pmap whose PDP is pointed to by processor register %cr3.  This pmap
270 * will have all its PTEs mapped into memory at the recursive mapping
271 * point (slots #1660-3 as show above).  When the pmap code wants to find the
272 * PTE for a virtual address, all it has to do is the following:
273 *
274 * Address of PTE = (1660 * 2MB) + (VA / NBPG) * sizeof(pt_entry_t)
275 *                = 0xcf800000 + (VA / 4096) * 8
276 *
277 * What happens if the pmap layer is asked to perform an operation
278 * on a pmap that is not the one which is currently active?  In that
279 * case we take the PA of the PDP of the non-active pmap and put it in
280 * slots 2044-7 of the active pmap.  This causes the non-active pmap's
281 * PTEs to get mapped in the final 4MB of the 4GB address space
282 * (e.g. starting at 0xffc00000).
283 *
284 * The following figure shows the effects of the recursive PDP mapping:
285 *
286 *   PDP (%cr3->PDPTP)
287 *   +----+
288 *   |   0| -> PTP#0 that maps VA 0x0 -> 0x200000
289 *   |    |
290 *   |    |
291 *   |1660| -> points back to PDP (%cr3) mapping VA 0xcf800000 -> 0xd0000000
292 *   |1661|    (PDP is 4 pages)
293 *   |1662|
294 *   |1663|
295 *   |1664| -> first kernel PTP (maps 0xd0000000 -> 0xe0200000)
296 *   |    |
297 *   |2044| -> points to alternate pmap's PDP (maps 0xff800000 -> end)
298 *   |2045|
299 *   |2046|
300 *   |2047|
301 *   +----+
302 *
303 * Note that the PDE#1660 VA (0xcf8033e0) is defined as "PTE_BASE".
304 * Note that the PDE#2044 VA (0xff803fe0) is defined as "APTE_BASE".
305 *
306 * Starting at VA 0xcf8033e0 the current active PDPs (%cr3) act as a
307 * PDPTP and reference four consecutively mapped pages:
308 *
309 * PTP#1660-3 == PDP(%cr3) => maps VA 0xcf800000 -> 0xd0000000
310 *   +----+
311 *   |   0| -> maps the contents of PTP#0 at VA 0xcf800000->0xcf801000
312 *   |    |
313 *   |    |
314 *   |1660| -> maps the contents of PTP#1660 (the PDP) at VA 0xcfe7c000
315 *   |1661|
316 *   |1662|
317 *   |1663|
318 *   |1664| -> maps the contents of first kernel PTP
319 *   |    |
320 *   |2047|
321 *   +----+
322 *
323 * Note that mapping of the PDP at PTP#1660's VA (0xcfe7c000) is
324 * defined as "PDP_BASE".... within that mapping there are two
325 * defines:
326 *   "PDP_PDE" (0xcfe7f3e0) is the VA of the PDE in the PDP
327 *      which points back to itself.
328 *   "APDP_PDE" (0xfff02fe0) is the VA of the PDE in the PDP which
329 *      establishes the recursive mapping of the alternate pmap.
330 *      To set the alternate PDP, one just has to put the correct
331 *	PA info in *APDP_PDE.
332 *
333 * Note that in the APTE_BASE space, the APDP appears at VA
334 * "APDP_BASE" (0xffffc000).
335 *
336 * unfortunately, we cannot use recursive PDPT from the page tables
337 * because cr3 is only 32 bits wide.
338 *
339 */
340#define PG_FRAME	0xffffff000ULL	/* page frame mask */
341#define PG_LGFRAME	0xfffe00000ULL	/* large (2M) page frame mask */
342
343/*
344 * Redefine the PDSHIFT and NBPD macros for PAE
345 */
346#undef PDSHIFT
347#define PDSHIFT		21		/* page directory address shift */
348#undef NBPD
349#define NBPD		(1U << PDSHIFT)	/* # bytes mapped by PD (2MB) */
350
351#define PDSHIFT86	22		/* for pmap86 transfer */
352
353#undef PDSLOT_PTE
354#define PDSLOT_PTE	(1660U)	/* 1660: for recursive PDP map */
355#undef PDSLOT_KERN
356#define PDSLOT_KERN	(1664U)	/* 1664: start of kernel space */
357#undef PDSLOT_APTE
358#define PDSLOT_APTE	(2044U)	/* 2044: alternative recursive slot */
359
360/*
361 * The following defines give the virtual addresses of various MMU
362 * data structures:
363 * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
364 * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
365 */
366#define PTE_BASE	((pt_entry_t *) (PDSLOT_PTE * NBPD))
367#define APTE_BASE	((pt_entry_t *) (PDSLOT_APTE * NBPD))
368#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG)))
369#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG)))
370#define PDP_PDE		(PDP_BASE + PDSLOT_PTE)
371#define APDP_PDE	(PDP_BASE + PDSLOT_APTE)
372
373/*
374 * pdei/ptei: generate index into PDP/PTP from a VA
375 */
376#define PD_MASK		0xffe00000	/* page directory address bits */
377#define PT_MASK		0x001ff000	/* page table address bits */
378#define pdei(VA)	(((VA) & PD_MASK) >> PDSHIFT)
379#define ptei(VA)	(((VA) & PT_MASK) >> PGSHIFT)
380
381#define PD_MASK86	0xffc00000	/* for pmap86 transfer */
382#define PT_MASK86	0x003ff000	/* for pmap86 transfer */
383
384/*
385 * Mach derived conversion macros
386 */
387#define i386_round_pdr(x)	((((unsigned)(x)) + ~PD_MASK) & PD_MASK)
388
389/*
390 * various address macros
391 *
392 *  vtopte: return a pointer to the PTE mapping a VA
393 */
394#define vtopte(VA)	(PTE_BASE + atop((vaddr_t)VA))
395
396/*
397 * PTP macros:
398 *   A PTP's index is the PD index of the PDE that points to it.
399 *   A PTP's offset is the byte-offset in the PTE space that this PTP is at.
400 *   A PTP's VA is the first VA mapped by that PTP.
401 *
402 * Note that NBPG == number of bytes in a PTP (4096 bytes == 512 entries)
403 *           NBPD == number of bytes a PTP can map (2MB)
404 */
405
406#define ptp_i2o(I)	((I) * NBPG)	/* index => offset */
407#define ptp_o2i(O)	((O) / NBPG)	/* offset => index */
408#define ptp_i2v(I)	((I) * NBPD)	/* index => VA */
409#define ptp_v2i(V)	((V) / NBPD)	/* VA => index (same as pdei) */
410
411/*
412 * Access PD and PT
413 */
414#define PDE(pm,i)	(((pd_entry_t *)(pm)->pm_pdir)[(i)])
415
416/*
417 * here we define the data types for PDEs and PTEs for PAE
418 */
419typedef u_int64_t pd_entry_t;		/* PDE */
420typedef u_int64_t pt_entry_t;		/* PTE */
421
422#define PG_NX	0x8000000000000000ULL	/* execute-disable */
423
424/*
425 * Number of PTEs per cache line. 8 byte pte, 64-byte cache line
426 * Used to avoid false sharing of cache lines.
427 */
428#define NPTECL			8
429
430/*
431 * other data structures
432 */
433
434extern u_int32_t protection_codes[];	/* maps MI prot to i386 prot code */
435extern int pmap_initialized;	/* pmap_init done yet? */
436
437/* Segment boundaries */
438extern vaddr_t kernel_text, etext, __rodata_start, erodata, __data_start;
439extern vaddr_t edata, __bss_start, end, ssym, esym, PTmap;
440
441/*
442 * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a
443 * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing
444 * due to false sharing.
445 */
446
447#ifdef MULTIPROCESSOR
448#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
449#define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG)
450#else
451#define PTESLEW(pte, id) (pte)
452#define VASLEW(va,id) (va)
453#endif
454
455/*
456 * special VAs and the PTEs that map them
457 */
458
459static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte;
460extern caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp;
461
462extern int pmap_pg_g;
463extern int pmap_pg_wc;
464extern struct pmap_head pmaps;
465extern struct mutex pmaps_lock;
466
467extern uint32_t	cpu_meltdown;
468
469/*
470 * local prototypes
471 */
472struct vm_page	*pmap_alloc_ptp_pae(struct pmap *, int, pt_entry_t);
473struct vm_page	*pmap_get_ptp_pae(struct pmap *, int);
474void		 pmap_drop_ptp_pae(struct pmap *, vaddr_t, struct vm_page *,
475    pt_entry_t *);
476pt_entry_t	*pmap_map_ptes_pae(struct pmap *);
477void		 pmap_unmap_ptes_pae(struct pmap *);
478void		 pmap_do_remove_pae(struct pmap *, vaddr_t, vaddr_t, int);
479void		 pmap_remove_ptes_pae(struct pmap *, struct vm_page *,
480		     vaddr_t, vaddr_t, vaddr_t, int, struct pv_entry **);
481void		 pmap_sync_flags_pte_pae(struct vm_page *, pt_entry_t);
482
483static __inline u_int
484pmap_pte2flags(pt_entry_t pte)
485{
486	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
487	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
488}
489
490void
491pmap_sync_flags_pte_pae(struct vm_page *pg, pt_entry_t pte)
492{
493	if (pte & (PG_U|PG_M)) {
494		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
495	}
496}
497
498/*
499 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
500 *
501 * => we lock enough pmaps to keep things locked in
502 * => must be undone with pmap_unmap_ptes before returning
503 */
504
505pt_entry_t *
506pmap_map_ptes_pae(struct pmap *pmap)
507{
508	pd_entry_t opde;
509
510	/* the kernel's pmap is always accessible */
511	if (pmap == pmap_kernel()) {
512		return(PTE_BASE);
513	}
514
515	mtx_enter(&pmap->pm_mtx);
516
517	/* if curpmap then we are always mapped */
518	if (pmap_is_curpmap(pmap)) {
519		return(PTE_BASE);
520	}
521
522	mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx);
523
524	/* need to load a new alternate pt space into curpmap? */
525	opde = *APDP_PDE;
526#if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC)
527	if (pmap_valid_entry(opde))
528		panic("pmap_map_ptes_pae: APTE valid");
529#endif
530	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdidx[0]) {
531		APDP_PDE[0] = pmap->pm_pdidx[0] | PG_RW | PG_V | PG_U | PG_M;
532		APDP_PDE[1] = pmap->pm_pdidx[1] | PG_RW | PG_V | PG_U | PG_M;
533		APDP_PDE[2] = pmap->pm_pdidx[2] | PG_RW | PG_V | PG_U | PG_M;
534		APDP_PDE[3] = pmap->pm_pdidx[3] | PG_RW | PG_V | PG_U | PG_M;
535		if (pmap_valid_entry(opde))
536			pmap_apte_flush();
537	}
538	return(APTE_BASE);
539}
540
541/*
542 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
543 */
544
545void
546pmap_unmap_ptes_pae(struct pmap *pmap)
547{
548	if (pmap == pmap_kernel())
549		return;
550
551	if (!pmap_is_curpmap(pmap)) {
552#if defined(MULTIPROCESSOR)
553		APDP_PDE[0] = 0;
554		APDP_PDE[1] = 0;
555		APDP_PDE[2] = 0;
556		APDP_PDE[3] = 0;
557		pmap_apte_flush();
558#endif
559		mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx);
560	}
561
562	mtx_leave(&pmap->pm_mtx);
563}
564
565u_int32_t
566pmap_pte_set_pae(vaddr_t va, paddr_t pa, u_int32_t bits)
567{
568	pt_entry_t pte, *ptep = vtopte(va);
569	uint64_t nx;
570
571	pa &= PMAP_PA_MASK;
572
573	if (bits & PG_X)
574		nx = 0;
575	else
576		nx = PG_NX;
577
578	pte = i386_atomic_testset_uq(ptep, pa | bits | nx);  /* zap! */
579	return (pte & ~PG_FRAME);
580}
581
582u_int32_t
583pmap_pte_setbits_pae(vaddr_t va, u_int32_t set, u_int32_t clr)
584{
585	pt_entry_t *ptep = vtopte(va);
586	pt_entry_t pte = *ptep;
587
588	i386_atomic_testset_uq(ptep, (pte | set) & ~(pt_entry_t)clr);
589	return (pte & ~PG_FRAME);
590}
591
592u_int32_t
593pmap_pte_bits_pae(vaddr_t va)
594{
595	pt_entry_t *ptep = vtopte(va);
596
597	return (*ptep & ~PG_FRAME);
598}
599
600paddr_t
601pmap_pte_paddr_pae(vaddr_t va)
602{
603	pt_entry_t *ptep = vtopte(va);
604
605	return (*ptep & PG_FRAME);
606}
607
608/*
609 * Switch over to PAE page tables
610 */
611void
612pmap_bootstrap_pae(void)
613{
614	extern int nkpde;
615	struct pmap *kpm = pmap_kernel();
616	struct vm_page *ptp;
617	paddr_t ptaddr;
618	u_int32_t bits;
619	vaddr_t va, eva;
620	pt_entry_t pte;
621
622	if ((cpu_feature & CPUID_PAE) == 0 ||
623	    (ecpu_feature & CPUID_NXE) == 0)
624		return;
625
626	cpu_pae = 1;
627
628	DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n", __func__,
629	    (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa,
630	    kpm->pm_pdirsize);
631
632	va = (vaddr_t)kpm->pm_pdir;
633	kpm->pm_pdidx[0] = (va + 0*NBPG - KERNBASE) | PG_V;
634	kpm->pm_pdidx[1] = (va + 1*NBPG - KERNBASE) | PG_V;
635	kpm->pm_pdidx[2] = (va + 2*NBPG - KERNBASE) | PG_V;
636	kpm->pm_pdidx[3] = (va + 3*NBPG - KERNBASE) | PG_V;
637	/* map pde recursively into itself */
638	PDE(kpm, PDSLOT_PTE+0) = kpm->pm_pdidx[0] | PG_KW | PG_M | PG_U;
639	PDE(kpm, PDSLOT_PTE+1) = kpm->pm_pdidx[1] | PG_KW | PG_M | PG_U;
640	PDE(kpm, PDSLOT_PTE+2) = kpm->pm_pdidx[2] | PG_KW | PG_M | PG_U;
641	PDE(kpm, PDSLOT_PTE+3) = kpm->pm_pdidx[3] | PG_KW | PG_M | PG_U;
642
643	/* transfer all kernel mappings over into pae tables */
644	for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86);
645	    va < eva; va += PAGE_SIZE) {
646		if (!pmap_valid_entry(PDE(kpm, pdei(va)))) {
647			ptp = uvm_pagealloc(&kpm->pm_obj, va, NULL,
648			    UVM_PGA_ZERO);
649			if (ptp == NULL)
650				panic("%s: uvm_pagealloc() failed", __func__);
651			ptaddr = VM_PAGE_TO_PHYS(ptp);
652			PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V |
653			    PG_U | PG_M;
654			pmap_pte_set_86((vaddr_t)vtopte(va),
655			    ptaddr, PG_KW | PG_V | PG_U | PG_M);
656
657			/* count PTP as resident */
658			kpm->pm_stats.resident_count++;
659		}
660		bits = pmap_pte_bits_86(va) | pmap_pg_g;
661
662		/*
663		 * At this point, ideally only kernel text should be executable.
664		 * However, we need to leave the ISA hole executable to handle
665		 * bios32, pcibios, and apmbios calls that may potentially
666		 * happen later since we don't know (yet) which of those may be
667		 * in use. Later (in biosattach), we will reset the permissions
668		 * according to what we actually need.
669		 */
670		if ((va >= (vaddr_t)&kernel_text && va <= (vaddr_t)&etext) ||
671		    (va >= (vaddr_t)atdevbase && va <=
672		     (vaddr_t)(atdevbase + IOM_SIZE)))
673			bits |= PG_X;
674		else
675			bits &= ~PG_X;
676
677		if (pmap_valid_entry(bits))
678			pmap_pte_set_pae(va, pmap_pte_paddr_86(va), bits);
679	}
680
681	/* Transfer special mappings */
682	if (kpm->pm_pdir_intel) {
683		uint32_t	*pd, *ptp;
684		uint32_t	 l1idx, l2idx;
685		paddr_t		 npa;
686		struct vm_page	*ptppg;
687
688		pd = (uint32_t *)kpm->pm_pdir_intel;
689		kpm->pm_pdir_intel = kpm->pm_pdirpa_intel = 0;
690
691		for (va = KERNBASE, eva = va + (nkpde << PDSHIFT86); va < eva;
692		    va += PAGE_SIZE) {
693			l1idx = ((va & PT_MASK86) >> PGSHIFT);
694			l2idx = ((va & PD_MASK86) >> PDSHIFT86);
695
696			if (!pmap_valid_entry(pd[l2idx]))
697				continue;
698
699			npa = pd[l2idx]	& PMAP_PA_MASK;
700			ptppg = PHYS_TO_VM_PAGE(npa);
701			mtx_enter(&ptppg->mdpage.pv_mtx);
702
703			/* still running on pmap86 */
704			ptp = (uint32_t *)pmap_tmpmap_pa_86(npa);
705
706			if (!pmap_valid_entry(ptp[l1idx])) {
707				mtx_leave(&ptppg->mdpage.pv_mtx);
708				pmap_tmpunmap_pa_86();
709				continue;
710			}
711			DPRINTF("%s: va 0x%x l2idx %u 0x%x lx1idx %u 0x%x\n",
712			    __func__, (uint32_t)va, l2idx, (uint32_t)pd[l2idx],
713			    l1idx, (uint32_t)ptp[l1idx]);
714
715			/* protection and cacheability */
716			bits = ptp[l1idx] & (PG_PROT|PG_N|PG_WT);
717			npa = ptp[l1idx] & PMAP_PA_MASK;
718
719			/* still running on pmap86 */
720			pmap_tmpunmap_pa_86();
721			mtx_leave(&ptppg->mdpage.pv_mtx);
722
723			/* enforce use of pmap86 */
724			cpu_pae = 0;
725			pmap_enter_special_pae(va, npa, 0, bits);
726			cpu_pae = 1;
727
728			if (--ptppg->wire_count == 1) {
729				ptppg->wire_count = 0;
730				uvm_pagerealloc(ptppg, NULL, 0);
731				DPRINTF("%s: freeing PT page 0x%x\n", __func__,
732				    (uint32_t)VM_PAGE_TO_PHYS(ptppg));
733			}
734		}
735		km_free(pd, NBPG, &kv_any, &kp_dirty);
736		DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
737	}
738
739	if (!cpu_paenable(&kpm->pm_pdidx[0])) {
740		extern struct user *proc0paddr;
741
742		proc0paddr->u_pcb.pcb_cr3 = kpm->pm_pdirpa =
743		    (vaddr_t)kpm - KERNBASE;
744		kpm->pm_pdirsize = 4 * NBPG;
745
746		/* Reset cr3 for NMI task switch */
747		cpu_update_nmi_cr3(kpm->pm_pdirpa);
748
749		DPRINTF("%s: pm_pdir 0x%x pm_pdirpa 0x%x pm_pdirsize %d\n",
750		    __func__, (uint32_t)kpm->pm_pdir, (uint32_t)kpm->pm_pdirpa,
751		    kpm->pm_pdirsize);
752
753		csrc_pte = vtopte(pmap_csrcp);
754		cdst_pte = vtopte(pmap_cdstp);
755		zero_pte = vtopte(pmap_zerop);
756		ptp_pte = vtopte(pmap_ptpp);
757		flsh_pte = vtopte(pmap_flshp);
758
759		nkpde *= 2;
760		nkptp_max = 2048 - PDSLOT_KERN - 4;
761
762		pmap_pte_set_p = pmap_pte_set_pae;
763		pmap_pte_setbits_p = pmap_pte_setbits_pae;
764		pmap_pte_bits_p = pmap_pte_bits_pae;
765		pmap_pte_paddr_p = pmap_pte_paddr_pae;
766		pmap_clear_attrs_p = pmap_clear_attrs_pae;
767		pmap_enter_p = pmap_enter_pae;
768		pmap_enter_special_p = pmap_enter_special_pae;
769		pmap_extract_p = pmap_extract_pae;
770		pmap_growkernel_p = pmap_growkernel_pae;
771		pmap_page_remove_p = pmap_page_remove_pae;
772		pmap_do_remove_p = pmap_do_remove_pae;
773		pmap_test_attrs_p = pmap_test_attrs_pae;
774		pmap_unwire_p = pmap_unwire_pae;
775		pmap_write_protect_p = pmap_write_protect_pae;
776		pmap_pinit_pd_p = pmap_pinit_pd_pae;
777		pmap_zero_phys_p = pmap_zero_phys_pae;
778		pmap_zero_page_uncached_p = pmap_zero_page_uncached_pae;
779		pmap_copy_page_p = pmap_copy_page_pae;
780
781		bzero((void *)kpm->pm_pdir + 8, (PDSLOT_PTE-1) * 8);
782		/* TODO also reclaim old PDPs */
783	}
784
785	/* Set region permissions */
786	for (va = (vaddr_t)&PTmap; va < KERNBASE; va += NBPD) {
787		pte = PDE(kpm, pdei(va));
788		PDE(kpm, pdei(va)) = pte | PG_NX;
789	}
790
791	va = (vaddr_t)APTE_BASE;
792	pte = PDE(kpm, pdei(va));
793	PDE(kpm, pdei(va)) = pte | PG_NX;
794
795	pmap_write_protect(kpm, (vaddr_t)&kernel_text, (vaddr_t)&etext,
796	    PROT_READ | PROT_EXEC);
797	pmap_write_protect(kpm, (vaddr_t)&__rodata_start,
798	    (vaddr_t)&erodata, PROT_READ);
799	pmap_write_protect(kpm, (vaddr_t)&__data_start, (vaddr_t)&edata,
800	    PROT_READ | PROT_WRITE);
801	pmap_write_protect(kpm, (vaddr_t)&__bss_start, (vaddr_t)&end,
802	    PROT_READ | PROT_WRITE);
803
804#if defined(DDB) || NKSYMS > 0
805	pmap_write_protect(kpm, ssym, esym, PROT_READ);
806#endif
807}
808
809/*
810 * p t p   f u n c t i o n s
811 */
812
813/*
814 * pmap_alloc_ptp: allocate a PTP for a PMAP
815 *
816 * => pmap should already be locked by caller
817 * => we use the ptp's wire_count to count the number of active mappings
818 *	in the PTP (we start it at one to prevent any chance this PTP
819 *	will ever leak onto the active/inactive queues)
820 * => we should not be holding any pv_head locks (in case we are forced
821 *	to call pmap_steal_ptp())
822 * => we may need to lock pv_head's if we have to steal a PTP
823 */
824
825struct vm_page *
826pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, pt_entry_t pde_flags)
827{
828	struct vm_page *ptp;
829	pd_entry_t *pva_intel;
830
831	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
832			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
833	if (ptp == NULL)
834		return (NULL);
835
836	/* got one! */
837	atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
838	ptp->wire_count = 1;	/* no mappings yet */
839	PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) |
840	    PG_RW | PG_V | PG_M | PG_U | pde_flags);
841
842	/*
843	 * Meltdown special case - if we are adding a new PDE for
844	 * usermode addresses, just copy the PDE to the U-K
845	 * table.
846	 */
847	if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) {
848		pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
849		pva_intel[pde_index] = PDE(pmap, pde_index);
850		DPRINTF("%s: copying usermode PDE (content=0x%llx) pde_index "
851		    "%d from 0x%llx -> 0x%llx\n", __func__,
852		    PDE(pmap, pde_index), pde_index,
853		    (uint64_t)&PDE(pmap, pde_index),
854		    (uint64_t)&(pva_intel[pde_index]));
855	}
856
857	pmap->pm_stats.resident_count++;	/* count PTP as resident */
858	pmap->pm_ptphint = ptp;
859	return(ptp);
860}
861
862/*
863 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
864 *
865 * => pmap should NOT be pmap_kernel()
866 * => pmap should be locked
867 */
868
869struct vm_page *
870pmap_get_ptp_pae(struct pmap *pmap, int pde_index)
871{
872	struct vm_page *ptp;
873
874	if (pmap_valid_entry(PDE(pmap, pde_index))) {
875		/* valid... check hint (saves us a PA->PG lookup) */
876		if (pmap->pm_ptphint &&
877		    (PDE(pmap, pde_index) & PG_FRAME) ==
878		    VM_PAGE_TO_PHYS(pmap->pm_ptphint))
879			return(pmap->pm_ptphint);
880
881		ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
882#ifdef DIAGNOSTIC
883		if (ptp == NULL)
884			panic("pmap_get_ptp_pae: unmanaged user PTP");
885#endif
886		pmap->pm_ptphint = ptp;
887		return(ptp);
888	}
889
890	/* allocate a new PTP (updates ptphint) */
891	return (pmap_alloc_ptp_pae(pmap, pde_index, PG_u));
892}
893
894void
895pmap_drop_ptp_pae(struct pmap *pm, vaddr_t va, struct vm_page *ptp,
896    pt_entry_t *ptes)
897{
898	pd_entry_t *pva_intel;
899
900	i386_atomic_testset_uq(&PDE(pm, pdei(va)), 0);
901	pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset);
902#ifdef MULTIPROCESSOR
903	/*
904	 * Always shoot down the other pmap's
905	 * self-mapping of the PTP.
906	 */
907	pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset);
908#endif
909	pm->pm_stats.resident_count--;
910	/* update hint */
911	if (pm->pm_ptphint == ptp)
912		pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt);
913	ptp->wire_count = 0;
914	/* Postpone free to after shootdown. */
915	uvm_pagerealloc(ptp, NULL, 0);
916
917	if (pm->pm_pdir_intel) {
918		KASSERT(va < VM_MAXUSER_ADDRESS);
919		/* Zap special meltdown PDE */
920		pva_intel = (pd_entry_t *)pm->pm_pdir_intel;
921		i386_atomic_testset_uq(&pva_intel[pdei(va)], 0);
922		DPRINTF("%s: cleared meltdown PDE @ index %lu "
923		    "(va range start 0x%x)\n", __func__, pdei(va),
924		    (uint32_t)va);
925	}
926}
927
928/*
929 * pmap_pinit_pd: given a freshly allocated pmap structure, give it a PD
930 */
931void
932pmap_pinit_pd_pae(struct pmap *pmap)
933{
934	extern int nkpde;
935	vaddr_t va;
936	paddr_t pdidx[4];
937
938	/* allocate PDP */
939	pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_dirty,
940	    &kd_waitok);
941	if (pmap->pm_pdir == 0)
942		panic("pmap_pinit_pd_pae: kernel_map out of virtual space!");
943	/* page index is in the pmap! */
944	pmap_extract(pmap_kernel(), (vaddr_t)pmap, &pmap->pm_pdirpa);
945	va = (vaddr_t)pmap->pm_pdir;
946	pmap_extract(pmap_kernel(), va + 0*NBPG, &pdidx[0]);
947	pmap_extract(pmap_kernel(), va + 1*NBPG, &pdidx[1]);
948	pmap_extract(pmap_kernel(), va + 2*NBPG, &pdidx[2]);
949	pmap_extract(pmap_kernel(), va + 3*NBPG, &pdidx[3]);
950	pmap->pm_pdidx[0] = (uint64_t)pdidx[0];
951	pmap->pm_pdidx[1] = (uint64_t)pdidx[1];
952	pmap->pm_pdidx[2] = (uint64_t)pdidx[2];
953	pmap->pm_pdidx[3] = (uint64_t)pdidx[3];
954	pmap->pm_pdidx[0] |= PG_V;
955	pmap->pm_pdidx[1] |= PG_V;
956	pmap->pm_pdidx[2] |= PG_V;
957	pmap->pm_pdidx[3] |= PG_V;
958	pmap->pm_pdirsize = 4 * NBPG;
959
960	/* init PDP */
961	/* zero init area */
962	bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t));
963	/* put in recursive PDE to map the PTEs */
964	PDE(pmap, PDSLOT_PTE+0) = pmap->pm_pdidx[0] | PG_KW | PG_U |
965	    PG_M | PG_V | PG_NX;
966	PDE(pmap, PDSLOT_PTE+1) = pmap->pm_pdidx[1] | PG_KW | PG_U |
967	    PG_M | PG_V | PG_NX;
968	PDE(pmap, PDSLOT_PTE+2) = pmap->pm_pdidx[2] | PG_KW | PG_U |
969	    PG_M | PG_V | PG_NX;
970	PDE(pmap, PDSLOT_PTE+3) = pmap->pm_pdidx[3] | PG_KW | PG_U |
971	    PG_M | PG_V | PG_NX;
972
973	/*
974	 * we need to lock pmaps_lock to prevent nkpde from changing on
975	 * us.   note that there is no need to splvm to protect us from
976	 * malloc since malloc allocates out of a submap and we should have
977	 * already allocated kernel PTPs to cover the range...
978	 */
979	/* put in kernel VM PDEs */
980	bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN),
981	       nkpde * sizeof(pd_entry_t));
982	/* zero the rest */
983	bzero(&PDE(pmap, PDSLOT_KERN + nkpde), pmap->pm_pdirsize -
984	    ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
985
986	/*
987	 * Intel CPUs need a special page table to be used during usermode
988	 * execution, one that lacks all kernel mappings.
989	 */
990	if (cpu_meltdown) {
991		int i;
992
993		va = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_zero, &kd_waitok);
994		if (va == 0)
995			panic("%s: kernel_map out of virtual space!", __func__);
996		if (!pmap_extract(pmap_kernel(),
997		    (vaddr_t)&pmap->pm_pdidx_intel, &pmap->pm_pdirpa_intel))
998			panic("%s: can't locate PDPT", __func__);
999		pmap->pm_pdir_intel = va;
1000
1001		for (i = 0; i < 4; i++) {
1002			pmap->pm_pdidx_intel[i] = 0;
1003			if (!pmap_extract(pmap, va + i * NBPG,
1004			    (paddr_t *)&pmap->pm_pdidx_intel[i]))
1005				panic("%s: can't locate PD page", __func__);
1006			pmap->pm_pdidx_intel[i] |= PG_V;
1007			DPRINTF("%s: pm_pdidx_intel[%d] = 0x%llx\n", __func__,
1008			    i, pmap->pm_pdidx_intel[i]);
1009		}
1010
1011		/* Copy PDEs from pmap_kernel's U-K view */
1012		bcopy((void *)pmap_kernel()->pm_pdir_intel,
1013		    (void *)pmap->pm_pdir_intel, 4 * NBPG);
1014
1015		DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx "
1016		    "pdir_intel 0x%lx pdirpa_intel 0x%lx\n",
1017		    __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa,
1018		    pmap->pm_pdir_intel, pmap->pm_pdirpa_intel);
1019	} else {
1020		pmap->pm_pdir_intel = 0;
1021		pmap->pm_pdirpa_intel = 0;
1022	}
1023
1024	mtx_enter(&pmaps_lock);
1025	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1026	mtx_leave(&pmaps_lock);
1027}
1028
1029/*
1030 * some misc. functions
1031 */
1032
1033/*
1034 * pmap_extract: extract a PA for the given VA
1035 */
1036
1037int
1038pmap_extract_pae(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1039{
1040	pt_entry_t *ptes, pte;
1041
1042	ptes = pmap_map_ptes_pae(pmap);
1043	if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1044		pte = ptes[atop(va)];
1045		pmap_unmap_ptes_pae(pmap);
1046		if (!pmap_valid_entry(pte))
1047			return 0;
1048		if (pap != NULL)
1049			*pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
1050		return 1;
1051	}
1052	pmap_unmap_ptes_pae(pmap);
1053	return 0;
1054}
1055
1056extern void (*pagezero)(void *, size_t);
1057
1058/*
1059 * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are
1060 * initialized.
1061 */
1062void
1063pmap_zero_phys_pae(paddr_t pa)
1064{
1065#ifdef MULTIPROCESSOR
1066	int id = cpu_number();
1067#endif
1068	pt_entry_t *zpte = PTESLEW(zero_pte, id);
1069	caddr_t zerova = VASLEW(pmap_zerop, id);
1070
1071#ifdef DIAGNOSTIC
1072	if (*zpte)
1073		panic("pmap_zero_phys_pae: lock botch");
1074#endif
1075
1076	*zpte = (pa & PG_FRAME) | PG_V | PG_RW;	/* map in */
1077	pmap_update_pg((vaddr_t)zerova);	/* flush TLB */
1078	pagezero(zerova, PAGE_SIZE);		/* zero */
1079	*zpte = 0;
1080}
1081
1082/*
1083 * pmap_zero_page_uncached: the same, except uncached.
1084 */
1085
1086int
1087pmap_zero_page_uncached_pae(paddr_t pa)
1088{
1089#ifdef MULTIPROCESSOR
1090	int id = cpu_number();
1091#endif
1092	pt_entry_t *zpte = PTESLEW(zero_pte, id);
1093	caddr_t zerova = VASLEW(pmap_zerop, id);
1094
1095#ifdef DIAGNOSTIC
1096	if (*zpte)
1097		panic("pmap_zero_page_uncached_pae: lock botch");
1098#endif
1099
1100	*zpte = (pa & PG_FRAME) | PG_V | PG_RW | PG_N;	/* map in */
1101	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
1102	pagezero(zerova, PAGE_SIZE);		/* zero */
1103	*zpte = 0;
1104
1105	return 1;
1106}
1107
1108/*
1109 * pmap_copy_page: copy a page
1110 */
1111
1112void
1113pmap_copy_page_pae(struct vm_page *srcpg, struct vm_page *dstpg)
1114{
1115	paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
1116	paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
1117#ifdef MULTIPROCESSOR
1118	int id = cpu_number();
1119#endif
1120	pt_entry_t *spte = PTESLEW(csrc_pte, id);
1121	pt_entry_t *dpte = PTESLEW(cdst_pte, id);
1122	caddr_t csrcva = VASLEW(pmap_csrcp, id);
1123	caddr_t cdstva = VASLEW(pmap_cdstp, id);
1124
1125#ifdef DIAGNOSTIC
1126	if (*spte || *dpte)
1127		panic("pmap_copy_page_pae: lock botch");
1128#endif
1129
1130	*spte = (srcpa & PG_FRAME) | PG_V | PG_RW;
1131	*dpte = (dstpa & PG_FRAME) | PG_V | PG_RW;
1132	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1133	bcopy(csrcva, cdstva, PAGE_SIZE);
1134	*spte = *dpte = 0;
1135	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1136}
1137
1138/*
1139 * p m a p   r e m o v e   f u n c t i o n s
1140 *
1141 * functions that remove mappings
1142 */
1143
1144/*
1145 * pmap_remove_ptes: remove PTEs from a PTP
1146 *
1147 * => caller must hold pmap's lock
1148 * => PTP must be mapped into KVA
1149 * => PTP should be null if pmap == pmap_kernel()
1150*/
1151
1152void
1153pmap_remove_ptes_pae(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1154    vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1155{
1156	struct pv_entry *pve;
1157	pt_entry_t *pte = (pt_entry_t *) ptpva;
1158	struct vm_page *pg;
1159	pt_entry_t opte;
1160
1161	/*
1162	 * note that ptpva points to the PTE that maps startva.   this may
1163	 * or may not be the first PTE in the PTP.
1164	 *
1165	 * we loop through the PTP while there are still PTEs to look at
1166	 * and the wire_count is greater than 1 (because we use the wire_count
1167	 * to keep track of the number of real PTEs in the PTP).
1168	 */
1169
1170	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1171			     ; pte++, startva += NBPG) {
1172		if (!pmap_valid_entry(*pte))
1173			continue;			/* VA not mapped */
1174
1175		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W))
1176			continue;
1177
1178		/* atomically save the old PTE and zero it */
1179		opte = i386_atomic_testset_uq(pte, 0);
1180
1181		if (opte & PG_W)
1182			pmap->pm_stats.wired_count--;
1183		pmap->pm_stats.resident_count--;
1184
1185		if (ptp)
1186			ptp->wire_count--;		/* dropping a PTE */
1187
1188		/*
1189		 * Unnecessary work if not PG_PVLIST.
1190		 */
1191		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1192
1193		/*
1194		 * if we are not on a pv list we are done.
1195		 */
1196		if ((opte & PG_PVLIST) == 0) {
1197#ifdef DIAGNOSTIC
1198			if (pg != NULL)
1199				panic("pmap_remove_ptes_pae: managed page "
1200				     "without PG_PVLIST for 0x%lx", startva);
1201#endif
1202			continue;
1203		}
1204
1205#ifdef DIAGNOSTIC
1206		if (pg == NULL)
1207			panic("pmap_remove_ptes_pae: unmanaged page marked "
1208			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1209			      startva, (u_long)(opte & PG_FRAME));
1210#endif
1211
1212		/* sync R/M bits */
1213		pmap_sync_flags_pte_pae(pg, opte);
1214		pve = pmap_remove_pv(pg, pmap, startva);
1215		if (pve) {
1216			pve->pv_next = *free_pvs;
1217			*free_pvs = pve;
1218		}
1219
1220		/* end of "for" loop: time for next pte */
1221	}
1222}
1223
1224/*
1225 * pmap_remove: top level mapping removal function
1226 *
1227 * => caller should not be holding any pmap locks
1228 */
1229
1230void
1231pmap_do_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1232{
1233	pt_entry_t *ptes;
1234	paddr_t ptppa;
1235	vaddr_t blkendva;
1236	struct vm_page *ptp;
1237	struct pv_entry *pve;
1238	struct pv_entry *free_pvs = NULL;
1239	TAILQ_HEAD(, vm_page) empty_ptps;
1240	int shootall;
1241	vaddr_t va;
1242
1243	TAILQ_INIT(&empty_ptps);
1244
1245	ptes = pmap_map_ptes_pae(pmap);	/* locks pmap */
1246
1247	/*
1248	 * Decide if we want to shoot the whole tlb or just the range.
1249	 * Right now, we simply shoot everything when we remove more
1250	 * than 32 pages, but never in the kernel pmap. XXX - tune.
1251	 */
1252	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1253		shootall = 1;
1254	else
1255		shootall = 0;
1256
1257	for (va = sva ; va < eva ; va = blkendva) {
1258		/* determine range of block */
1259		blkendva = i386_round_pdr(va + 1);
1260		if (blkendva > eva)
1261			blkendva = eva;
1262
1263		/*
1264		 * XXXCDC: our PTE mappings should never be removed
1265		 * with pmap_remove!  if we allow this (and why would
1266		 * we?) then we end up freeing the pmap's page
1267		 * directory page (PDP) before we are finished using
1268		 * it when we hit it in the recursive mapping.  this
1269		 * is BAD.
1270		 *
1271		 * long term solution is to move the PTEs out of user
1272		 * address space.  and into kernel address space (up
1273		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1274		 * be VM_MAX_ADDRESS.
1275		 */
1276
1277		if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3))
1278			/* XXXCDC: ugly hack to avoid freeing PDP here */
1279			continue;
1280
1281		if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1282			/* valid block? */
1283			continue;
1284
1285		/* PA of the PTP */
1286		ptppa = PDE(pmap, pdei(va)) & PG_FRAME;
1287
1288		/* get PTP if non-kernel mapping */
1289		if (pmap == pmap_kernel()) {
1290			/* we never free kernel PTPs */
1291			ptp = NULL;
1292		} else {
1293			if (pmap->pm_ptphint &&
1294			    VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
1295				ptp = pmap->pm_ptphint;
1296			} else {
1297				ptp = PHYS_TO_VM_PAGE(ptppa);
1298#ifdef DIAGNOSTIC
1299				if (ptp == NULL)
1300					panic("pmap_do_remove_pae: unmanaged "
1301					      "PTP detected");
1302#endif
1303			}
1304		}
1305
1306		pmap_remove_ptes_pae(pmap, ptp, (vaddr_t)&ptes[atop(va)],
1307		    va, blkendva, flags, &free_pvs);
1308
1309		/* If PTP is no longer being used, free it. */
1310		if (ptp && ptp->wire_count <= 1) {
1311			pmap_drop_ptp_pae(pmap, va, ptp, ptes);
1312			TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq);
1313		}
1314
1315		if (!shootall)
1316			pmap_tlb_shootrange(pmap, va, blkendva);
1317	}
1318
1319	if (shootall)
1320		pmap_tlb_shoottlb();
1321
1322	pmap_unmap_ptes_pae(pmap);
1323	pmap_tlb_shootwait();
1324
1325	while ((pve = free_pvs) != NULL) {
1326		free_pvs = pve->pv_next;
1327		pool_put(&pmap_pv_pool, pve);
1328	}
1329
1330	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1331		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1332		uvm_pagefree(ptp);
1333	}
1334}
1335
1336/*
1337 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1338 *
1339 * => R/M bits are sync'd back to attrs
1340 */
1341
1342void
1343pmap_page_remove_pae(struct vm_page *pg)
1344{
1345	struct pv_entry *pve;
1346	struct pmap *pm;
1347	pt_entry_t *ptes, opte;
1348	TAILQ_HEAD(, vm_page) empty_ptps;
1349	struct vm_page *ptp;
1350
1351	if (pg->mdpage.pv_list == NULL)
1352		return;
1353
1354	TAILQ_INIT(&empty_ptps);
1355
1356	mtx_enter(&pg->mdpage.pv_mtx);
1357	while ((pve = pg->mdpage.pv_list) != NULL) {
1358		pmap_reference(pve->pv_pmap);
1359		pm = pve->pv_pmap;
1360		mtx_leave(&pg->mdpage.pv_mtx);
1361
1362		ptes = pmap_map_ptes_pae(pve->pv_pmap);	/* locks pmap */
1363
1364		/*
1365		 * We dropped the pvlist lock before grabbing the pmap
1366		 * lock to avoid lock ordering problems.  This means
1367		 * we have to check the pvlist again since somebody
1368		 * else might have modified it.  All we care about is
1369		 * that the pvlist entry matches the pmap we just
1370		 * locked.  If it doesn't, unlock the pmap and try
1371		 * again.
1372		 */
1373		mtx_enter(&pg->mdpage.pv_mtx);
1374		if ((pve = pg->mdpage.pv_list) == NULL ||
1375		    pve->pv_pmap != pm) {
1376			mtx_leave(&pg->mdpage.pv_mtx);
1377			pmap_unmap_ptes_pae(pm);	/* unlocks pmap */
1378			pmap_destroy(pm);
1379			mtx_enter(&pg->mdpage.pv_mtx);
1380			continue;
1381		}
1382
1383		pg->mdpage.pv_list = pve->pv_next;
1384		mtx_leave(&pg->mdpage.pv_mtx);
1385
1386#ifdef DIAGNOSTIC
1387		if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1388				    PG_FRAME)
1389		    != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1390			printf("pmap_page_remove_pae: pg=%p: va=%lx, "
1391				"pv_ptp=%p\n",
1392				pg, pve->pv_va, pve->pv_ptp);
1393			printf("pmap_page_remove_pae: PTP's phys addr: "
1394				"actual=%llx, recorded=%lx\n",
1395				(PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1396				PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
1397			panic("pmap_page_remove_pae: mapped managed page has "
1398				"invalid pv_ptp field");
1399}
1400#endif
1401		opte = i386_atomic_testset_uq(&ptes[atop(pve->pv_va)], 0);
1402
1403		if (opte & PG_W)
1404			pve->pv_pmap->pm_stats.wired_count--;
1405		pve->pv_pmap->pm_stats.resident_count--;
1406
1407		/* sync R/M bits */
1408		pmap_sync_flags_pte_pae(pg, opte);
1409
1410		/* update the PTP reference count.  free if last reference. */
1411		if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) {
1412			pmap_drop_ptp_pae(pve->pv_pmap, pve->pv_va,
1413			    pve->pv_ptp, ptes);
1414			TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq);
1415		}
1416
1417		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1418
1419		pmap_unmap_ptes_pae(pve->pv_pmap);	/* unlocks pmap */
1420		pmap_destroy(pve->pv_pmap);
1421		pool_put(&pmap_pv_pool, pve);
1422		mtx_enter(&pg->mdpage.pv_mtx);
1423	}
1424	mtx_leave(&pg->mdpage.pv_mtx);
1425
1426	pmap_tlb_shootwait();
1427
1428	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1429		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1430		uvm_pagefree(ptp);
1431	}
1432}
1433
1434/*
1435 * p m a p   a t t r i b u t e  f u n c t i o n s
1436 * functions that test/change managed page's attributes
1437 * since a page can be mapped multiple times we must check each PTE that
1438 * maps it by going down the pv lists.
1439 */
1440
1441/*
1442 * pmap_test_attrs: test a page's attributes
1443 *
1444 * => we set pv_head => pmap locking
1445 */
1446
1447int
1448pmap_test_attrs_pae(struct vm_page *pg, int testbits)
1449{
1450	struct pv_entry *pve;
1451	pt_entry_t *ptes, pte;
1452	u_long mybits, testflags;
1453	paddr_t ptppa;
1454
1455	testflags = pmap_pte2flags(testbits);
1456
1457	if (pg->pg_flags & testflags)
1458		return 1;
1459
1460	mybits = 0;
1461	mtx_enter(&pg->mdpage.pv_mtx);
1462	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
1463	    pve = pve->pv_next) {
1464		ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
1465		ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
1466		pte = ptes[ptei(pve->pv_va)];
1467		pmap_tmpunmap_pa();
1468		mybits |= (pte & testbits);
1469	}
1470	mtx_leave(&pg->mdpage.pv_mtx);
1471
1472	if (mybits == 0)
1473		return 0;
1474
1475	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
1476
1477	return 1;
1478}
1479
1480/*
1481 * pmap_clear_attrs: change a page's attributes
1482 *
1483 * => we return 1 if we cleared one of the bits we were asked to
1484 */
1485int
1486pmap_clear_attrs_pae(struct vm_page *pg, int clearbits)
1487{
1488	struct pv_entry *pve;
1489	pt_entry_t *ptes, npte, opte;
1490	u_long clearflags;
1491	paddr_t ptppa;
1492	int result;
1493
1494	clearflags = pmap_pte2flags(clearbits);
1495
1496	result = pg->pg_flags & clearflags;
1497	if (result)
1498		atomic_clearbits_int(&pg->pg_flags, clearflags);
1499
1500	mtx_enter(&pg->mdpage.pv_mtx);
1501	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
1502		ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME;
1503		ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa);
1504#ifdef DIAGNOSTIC
1505		if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va))))
1506			panic("pmap_clear_attrs_pae: mapping without PTP "
1507				"detected");
1508#endif
1509
1510		opte = ptes[ptei(pve->pv_va)];
1511		if (opte & clearbits) {
1512			result = 1;
1513			npte = opte & ~clearbits;
1514			opte = i386_atomic_testset_uq(
1515			   &ptes[ptei(pve->pv_va)], npte);
1516			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
1517		}
1518		pmap_tmpunmap_pa();
1519	}
1520	mtx_leave(&pg->mdpage.pv_mtx);
1521
1522	pmap_tlb_shootwait();
1523
1524	return (result != 0);
1525}
1526
1527
1528/*
1529 * p m a p   p r o t e c t i o n   f u n c t i o n s
1530 */
1531
1532/*
1533 * pmap_page_protect: change the protection of all recorded mappings
1534 *	of a managed page
1535 *
1536 * => NOTE: this is an inline function in pmap.h
1537 */
1538
1539/* see pmap.h */
1540
1541/*
1542 * pmap_protect: set the protection in of the pages in a pmap
1543 *
1544 * => NOTE: this is an inline function in pmap.h
1545 */
1546
1547/* see pmap.h */
1548
1549/*
1550 * pmap_write_protect: write-protect pages in a pmap
1551 */
1552
1553void
1554pmap_write_protect_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva,
1555    vm_prot_t prot)
1556{
1557	pt_entry_t *ptes, *spte, *epte, npte, opte;
1558	vaddr_t blockend;
1559	u_int64_t md_prot;
1560	vaddr_t va;
1561	int shootall = 0;
1562
1563	ptes = pmap_map_ptes_pae(pmap);		/* locks pmap */
1564
1565	/* should be ok, but just in case ... */
1566	sva &= PG_FRAME;
1567	eva &= PG_FRAME;
1568
1569	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1570		shootall = 1;
1571
1572	for (va = sva; va < eva; va = blockend) {
1573		blockend = (va & PD_MASK) + NBPD;
1574		if (blockend > eva)
1575			blockend = eva;
1576
1577		/*
1578		 * XXXCDC: our PTE mappings should never be write-protected!
1579		 *
1580		 * long term solution is to move the PTEs out of user
1581		 * address space.  and into kernel address space (up
1582		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1583		 * be VM_MAX_ADDRESS.
1584		 */
1585
1586		/* XXXCDC: ugly hack to avoid freeing PDP here */
1587		if (pdei(va) >= PDSLOT_PTE && pdei(va) <= (PDSLOT_PTE + 3))
1588			continue;
1589
1590		/* empty block? */
1591		if (!pmap_valid_entry(PDE(pmap, pdei(va))))
1592			continue;
1593
1594		md_prot = protection_codes[prot];
1595		if (!(prot & PROT_EXEC))
1596			md_prot |= PG_NX;
1597		if (va < VM_MAXUSER_ADDRESS)
1598			md_prot |= PG_u;
1599		else if (va < VM_MAX_ADDRESS)
1600			/* XXX: write-prot our PTES? never! */
1601			md_prot |= PG_RW;
1602
1603		spte = &ptes[atop(va)];
1604		epte = &ptes[atop(blockend)];
1605
1606		for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) {
1607
1608			if (!pmap_valid_entry(*spte))	/* no mapping? */
1609				continue;
1610
1611			opte = *spte;
1612			npte = (opte & ~(pt_entry_t)PG_PROT) | md_prot;
1613
1614			if (npte != opte) {
1615				pmap_exec_account(pmap, va, *spte, npte);
1616				i386_atomic_testset_uq(spte, npte);
1617			}
1618		}
1619	}
1620	if (shootall)
1621		pmap_tlb_shoottlb();
1622	else
1623		pmap_tlb_shootrange(pmap, sva, eva);
1624
1625	pmap_unmap_ptes_pae(pmap);		/* unlocks pmap */
1626	pmap_tlb_shootwait();
1627}
1628
1629/*
1630 * end of protection functions
1631 */
1632
1633/*
1634 * pmap_unwire: clear the wired bit in the PTE
1635 *
1636 * => mapping should already be in map
1637 */
1638
1639void
1640pmap_unwire_pae(struct pmap *pmap, vaddr_t va)
1641{
1642	pt_entry_t *ptes;
1643
1644	if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1645		ptes = pmap_map_ptes_pae(pmap);		/* locks pmap */
1646
1647#ifdef DIAGNOSTIC
1648		if (!pmap_valid_entry(ptes[atop(va)]))
1649			panic("pmap_unwire_pae: invalid (unmapped) va "
1650			      "0x%lx", va);
1651#endif
1652		if ((ptes[atop(va)] & PG_W) != 0) {
1653			i386_atomic_testset_uq(&ptes[atop(va)],
1654			    ptes[atop(va)] & ~PG_W);
1655			pmap->pm_stats.wired_count--;
1656		}
1657#ifdef DIAGNOSTIC
1658		else {
1659			printf("pmap_unwire_pae: wiring for pmap %p va 0x%lx "
1660			       "didn't change!\n", pmap, va);
1661		}
1662#endif
1663		pmap_unmap_ptes_pae(pmap);		/* unlocks map */
1664	}
1665#ifdef DIAGNOSTIC
1666	else {
1667		panic("pmap_unwire_pae: invalid PDE");
1668	}
1669#endif
1670}
1671
1672/*
1673 * pmap_enter: enter a mapping into a pmap
1674 *
1675 * => must be done "now" ... no lazy-evaluation
1676 */
1677
1678int
1679pmap_enter_pae(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
1680    int flags)
1681{
1682	pt_entry_t *ptes, opte, npte;
1683	struct vm_page *ptp;
1684	struct pv_entry *pve, *opve = NULL;
1685	int wired = (flags & PMAP_WIRED) != 0;
1686	int nocache = (pa & PMAP_NOCACHE) != 0;
1687	int wc = (pa & PMAP_WC) != 0;
1688	struct vm_page *pg = NULL;
1689	int error, wired_count, resident_count, ptp_count;
1690
1691	KASSERT(!(wc && nocache));
1692	pa &= PMAP_PA_MASK;	/* nuke flags from pa */
1693
1694#ifdef DIAGNOSTIC
1695	/* sanity check: totally out of range? */
1696	if (va >= VM_MAX_KERNEL_ADDRESS)
1697		panic("pmap_enter_pae: too big");
1698
1699	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
1700		panic("pmap_enter_pae: trying to map over PDP/APDP!");
1701
1702	/* sanity check: kernel PTPs should already have been pre-allocated */
1703	if (va >= VM_MIN_KERNEL_ADDRESS &&
1704	    !pmap_valid_entry(PDE(pmap, pdei(va))))
1705		panic("pmap_enter_pae: missing kernel PTP!");
1706#endif
1707
1708	if (pmap_initialized)
1709		pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
1710	else
1711		pve = NULL;
1712	wired_count = resident_count = ptp_count = 0;
1713
1714	/*
1715	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
1716	 */
1717
1718	ptes = pmap_map_ptes_pae(pmap);		/* locks pmap */
1719	if (pmap == pmap_kernel()) {
1720		ptp = NULL;
1721	} else {
1722		ptp = pmap_get_ptp_pae(pmap, pdei(va));
1723		if (ptp == NULL) {
1724			if (flags & PMAP_CANFAIL) {
1725				error = ENOMEM;
1726				pmap_unmap_ptes_pae(pmap);
1727				goto out;
1728			}
1729			panic("pmap_enter_pae: get ptp failed");
1730		}
1731	}
1732	/*
1733	 * not allowed to sleep after here!
1734	 */
1735	opte = ptes[atop(va)];			/* old PTE */
1736
1737	/*
1738	 * is there currently a valid mapping at our VA?
1739	 */
1740
1741	if (pmap_valid_entry(opte)) {
1742
1743		/*
1744		 * first, calculate pm_stats updates.  resident count will not
1745		 * change since we are replacing/changing a valid
1746		 * mapping.  wired count might change...
1747		 */
1748
1749		if (wired && (opte & PG_W) == 0)
1750			wired_count++;
1751		else if (!wired && (opte & PG_W) != 0)
1752			wired_count--;
1753
1754		/*
1755		 * is the currently mapped PA the same as the one we
1756		 * want to map?
1757		 */
1758
1759		if ((opte & PG_FRAME) == pa) {
1760
1761			/* if this is on the PVLIST, sync R/M bit */
1762			if (opte & PG_PVLIST) {
1763				pg = PHYS_TO_VM_PAGE(pa);
1764#ifdef DIAGNOSTIC
1765				if (pg == NULL)
1766					panic("pmap_enter_pae: same pa "
1767					     "PG_PVLIST mapping with "
1768					     "unmanaged page "
1769					     "pa = 0x%lx (0x%lx)", pa,
1770					     atop(pa));
1771#endif
1772				pmap_sync_flags_pte_pae(pg, opte);
1773			}
1774			goto enter_now;
1775		}
1776
1777		/*
1778		 * changing PAs: we must remove the old one first
1779		 */
1780
1781		/*
1782		 * if current mapping is on a pvlist,
1783		 * remove it (sync R/M bits)
1784		 */
1785
1786		if (opte & PG_PVLIST) {
1787			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1788#ifdef DIAGNOSTIC
1789			if (pg == NULL)
1790				panic("pmap_enter_pae: PG_PVLIST mapping with "
1791				      "unmanaged page "
1792				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
1793#endif
1794			pmap_sync_flags_pte_pae(pg, opte);
1795			opve = pmap_remove_pv(pg, pmap, va);
1796			pg = NULL; /* This is not the page we are looking for */
1797		}
1798	} else {	/* opte not valid */
1799		resident_count++;
1800		if (wired)
1801			wired_count++;
1802		if (ptp)
1803			ptp_count++;	/* count # of valid entries */
1804	}
1805
1806	/*
1807	 * pve is either NULL or points to a now-free pv_entry structure
1808	 * (the latter case is if we called pmap_remove_pv above).
1809	 *
1810	 * if this entry is to be on a pvlist, enter it now.
1811	 */
1812
1813	if (pmap_initialized && pg == NULL)
1814		pg = PHYS_TO_VM_PAGE(pa);
1815
1816	if (pg != NULL) {
1817		if (pve == NULL) {
1818			pve = opve;
1819			opve = NULL;
1820		}
1821		if (pve == NULL) {
1822			if (flags & PMAP_CANFAIL) {
1823				pmap_unmap_ptes_pae(pmap);
1824				error = ENOMEM;
1825				goto out;
1826			}
1827			panic("pmap_enter_pae: no pv entries available");
1828		}
1829		/* lock pg when adding */
1830		pmap_enter_pv(pg, pve, pmap, va, ptp);
1831		pve = NULL;
1832	}
1833
1834enter_now:
1835	/*
1836	 * at this point pg is !NULL if we want the PG_PVLIST bit set
1837	 */
1838
1839	npte = pa | protection_codes[prot] | PG_V;
1840	if (!(prot & PROT_EXEC))
1841		npte |= PG_NX;
1842	pmap_exec_account(pmap, va, opte, npte);
1843	if (wired)
1844		npte |= PG_W;
1845	if (nocache)
1846		npte |= PG_N;
1847	if (va < VM_MAXUSER_ADDRESS)
1848		npte |= PG_u;
1849	else if (va < VM_MAX_ADDRESS)
1850		npte |= PG_RW;	/* XXXCDC: no longer needed? */
1851	if (pmap == pmap_kernel())
1852		npte |= pmap_pg_g;
1853	if (flags & PROT_READ)
1854		npte |= PG_U;
1855	if (flags & PROT_WRITE)
1856		npte |= PG_M;
1857	if (pg) {
1858		npte |= PG_PVLIST;
1859		if (pg->pg_flags & PG_PMAP_WC) {
1860			KASSERT(nocache == 0);
1861			wc = 1;
1862		}
1863		pmap_sync_flags_pte_pae(pg, npte);
1864	}
1865	if (wc)
1866		npte |= pmap_pg_wc;
1867
1868	opte = i386_atomic_testset_uq(&ptes[atop(va)], npte);
1869	if (ptp)
1870		ptp->wire_count += ptp_count;
1871	pmap->pm_stats.resident_count += resident_count;
1872	pmap->pm_stats.wired_count += wired_count;
1873
1874	if (pmap_valid_entry(opte)) {
1875		if (nocache && (opte & PG_N) == 0)
1876			wbinvd_on_all_cpus(); /* XXX clflush before we enter? */
1877		pmap_tlb_shootpage(pmap, va);
1878	}
1879
1880	pmap_unmap_ptes_pae(pmap);
1881	pmap_tlb_shootwait();
1882
1883	error = 0;
1884
1885out:
1886	if (pve)
1887		pool_put(&pmap_pv_pool, pve);
1888	if (opve)
1889		pool_put(&pmap_pv_pool, opve);
1890
1891	return error;
1892}
1893
1894/*
1895 * Allocate an extra PDPT and PT pages as needed to map kernel pages
1896 * used for the U-K mappings.  These special mappings are set up
1897 * during bootstrap and get never removed and are part of pmap_kernel.
1898 *
1899 * New pmaps inherit the kernel portion of pmap_kernel including
1900 * the special mappings (see pmap_pinit_pd_pae()).
1901 */
1902void
1903pmap_enter_special_pae(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags)
1904{
1905	struct pmap 	*pmap = pmap_kernel();
1906	struct vm_page	*ptppg = NULL, *pdppg;
1907	pd_entry_t	*pd, *ptp;
1908	pt_entry_t	*ptes;
1909	uint32_t	 l2idx, l1idx;
1910	vaddr_t		 vapd;
1911	paddr_t		 npa;
1912	int		 i;
1913
1914	/* If CPU is secure, no need to do anything */
1915	if (!cpu_meltdown)
1916		return;
1917
1918	/* Must be kernel VA */
1919	if (va < VM_MIN_KERNEL_ADDRESS)
1920		panic("%s: invalid special mapping va 0x%lx requested",
1921		    __func__, va);
1922
1923	if (!pmap->pm_pdir_intel) {
1924		if ((vapd = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
1925			panic("%s: kernel_map out of virtual space!", __func__);
1926		pmap->pm_pdir_intel = vapd;
1927		if (!pmap_extract(pmap, (vaddr_t)&pmap->pm_pdidx_intel,
1928		    &pmap->pm_pdirpa_intel))
1929			panic("%s: can't locate PDPT", __func__);
1930
1931		for (i = 0; i < 4; i++) {
1932			pmap->pm_pdidx_intel[i] = 0;
1933			if (!pmap_extract(pmap, vapd + i*NBPG,
1934			    (paddr_t *)&pmap->pm_pdidx_intel[i]))
1935				panic("%s: can't locate PD page", __func__);
1936
1937			/* ensure PDPs are wired down XXX hshoexer why? */
1938			pdppg = PHYS_TO_VM_PAGE(pmap->pm_pdidx_intel[i]);
1939			if (pdppg == NULL)
1940				panic("%s: no vm_page for pdidx %d", __func__, i);
1941			atomic_clearbits_int(&pdppg->pg_flags, PG_BUSY);
1942			pdppg->wire_count = 1;	/* no mappings yet */
1943
1944			pmap->pm_pdidx_intel[i] |= PG_V;
1945
1946			DPRINTF("%s: pm_pdidx_intel[%d] = 0x%llx\n", __func__,
1947			    i, pmap->pm_pdidx_intel[i]);
1948		}
1949	}
1950
1951	DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__,
1952	    (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel);
1953
1954	/* These are the PAE versions of pdei() and ptei() */
1955	l2idx = pdei(va);
1956	l1idx = ptei(va);
1957
1958	DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x "
1959	    "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot,
1960	    flags, l2idx, l1idx);
1961
1962	if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == 0)
1963		panic("%s: PD not initialized for pmap @ %p", __func__, pmap);
1964
1965	/* npa = physaddr of PT page */
1966	npa = pd[l2idx] & PMAP_PA_MASK;
1967
1968	/* Valid PDE for the 2MB region containing va? */
1969	if (!npa) {
1970		/*
1971		 * No valid PDE - allocate PT page and set PDE.  We
1972		 * get it from pm_obj, which is used for PT pages.
1973		 * We calculate the offset  from l2idx+2048, so we are
1974		 * beyond the regular PT pages. For their l2dix
1975		 * 0 <= l2idx < 2048 holds.
1976		 */
1977		ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 2048),
1978		    NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1979		if (ptppg == NULL)
1980			panic("%s: failed to allocate PT page", __func__);
1981
1982		atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY);
1983		ptppg->wire_count = 1;	/* no mappings yet */
1984
1985		npa = VM_PAGE_TO_PHYS(ptppg);
1986		pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U);
1987
1988		DPRINTF("%s: allocated new PT page at phys 0x%x, "
1989		    "setting PDE[%d] = 0x%llx\n", __func__, (uint32_t)npa,
1990		    l2idx, pd[l2idx]);
1991	}
1992
1993	/* temporarily map PT page and set PTE for U-K mapping */
1994	if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL)
1995		panic("%s: no vm_page for PT page", __func__);
1996	mtx_enter(&ptppg->mdpage.pv_mtx);
1997	ptp = (pd_entry_t *)pmap_tmpmap_pa(npa);
1998	ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags);
1999	DPRINTF("%s: setting PTE[%d] = 0x%llx\n", __func__, l1idx, ptp[l1idx]);
2000	pmap_tmpunmap_pa();
2001	mtx_leave(&ptppg->mdpage.pv_mtx);
2002
2003	/* if supported, set the PG_G flag on the corresponding U+K entry */
2004	if (!(cpu_feature & CPUID_PGE))
2005		return;
2006	ptes = pmap_map_ptes_pae(pmap);	/* pmap_kernel -> PTE_BASE */
2007	if (pmap_valid_entry(ptes[atop(va)]))
2008		ptes[atop(va)] |= PG_G;
2009	else
2010		DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2011	pmap_unmap_ptes_pae(pmap);	/* pmap_kernel -> nothing */
2012}
2013
2014/*
2015 * pmap_growkernel: increase usage of KVM space
2016 *
2017 * => we allocate new PTPs for the kernel and install them in all
2018 *	the pmaps on the system.
2019 */
2020
2021vaddr_t
2022pmap_growkernel_pae(vaddr_t maxkvaddr)
2023{
2024	extern int nkpde;
2025	struct pmap *kpm = pmap_kernel(), *pm;
2026	int needed_kpde;   /* needed number of kernel PTPs */
2027	int s;
2028	paddr_t ptaddr;
2029
2030	needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
2031		/ NBPD;
2032	if (needed_kpde <= nkpde)
2033		goto out;		/* we are OK */
2034
2035	/*
2036	 * whoops!   we need to add kernel PTPs
2037	 */
2038
2039	s = splhigh();	/* to be safe */
2040
2041	for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
2042
2043		if (uvm.page_init_done == 0) {
2044
2045			/*
2046			 * we're growing the kernel pmap early (from
2047			 * uvm_pageboot_alloc()).  this case must be
2048			 * handled a little differently.
2049			 */
2050
2051			if (uvm_page_physget(&ptaddr) == 0)
2052				panic("pmap_growkernel: out of memory");
2053			pmap_zero_phys_pae(ptaddr);
2054
2055			PDE(kpm, PDSLOT_KERN + nkpde) =
2056				ptaddr | PG_RW | PG_V | PG_U | PG_M;
2057
2058			/* count PTP as resident */
2059			kpm->pm_stats.resident_count++;
2060			continue;
2061		}
2062
2063		/*
2064		 * THIS *MUST* BE CODED SO AS TO WORK IN THE
2065		 * pmap_initialized == 0 CASE!  WE MAY BE
2066		 * INVOKED WHILE pmap_init() IS RUNNING!
2067		 */
2068
2069		while (!pmap_alloc_ptp_pae(kpm, PDSLOT_KERN + nkpde, 0))
2070			uvm_wait("pmap_growkernel");
2071
2072		/* distribute new kernel PTP to all active pmaps */
2073		mtx_enter(&pmaps_lock);
2074		LIST_FOREACH(pm, &pmaps, pm_list) {
2075			PDE(pm, PDSLOT_KERN + nkpde) =
2076				PDE(kpm, PDSLOT_KERN + nkpde);
2077		}
2078		mtx_leave(&pmaps_lock);
2079	}
2080
2081	splx(s);
2082
2083out:
2084	return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
2085}
2086
2087/*
2088 * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various
2089 * trampoline code can be entered.
2090 */
2091void
2092pmap_prealloc_lowmem_ptp_pae(void)
2093{
2094	pt_entry_t *pte, npte;
2095	vaddr_t ptpva = (vaddr_t)vtopte(0);
2096
2097	/* enter pa for pte 0 into recursive map */
2098	pte = vtopte(ptpva);
2099	npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M;
2100
2101	i386_atomic_testset_uq(pte, npte);
2102
2103	/* make sure it is clean before using */
2104	memset((void *)ptpva, 0, NBPG);
2105}
2106
2107/*
2108 * pmap_tmpmap_pa_pae: map a page in for tmp usage
2109 */
2110
2111vaddr_t
2112pmap_tmpmap_pa_pae(paddr_t pa)
2113{
2114#ifdef MULTIPROCESSOR
2115	int id = cpu_number();
2116#endif
2117	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
2118	caddr_t ptpva = VASLEW(pmap_ptpp, id);
2119#if defined(DIAGNOSTIC)
2120	if (*ptpte)
2121		panic("pmap_tmpmap_pa_pae: ptp_pte in use?");
2122#endif
2123	*ptpte = PG_V | PG_RW | pa;	/* always a new mapping */
2124	return((vaddr_t)ptpva);
2125}
2126
2127/*
2128 * pmap_tmpunmap_pa_pae: unmap a tmp use page (undoes pmap_tmpmap_pa_pae)
2129 */
2130
2131void
2132pmap_tmpunmap_pa_pae(void)
2133{
2134#ifdef MULTIPROCESSOR
2135	int id = cpu_number();
2136#endif
2137	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
2138	caddr_t ptpva = VASLEW(pmap_ptpp, id);
2139#if defined(DIAGNOSTIC)
2140	if (!pmap_valid_entry(*ptpte))
2141		panic("pmap_tmpunmap_pa_pae: our pte invalid?");
2142#endif
2143	*ptpte = 0;
2144	pmap_update_pg((vaddr_t)ptpva);
2145#ifdef MULTIPROCESSOR
2146	/*
2147	 * No need for tlb shootdown here, since ptp_pte is per-CPU.
2148	 */
2149#endif
2150}
2151
2152paddr_t
2153vtophys_pae(vaddr_t va)
2154{
2155	return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME));
2156}
2157
2158void
2159pmap_flush_page_pae(paddr_t pa)
2160{
2161#ifdef MULTIPROCESSOR
2162	int id = cpu_number();
2163#endif
2164	pt_entry_t *pte = PTESLEW(flsh_pte, id);
2165	caddr_t va = VASLEW(pmap_flshp, id);
2166
2167	KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL);
2168#ifdef DIAGNOSTIC
2169	if (*pte)
2170		panic("pmap_flush_page_pae: lock botch");
2171#endif
2172
2173	*pte = (pa & PG_FRAME) | PG_V | PG_RW;
2174	pmap_update_pg(va);
2175	pmap_flush_cache((vaddr_t)va, PAGE_SIZE);
2176	*pte = 0;
2177	pmap_update_pg(va);
2178}
2179