pmap.c revision 177851
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 177851 2008-04-02 04:39:47Z alc $");
81
82/*
83 *	Manages physical address maps.
84 *
85 *	In addition to hardware address maps, this
86 *	module is called upon to provide software-use-only
87 *	maps which may or may not be stored in the same
88 *	form as hardware maps.  These pseudo-maps are
89 *	used to store intermediate results from copy
90 *	operations to and from address spaces.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_msgbuf.h"
109#include "opt_pmap.h"
110#include "opt_vm.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/sx.h>
123#include <sys/vmmeter.h>
124#include <sys/sched.h>
125#include <sys/sysctl.h>
126#ifdef SMP
127#include <sys/smp.h>
128#endif
129
130#include <vm/vm.h>
131#include <vm/vm_param.h>
132#include <vm/vm_kern.h>
133#include <vm/vm_page.h>
134#include <vm/vm_map.h>
135#include <vm/vm_object.h>
136#include <vm/vm_extern.h>
137#include <vm/vm_pageout.h>
138#include <vm/vm_pager.h>
139#include <vm/vm_reserv.h>
140#include <vm/uma.h>
141
142#include <machine/cpu.h>
143#include <machine/cputypes.h>
144#include <machine/md_var.h>
145#include <machine/pcb.h>
146#include <machine/specialreg.h>
147#ifdef SMP
148#include <machine/smp.h>
149#endif
150
151#ifndef PMAP_SHPGPERPROC
152#define PMAP_SHPGPERPROC 200
153#endif
154
155#if !defined(DIAGNOSTIC)
156#define PMAP_INLINE	__gnu89_inline
157#else
158#define PMAP_INLINE
159#endif
160
161#define PV_STATS
162#ifdef PV_STATS
163#define PV_STAT(x)	do { x ; } while (0)
164#else
165#define PV_STAT(x)	do { } while (0)
166#endif
167
168#define	pa_index(pa)	((pa) >> PDRSHIFT)
169#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
170
171struct pmap kernel_pmap_store;
172
173vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
174vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
175
176static int nkpt;
177static int ndmpdp;
178static vm_paddr_t dmaplimit;
179vm_offset_t kernel_vm_end;
180pt_entry_t pg_nx;
181
182SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
183
184static int pg_ps_enabled;
185SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0,
186    "Are large page mappings enabled?");
187
188static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
189static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
190u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
191u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
192
193static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
194static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
195
196/*
197 * Data for the pv entry allocation mechanism
198 */
199static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
200static struct md_page *pv_table;
201static int shpgperproc = PMAP_SHPGPERPROC;
202
203/*
204 * All those kernel PT submaps that BSD is so fond of
205 */
206pt_entry_t *CMAP1 = 0;
207caddr_t CADDR1 = 0;
208struct msgbuf *msgbufp = 0;
209
210/*
211 * Crashdump maps.
212 */
213static caddr_t crashdumpmap;
214
215static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
216static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
217static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
218static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m);
219static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
220static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
221static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
222		    vm_offset_t va);
223
224static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
225static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
226    vm_prot_t prot);
227static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
228    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
229static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
230static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
231static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
232static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
233static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
234    vm_prot_t prot);
235static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
236		vm_page_t *free);
237static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
238		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
239static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
240static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
241    vm_page_t *free);
242static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
243		vm_offset_t va);
244static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
245static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
246    vm_page_t m);
247
248static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
249static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
250
251static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
252static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
253                vm_page_t* free);
254static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
255static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
256
257CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
258CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
259
260/*
261 * Move the kernel virtual free pointer to the next
262 * 2MB.  This is used to help improve performance
263 * by using a large (2MB) page for much of the kernel
264 * (.text, .data, .bss)
265 */
266static vm_offset_t
267pmap_kmem_choose(vm_offset_t addr)
268{
269	vm_offset_t newaddr = addr;
270
271	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
272	return newaddr;
273}
274
275/********************/
276/* Inline functions */
277/********************/
278
279/* Return a non-clipped PD index for a given VA */
280static __inline vm_pindex_t
281pmap_pde_pindex(vm_offset_t va)
282{
283	return va >> PDRSHIFT;
284}
285
286
287/* Return various clipped indexes for a given VA */
288static __inline vm_pindex_t
289pmap_pte_index(vm_offset_t va)
290{
291
292	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
293}
294
295static __inline vm_pindex_t
296pmap_pde_index(vm_offset_t va)
297{
298
299	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
300}
301
302static __inline vm_pindex_t
303pmap_pdpe_index(vm_offset_t va)
304{
305
306	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
307}
308
309static __inline vm_pindex_t
310pmap_pml4e_index(vm_offset_t va)
311{
312
313	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
314}
315
316/* Return a pointer to the PML4 slot that corresponds to a VA */
317static __inline pml4_entry_t *
318pmap_pml4e(pmap_t pmap, vm_offset_t va)
319{
320
321	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
322}
323
324/* Return a pointer to the PDP slot that corresponds to a VA */
325static __inline pdp_entry_t *
326pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
327{
328	pdp_entry_t *pdpe;
329
330	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
331	return (&pdpe[pmap_pdpe_index(va)]);
332}
333
334/* Return a pointer to the PDP slot that corresponds to a VA */
335static __inline pdp_entry_t *
336pmap_pdpe(pmap_t pmap, vm_offset_t va)
337{
338	pml4_entry_t *pml4e;
339
340	pml4e = pmap_pml4e(pmap, va);
341	if ((*pml4e & PG_V) == 0)
342		return NULL;
343	return (pmap_pml4e_to_pdpe(pml4e, va));
344}
345
346/* Return a pointer to the PD slot that corresponds to a VA */
347static __inline pd_entry_t *
348pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
349{
350	pd_entry_t *pde;
351
352	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
353	return (&pde[pmap_pde_index(va)]);
354}
355
356/* Return a pointer to the PD slot that corresponds to a VA */
357static __inline pd_entry_t *
358pmap_pde(pmap_t pmap, vm_offset_t va)
359{
360	pdp_entry_t *pdpe;
361
362	pdpe = pmap_pdpe(pmap, va);
363	if (pdpe == NULL || (*pdpe & PG_V) == 0)
364		 return NULL;
365	return (pmap_pdpe_to_pde(pdpe, va));
366}
367
368/* Return a pointer to the PT slot that corresponds to a VA */
369static __inline pt_entry_t *
370pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
371{
372	pt_entry_t *pte;
373
374	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
375	return (&pte[pmap_pte_index(va)]);
376}
377
378/* Return a pointer to the PT slot that corresponds to a VA */
379static __inline pt_entry_t *
380pmap_pte(pmap_t pmap, vm_offset_t va)
381{
382	pd_entry_t *pde;
383
384	pde = pmap_pde(pmap, va);
385	if (pde == NULL || (*pde & PG_V) == 0)
386		return NULL;
387	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
388		return ((pt_entry_t *)pde);
389	return (pmap_pde_to_pte(pde, va));
390}
391
392
393PMAP_INLINE pt_entry_t *
394vtopte(vm_offset_t va)
395{
396	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
397
398	return (PTmap + ((va >> PAGE_SHIFT) & mask));
399}
400
401static __inline pd_entry_t *
402vtopde(vm_offset_t va)
403{
404	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
405
406	return (PDmap + ((va >> PDRSHIFT) & mask));
407}
408
409static u_int64_t
410allocpages(vm_paddr_t *firstaddr, int n)
411{
412	u_int64_t ret;
413
414	ret = *firstaddr;
415	bzero((void *)ret, n * PAGE_SIZE);
416	*firstaddr += n * PAGE_SIZE;
417	return (ret);
418}
419
420static void
421create_pagetables(vm_paddr_t *firstaddr)
422{
423	int i;
424
425	/* Allocate pages */
426	KPTphys = allocpages(firstaddr, NKPT);
427	KPML4phys = allocpages(firstaddr, 1);
428	KPDPphys = allocpages(firstaddr, NKPML4E);
429	KPDphys = allocpages(firstaddr, NKPDPE);
430
431	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
432	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
433		ndmpdp = 4;
434	DMPDPphys = allocpages(firstaddr, NDMPML4E);
435	if ((amd_feature & AMDID_PAGE1GB) == 0)
436		DMPDphys = allocpages(firstaddr, ndmpdp);
437	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
438
439	/* Fill in the underlying page table pages */
440	/* Read-only from zero to physfree */
441	/* XXX not fully used, underneath 2M pages */
442	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
443		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
444		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
445	}
446
447	/* Now map the page tables at their location within PTmap */
448	for (i = 0; i < NKPT; i++) {
449		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
450		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
451	}
452
453	/* Map from zero to end of allocations under 2M pages */
454	/* This replaces some of the KPTphys entries above */
455	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
456		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
457		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
458	}
459
460	/* And connect up the PD to the PDP */
461	for (i = 0; i < NKPDPE; i++) {
462		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
463		    (i << PAGE_SHIFT);
464		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
465	}
466
467	/* Now set up the direct map space using either 2MB or 1GB pages */
468	if ((amd_feature & AMDID_PAGE1GB) == 0) {
469		for (i = 0; i < NPDEPG * ndmpdp; i++) {
470			((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
471			((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS |
472			    PG_G;
473		}
474		/* And the direct map space's PDP */
475		for (i = 0; i < ndmpdp; i++) {
476			((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
477			    (i << PAGE_SHIFT);
478			((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
479		}
480	} else {
481		for (i = 0; i < ndmpdp; i++) {
482			((pdp_entry_t *)DMPDPphys)[i] =
483			    (vm_paddr_t)i << PDPSHIFT;
484			((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS |
485			    PG_G;
486		}
487	}
488
489	/* And recursively map PML4 to itself in order to get PTmap */
490	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
491	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
492
493	/* Connect the Direct Map slot up to the PML4 */
494	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
495	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
496
497	/* Connect the KVA slot up to the PML4 */
498	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
499	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
500}
501
502/*
503 *	Bootstrap the system enough to run with virtual memory.
504 *
505 *	On amd64 this is called after mapping has already been enabled
506 *	and just syncs the pmap module with what has already been done.
507 *	[We can't call it easily with mapping off since the kernel is not
508 *	mapped with PA == VA, hence we would have to relocate every address
509 *	from the linked base (virtual) address "KERNBASE" to the actual
510 *	(physical) address starting relative to 0]
511 */
512void
513pmap_bootstrap(vm_paddr_t *firstaddr)
514{
515	vm_offset_t va;
516	pt_entry_t *pte, *unused;
517
518	/*
519	 * Create an initial set of page tables to run the kernel in.
520	 */
521	create_pagetables(firstaddr);
522
523	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
524	virtual_avail = pmap_kmem_choose(virtual_avail);
525
526	virtual_end = VM_MAX_KERNEL_ADDRESS;
527
528
529	/* XXX do %cr0 as well */
530	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
531	load_cr3(KPML4phys);
532
533	/*
534	 * Initialize the kernel pmap (which is statically allocated).
535	 */
536	PMAP_LOCK_INIT(kernel_pmap);
537	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
538	kernel_pmap->pm_root = NULL;
539	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
540	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
541	nkpt = NKPT;
542
543	/*
544	 * Reserve some special page table entries/VA space for temporary
545	 * mapping of pages.
546	 */
547#define	SYSMAP(c, p, v, n)	\
548	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
549
550	va = virtual_avail;
551	pte = vtopte(va);
552
553	/*
554	 * CMAP1 is only used for the memory test.
555	 */
556	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
557
558	/*
559	 * Crashdump maps.
560	 */
561	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
562
563	/*
564	 * msgbufp is used to map the system message buffer.
565	 */
566	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
567
568	virtual_avail = va;
569
570	*CMAP1 = 0;
571
572	invltlb();
573
574	/* Initialize the PAT MSR. */
575	pmap_init_pat();
576}
577
578/*
579 * Setup the PAT MSR.
580 */
581void
582pmap_init_pat(void)
583{
584	uint64_t pat_msr;
585
586	/* Bail if this CPU doesn't implement PAT. */
587	if (!(cpu_feature & CPUID_PAT))
588		panic("no PAT??");
589
590#ifdef PAT_WORKS
591	/*
592	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
593	 * Program 4 and 5 as WP and WC.
594	 * Leave 6 and 7 as UC and UC-.
595	 */
596	pat_msr = rdmsr(MSR_PAT);
597	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
598	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
599	    PAT_VALUE(5, PAT_WRITE_COMBINING);
600#else
601	/*
602	 * Due to some Intel errata, we can only safely use the lower 4
603	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
604	 * of UC-.
605	 *
606	 *   Intel Pentium III Processor Specification Update
607	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
608	 * or Mode C Paging)
609	 *
610	 *   Intel Pentium IV  Processor Specification Update
611	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
612	 */
613	pat_msr = rdmsr(MSR_PAT);
614	pat_msr &= ~PAT_MASK(2);
615	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
616#endif
617	wrmsr(MSR_PAT, pat_msr);
618}
619
620/*
621 *	Initialize a vm_page's machine-dependent fields.
622 */
623void
624pmap_page_init(vm_page_t m)
625{
626
627	TAILQ_INIT(&m->md.pv_list);
628}
629
630/*
631 *	Initialize the pmap module.
632 *	Called by vm_init, to initialize any structures that the pmap
633 *	system needs to map virtual memory.
634 */
635void
636pmap_init(void)
637{
638	pd_entry_t *pd;
639	vm_page_t mpte;
640	vm_size_t s;
641	int i, pv_npg;
642
643	/*
644	 * Initialize the vm page array entries for the kernel pmap's
645	 * page table pages.
646	 */
647	pd = pmap_pde(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
648	for (i = 0; i < nkpt; i++) {
649		if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V))
650			continue;
651		mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME);
652		KASSERT(mpte >= vm_page_array &&
653		    mpte < &vm_page_array[vm_page_array_size],
654		    ("pmap_init: page table page is out of range"));
655		mpte->pindex = pmap_pde_pindex(VM_MIN_KERNEL_ADDRESS) + i;
656		mpte->phys_addr = pd[i] & PG_FRAME;
657	}
658
659	/*
660	 * Initialize the address space (zone) for the pv entries.  Set a
661	 * high water mark so that the system can recover from excessive
662	 * numbers of pv entries.
663	 */
664	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
665	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
666	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
667	pv_entry_high_water = 9 * (pv_entry_max / 10);
668
669	/*
670	 * Are large page mappings enabled?
671	 */
672	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
673
674	/*
675	 * Calculate the size of the pv head table for superpages.
676	 */
677	for (i = 0; phys_avail[i + 1]; i += 2);
678	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
679
680	/*
681	 * Allocate memory for the pv head table for superpages.
682	 */
683	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
684	s = round_page(s);
685	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
686	for (i = 0; i < pv_npg; i++)
687		TAILQ_INIT(&pv_table[i].pv_list);
688}
689
690static int
691pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
692{
693	int error;
694
695	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
696	if (error == 0 && req->newptr) {
697		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
698		pv_entry_high_water = 9 * (pv_entry_max / 10);
699	}
700	return (error);
701}
702SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW,
703    &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
704
705static int
706pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
707{
708	int error;
709
710	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
711	if (error == 0 && req->newptr) {
712		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
713		pv_entry_high_water = 9 * (pv_entry_max / 10);
714	}
715	return (error);
716}
717SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW,
718    &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
719
720SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
721    "2MB page mapping counters");
722
723static u_long pmap_pde_demotions;
724SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
725    &pmap_pde_demotions, 0, "2MB page demotions");
726
727static u_long pmap_pde_mappings;
728SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
729    &pmap_pde_mappings, 0, "2MB page mappings");
730
731static u_long pmap_pde_p_failures;
732SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
733    &pmap_pde_p_failures, 0, "2MB page promotion failures");
734
735static u_long pmap_pde_promotions;
736SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
737    &pmap_pde_promotions, 0, "2MB page promotions");
738
739
740/***************************************************
741 * Low level helper routines.....
742 ***************************************************/
743
744/*
745 * Determine the appropriate bits to set in a PTE or PDE for a specified
746 * caching mode.
747 */
748static int
749pmap_cache_bits(int mode, boolean_t is_pde)
750{
751	int pat_flag, pat_index, cache_bits;
752
753	/* The PAT bit is different for PTE's and PDE's. */
754	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
755
756	/* If we don't support PAT, map extended modes to older ones. */
757	if (!(cpu_feature & CPUID_PAT)) {
758		switch (mode) {
759		case PAT_UNCACHEABLE:
760		case PAT_WRITE_THROUGH:
761		case PAT_WRITE_BACK:
762			break;
763		case PAT_UNCACHED:
764		case PAT_WRITE_COMBINING:
765		case PAT_WRITE_PROTECTED:
766			mode = PAT_UNCACHEABLE;
767			break;
768		}
769	}
770
771	/* Map the caching mode to a PAT index. */
772	switch (mode) {
773#ifdef PAT_WORKS
774	case PAT_UNCACHEABLE:
775		pat_index = 3;
776		break;
777	case PAT_WRITE_THROUGH:
778		pat_index = 1;
779		break;
780	case PAT_WRITE_BACK:
781		pat_index = 0;
782		break;
783	case PAT_UNCACHED:
784		pat_index = 2;
785		break;
786	case PAT_WRITE_COMBINING:
787		pat_index = 5;
788		break;
789	case PAT_WRITE_PROTECTED:
790		pat_index = 4;
791		break;
792#else
793	case PAT_UNCACHED:
794	case PAT_UNCACHEABLE:
795	case PAT_WRITE_PROTECTED:
796		pat_index = 3;
797		break;
798	case PAT_WRITE_THROUGH:
799		pat_index = 1;
800		break;
801	case PAT_WRITE_BACK:
802		pat_index = 0;
803		break;
804	case PAT_WRITE_COMBINING:
805		pat_index = 2;
806		break;
807#endif
808	default:
809		panic("Unknown caching mode %d\n", mode);
810	}
811
812	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
813	cache_bits = 0;
814	if (pat_index & 0x4)
815		cache_bits |= pat_flag;
816	if (pat_index & 0x2)
817		cache_bits |= PG_NC_PCD;
818	if (pat_index & 0x1)
819		cache_bits |= PG_NC_PWT;
820	return (cache_bits);
821}
822#ifdef SMP
823/*
824 * For SMP, these functions have to use the IPI mechanism for coherence.
825 *
826 * N.B.: Before calling any of the following TLB invalidation functions,
827 * the calling processor must ensure that all stores updating a non-
828 * kernel page table are globally performed.  Otherwise, another
829 * processor could cache an old, pre-update entry without being
830 * invalidated.  This can happen one of two ways: (1) The pmap becomes
831 * active on another processor after its pm_active field is checked by
832 * one of the following functions but before a store updating the page
833 * table is globally performed. (2) The pmap becomes active on another
834 * processor before its pm_active field is checked but due to
835 * speculative loads one of the following functions stills reads the
836 * pmap as inactive on the other processor.
837 *
838 * The kernel page table is exempt because its pm_active field is
839 * immutable.  The kernel page table is always active on every
840 * processor.
841 */
842void
843pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
844{
845	u_int cpumask;
846	u_int other_cpus;
847
848	sched_pin();
849	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
850		invlpg(va);
851		smp_invlpg(va);
852	} else {
853		cpumask = PCPU_GET(cpumask);
854		other_cpus = PCPU_GET(other_cpus);
855		if (pmap->pm_active & cpumask)
856			invlpg(va);
857		if (pmap->pm_active & other_cpus)
858			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
859	}
860	sched_unpin();
861}
862
863void
864pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
865{
866	u_int cpumask;
867	u_int other_cpus;
868	vm_offset_t addr;
869
870	sched_pin();
871	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
872		for (addr = sva; addr < eva; addr += PAGE_SIZE)
873			invlpg(addr);
874		smp_invlpg_range(sva, eva);
875	} else {
876		cpumask = PCPU_GET(cpumask);
877		other_cpus = PCPU_GET(other_cpus);
878		if (pmap->pm_active & cpumask)
879			for (addr = sva; addr < eva; addr += PAGE_SIZE)
880				invlpg(addr);
881		if (pmap->pm_active & other_cpus)
882			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
883			    sva, eva);
884	}
885	sched_unpin();
886}
887
888void
889pmap_invalidate_all(pmap_t pmap)
890{
891	u_int cpumask;
892	u_int other_cpus;
893
894	sched_pin();
895	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
896		invltlb();
897		smp_invltlb();
898	} else {
899		cpumask = PCPU_GET(cpumask);
900		other_cpus = PCPU_GET(other_cpus);
901		if (pmap->pm_active & cpumask)
902			invltlb();
903		if (pmap->pm_active & other_cpus)
904			smp_masked_invltlb(pmap->pm_active & other_cpus);
905	}
906	sched_unpin();
907}
908
909void
910pmap_invalidate_cache(void)
911{
912
913	sched_pin();
914	wbinvd();
915	smp_cache_flush();
916	sched_unpin();
917}
918#else /* !SMP */
919/*
920 * Normal, non-SMP, invalidation functions.
921 * We inline these within pmap.c for speed.
922 */
923PMAP_INLINE void
924pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
925{
926
927	if (pmap == kernel_pmap || pmap->pm_active)
928		invlpg(va);
929}
930
931PMAP_INLINE void
932pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
933{
934	vm_offset_t addr;
935
936	if (pmap == kernel_pmap || pmap->pm_active)
937		for (addr = sva; addr < eva; addr += PAGE_SIZE)
938			invlpg(addr);
939}
940
941PMAP_INLINE void
942pmap_invalidate_all(pmap_t pmap)
943{
944
945	if (pmap == kernel_pmap || pmap->pm_active)
946		invltlb();
947}
948
949PMAP_INLINE void
950pmap_invalidate_cache(void)
951{
952
953	wbinvd();
954}
955#endif /* !SMP */
956
957/*
958 * Are we current address space or kernel?
959 */
960static __inline int
961pmap_is_current(pmap_t pmap)
962{
963	return (pmap == kernel_pmap ||
964	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
965}
966
967/*
968 *	Routine:	pmap_extract
969 *	Function:
970 *		Extract the physical page address associated
971 *		with the given map/virtual_address pair.
972 */
973vm_paddr_t
974pmap_extract(pmap_t pmap, vm_offset_t va)
975{
976	vm_paddr_t rtval;
977	pt_entry_t *pte;
978	pd_entry_t pde, *pdep;
979
980	rtval = 0;
981	PMAP_LOCK(pmap);
982	pdep = pmap_pde(pmap, va);
983	if (pdep != NULL) {
984		pde = *pdep;
985		if (pde) {
986			if ((pde & PG_PS) != 0) {
987				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
988				PMAP_UNLOCK(pmap);
989				return rtval;
990			}
991			pte = pmap_pde_to_pte(pdep, va);
992			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
993		}
994	}
995	PMAP_UNLOCK(pmap);
996	return (rtval);
997}
998
999/*
1000 *	Routine:	pmap_extract_and_hold
1001 *	Function:
1002 *		Atomically extract and hold the physical page
1003 *		with the given pmap and virtual address pair
1004 *		if that mapping permits the given protection.
1005 */
1006vm_page_t
1007pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1008{
1009	pd_entry_t pde, *pdep;
1010	pt_entry_t pte;
1011	vm_page_t m;
1012
1013	m = NULL;
1014	vm_page_lock_queues();
1015	PMAP_LOCK(pmap);
1016	pdep = pmap_pde(pmap, va);
1017	if (pdep != NULL && (pde = *pdep)) {
1018		if (pde & PG_PS) {
1019			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1020				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1021				    (va & PDRMASK));
1022				vm_page_hold(m);
1023			}
1024		} else {
1025			pte = *pmap_pde_to_pte(pdep, va);
1026			if ((pte & PG_V) &&
1027			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1028				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1029				vm_page_hold(m);
1030			}
1031		}
1032	}
1033	vm_page_unlock_queues();
1034	PMAP_UNLOCK(pmap);
1035	return (m);
1036}
1037
1038vm_paddr_t
1039pmap_kextract(vm_offset_t va)
1040{
1041	pd_entry_t *pde;
1042	vm_paddr_t pa;
1043
1044	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1045		pa = DMAP_TO_PHYS(va);
1046	} else {
1047		pde = vtopde(va);
1048		if (*pde & PG_PS) {
1049			pa = (*pde & PG_PS_FRAME) | (va & PDRMASK);
1050		} else {
1051			pa = *vtopte(va);
1052			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1053		}
1054	}
1055	return pa;
1056}
1057
1058/***************************************************
1059 * Low level mapping routines.....
1060 ***************************************************/
1061
1062/*
1063 * Add a wired page to the kva.
1064 * Note: not SMP coherent.
1065 */
1066PMAP_INLINE void
1067pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1068{
1069	pt_entry_t *pte;
1070
1071	pte = vtopte(va);
1072	pte_store(pte, pa | PG_RW | PG_V | PG_G);
1073}
1074
1075PMAP_INLINE void
1076pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1077{
1078	pt_entry_t *pte;
1079
1080	pte = vtopte(va);
1081	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
1082}
1083
1084/*
1085 * Remove a page from the kernel pagetables.
1086 * Note: not SMP coherent.
1087 */
1088PMAP_INLINE void
1089pmap_kremove(vm_offset_t va)
1090{
1091	pt_entry_t *pte;
1092
1093	pte = vtopte(va);
1094	pte_clear(pte);
1095}
1096
1097/*
1098 *	Used to map a range of physical addresses into kernel
1099 *	virtual address space.
1100 *
1101 *	The value passed in '*virt' is a suggested virtual address for
1102 *	the mapping. Architectures which can support a direct-mapped
1103 *	physical to virtual region can return the appropriate address
1104 *	within that region, leaving '*virt' unchanged. Other
1105 *	architectures should map the pages starting at '*virt' and
1106 *	update '*virt' with the first usable address after the mapped
1107 *	region.
1108 */
1109vm_offset_t
1110pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1111{
1112	return PHYS_TO_DMAP(start);
1113}
1114
1115
1116/*
1117 * Add a list of wired pages to the kva
1118 * this routine is only used for temporary
1119 * kernel mappings that do not need to have
1120 * page modification or references recorded.
1121 * Note that old mappings are simply written
1122 * over.  The page *must* be wired.
1123 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1124 */
1125void
1126pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1127{
1128	pt_entry_t *endpte, oldpte, *pte;
1129
1130	oldpte = 0;
1131	pte = vtopte(sva);
1132	endpte = pte + count;
1133	while (pte < endpte) {
1134		oldpte |= *pte;
1135		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V);
1136		pte++;
1137		ma++;
1138	}
1139	if ((oldpte & PG_V) != 0)
1140		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1141		    PAGE_SIZE);
1142}
1143
1144/*
1145 * This routine tears out page mappings from the
1146 * kernel -- it is meant only for temporary mappings.
1147 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1148 */
1149void
1150pmap_qremove(vm_offset_t sva, int count)
1151{
1152	vm_offset_t va;
1153
1154	va = sva;
1155	while (count-- > 0) {
1156		pmap_kremove(va);
1157		va += PAGE_SIZE;
1158	}
1159	pmap_invalidate_range(kernel_pmap, sva, va);
1160}
1161
1162/***************************************************
1163 * Page table page management routines.....
1164 ***************************************************/
1165static __inline void
1166pmap_free_zero_pages(vm_page_t free)
1167{
1168	vm_page_t m;
1169
1170	while (free != NULL) {
1171		m = free;
1172		free = m->right;
1173		/* Preserve the page's PG_ZERO setting. */
1174		vm_page_free_toq(m);
1175	}
1176}
1177
1178/*
1179 * Schedule the specified unused page table page to be freed.  Specifically,
1180 * add the page to the specified list of pages that will be released to the
1181 * physical memory manager after the TLB has been updated.
1182 */
1183static __inline void
1184pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1185{
1186
1187	if (set_PG_ZERO)
1188		m->flags |= PG_ZERO;
1189	else
1190		m->flags &= ~PG_ZERO;
1191	m->right = *free;
1192	*free = m;
1193}
1194
1195/*
1196 * Inserts the specified page table page into the specified pmap's collection
1197 * of idle page table pages.  Each of a pmap's page table pages is responsible
1198 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1199 * ordered by this virtual address range.
1200 */
1201static void
1202pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1203{
1204	vm_page_t root;
1205
1206	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1207	root = pmap->pm_root;
1208	if (root == NULL) {
1209		mpte->left = NULL;
1210		mpte->right = NULL;
1211	} else {
1212		root = vm_page_splay(mpte->pindex, root);
1213		if (mpte->pindex < root->pindex) {
1214			mpte->left = root->left;
1215			mpte->right = root;
1216			root->left = NULL;
1217		} else if (mpte->pindex == root->pindex)
1218			panic("pmap_insert_pt_page: pindex already inserted");
1219		else {
1220			mpte->right = root->right;
1221			mpte->left = root;
1222			root->right = NULL;
1223		}
1224	}
1225	pmap->pm_root = mpte;
1226}
1227
1228/*
1229 * Looks for a page table page mapping the specified virtual address in the
1230 * specified pmap's collection of idle page table pages.  Returns NULL if there
1231 * is no page table page corresponding to the specified virtual address.
1232 */
1233static vm_page_t
1234pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1235{
1236	vm_page_t mpte;
1237	vm_pindex_t pindex = pmap_pde_pindex(va);
1238
1239	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1240	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1241		mpte = vm_page_splay(pindex, mpte);
1242		if ((pmap->pm_root = mpte)->pindex != pindex)
1243			mpte = NULL;
1244	}
1245	return (mpte);
1246}
1247
1248/*
1249 * Removes the specified page table page from the specified pmap's collection
1250 * of idle page table pages.  The specified page table page must be a member of
1251 * the pmap's collection.
1252 */
1253static void
1254pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1255{
1256	vm_page_t root;
1257
1258	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1259	if (mpte != pmap->pm_root) {
1260		root = vm_page_splay(mpte->pindex, pmap->pm_root);
1261		KASSERT(mpte == root,
1262		    ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
1263		    mpte, pmap));
1264	}
1265	if (mpte->left == NULL)
1266		root = mpte->right;
1267	else {
1268		root = vm_page_splay(mpte->pindex, mpte->left);
1269		root->right = mpte->right;
1270	}
1271	pmap->pm_root = root;
1272}
1273
1274/*
1275 * This routine unholds page table pages, and if the hold count
1276 * drops to zero, then it decrements the wire count.
1277 */
1278static __inline int
1279pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1280{
1281
1282	--m->wire_count;
1283	if (m->wire_count == 0)
1284		return _pmap_unwire_pte_hold(pmap, va, m, free);
1285	else
1286		return 0;
1287}
1288
1289static int
1290_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
1291    vm_page_t *free)
1292{
1293	vm_offset_t pteva;
1294
1295	/*
1296	 * unmap the page table page
1297	 */
1298	if (m->pindex >= (NUPDE + NUPDPE)) {
1299		/* PDP page */
1300		pml4_entry_t *pml4;
1301		pml4 = pmap_pml4e(pmap, va);
1302		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
1303		*pml4 = 0;
1304	} else if (m->pindex >= NUPDE) {
1305		/* PD page */
1306		pdp_entry_t *pdp;
1307		pdp = pmap_pdpe(pmap, va);
1308		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
1309		*pdp = 0;
1310	} else {
1311		/* PTE page */
1312		pd_entry_t *pd;
1313		pd = pmap_pde(pmap, va);
1314		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
1315		*pd = 0;
1316	}
1317	--pmap->pm_stats.resident_count;
1318	if (m->pindex < NUPDE) {
1319		/* We just released a PT, unhold the matching PD */
1320		vm_page_t pdpg;
1321
1322		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1323		pmap_unwire_pte_hold(pmap, va, pdpg, free);
1324	}
1325	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1326		/* We just released a PD, unhold the matching PDP */
1327		vm_page_t pdppg;
1328
1329		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1330		pmap_unwire_pte_hold(pmap, va, pdppg, free);
1331	}
1332
1333	/*
1334	 * This is a release store so that the ordinary store unmapping
1335	 * the page table page is globally performed before TLB shoot-
1336	 * down is begun.
1337	 */
1338	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1339
1340	/*
1341	 * Do an invltlb to make the invalidated mapping
1342	 * take effect immediately.
1343	 */
1344	pmap_invalidate_page(pmap, pteva);
1345
1346	/*
1347	 * Put page on a list so that it is released after
1348	 * *ALL* TLB shootdown is done
1349	 */
1350	pmap_add_delayed_free_list(m, free, TRUE);
1351
1352	return 1;
1353}
1354
1355/*
1356 * After removing a page table entry, this routine is used to
1357 * conditionally free the page, and manage the hold/wire counts.
1358 */
1359static int
1360pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
1361{
1362	vm_page_t mpte;
1363
1364	if (va >= VM_MAXUSER_ADDRESS)
1365		return 0;
1366	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1367	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1368	return pmap_unwire_pte_hold(pmap, va, mpte, free);
1369}
1370
1371void
1372pmap_pinit0(pmap_t pmap)
1373{
1374
1375	PMAP_LOCK_INIT(pmap);
1376	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1377	pmap->pm_root = NULL;
1378	pmap->pm_active = 0;
1379	TAILQ_INIT(&pmap->pm_pvchunk);
1380	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1381}
1382
1383/*
1384 * Initialize a preallocated and zeroed pmap structure,
1385 * such as one in a vmspace structure.
1386 */
1387int
1388pmap_pinit(pmap_t pmap)
1389{
1390	vm_page_t pml4pg;
1391	static vm_pindex_t color;
1392
1393	PMAP_LOCK_INIT(pmap);
1394
1395	/*
1396	 * allocate the page directory page
1397	 */
1398	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1399	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1400		VM_WAIT;
1401
1402	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1403
1404	if ((pml4pg->flags & PG_ZERO) == 0)
1405		pagezero(pmap->pm_pml4);
1406
1407	/* Wire in kernel global address entries. */
1408	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1409	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1410
1411	/* install self-referential address mapping entry(s) */
1412	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1413
1414	pmap->pm_root = NULL;
1415	pmap->pm_active = 0;
1416	TAILQ_INIT(&pmap->pm_pvchunk);
1417	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1418
1419	return (1);
1420}
1421
1422/*
1423 * this routine is called if the page table page is not
1424 * mapped correctly.
1425 *
1426 * Note: If a page allocation fails at page table level two or three,
1427 * one or two pages may be held during the wait, only to be released
1428 * afterwards.  This conservative approach is easily argued to avoid
1429 * race conditions.
1430 */
1431static vm_page_t
1432_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1433{
1434	vm_page_t m, pdppg, pdpg;
1435
1436	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1437	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1438	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1439
1440	/*
1441	 * Allocate a page table page.
1442	 */
1443	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1444	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1445		if (flags & M_WAITOK) {
1446			PMAP_UNLOCK(pmap);
1447			vm_page_unlock_queues();
1448			VM_WAIT;
1449			vm_page_lock_queues();
1450			PMAP_LOCK(pmap);
1451		}
1452
1453		/*
1454		 * Indicate the need to retry.  While waiting, the page table
1455		 * page may have been allocated.
1456		 */
1457		return (NULL);
1458	}
1459	if ((m->flags & PG_ZERO) == 0)
1460		pmap_zero_page(m);
1461
1462	/*
1463	 * Map the pagetable page into the process address space, if
1464	 * it isn't already there.
1465	 */
1466
1467	pmap->pm_stats.resident_count++;
1468
1469	if (ptepindex >= (NUPDE + NUPDPE)) {
1470		pml4_entry_t *pml4;
1471		vm_pindex_t pml4index;
1472
1473		/* Wire up a new PDPE page */
1474		pml4index = ptepindex - (NUPDE + NUPDPE);
1475		pml4 = &pmap->pm_pml4[pml4index];
1476		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1477
1478	} else if (ptepindex >= NUPDE) {
1479		vm_pindex_t pml4index;
1480		vm_pindex_t pdpindex;
1481		pml4_entry_t *pml4;
1482		pdp_entry_t *pdp;
1483
1484		/* Wire up a new PDE page */
1485		pdpindex = ptepindex - NUPDE;
1486		pml4index = pdpindex >> NPML4EPGSHIFT;
1487
1488		pml4 = &pmap->pm_pml4[pml4index];
1489		if ((*pml4 & PG_V) == 0) {
1490			/* Have to allocate a new pdp, recurse */
1491			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1492			    flags) == NULL) {
1493				--m->wire_count;
1494				vm_page_free(m);
1495				return (NULL);
1496			}
1497		} else {
1498			/* Add reference to pdp page */
1499			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1500			pdppg->wire_count++;
1501		}
1502		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1503
1504		/* Now find the pdp page */
1505		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1506		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1507
1508	} else {
1509		vm_pindex_t pml4index;
1510		vm_pindex_t pdpindex;
1511		pml4_entry_t *pml4;
1512		pdp_entry_t *pdp;
1513		pd_entry_t *pd;
1514
1515		/* Wire up a new PTE page */
1516		pdpindex = ptepindex >> NPDPEPGSHIFT;
1517		pml4index = pdpindex >> NPML4EPGSHIFT;
1518
1519		/* First, find the pdp and check that its valid. */
1520		pml4 = &pmap->pm_pml4[pml4index];
1521		if ((*pml4 & PG_V) == 0) {
1522			/* Have to allocate a new pd, recurse */
1523			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1524			    flags) == NULL) {
1525				--m->wire_count;
1526				vm_page_free(m);
1527				return (NULL);
1528			}
1529			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1530			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1531		} else {
1532			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1533			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1534			if ((*pdp & PG_V) == 0) {
1535				/* Have to allocate a new pd, recurse */
1536				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1537				    flags) == NULL) {
1538					--m->wire_count;
1539					vm_page_free(m);
1540					return (NULL);
1541				}
1542			} else {
1543				/* Add reference to the pd page */
1544				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1545				pdpg->wire_count++;
1546			}
1547		}
1548		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1549
1550		/* Now we know where the page directory page is */
1551		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1552		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1553	}
1554
1555	return m;
1556}
1557
1558static vm_page_t
1559pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1560{
1561	vm_pindex_t pdpindex, ptepindex;
1562	pdp_entry_t *pdpe;
1563	vm_page_t pdpg;
1564
1565	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1566	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1567	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1568retry:
1569	pdpe = pmap_pdpe(pmap, va);
1570	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1571		/* Add a reference to the pd page. */
1572		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1573		pdpg->wire_count++;
1574	} else {
1575		/* Allocate a pd page. */
1576		ptepindex = pmap_pde_pindex(va);
1577		pdpindex = ptepindex >> NPDPEPGSHIFT;
1578		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1579		if (pdpg == NULL && (flags & M_WAITOK))
1580			goto retry;
1581	}
1582	return (pdpg);
1583}
1584
1585static vm_page_t
1586pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1587{
1588	vm_pindex_t ptepindex;
1589	pd_entry_t *pd;
1590	vm_page_t m;
1591
1592	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1593	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1594	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1595
1596	/*
1597	 * Calculate pagetable page index
1598	 */
1599	ptepindex = pmap_pde_pindex(va);
1600retry:
1601	/*
1602	 * Get the page directory entry
1603	 */
1604	pd = pmap_pde(pmap, va);
1605
1606	/*
1607	 * This supports switching from a 2MB page to a
1608	 * normal 4K page.
1609	 */
1610	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1611		if (!pmap_demote_pde(pmap, pd, va)) {
1612			/*
1613			 * Invalidation of the 2MB page mapping may have caused
1614			 * the deallocation of the underlying PD page.
1615			 */
1616			pd = NULL;
1617		}
1618	}
1619
1620	/*
1621	 * If the page table page is mapped, we just increment the
1622	 * hold count, and activate it.
1623	 */
1624	if (pd != NULL && (*pd & PG_V) != 0) {
1625		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1626		m->wire_count++;
1627	} else {
1628		/*
1629		 * Here if the pte page isn't mapped, or if it has been
1630		 * deallocated.
1631		 */
1632		m = _pmap_allocpte(pmap, ptepindex, flags);
1633		if (m == NULL && (flags & M_WAITOK))
1634			goto retry;
1635	}
1636	return (m);
1637}
1638
1639
1640/***************************************************
1641 * Pmap allocation/deallocation routines.
1642 ***************************************************/
1643
1644/*
1645 * Release any resources held by the given physical map.
1646 * Called when a pmap initialized by pmap_pinit is being released.
1647 * Should only be called if the map contains no valid mappings.
1648 */
1649void
1650pmap_release(pmap_t pmap)
1651{
1652	vm_page_t m;
1653
1654	KASSERT(pmap->pm_stats.resident_count == 0,
1655	    ("pmap_release: pmap resident count %ld != 0",
1656	    pmap->pm_stats.resident_count));
1657	KASSERT(pmap->pm_root == NULL,
1658	    ("pmap_release: pmap has reserved page table page(s)"));
1659
1660	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1661
1662	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1663	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1664	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1665
1666	m->wire_count--;
1667	atomic_subtract_int(&cnt.v_wire_count, 1);
1668	vm_page_free_zero(m);
1669	PMAP_LOCK_DESTROY(pmap);
1670}
1671
1672static int
1673kvm_size(SYSCTL_HANDLER_ARGS)
1674{
1675	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1676
1677	return sysctl_handle_long(oidp, &ksize, 0, req);
1678}
1679SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1680    0, 0, kvm_size, "LU", "Size of KVM");
1681
1682static int
1683kvm_free(SYSCTL_HANDLER_ARGS)
1684{
1685	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1686
1687	return sysctl_handle_long(oidp, &kfree, 0, req);
1688}
1689SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1690    0, 0, kvm_free, "LU", "Amount of KVM free");
1691
1692/*
1693 * grow the number of kernel page table entries, if needed
1694 */
1695void
1696pmap_growkernel(vm_offset_t addr)
1697{
1698	vm_paddr_t paddr;
1699	vm_page_t nkpg;
1700	pd_entry_t *pde, newpdir;
1701	pdp_entry_t newpdp;
1702
1703	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1704	if (kernel_vm_end == 0) {
1705		kernel_vm_end = KERNBASE;
1706		nkpt = 0;
1707		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1708			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1709			nkpt++;
1710			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1711				kernel_vm_end = kernel_map->max_offset;
1712				break;
1713			}
1714		}
1715	}
1716	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1717	if (addr - 1 >= kernel_map->max_offset)
1718		addr = kernel_map->max_offset;
1719	while (kernel_vm_end < addr) {
1720		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1721		if (pde == NULL) {
1722			/* We need a new PDP entry */
1723			nkpg = vm_page_alloc(NULL, nkpt,
1724			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1725			if (nkpg == NULL)
1726				panic("pmap_growkernel: no memory to grow kernel");
1727			pmap_zero_page(nkpg);
1728			paddr = VM_PAGE_TO_PHYS(nkpg);
1729			newpdp = (pdp_entry_t)
1730				(paddr | PG_V | PG_RW | PG_A | PG_M);
1731			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1732			continue; /* try again */
1733		}
1734		if ((*pde & PG_V) != 0) {
1735			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1736			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1737				kernel_vm_end = kernel_map->max_offset;
1738				break;
1739			}
1740			continue;
1741		}
1742
1743		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
1744		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1745		if (nkpg == NULL)
1746			panic("pmap_growkernel: no memory to grow kernel");
1747
1748		nkpt++;
1749
1750		pmap_zero_page(nkpg);
1751		paddr = VM_PAGE_TO_PHYS(nkpg);
1752		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1753		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1754
1755		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1756		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1757			kernel_vm_end = kernel_map->max_offset;
1758			break;
1759		}
1760	}
1761}
1762
1763
1764/***************************************************
1765 * page management routines.
1766 ***************************************************/
1767
1768CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1769CTASSERT(_NPCM == 3);
1770CTASSERT(_NPCPV == 168);
1771
1772static __inline struct pv_chunk *
1773pv_to_chunk(pv_entry_t pv)
1774{
1775
1776	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1777}
1778
1779#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1780
1781#define	PC_FREE0	0xfffffffffffffffful
1782#define	PC_FREE1	0xfffffffffffffffful
1783#define	PC_FREE2	0x000000fffffffffful
1784
1785static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1786
1787SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1788	"Current number of pv entries");
1789
1790#ifdef PV_STATS
1791static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1792
1793SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1794	"Current number of pv entry chunks");
1795SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1796	"Current number of pv entry chunks allocated");
1797SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1798	"Current number of pv entry chunks frees");
1799SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1800	"Number of times tried to get a chunk page but failed.");
1801
1802static long pv_entry_frees, pv_entry_allocs;
1803static int pv_entry_spare;
1804
1805SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1806	"Current number of pv entry frees");
1807SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1808	"Current number of pv entry allocs");
1809SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1810	"Current number of spare pv entries");
1811
1812static int pmap_collect_inactive, pmap_collect_active;
1813
1814SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1815	"Current number times pmap_collect called on inactive queue");
1816SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1817	"Current number times pmap_collect called on active queue");
1818#endif
1819
1820/*
1821 * We are in a serious low memory condition.  Resort to
1822 * drastic measures to free some pages so we can allocate
1823 * another pv entry chunk.  This is normally called to
1824 * unmap inactive pages, and if necessary, active pages.
1825 *
1826 * We do not, however, unmap 2mpages because subsequent accesses will
1827 * allocate per-page pv entries until repromotion occurs, thereby
1828 * exacerbating the shortage of free pv entries.
1829 */
1830static void
1831pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1832{
1833	struct md_page *pvh;
1834	pd_entry_t *pde;
1835	pmap_t pmap;
1836	pt_entry_t *pte, tpte;
1837	pv_entry_t next_pv, pv;
1838	vm_offset_t va;
1839	vm_page_t m, free;
1840
1841	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1842		if (m->hold_count || m->busy)
1843			continue;
1844		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1845			va = pv->pv_va;
1846			pmap = PV_PMAP(pv);
1847			/* Avoid deadlock and lock recursion. */
1848			if (pmap > locked_pmap)
1849				PMAP_LOCK(pmap);
1850			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1851				continue;
1852			pmap->pm_stats.resident_count--;
1853			pde = pmap_pde(pmap, va);
1854			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
1855			    " a 2mpage in page %p's pv list", m));
1856			pte = pmap_pde_to_pte(pde, va);
1857			tpte = pte_load_clear(pte);
1858			KASSERT((tpte & PG_W) == 0,
1859			    ("pmap_collect: wired pte %#lx", tpte));
1860			if (tpte & PG_A)
1861				vm_page_flag_set(m, PG_REFERENCED);
1862			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1863				vm_page_dirty(m);
1864			free = NULL;
1865			pmap_unuse_pt(pmap, va, *pde, &free);
1866			pmap_invalidate_page(pmap, va);
1867			pmap_free_zero_pages(free);
1868			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1869			if (TAILQ_EMPTY(&m->md.pv_list)) {
1870				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1871				if (TAILQ_EMPTY(&pvh->pv_list))
1872					vm_page_flag_clear(m, PG_WRITEABLE);
1873			}
1874			free_pv_entry(pmap, pv);
1875			if (pmap != locked_pmap)
1876				PMAP_UNLOCK(pmap);
1877		}
1878	}
1879}
1880
1881
1882/*
1883 * free the pv_entry back to the free list
1884 */
1885static void
1886free_pv_entry(pmap_t pmap, pv_entry_t pv)
1887{
1888	vm_page_t m;
1889	struct pv_chunk *pc;
1890	int idx, field, bit;
1891
1892	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1893	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1894	PV_STAT(pv_entry_frees++);
1895	PV_STAT(pv_entry_spare++);
1896	pv_entry_count--;
1897	pc = pv_to_chunk(pv);
1898	idx = pv - &pc->pc_pventry[0];
1899	field = idx / 64;
1900	bit = idx % 64;
1901	pc->pc_map[field] |= 1ul << bit;
1902	/* move to head of list */
1903	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1904	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1905	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1906	    pc->pc_map[2] != PC_FREE2)
1907		return;
1908	PV_STAT(pv_entry_spare -= _NPCPV);
1909	PV_STAT(pc_chunk_count--);
1910	PV_STAT(pc_chunk_frees++);
1911	/* entire chunk is free, return it */
1912	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1913	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1914	dump_drop_page(m->phys_addr);
1915	vm_page_unwire(m, 0);
1916	vm_page_free(m);
1917}
1918
1919/*
1920 * get a new pv_entry, allocating a block from the system
1921 * when needed.
1922 */
1923static pv_entry_t
1924get_pv_entry(pmap_t pmap, int try)
1925{
1926	static const struct timeval printinterval = { 60, 0 };
1927	static struct timeval lastprint;
1928	static vm_pindex_t colour;
1929	struct vpgqueues *pq;
1930	int bit, field;
1931	pv_entry_t pv;
1932	struct pv_chunk *pc;
1933	vm_page_t m;
1934
1935	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1936	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1937	PV_STAT(pv_entry_allocs++);
1938	pv_entry_count++;
1939	if (pv_entry_count > pv_entry_high_water)
1940		if (ratecheck(&lastprint, &printinterval))
1941			printf("Approaching the limit on PV entries, consider "
1942			    "increasing either the vm.pmap.shpgperproc or the "
1943			    "vm.pmap.pv_entry_max sysctl.\n");
1944	pq = NULL;
1945retry:
1946	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1947	if (pc != NULL) {
1948		for (field = 0; field < _NPCM; field++) {
1949			if (pc->pc_map[field]) {
1950				bit = bsfq(pc->pc_map[field]);
1951				break;
1952			}
1953		}
1954		if (field < _NPCM) {
1955			pv = &pc->pc_pventry[field * 64 + bit];
1956			pc->pc_map[field] &= ~(1ul << bit);
1957			/* If this was the last item, move it to tail */
1958			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1959			    pc->pc_map[2] == 0) {
1960				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1961				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1962			}
1963			PV_STAT(pv_entry_spare--);
1964			return (pv);
1965		}
1966	}
1967	/* No free items, allocate another chunk */
1968	m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ?
1969	    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
1970	    VM_ALLOC_WIRED);
1971	if (m == NULL) {
1972		if (try) {
1973			pv_entry_count--;
1974			PV_STAT(pc_chunk_tryfail++);
1975			return (NULL);
1976		}
1977		/*
1978		 * Reclaim pv entries: At first, destroy mappings to inactive
1979		 * pages.  After that, if a pv chunk entry is still needed,
1980		 * destroy mappings to active pages.
1981		 */
1982		if (pq == NULL) {
1983			PV_STAT(pmap_collect_inactive++);
1984			pq = &vm_page_queues[PQ_INACTIVE];
1985		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
1986			PV_STAT(pmap_collect_active++);
1987			pq = &vm_page_queues[PQ_ACTIVE];
1988		} else
1989			panic("get_pv_entry: increase vm.pmap.shpgperproc");
1990		pmap_collect(pmap, pq);
1991		goto retry;
1992	}
1993	PV_STAT(pc_chunk_count++);
1994	PV_STAT(pc_chunk_allocs++);
1995	colour++;
1996	dump_add_page(m->phys_addr);
1997	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1998	pc->pc_pmap = pmap;
1999	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2000	pc->pc_map[1] = PC_FREE1;
2001	pc->pc_map[2] = PC_FREE2;
2002	pv = &pc->pc_pventry[0];
2003	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2004	PV_STAT(pv_entry_spare += _NPCPV - 1);
2005	return (pv);
2006}
2007
2008/*
2009 * First find and then remove the pv entry for the specified pmap and virtual
2010 * address from the specified pv list.  Returns the pv entry if found and NULL
2011 * otherwise.  This operation can be performed on pv lists for either 4KB or
2012 * 2MB page mappings.
2013 */
2014static __inline pv_entry_t
2015pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2016{
2017	pv_entry_t pv;
2018
2019	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2020	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2021		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2022			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2023			break;
2024		}
2025	}
2026	return (pv);
2027}
2028
2029/*
2030 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2031 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2032 * entries for each of the 4KB page mappings.
2033 */
2034static void
2035pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2036{
2037	struct md_page *pvh;
2038	pv_entry_t pv;
2039	vm_offset_t va_last;
2040	vm_page_t m;
2041
2042	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2043	KASSERT((pa & PDRMASK) == 0,
2044	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
2045
2046	/*
2047	 * Transfer the 2mpage's pv entry for this mapping to the first
2048	 * page's pv list.
2049	 */
2050	pvh = pa_to_pvh(pa);
2051	va = trunc_2mpage(va);
2052	pv = pmap_pvh_remove(pvh, pmap, va);
2053	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2054	m = PHYS_TO_VM_PAGE(pa);
2055	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2056	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2057	va_last = va + NBPDR - PAGE_SIZE;
2058	do {
2059		m++;
2060		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2061		    ("pmap_pv_demote_pde: page %p is not managed", m));
2062		va += PAGE_SIZE;
2063		pmap_insert_entry(pmap, va, m);
2064	} while (va < va_last);
2065}
2066
2067/*
2068 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2069 * replace the many pv entries for the 4KB page mappings by a single pv entry
2070 * for the 2MB page mapping.
2071 */
2072static void
2073pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2074{
2075	struct md_page *pvh;
2076	pv_entry_t pv;
2077	vm_offset_t va_last;
2078	vm_page_t m;
2079
2080	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2081	KASSERT((pa & PDRMASK) == 0,
2082	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
2083
2084	/*
2085	 * Transfer the first page's pv entry for this mapping to the
2086	 * 2mpage's pv list.  Aside from avoiding the cost of a call
2087	 * to get_pv_entry(), a transfer avoids the possibility that
2088	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2089	 * removes one of the mappings that is being promoted.
2090	 */
2091	m = PHYS_TO_VM_PAGE(pa);
2092	va = trunc_2mpage(va);
2093	pv = pmap_pvh_remove(&m->md, pmap, va);
2094	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2095	pvh = pa_to_pvh(pa);
2096	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2097	/* Free the remaining NPTEPG - 1 pv entries. */
2098	va_last = va + NBPDR - PAGE_SIZE;
2099	do {
2100		m++;
2101		va += PAGE_SIZE;
2102		pmap_pvh_free(&m->md, pmap, va);
2103	} while (va < va_last);
2104}
2105
2106/*
2107 * First find and then destroy the pv entry for the specified pmap and virtual
2108 * address.  This operation can be performed on pv lists for either 4KB or 2MB
2109 * page mappings.
2110 */
2111static void
2112pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2113{
2114	pv_entry_t pv;
2115
2116	pv = pmap_pvh_remove(pvh, pmap, va);
2117	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2118	free_pv_entry(pmap, pv);
2119}
2120
2121static void
2122pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2123{
2124	struct md_page *pvh;
2125
2126	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2127	pmap_pvh_free(&m->md, pmap, va);
2128	if (TAILQ_EMPTY(&m->md.pv_list)) {
2129		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2130		if (TAILQ_EMPTY(&pvh->pv_list))
2131			vm_page_flag_clear(m, PG_WRITEABLE);
2132	}
2133}
2134
2135/*
2136 * Create a pv entry for page at pa for
2137 * (pmap, va).
2138 */
2139static void
2140pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2141{
2142	pv_entry_t pv;
2143
2144	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2145	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2146	pv = get_pv_entry(pmap, FALSE);
2147	pv->pv_va = va;
2148	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2149}
2150
2151/*
2152 * Conditionally create a pv entry.
2153 */
2154static boolean_t
2155pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2156{
2157	pv_entry_t pv;
2158
2159	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2160	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2161	if (pv_entry_count < pv_entry_high_water &&
2162	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2163		pv->pv_va = va;
2164		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2165		return (TRUE);
2166	} else
2167		return (FALSE);
2168}
2169
2170/*
2171 * Create the pv entry for a 2MB page mapping.
2172 */
2173static boolean_t
2174pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m)
2175{
2176	struct md_page *pvh;
2177	pv_entry_t pv;
2178
2179	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2180	if (pv_entry_count < pv_entry_high_water &&
2181	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2182		pv->pv_va = va;
2183		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2184		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2185		return (TRUE);
2186	} else
2187		return (FALSE);
2188}
2189
2190/*
2191 * Tries to demote a 2MB page mapping.
2192 */
2193static boolean_t
2194pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2195{
2196	pd_entry_t newpde, oldpde;
2197	pt_entry_t *firstpte, newpte, *pte;
2198	vm_paddr_t mptepa;
2199	vm_page_t free, mpte;
2200
2201	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2202	mpte = pmap_lookup_pt_page(pmap, va);
2203	if (mpte != NULL)
2204		pmap_remove_pt_page(pmap, mpte);
2205	else {
2206		KASSERT((*pde & PG_W) == 0,
2207		    ("pmap_demote_pde: page table page for a wired mapping"
2208		    " is missing"));
2209		free = NULL;
2210		pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
2211		pmap_invalidate_page(pmap, trunc_2mpage(va));
2212		pmap_free_zero_pages(free);
2213		CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
2214		    " in pmap %p", va, pmap);
2215		return (FALSE);
2216	}
2217	mptepa = VM_PAGE_TO_PHYS(mpte);
2218	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2219	oldpde = *pde;
2220	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2221	KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V),
2222	    ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V"));
2223	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2224	    ("pmap_demote_pde: oldpde is missing PG_M"));
2225	KASSERT((oldpde & PG_PS) != 0,
2226	    ("pmap_demote_pde: oldpde is missing PG_PS"));
2227	newpte = oldpde & ~PG_PS;
2228	if ((newpte & PG_PDE_PAT) != 0)
2229		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2230
2231	/*
2232	 * If the mapping has changed attributes, update the page table
2233	 * entries.
2234	 */
2235	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2236	    ("pmap_demote_pde: firstpte and newpte map different physical"
2237	    " addresses"));
2238	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2239		for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2240			*pte = newpte;
2241			newpte += PAGE_SIZE;
2242		}
2243
2244	/*
2245	 * Demote the mapping.  This pmap is locked.  The old PDE has
2246	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2247	 * set.  Thus, there is no danger of a race with another
2248	 * processor changing the setting of PG_A and/or PG_M between
2249	 * the read above and the store below.
2250	 */
2251	pde_store(pde, newpde);
2252
2253	/*
2254	 * Invalidate a stale mapping of the page table page.
2255	 */
2256	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2257
2258	/*
2259	 * Demote the pv entry.  This depends on the earlier demotion
2260	 * of the mapping.  Specifically, the (re)creation of a per-
2261	 * page pv entry might trigger the execution of pmap_collect(),
2262	 * which might reclaim a newly (re)created per-page pv entry
2263	 * and destroy the associated mapping.  In order to destroy
2264	 * the mapping, the PDE must have already changed from mapping
2265	 * the 2mpage to referencing the page table page.
2266	 */
2267	if ((oldpde & PG_MANAGED) != 0)
2268		pmap_pv_demote_pde(pmap, va, oldpde & PG_FRAME);
2269
2270	pmap_pde_demotions++;
2271	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
2272	    " in pmap %p", va, pmap);
2273	return (TRUE);
2274}
2275
2276/*
2277 * pmap_remove_pde: do the things to unmap a superpage in a process
2278 */
2279static int
2280pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2281    vm_page_t *free)
2282{
2283	struct md_page *pvh;
2284	pd_entry_t oldpde;
2285	vm_offset_t eva, va;
2286	vm_page_t m, mpte;
2287
2288	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2289	KASSERT((sva & PDRMASK) == 0,
2290	    ("pmap_remove_pde: sva is not 2mpage aligned"));
2291	oldpde = pte_load_clear(pdq);
2292	if (oldpde & PG_W)
2293		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2294
2295	/*
2296	 * Machines that don't support invlpg, also don't support
2297	 * PG_G.
2298	 */
2299	if (oldpde & PG_G)
2300		pmap_invalidate_page(kernel_pmap, sva);
2301	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2302	if (oldpde & PG_MANAGED) {
2303		pvh = pa_to_pvh(oldpde & PG_FRAME);
2304		pmap_pvh_free(pvh, pmap, sva);
2305		eva = sva + NBPDR;
2306		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
2307		    va < eva; va += PAGE_SIZE, m++) {
2308			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2309				vm_page_dirty(m);
2310			if (oldpde & PG_A)
2311				vm_page_flag_set(m, PG_REFERENCED);
2312			if (TAILQ_EMPTY(&m->md.pv_list) &&
2313			    TAILQ_EMPTY(&pvh->pv_list))
2314				vm_page_flag_clear(m, PG_WRITEABLE);
2315		}
2316	}
2317	if (pmap == kernel_pmap) {
2318		if (!pmap_demote_pde(pmap, pdq, sva))
2319			panic("pmap_remove_pde: failed demotion");
2320	} else {
2321		mpte = pmap_lookup_pt_page(pmap, sva);
2322		if (mpte != NULL) {
2323			pmap_remove_pt_page(pmap, mpte);
2324			KASSERT(mpte->wire_count == NPTEPG,
2325			    ("pmap_remove_pde: pte page wire count error"));
2326			mpte->wire_count = 0;
2327			pmap_add_delayed_free_list(mpte, free, FALSE);
2328			atomic_subtract_int(&cnt.v_wire_count, 1);
2329		}
2330	}
2331	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
2332}
2333
2334/*
2335 * pmap_remove_pte: do the things to unmap a page in a process
2336 */
2337static int
2338pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2339    pd_entry_t ptepde, vm_page_t *free)
2340{
2341	pt_entry_t oldpte;
2342	vm_page_t m;
2343
2344	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2345	oldpte = pte_load_clear(ptq);
2346	if (oldpte & PG_W)
2347		pmap->pm_stats.wired_count -= 1;
2348	/*
2349	 * Machines that don't support invlpg, also don't support
2350	 * PG_G.
2351	 */
2352	if (oldpte & PG_G)
2353		pmap_invalidate_page(kernel_pmap, va);
2354	pmap->pm_stats.resident_count -= 1;
2355	if (oldpte & PG_MANAGED) {
2356		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2357		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2358			vm_page_dirty(m);
2359		if (oldpte & PG_A)
2360			vm_page_flag_set(m, PG_REFERENCED);
2361		pmap_remove_entry(pmap, m, va);
2362	}
2363	return (pmap_unuse_pt(pmap, va, ptepde, free));
2364}
2365
2366/*
2367 * Remove a single page from a process address space
2368 */
2369static void
2370pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
2371{
2372	pt_entry_t *pte;
2373
2374	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2375	if ((*pde & PG_V) == 0)
2376		return;
2377	pte = pmap_pde_to_pte(pde, va);
2378	if ((*pte & PG_V) == 0)
2379		return;
2380	pmap_remove_pte(pmap, pte, va, *pde, free);
2381	pmap_invalidate_page(pmap, va);
2382}
2383
2384/*
2385 *	Remove the given range of addresses from the specified map.
2386 *
2387 *	It is assumed that the start and end are properly
2388 *	rounded to the page size.
2389 */
2390void
2391pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2392{
2393	vm_offset_t va_next;
2394	pml4_entry_t *pml4e;
2395	pdp_entry_t *pdpe;
2396	pd_entry_t ptpaddr, *pde;
2397	pt_entry_t *pte;
2398	vm_page_t free = NULL;
2399	int anyvalid;
2400
2401	/*
2402	 * Perform an unsynchronized read.  This is, however, safe.
2403	 */
2404	if (pmap->pm_stats.resident_count == 0)
2405		return;
2406
2407	anyvalid = 0;
2408
2409	vm_page_lock_queues();
2410	PMAP_LOCK(pmap);
2411
2412	/*
2413	 * special handling of removing one page.  a very
2414	 * common operation and easy to short circuit some
2415	 * code.
2416	 */
2417	if (sva + PAGE_SIZE == eva) {
2418		pde = pmap_pde(pmap, sva);
2419		if (pde && (*pde & PG_PS) == 0) {
2420			pmap_remove_page(pmap, sva, pde, &free);
2421			goto out;
2422		}
2423	}
2424
2425	for (; sva < eva; sva = va_next) {
2426
2427		if (pmap->pm_stats.resident_count == 0)
2428			break;
2429
2430		pml4e = pmap_pml4e(pmap, sva);
2431		if ((*pml4e & PG_V) == 0) {
2432			va_next = (sva + NBPML4) & ~PML4MASK;
2433			if (va_next < sva)
2434				va_next = eva;
2435			continue;
2436		}
2437
2438		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2439		if ((*pdpe & PG_V) == 0) {
2440			va_next = (sva + NBPDP) & ~PDPMASK;
2441			if (va_next < sva)
2442				va_next = eva;
2443			continue;
2444		}
2445
2446		/*
2447		 * Calculate index for next page table.
2448		 */
2449		va_next = (sva + NBPDR) & ~PDRMASK;
2450		if (va_next < sva)
2451			va_next = eva;
2452
2453		pde = pmap_pdpe_to_pde(pdpe, sva);
2454		ptpaddr = *pde;
2455
2456		/*
2457		 * Weed out invalid mappings.
2458		 */
2459		if (ptpaddr == 0)
2460			continue;
2461
2462		/*
2463		 * Check for large page.
2464		 */
2465		if ((ptpaddr & PG_PS) != 0) {
2466			/*
2467			 * Are we removing the entire large page?  If not,
2468			 * demote the mapping and fall through.
2469			 */
2470			if (sva + NBPDR == va_next && eva >= va_next) {
2471				/*
2472				 * The TLB entry for a PG_G mapping is
2473				 * invalidated by pmap_remove_pde().
2474				 */
2475				if ((ptpaddr & PG_G) == 0)
2476					anyvalid = 1;
2477				pmap_remove_pde(pmap, pde, sva, &free);
2478				continue;
2479			} else if (!pmap_demote_pde(pmap, pde, sva)) {
2480				/* The large page mapping was destroyed. */
2481				continue;
2482			} else
2483				ptpaddr = *pde;
2484		}
2485
2486		/*
2487		 * Limit our scan to either the end of the va represented
2488		 * by the current page table page, or to the end of the
2489		 * range being removed.
2490		 */
2491		if (va_next > eva)
2492			va_next = eva;
2493
2494		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2495		    sva += PAGE_SIZE) {
2496			if (*pte == 0)
2497				continue;
2498
2499			/*
2500			 * The TLB entry for a PG_G mapping is invalidated
2501			 * by pmap_remove_pte().
2502			 */
2503			if ((*pte & PG_G) == 0)
2504				anyvalid = 1;
2505			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
2506				break;
2507		}
2508	}
2509out:
2510	if (anyvalid)
2511		pmap_invalidate_all(pmap);
2512	vm_page_unlock_queues();
2513	PMAP_UNLOCK(pmap);
2514	pmap_free_zero_pages(free);
2515}
2516
2517/*
2518 *	Routine:	pmap_remove_all
2519 *	Function:
2520 *		Removes this physical page from
2521 *		all physical maps in which it resides.
2522 *		Reflects back modify bits to the pager.
2523 *
2524 *	Notes:
2525 *		Original versions of this routine were very
2526 *		inefficient because they iteratively called
2527 *		pmap_remove (slow...)
2528 */
2529
2530void
2531pmap_remove_all(vm_page_t m)
2532{
2533	struct md_page *pvh;
2534	pv_entry_t pv;
2535	pmap_t pmap;
2536	pt_entry_t *pte, tpte;
2537	pd_entry_t *pde;
2538	vm_offset_t va;
2539	vm_page_t free;
2540
2541	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2542	    ("pmap_remove_all: page %p is fictitious", m));
2543	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2544	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2545	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2546		va = pv->pv_va;
2547		pmap = PV_PMAP(pv);
2548		PMAP_LOCK(pmap);
2549		pde = pmap_pde(pmap, va);
2550		(void)pmap_demote_pde(pmap, pde, va);
2551		PMAP_UNLOCK(pmap);
2552	}
2553	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2554		pmap = PV_PMAP(pv);
2555		PMAP_LOCK(pmap);
2556		pmap->pm_stats.resident_count--;
2557		pde = pmap_pde(pmap, pv->pv_va);
2558		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2559		    " a 2mpage in page %p's pv list", m));
2560		pte = pmap_pde_to_pte(pde, pv->pv_va);
2561		tpte = pte_load_clear(pte);
2562		if (tpte & PG_W)
2563			pmap->pm_stats.wired_count--;
2564		if (tpte & PG_A)
2565			vm_page_flag_set(m, PG_REFERENCED);
2566
2567		/*
2568		 * Update the vm_page_t clean and reference bits.
2569		 */
2570		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2571			vm_page_dirty(m);
2572		free = NULL;
2573		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
2574		pmap_invalidate_page(pmap, pv->pv_va);
2575		pmap_free_zero_pages(free);
2576		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2577		free_pv_entry(pmap, pv);
2578		PMAP_UNLOCK(pmap);
2579	}
2580	vm_page_flag_clear(m, PG_WRITEABLE);
2581}
2582
2583/*
2584 * pmap_protect_pde: do the things to protect a 2mpage in a process
2585 */
2586static boolean_t
2587pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2588{
2589	pd_entry_t newpde, oldpde;
2590	vm_offset_t eva, va;
2591	vm_page_t m;
2592	boolean_t anychanged;
2593
2594	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2595	KASSERT((sva & PDRMASK) == 0,
2596	    ("pmap_protect_pde: sva is not 2mpage aligned"));
2597	anychanged = FALSE;
2598retry:
2599	oldpde = newpde = *pde;
2600	if (oldpde & PG_MANAGED) {
2601		eva = sva + NBPDR;
2602		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
2603		    va < eva; va += PAGE_SIZE, m++) {
2604			/*
2605			 * In contrast to the analogous operation on a 4KB page
2606			 * mapping, the mapping's PG_A flag is not cleared and
2607			 * the page's PG_REFERENCED flag is not set.  The
2608			 * reason is that pmap_demote_pde() expects that a 2MB
2609			 * page mapping with a stored page table page has PG_A
2610			 * set.
2611			 */
2612			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2613				vm_page_dirty(m);
2614		}
2615	}
2616	if ((prot & VM_PROT_WRITE) == 0)
2617		newpde &= ~(PG_RW | PG_M);
2618	if ((prot & VM_PROT_EXECUTE) == 0)
2619		newpde |= pg_nx;
2620	if (newpde != oldpde) {
2621		if (!atomic_cmpset_long(pde, oldpde, newpde))
2622			goto retry;
2623		if (oldpde & PG_G)
2624			pmap_invalidate_page(pmap, sva);
2625		else
2626			anychanged = TRUE;
2627	}
2628	return (anychanged);
2629}
2630
2631/*
2632 *	Set the physical protection on the
2633 *	specified range of this map as requested.
2634 */
2635void
2636pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2637{
2638	vm_offset_t va_next;
2639	pml4_entry_t *pml4e;
2640	pdp_entry_t *pdpe;
2641	pd_entry_t ptpaddr, *pde;
2642	pt_entry_t *pte;
2643	int anychanged;
2644
2645	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2646		pmap_remove(pmap, sva, eva);
2647		return;
2648	}
2649
2650	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2651	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2652		return;
2653
2654	anychanged = 0;
2655
2656	vm_page_lock_queues();
2657	PMAP_LOCK(pmap);
2658	for (; sva < eva; sva = va_next) {
2659
2660		pml4e = pmap_pml4e(pmap, sva);
2661		if ((*pml4e & PG_V) == 0) {
2662			va_next = (sva + NBPML4) & ~PML4MASK;
2663			if (va_next < sva)
2664				va_next = eva;
2665			continue;
2666		}
2667
2668		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2669		if ((*pdpe & PG_V) == 0) {
2670			va_next = (sva + NBPDP) & ~PDPMASK;
2671			if (va_next < sva)
2672				va_next = eva;
2673			continue;
2674		}
2675
2676		va_next = (sva + NBPDR) & ~PDRMASK;
2677		if (va_next < sva)
2678			va_next = eva;
2679
2680		pde = pmap_pdpe_to_pde(pdpe, sva);
2681		ptpaddr = *pde;
2682
2683		/*
2684		 * Weed out invalid mappings.
2685		 */
2686		if (ptpaddr == 0)
2687			continue;
2688
2689		/*
2690		 * Check for large page.
2691		 */
2692		if ((ptpaddr & PG_PS) != 0) {
2693			/*
2694			 * Are we protecting the entire large page?  If not,
2695			 * demote the mapping and fall through.
2696			 */
2697			if (sva + NBPDR == va_next && eva >= va_next) {
2698				/*
2699				 * The TLB entry for a PG_G mapping is
2700				 * invalidated by pmap_protect_pde().
2701				 */
2702				if (pmap_protect_pde(pmap, pde, sva, prot))
2703					anychanged = 1;
2704				continue;
2705			} else if (!pmap_demote_pde(pmap, pde, sva)) {
2706				/* The large page mapping was destroyed. */
2707				continue;
2708			}
2709		}
2710
2711		if (va_next > eva)
2712			va_next = eva;
2713
2714		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2715		    sva += PAGE_SIZE) {
2716			pt_entry_t obits, pbits;
2717			vm_page_t m;
2718
2719retry:
2720			obits = pbits = *pte;
2721			if ((pbits & PG_V) == 0)
2722				continue;
2723			if (pbits & PG_MANAGED) {
2724				m = NULL;
2725				if (pbits & PG_A) {
2726					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2727					vm_page_flag_set(m, PG_REFERENCED);
2728					pbits &= ~PG_A;
2729				}
2730				if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2731					if (m == NULL)
2732						m = PHYS_TO_VM_PAGE(pbits &
2733						    PG_FRAME);
2734					vm_page_dirty(m);
2735				}
2736			}
2737
2738			if ((prot & VM_PROT_WRITE) == 0)
2739				pbits &= ~(PG_RW | PG_M);
2740			if ((prot & VM_PROT_EXECUTE) == 0)
2741				pbits |= pg_nx;
2742
2743			if (pbits != obits) {
2744				if (!atomic_cmpset_long(pte, obits, pbits))
2745					goto retry;
2746				if (obits & PG_G)
2747					pmap_invalidate_page(pmap, sva);
2748				else
2749					anychanged = 1;
2750			}
2751		}
2752	}
2753	if (anychanged)
2754		pmap_invalidate_all(pmap);
2755	vm_page_unlock_queues();
2756	PMAP_UNLOCK(pmap);
2757}
2758
2759/*
2760 * Tries to promote the 512, contiguous 4KB page mappings that are within a
2761 * single page table page to a single 2MB page mapping.  For promotion to
2762 * occur, two conditions must be met: (1) the 4KB page mappings must map
2763 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2764 * identical characteristics.
2765 */
2766static void
2767pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2768{
2769	pd_entry_t newpde;
2770	pt_entry_t *firstpte, oldpte, *pte;
2771	vm_offset_t oldpteva;
2772	vm_paddr_t pa;
2773	vm_page_t mpte;
2774
2775	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2776	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
2777	KASSERT((*firstpte & PG_V) != 0,
2778	    ("pmap_promote_pde: firstpte is missing PG_V"));
2779	if ((*firstpte & PG_A) == 0) {
2780		pmap_pde_p_failures++;
2781		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2782		    " in pmap %p", va, pmap);
2783		return;
2784	}
2785	pa = *firstpte & PG_PS_FRAME;
2786	newpde = *firstpte;
2787	if ((newpde & (PG_M | PG_RW)) == PG_RW)
2788		newpde &= ~PG_RW;
2789
2790	/*
2791	 * Check all the ptes before promotion
2792	 */
2793	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2794retry:
2795		oldpte = *pte;
2796		if ((oldpte & PG_FRAME) != pa) {
2797			pmap_pde_p_failures++;
2798			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2799			    " in pmap %p", va, pmap);
2800			return;
2801		}
2802		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2803			/*
2804			 * When PG_M is already clear, PG_RW can be cleared
2805			 * without a TLB invalidation.
2806			 */
2807			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
2808				goto retry;
2809			oldpte &= ~PG_RW;
2810			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
2811			    (va & ~PDRMASK);
2812			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
2813			    " in pmap %p", oldpteva, pmap);
2814		}
2815		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2816			pmap_pde_p_failures++;
2817			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2818			    " in pmap %p", va, pmap);
2819			return;
2820		}
2821		pa += PAGE_SIZE;
2822	}
2823
2824	/*
2825	 * Save the page table page in its current state until the PDE
2826	 * mapping the superpage is demoted by pmap_demote_pde() or
2827	 * destroyed by pmap_remove_pde().
2828	 */
2829	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
2830	KASSERT(mpte >= vm_page_array &&
2831	    mpte < &vm_page_array[vm_page_array_size],
2832	    ("pmap_promote_pde: page table page is out of range"));
2833	KASSERT(mpte->pindex == pmap_pde_pindex(va),
2834	    ("pmap_promote_pde: page table page's pindex is wrong"));
2835	pmap_insert_pt_page(pmap, mpte);
2836
2837	/*
2838	 * Promote the pv entries.
2839	 */
2840	if ((newpde & PG_MANAGED) != 0)
2841		pmap_pv_promote_pde(pmap, va, newpde & PG_FRAME);
2842
2843	/*
2844	 * Propagate the PAT index to its proper position.
2845	 */
2846	if ((newpde & PG_PTE_PAT) != 0)
2847		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
2848
2849	/*
2850	 * Map the superpage.
2851	 */
2852	pde_store(pde, PG_PS | newpde);
2853
2854	pmap_pde_promotions++;
2855	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
2856	    " in pmap %p", va, pmap);
2857}
2858
2859/*
2860 *	Insert the given physical page (p) at
2861 *	the specified virtual address (v) in the
2862 *	target physical map with the protection requested.
2863 *
2864 *	If specified, the page will be wired down, meaning
2865 *	that the related pte can not be reclaimed.
2866 *
2867 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2868 *	or lose information.  That is, this routine must actually
2869 *	insert this page into the given map NOW.
2870 */
2871void
2872pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
2873    vm_prot_t prot, boolean_t wired)
2874{
2875	vm_paddr_t pa;
2876	pd_entry_t *pde;
2877	pt_entry_t *pte;
2878	vm_paddr_t opa;
2879	pt_entry_t origpte, newpte;
2880	vm_page_t mpte, om;
2881	boolean_t invlva;
2882
2883	va = trunc_page(va);
2884	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2885	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
2886	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va));
2887
2888	mpte = NULL;
2889
2890	vm_page_lock_queues();
2891	PMAP_LOCK(pmap);
2892
2893	/*
2894	 * In the case that a page table page is not
2895	 * resident, we are creating it here.
2896	 */
2897	if (va < VM_MAXUSER_ADDRESS) {
2898		mpte = pmap_allocpte(pmap, va, M_WAITOK);
2899	}
2900
2901	pde = pmap_pde(pmap, va);
2902	if (pde != NULL && (*pde & PG_V) != 0) {
2903		if ((*pde & PG_PS) != 0)
2904			panic("pmap_enter: attempted pmap_enter on 2MB page");
2905		pte = pmap_pde_to_pte(pde, va);
2906	} else
2907		pte = NULL;
2908
2909	/*
2910	 * Page Directory table entry not valid, we need a new PT page
2911	 */
2912	if (pte == NULL)
2913		panic("pmap_enter: invalid page directory va=%#lx", va);
2914
2915	pa = VM_PAGE_TO_PHYS(m);
2916	om = NULL;
2917	origpte = *pte;
2918	opa = origpte & PG_FRAME;
2919
2920	/*
2921	 * Mapping has not changed, must be protection or wiring change.
2922	 */
2923	if (origpte && (opa == pa)) {
2924		/*
2925		 * Wiring change, just update stats. We don't worry about
2926		 * wiring PT pages as they remain resident as long as there
2927		 * are valid mappings in them. Hence, if a user page is wired,
2928		 * the PT page will be also.
2929		 */
2930		if (wired && ((origpte & PG_W) == 0))
2931			pmap->pm_stats.wired_count++;
2932		else if (!wired && (origpte & PG_W))
2933			pmap->pm_stats.wired_count--;
2934
2935		/*
2936		 * Remove extra pte reference
2937		 */
2938		if (mpte)
2939			mpte->wire_count--;
2940
2941		/*
2942		 * We might be turning off write access to the page,
2943		 * so we go ahead and sense modify status.
2944		 */
2945		if (origpte & PG_MANAGED) {
2946			om = m;
2947			pa |= PG_MANAGED;
2948		}
2949		goto validate;
2950	}
2951	/*
2952	 * Mapping has changed, invalidate old range and fall through to
2953	 * handle validating new mapping.
2954	 */
2955	if (opa) {
2956		if (origpte & PG_W)
2957			pmap->pm_stats.wired_count--;
2958		if (origpte & PG_MANAGED) {
2959			om = PHYS_TO_VM_PAGE(opa);
2960			pmap_remove_entry(pmap, om, va);
2961		}
2962		if (mpte != NULL) {
2963			mpte->wire_count--;
2964			KASSERT(mpte->wire_count > 0,
2965			    ("pmap_enter: missing reference to page table page,"
2966			     " va: 0x%lx", va));
2967		}
2968	} else
2969		pmap->pm_stats.resident_count++;
2970
2971	/*
2972	 * Enter on the PV list if part of our managed memory.
2973	 */
2974	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2975		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2976		    ("pmap_enter: managed mapping within the clean submap"));
2977		pmap_insert_entry(pmap, va, m);
2978		pa |= PG_MANAGED;
2979	}
2980
2981	/*
2982	 * Increment counters
2983	 */
2984	if (wired)
2985		pmap->pm_stats.wired_count++;
2986
2987validate:
2988	/*
2989	 * Now validate mapping with desired protection/wiring.
2990	 */
2991	newpte = (pt_entry_t)(pa | PG_V);
2992	if ((prot & VM_PROT_WRITE) != 0) {
2993		newpte |= PG_RW;
2994		vm_page_flag_set(m, PG_WRITEABLE);
2995	}
2996	if ((prot & VM_PROT_EXECUTE) == 0)
2997		newpte |= pg_nx;
2998	if (wired)
2999		newpte |= PG_W;
3000	if (va < VM_MAXUSER_ADDRESS)
3001		newpte |= PG_U;
3002	if (pmap == kernel_pmap)
3003		newpte |= PG_G;
3004
3005	/*
3006	 * if the mapping or permission bits are different, we need
3007	 * to update the pte.
3008	 */
3009	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3010		newpte |= PG_A;
3011		if ((access & VM_PROT_WRITE) != 0)
3012			newpte |= PG_M;
3013		if (origpte & PG_V) {
3014			invlva = FALSE;
3015			origpte = pte_load_store(pte, newpte);
3016			if (origpte & PG_A) {
3017				if (origpte & PG_MANAGED)
3018					vm_page_flag_set(om, PG_REFERENCED);
3019				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
3020				    PG_NX) == 0 && (newpte & PG_NX)))
3021					invlva = TRUE;
3022			}
3023			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3024				if ((origpte & PG_MANAGED) != 0)
3025					vm_page_dirty(om);
3026				if ((newpte & PG_RW) == 0)
3027					invlva = TRUE;
3028			}
3029			if (invlva)
3030				pmap_invalidate_page(pmap, va);
3031		} else
3032			pte_store(pte, newpte);
3033	}
3034
3035	/*
3036	 * If both the page table page and the reservation are fully
3037	 * populated, then attempt promotion.
3038	 */
3039	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3040	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3041		pmap_promote_pde(pmap, pde, va);
3042
3043	vm_page_unlock_queues();
3044	PMAP_UNLOCK(pmap);
3045}
3046
3047/*
3048 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
3049 * otherwise.  Fails if (1) a page table page cannot be allocated without
3050 * blocking, (2) a mapping already exists at the specified virtual address, or
3051 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3052 */
3053static boolean_t
3054pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3055{
3056	pd_entry_t *pde, newpde;
3057	vm_page_t free, mpde;
3058
3059	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3060	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3061	if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
3062		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3063		    " in pmap %p", va, pmap);
3064		return (FALSE);
3065	}
3066	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
3067	pde = &pde[pmap_pde_index(va)];
3068	if ((*pde & PG_V) != 0) {
3069		KASSERT(mpde->wire_count > 1,
3070		    ("pmap_enter_pde: mpde's wire count is too low"));
3071		mpde->wire_count--;
3072		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3073		    " in pmap %p", va, pmap);
3074		return (FALSE);
3075	}
3076	newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V;
3077	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3078		newpde |= PG_MANAGED;
3079
3080		/*
3081		 * Abort this mapping if its PV entry could not be created.
3082		 */
3083		if (!pmap_pv_insert_pde(pmap, va, m)) {
3084			free = NULL;
3085			if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
3086				pmap_invalidate_page(pmap, va);
3087				pmap_free_zero_pages(free);
3088			}
3089			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3090			    " in pmap %p", va, pmap);
3091			return (FALSE);
3092		}
3093	}
3094	if ((prot & VM_PROT_EXECUTE) == 0)
3095		newpde |= pg_nx;
3096	if (va < VM_MAXUSER_ADDRESS)
3097		newpde |= PG_U;
3098
3099	/*
3100	 * Increment counters.
3101	 */
3102	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3103
3104	/*
3105	 * Map the superpage.
3106	 */
3107	pde_store(pde, newpde);
3108
3109	pmap_pde_mappings++;
3110	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3111	    " in pmap %p", va, pmap);
3112	return (TRUE);
3113}
3114
3115/*
3116 * Maps a sequence of resident pages belonging to the same object.
3117 * The sequence begins with the given page m_start.  This page is
3118 * mapped at the given virtual address start.  Each subsequent page is
3119 * mapped at a virtual address that is offset from start by the same
3120 * amount as the page is offset from m_start within the object.  The
3121 * last page in the sequence is the page with the largest offset from
3122 * m_start that can be mapped at a virtual address less than the given
3123 * virtual address end.  Not every virtual page between start and end
3124 * is mapped; only those for which a resident page exists with the
3125 * corresponding offset from m_start are mapped.
3126 */
3127void
3128pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3129    vm_page_t m_start, vm_prot_t prot)
3130{
3131	vm_offset_t va;
3132	vm_page_t m, mpte;
3133	vm_pindex_t diff, psize;
3134
3135	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3136	psize = atop(end - start);
3137	mpte = NULL;
3138	m = m_start;
3139	PMAP_LOCK(pmap);
3140	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3141		va = start + ptoa(diff);
3142		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3143		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3144		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3145		    pmap_enter_pde(pmap, va, m, prot))
3146			m = &m[NBPDR / PAGE_SIZE - 1];
3147		else
3148			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3149			    mpte);
3150		m = TAILQ_NEXT(m, listq);
3151	}
3152 	PMAP_UNLOCK(pmap);
3153}
3154
3155/*
3156 * this code makes some *MAJOR* assumptions:
3157 * 1. Current pmap & pmap exists.
3158 * 2. Not wired.
3159 * 3. Read access.
3160 * 4. No page table pages.
3161 * but is *MUCH* faster than pmap_enter...
3162 */
3163
3164void
3165pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3166{
3167
3168	PMAP_LOCK(pmap);
3169	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3170	PMAP_UNLOCK(pmap);
3171}
3172
3173static vm_page_t
3174pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3175    vm_prot_t prot, vm_page_t mpte)
3176{
3177	vm_page_t free;
3178	pt_entry_t *pte;
3179	vm_paddr_t pa;
3180
3181	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3182	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3183	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3184	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3185	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3186
3187	/*
3188	 * In the case that a page table page is not
3189	 * resident, we are creating it here.
3190	 */
3191	if (va < VM_MAXUSER_ADDRESS) {
3192		vm_pindex_t ptepindex;
3193		pd_entry_t *ptepa;
3194
3195		/*
3196		 * Calculate pagetable page index
3197		 */
3198		ptepindex = pmap_pde_pindex(va);
3199		if (mpte && (mpte->pindex == ptepindex)) {
3200			mpte->wire_count++;
3201		} else {
3202			/*
3203			 * Get the page directory entry
3204			 */
3205			ptepa = pmap_pde(pmap, va);
3206
3207			/*
3208			 * If the page table page is mapped, we just increment
3209			 * the hold count, and activate it.
3210			 */
3211			if (ptepa && (*ptepa & PG_V) != 0) {
3212				if (*ptepa & PG_PS)
3213					return (NULL);
3214				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
3215				mpte->wire_count++;
3216			} else {
3217				mpte = _pmap_allocpte(pmap, ptepindex,
3218				    M_NOWAIT);
3219				if (mpte == NULL)
3220					return (mpte);
3221			}
3222		}
3223	} else {
3224		mpte = NULL;
3225	}
3226
3227	/*
3228	 * This call to vtopte makes the assumption that we are
3229	 * entering the page into the current pmap.  In order to support
3230	 * quick entry into any pmap, one would likely use pmap_pte.
3231	 * But that isn't as quick as vtopte.
3232	 */
3233	pte = vtopte(va);
3234	if (*pte) {
3235		if (mpte != NULL) {
3236			mpte->wire_count--;
3237			mpte = NULL;
3238		}
3239		return (mpte);
3240	}
3241
3242	/*
3243	 * Enter on the PV list if part of our managed memory.
3244	 */
3245	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3246	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3247		if (mpte != NULL) {
3248			free = NULL;
3249			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
3250				pmap_invalidate_page(pmap, va);
3251				pmap_free_zero_pages(free);
3252			}
3253			mpte = NULL;
3254		}
3255		return (mpte);
3256	}
3257
3258	/*
3259	 * Increment counters
3260	 */
3261	pmap->pm_stats.resident_count++;
3262
3263	pa = VM_PAGE_TO_PHYS(m);
3264	if ((prot & VM_PROT_EXECUTE) == 0)
3265		pa |= pg_nx;
3266
3267	/*
3268	 * Now validate mapping with RO protection
3269	 */
3270	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3271		pte_store(pte, pa | PG_V | PG_U);
3272	else
3273		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3274	return mpte;
3275}
3276
3277/*
3278 * Make a temporary mapping for a physical address.  This is only intended
3279 * to be used for panic dumps.
3280 */
3281void *
3282pmap_kenter_temporary(vm_paddr_t pa, int i)
3283{
3284	vm_offset_t va;
3285
3286	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3287	pmap_kenter(va, pa);
3288	invlpg(va);
3289	return ((void *)crashdumpmap);
3290}
3291
3292/*
3293 * This code maps large physical mmap regions into the
3294 * processor address space.  Note that some shortcuts
3295 * are taken, but the code works.
3296 */
3297void
3298pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
3299		    vm_object_t object, vm_pindex_t pindex,
3300		    vm_size_t size)
3301{
3302	vm_offset_t va;
3303	vm_page_t p, pdpg;
3304
3305	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3306	KASSERT(object->type == OBJT_DEVICE,
3307	    ("pmap_object_init_pt: non-device object"));
3308	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
3309		vm_page_t m[1];
3310		pd_entry_t ptepa, *pde;
3311
3312		PMAP_LOCK(pmap);
3313		pde = pmap_pde(pmap, addr);
3314		if (pde != 0 && (*pde & PG_V) != 0)
3315			goto out;
3316		PMAP_UNLOCK(pmap);
3317retry:
3318		p = vm_page_lookup(object, pindex);
3319		if (p != NULL) {
3320			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
3321				goto retry;
3322		} else {
3323			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
3324			if (p == NULL)
3325				return;
3326			m[0] = p;
3327
3328			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
3329				vm_page_lock_queues();
3330				vm_page_free(p);
3331				vm_page_unlock_queues();
3332				return;
3333			}
3334
3335			p = vm_page_lookup(object, pindex);
3336			vm_page_lock_queues();
3337			vm_page_wakeup(p);
3338			vm_page_unlock_queues();
3339		}
3340
3341		ptepa = VM_PAGE_TO_PHYS(p);
3342		if (ptepa & (NBPDR - 1))
3343			return;
3344
3345		p->valid = VM_PAGE_BITS_ALL;
3346
3347		PMAP_LOCK(pmap);
3348		for (va = addr; va < addr + size; va += NBPDR) {
3349			while ((pdpg =
3350			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
3351				PMAP_UNLOCK(pmap);
3352				vm_page_lock_queues();
3353				vm_page_busy(p);
3354				vm_page_unlock_queues();
3355				VM_OBJECT_UNLOCK(object);
3356				VM_WAIT;
3357				VM_OBJECT_LOCK(object);
3358				vm_page_lock_queues();
3359				vm_page_wakeup(p);
3360				vm_page_unlock_queues();
3361				PMAP_LOCK(pmap);
3362			}
3363			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3364			pde = &pde[pmap_pde_index(va)];
3365			if ((*pde & PG_V) == 0) {
3366				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
3367				    PG_U | PG_RW | PG_V);
3368				pmap->pm_stats.resident_count +=
3369				    NBPDR / PAGE_SIZE;
3370			} else {
3371				pdpg->wire_count--;
3372				KASSERT(pdpg->wire_count > 0,
3373				    ("pmap_object_init_pt: missing reference "
3374				     "to page directory page, va: 0x%lx", va));
3375			}
3376			ptepa += NBPDR;
3377		}
3378		pmap_invalidate_all(pmap);
3379out:
3380		PMAP_UNLOCK(pmap);
3381	}
3382}
3383
3384/*
3385 *	Routine:	pmap_change_wiring
3386 *	Function:	Change the wiring attribute for a map/virtual-address
3387 *			pair.
3388 *	In/out conditions:
3389 *			The mapping must already exist in the pmap.
3390 */
3391void
3392pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3393{
3394	pd_entry_t *pde;
3395	pt_entry_t *pte;
3396	boolean_t are_queues_locked;
3397
3398	are_queues_locked = FALSE;
3399
3400	/*
3401	 * Wiring is not a hardware characteristic so there is no need to
3402	 * invalidate TLB.
3403	 */
3404retry:
3405	PMAP_LOCK(pmap);
3406	pde = pmap_pde(pmap, va);
3407	if ((*pde & PG_PS) != 0) {
3408		if (!wired != ((*pde & PG_W) == 0)) {
3409			if (!are_queues_locked) {
3410				are_queues_locked = TRUE;
3411				if (!mtx_trylock(&vm_page_queue_mtx)) {
3412					PMAP_UNLOCK(pmap);
3413					vm_page_lock_queues();
3414					goto retry;
3415				}
3416			}
3417			if (!pmap_demote_pde(pmap, pde, va))
3418				panic("pmap_change_wiring: demotion failed");
3419		} else
3420			goto out;
3421	}
3422	pte = pmap_pde_to_pte(pde, va);
3423	if (wired && (*pte & PG_W) == 0) {
3424		pmap->pm_stats.wired_count++;
3425		atomic_set_long(pte, PG_W);
3426	} else if (!wired && (*pte & PG_W) != 0) {
3427		pmap->pm_stats.wired_count--;
3428		atomic_clear_long(pte, PG_W);
3429	}
3430out:
3431	if (are_queues_locked)
3432		vm_page_unlock_queues();
3433	PMAP_UNLOCK(pmap);
3434}
3435
3436
3437
3438/*
3439 *	Copy the range specified by src_addr/len
3440 *	from the source map to the range dst_addr/len
3441 *	in the destination map.
3442 *
3443 *	This routine is only advisory and need not do anything.
3444 */
3445
3446void
3447pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3448	  vm_offset_t src_addr)
3449{
3450	vm_page_t   free;
3451	vm_offset_t addr;
3452	vm_offset_t end_addr = src_addr + len;
3453	vm_offset_t va_next;
3454
3455	if (dst_addr != src_addr)
3456		return;
3457
3458	if (!pmap_is_current(src_pmap))
3459		return;
3460
3461	vm_page_lock_queues();
3462	if (dst_pmap < src_pmap) {
3463		PMAP_LOCK(dst_pmap);
3464		PMAP_LOCK(src_pmap);
3465	} else {
3466		PMAP_LOCK(src_pmap);
3467		PMAP_LOCK(dst_pmap);
3468	}
3469	for (addr = src_addr; addr < end_addr; addr = va_next) {
3470		pt_entry_t *src_pte, *dst_pte;
3471		vm_page_t dstmpde, dstmpte, srcmpte;
3472		pml4_entry_t *pml4e;
3473		pdp_entry_t *pdpe;
3474		pd_entry_t srcptepaddr, *pde;
3475
3476		KASSERT(addr < UPT_MIN_ADDRESS,
3477		    ("pmap_copy: invalid to pmap_copy page tables"));
3478
3479		pml4e = pmap_pml4e(src_pmap, addr);
3480		if ((*pml4e & PG_V) == 0) {
3481			va_next = (addr + NBPML4) & ~PML4MASK;
3482			if (va_next < addr)
3483				va_next = end_addr;
3484			continue;
3485		}
3486
3487		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
3488		if ((*pdpe & PG_V) == 0) {
3489			va_next = (addr + NBPDP) & ~PDPMASK;
3490			if (va_next < addr)
3491				va_next = end_addr;
3492			continue;
3493		}
3494
3495		va_next = (addr + NBPDR) & ~PDRMASK;
3496		if (va_next < addr)
3497			va_next = end_addr;
3498
3499		pde = pmap_pdpe_to_pde(pdpe, addr);
3500		srcptepaddr = *pde;
3501		if (srcptepaddr == 0)
3502			continue;
3503
3504		if (srcptepaddr & PG_PS) {
3505			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
3506			if (dstmpde == NULL)
3507				break;
3508			pde = (pd_entry_t *)
3509			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
3510			pde = &pde[pmap_pde_index(addr)];
3511			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
3512			    pmap_pv_insert_pde(dst_pmap, addr,
3513			    PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME)))) {
3514				*pde = srcptepaddr & ~PG_W;
3515				dst_pmap->pm_stats.resident_count +=
3516				    NBPDR / PAGE_SIZE;
3517			} else
3518				dstmpde->wire_count--;
3519			continue;
3520		}
3521
3522		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3523		KASSERT(srcmpte->wire_count > 0,
3524		    ("pmap_copy: source page table page is unused"));
3525
3526		if (va_next > end_addr)
3527			va_next = end_addr;
3528
3529		src_pte = vtopte(addr);
3530		while (addr < va_next) {
3531			pt_entry_t ptetemp;
3532			ptetemp = *src_pte;
3533			/*
3534			 * we only virtual copy managed pages
3535			 */
3536			if ((ptetemp & PG_MANAGED) != 0) {
3537				dstmpte = pmap_allocpte(dst_pmap, addr,
3538				    M_NOWAIT);
3539				if (dstmpte == NULL)
3540					break;
3541				dst_pte = (pt_entry_t *)
3542				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
3543				dst_pte = &dst_pte[pmap_pte_index(addr)];
3544				if (*dst_pte == 0 &&
3545				    pmap_try_insert_pv_entry(dst_pmap, addr,
3546				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3547					/*
3548					 * Clear the wired, modified, and
3549					 * accessed (referenced) bits
3550					 * during the copy.
3551					 */
3552					*dst_pte = ptetemp & ~(PG_W | PG_M |
3553					    PG_A);
3554					dst_pmap->pm_stats.resident_count++;
3555	 			} else {
3556					free = NULL;
3557					if (pmap_unwire_pte_hold(dst_pmap,
3558					    addr, dstmpte, &free)) {
3559					    	pmap_invalidate_page(dst_pmap,
3560					 	    addr);
3561				    	    	pmap_free_zero_pages(free);
3562					}
3563				}
3564				if (dstmpte->wire_count >= srcmpte->wire_count)
3565					break;
3566			}
3567			addr += PAGE_SIZE;
3568			src_pte++;
3569		}
3570	}
3571	vm_page_unlock_queues();
3572	PMAP_UNLOCK(src_pmap);
3573	PMAP_UNLOCK(dst_pmap);
3574}
3575
3576/*
3577 *	pmap_zero_page zeros the specified hardware page by mapping
3578 *	the page into KVM and using bzero to clear its contents.
3579 */
3580void
3581pmap_zero_page(vm_page_t m)
3582{
3583	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3584
3585	pagezero((void *)va);
3586}
3587
3588/*
3589 *	pmap_zero_page_area zeros the specified hardware page by mapping
3590 *	the page into KVM and using bzero to clear its contents.
3591 *
3592 *	off and size may not cover an area beyond a single hardware page.
3593 */
3594void
3595pmap_zero_page_area(vm_page_t m, int off, int size)
3596{
3597	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3598
3599	if (off == 0 && size == PAGE_SIZE)
3600		pagezero((void *)va);
3601	else
3602		bzero((char *)va + off, size);
3603}
3604
3605/*
3606 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3607 *	the page into KVM and using bzero to clear its contents.  This
3608 *	is intended to be called from the vm_pagezero process only and
3609 *	outside of Giant.
3610 */
3611void
3612pmap_zero_page_idle(vm_page_t m)
3613{
3614	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3615
3616	pagezero((void *)va);
3617}
3618
3619/*
3620 *	pmap_copy_page copies the specified (machine independent)
3621 *	page by mapping the page into virtual memory and using
3622 *	bcopy to copy the page, one machine dependent page at a
3623 *	time.
3624 */
3625void
3626pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3627{
3628	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3629	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3630
3631	pagecopy((void *)src, (void *)dst);
3632}
3633
3634/*
3635 * Returns true if the pmap's pv is one of the first
3636 * 16 pvs linked to from this page.  This count may
3637 * be changed upwards or downwards in the future; it
3638 * is only necessary that true be returned for a small
3639 * subset of pmaps for proper page aging.
3640 */
3641boolean_t
3642pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3643{
3644	struct md_page *pvh;
3645	pv_entry_t pv;
3646	int loops = 0;
3647
3648	if (m->flags & PG_FICTITIOUS)
3649		return FALSE;
3650
3651	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3652	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3653		if (PV_PMAP(pv) == pmap) {
3654			return TRUE;
3655		}
3656		loops++;
3657		if (loops >= 16)
3658			break;
3659	}
3660	if (loops < 16) {
3661		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3662		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3663			if (PV_PMAP(pv) == pmap)
3664				return (TRUE);
3665			loops++;
3666			if (loops >= 16)
3667				break;
3668		}
3669	}
3670	return (FALSE);
3671}
3672
3673/*
3674 *	pmap_page_wired_mappings:
3675 *
3676 *	Return the number of managed mappings to the given physical page
3677 *	that are wired.
3678 */
3679int
3680pmap_page_wired_mappings(vm_page_t m)
3681{
3682	pv_entry_t pv;
3683	pt_entry_t *pte;
3684	pmap_t pmap;
3685	int count;
3686
3687	count = 0;
3688	if ((m->flags & PG_FICTITIOUS) != 0)
3689		return (count);
3690	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3691	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3692		pmap = PV_PMAP(pv);
3693		PMAP_LOCK(pmap);
3694		pte = pmap_pte(pmap, pv->pv_va);
3695		if ((*pte & PG_W) != 0)
3696			count++;
3697		PMAP_UNLOCK(pmap);
3698	}
3699	return (count);
3700}
3701
3702/*
3703 * Returns TRUE if the given page is mapped individually or as part of
3704 * a 2mpage.  Otherwise, returns FALSE.
3705 */
3706boolean_t
3707pmap_page_is_mapped(vm_page_t m)
3708{
3709	struct md_page *pvh;
3710
3711	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
3712		return (FALSE);
3713	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3714	if (TAILQ_EMPTY(&m->md.pv_list)) {
3715		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3716		return (!TAILQ_EMPTY(&pvh->pv_list));
3717	} else
3718		return (TRUE);
3719}
3720
3721/*
3722 * Remove all pages from specified address space
3723 * this aids process exit speeds.  Also, this code
3724 * is special cased for current process only, but
3725 * can have the more generic (and slightly slower)
3726 * mode enabled.  This is much faster than pmap_remove
3727 * in the case of running down an entire address space.
3728 */
3729void
3730pmap_remove_pages(pmap_t pmap)
3731{
3732	pd_entry_t *pde;
3733	pt_entry_t *pte, tpte;
3734	vm_page_t free = NULL;
3735	vm_page_t m, mpte, mt;
3736	pv_entry_t pv;
3737	struct md_page *pvh;
3738	struct pv_chunk *pc, *npc;
3739	int field, idx;
3740	int64_t bit;
3741	uint64_t inuse, bitmask;
3742	int allfree;
3743
3744	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3745		printf("warning: pmap_remove_pages called with non-current pmap\n");
3746		return;
3747	}
3748	vm_page_lock_queues();
3749	PMAP_LOCK(pmap);
3750	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3751		allfree = 1;
3752		for (field = 0; field < _NPCM; field++) {
3753			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
3754			while (inuse != 0) {
3755				bit = bsfq(inuse);
3756				bitmask = 1UL << bit;
3757				idx = field * 64 + bit;
3758				pv = &pc->pc_pventry[idx];
3759				inuse &= ~bitmask;
3760
3761				pde = vtopde(pv->pv_va);
3762				tpte = *pde;
3763				if ((tpte & PG_PS) != 0)
3764					pte = pde;
3765				else {
3766					pte = vtopte(pv->pv_va);
3767					tpte = *pte & ~PG_PTE_PAT;
3768				}
3769
3770				if (tpte == 0) {
3771					printf(
3772					    "TPTE at %p  IS ZERO @ VA %08lx\n",
3773					    pte, pv->pv_va);
3774					panic("bad pte");
3775				}
3776
3777/*
3778 * We cannot remove wired pages from a process' mapping at this time
3779 */
3780				if (tpte & PG_W) {
3781					allfree = 0;
3782					continue;
3783				}
3784
3785				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3786				KASSERT(m->phys_addr == (tpte & PG_FRAME),
3787				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3788				    m, (uintmax_t)m->phys_addr,
3789				    (uintmax_t)tpte));
3790
3791				KASSERT(m < &vm_page_array[vm_page_array_size],
3792					("pmap_remove_pages: bad tpte %#jx",
3793					(uintmax_t)tpte));
3794
3795				pte_clear(pte);
3796
3797				/*
3798				 * Update the vm_page_t clean/reference bits.
3799				 */
3800				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3801					if ((tpte & PG_PS) != 0) {
3802						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3803							vm_page_dirty(mt);
3804					} else
3805						vm_page_dirty(m);
3806				}
3807
3808				/* Mark free */
3809				PV_STAT(pv_entry_frees++);
3810				PV_STAT(pv_entry_spare++);
3811				pv_entry_count--;
3812				pc->pc_map[field] |= bitmask;
3813				if ((tpte & PG_PS) != 0) {
3814					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
3815					pvh = pa_to_pvh(tpte & PG_FRAME);
3816					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
3817					if (TAILQ_EMPTY(&pvh->pv_list)) {
3818						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3819							if (TAILQ_EMPTY(&mt->md.pv_list))
3820								vm_page_flag_clear(mt, PG_WRITEABLE);
3821					}
3822					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
3823					if (mpte != NULL) {
3824						pmap_remove_pt_page(pmap, mpte);
3825						KASSERT(mpte->wire_count == NPTEPG,
3826						    ("pmap_remove_pages: pte page wire count error"));
3827						mpte->wire_count = 0;
3828						pmap_add_delayed_free_list(mpte, &free, FALSE);
3829						atomic_subtract_int(&cnt.v_wire_count, 1);
3830					}
3831					pmap_unuse_pt(pmap, pv->pv_va,
3832					    *pmap_pdpe(pmap, pv->pv_va), &free);
3833				} else {
3834					pmap->pm_stats.resident_count--;
3835					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3836					if (TAILQ_EMPTY(&m->md.pv_list)) {
3837						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3838						if (TAILQ_EMPTY(&pvh->pv_list))
3839							vm_page_flag_clear(m, PG_WRITEABLE);
3840					}
3841					pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3842				}
3843			}
3844		}
3845		if (allfree) {
3846			PV_STAT(pv_entry_spare -= _NPCPV);
3847			PV_STAT(pc_chunk_count--);
3848			PV_STAT(pc_chunk_frees++);
3849			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3850			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3851			dump_drop_page(m->phys_addr);
3852			vm_page_unwire(m, 0);
3853			vm_page_free(m);
3854		}
3855	}
3856	pmap_invalidate_all(pmap);
3857	vm_page_unlock_queues();
3858	PMAP_UNLOCK(pmap);
3859	pmap_free_zero_pages(free);
3860}
3861
3862/*
3863 *	pmap_is_modified:
3864 *
3865 *	Return whether or not the specified physical page was modified
3866 *	in any physical maps.
3867 */
3868boolean_t
3869pmap_is_modified(vm_page_t m)
3870{
3871
3872	if (m->flags & PG_FICTITIOUS)
3873		return (FALSE);
3874	if (pmap_is_modified_pvh(&m->md))
3875		return (TRUE);
3876	return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
3877}
3878
3879/*
3880 * Returns TRUE if any of the given mappings were used to modify
3881 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
3882 * mappings are supported.
3883 */
3884static boolean_t
3885pmap_is_modified_pvh(struct md_page *pvh)
3886{
3887	pv_entry_t pv;
3888	pt_entry_t *pte;
3889	pmap_t pmap;
3890	boolean_t rv;
3891
3892	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3893	rv = FALSE;
3894	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3895		pmap = PV_PMAP(pv);
3896		PMAP_LOCK(pmap);
3897		pte = pmap_pte(pmap, pv->pv_va);
3898		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
3899		PMAP_UNLOCK(pmap);
3900		if (rv)
3901			break;
3902	}
3903	return (rv);
3904}
3905
3906/*
3907 *	pmap_is_prefaultable:
3908 *
3909 *	Return whether or not the specified virtual address is elgible
3910 *	for prefault.
3911 */
3912boolean_t
3913pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3914{
3915	pd_entry_t *pde;
3916	pt_entry_t *pte;
3917	boolean_t rv;
3918
3919	rv = FALSE;
3920	PMAP_LOCK(pmap);
3921	pde = pmap_pde(pmap, addr);
3922	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
3923		pte = pmap_pde_to_pte(pde, addr);
3924		rv = (*pte & PG_V) == 0;
3925	}
3926	PMAP_UNLOCK(pmap);
3927	return (rv);
3928}
3929
3930/*
3931 * Clear the write and modified bits in each of the given page's mappings.
3932 */
3933void
3934pmap_remove_write(vm_page_t m)
3935{
3936	struct md_page *pvh;
3937	pmap_t pmap;
3938	pv_entry_t next_pv, pv;
3939	pd_entry_t *pde;
3940	pt_entry_t oldpte, *pte;
3941	vm_offset_t va;
3942
3943	if ((m->flags & PG_FICTITIOUS) != 0 ||
3944	    (m->flags & PG_WRITEABLE) == 0)
3945		return;
3946	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3947	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3948	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
3949		va = pv->pv_va;
3950		pmap = PV_PMAP(pv);
3951		PMAP_LOCK(pmap);
3952		pde = pmap_pde(pmap, va);
3953		if ((*pde & PG_RW) != 0)
3954			(void)pmap_demote_pde(pmap, pde, va);
3955		PMAP_UNLOCK(pmap);
3956	}
3957	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3958		pmap = PV_PMAP(pv);
3959		PMAP_LOCK(pmap);
3960		pde = pmap_pde(pmap, pv->pv_va);
3961		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
3962		    " a 2mpage in page %p's pv list", m));
3963		pte = pmap_pde_to_pte(pde, pv->pv_va);
3964retry:
3965		oldpte = *pte;
3966		if (oldpte & PG_RW) {
3967			if (!atomic_cmpset_long(pte, oldpte, oldpte &
3968			    ~(PG_RW | PG_M)))
3969				goto retry;
3970			if ((oldpte & PG_M) != 0)
3971				vm_page_dirty(m);
3972			pmap_invalidate_page(pmap, pv->pv_va);
3973		}
3974		PMAP_UNLOCK(pmap);
3975	}
3976	vm_page_flag_clear(m, PG_WRITEABLE);
3977}
3978
3979/*
3980 *	pmap_ts_referenced:
3981 *
3982 *	Return a count of reference bits for a page, clearing those bits.
3983 *	It is not necessary for every reference bit to be cleared, but it
3984 *	is necessary that 0 only be returned when there are truly no
3985 *	reference bits set.
3986 *
3987 *	XXX: The exact number of bits to check and clear is a matter that
3988 *	should be tested and standardized at some point in the future for
3989 *	optimal aging of shared pages.
3990 */
3991int
3992pmap_ts_referenced(vm_page_t m)
3993{
3994	struct md_page *pvh;
3995	pv_entry_t pv, pvf, pvn;
3996	pmap_t pmap;
3997	pd_entry_t oldpde, *pde;
3998	pt_entry_t *pte;
3999	vm_offset_t va;
4000	int rtval = 0;
4001
4002	if (m->flags & PG_FICTITIOUS)
4003		return (rtval);
4004	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4005	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4006	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4007		va = pv->pv_va;
4008		pmap = PV_PMAP(pv);
4009		PMAP_LOCK(pmap);
4010		pde = pmap_pde(pmap, va);
4011		oldpde = *pde;
4012		if ((oldpde & PG_A) != 0) {
4013			if (pmap_demote_pde(pmap, pde, va)) {
4014				if ((oldpde & PG_W) == 0) {
4015					/*
4016					 * Remove the mapping to a single page
4017					 * so that a subsequent access may
4018					 * repromote.  Since the underlying
4019					 * page table page is fully populated,
4020					 * this removal never frees a page
4021					 * table page.
4022					 */
4023					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4024					    PG_FRAME);
4025					pmap_remove_page(pmap, va, pde, NULL);
4026					rtval++;
4027					if (rtval > 4) {
4028						PMAP_UNLOCK(pmap);
4029						return (rtval);
4030					}
4031				}
4032			}
4033		}
4034		PMAP_UNLOCK(pmap);
4035	}
4036	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4037		pvf = pv;
4038		do {
4039			pvn = TAILQ_NEXT(pv, pv_list);
4040			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4041			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4042			pmap = PV_PMAP(pv);
4043			PMAP_LOCK(pmap);
4044			pde = pmap_pde(pmap, pv->pv_va);
4045			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4046			    " found a 2mpage in page %p's pv list", m));
4047			pte = pmap_pde_to_pte(pde, pv->pv_va);
4048			if ((*pte & PG_A) != 0) {
4049				atomic_clear_long(pte, PG_A);
4050				pmap_invalidate_page(pmap, pv->pv_va);
4051				rtval++;
4052				if (rtval > 4)
4053					pvn = NULL;
4054			}
4055			PMAP_UNLOCK(pmap);
4056		} while ((pv = pvn) != NULL && pv != pvf);
4057	}
4058	return (rtval);
4059}
4060
4061/*
4062 *	Clear the modify bits on the specified physical page.
4063 */
4064void
4065pmap_clear_modify(vm_page_t m)
4066{
4067	struct md_page *pvh;
4068	pmap_t pmap;
4069	pv_entry_t next_pv, pv;
4070	pd_entry_t oldpde, *pde;
4071	pt_entry_t oldpte, *pte;
4072	vm_offset_t va;
4073
4074	if ((m->flags & PG_FICTITIOUS) != 0)
4075		return;
4076	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4077	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4078	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4079		va = pv->pv_va;
4080		pmap = PV_PMAP(pv);
4081		PMAP_LOCK(pmap);
4082		pde = pmap_pde(pmap, va);
4083		oldpde = *pde;
4084		if ((oldpde & PG_RW) != 0) {
4085			if (pmap_demote_pde(pmap, pde, va)) {
4086				if ((oldpde & PG_W) == 0) {
4087					/*
4088					 * Write protect the mapping to a
4089					 * single page so that a subsequent
4090					 * write access may repromote.
4091					 */
4092					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4093					    PG_FRAME);
4094					pte = pmap_pde_to_pte(pde, va);
4095					oldpte = *pte;
4096					if ((oldpte & PG_V) != 0) {
4097						while (!atomic_cmpset_long(pte,
4098						    oldpte,
4099						    oldpte & ~(PG_M | PG_RW)))
4100							oldpte = *pte;
4101						vm_page_dirty(m);
4102						pmap_invalidate_page(pmap, va);
4103					}
4104				}
4105			}
4106		}
4107		PMAP_UNLOCK(pmap);
4108	}
4109	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4110		pmap = PV_PMAP(pv);
4111		PMAP_LOCK(pmap);
4112		pde = pmap_pde(pmap, pv->pv_va);
4113		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4114		    " a 2mpage in page %p's pv list", m));
4115		pte = pmap_pde_to_pte(pde, pv->pv_va);
4116		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4117			atomic_clear_long(pte, PG_M);
4118			pmap_invalidate_page(pmap, pv->pv_va);
4119		}
4120		PMAP_UNLOCK(pmap);
4121	}
4122}
4123
4124/*
4125 *	pmap_clear_reference:
4126 *
4127 *	Clear the reference bit on the specified physical page.
4128 */
4129void
4130pmap_clear_reference(vm_page_t m)
4131{
4132	struct md_page *pvh;
4133	pmap_t pmap;
4134	pv_entry_t next_pv, pv;
4135	pd_entry_t oldpde, *pde;
4136	pt_entry_t *pte;
4137	vm_offset_t va;
4138
4139	if ((m->flags & PG_FICTITIOUS) != 0)
4140		return;
4141	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4142	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4143	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4144		va = pv->pv_va;
4145		pmap = PV_PMAP(pv);
4146		PMAP_LOCK(pmap);
4147		pde = pmap_pde(pmap, va);
4148		oldpde = *pde;
4149		if ((oldpde & PG_A) != 0) {
4150			if (pmap_demote_pde(pmap, pde, va)) {
4151				/*
4152				 * Remove the mapping to a single page so
4153				 * that a subsequent access may repromote.
4154				 * Since the underlying page table page is
4155				 * fully populated, this removal never frees
4156				 * a page table page.
4157				 */
4158				va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME);
4159				pmap_remove_page(pmap, va, pde, NULL);
4160			}
4161		}
4162		PMAP_UNLOCK(pmap);
4163	}
4164	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4165		pmap = PV_PMAP(pv);
4166		PMAP_LOCK(pmap);
4167		pde = pmap_pde(pmap, pv->pv_va);
4168		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4169		    " a 2mpage in page %p's pv list", m));
4170		pte = pmap_pde_to_pte(pde, pv->pv_va);
4171		if (*pte & PG_A) {
4172			atomic_clear_long(pte, PG_A);
4173			pmap_invalidate_page(pmap, pv->pv_va);
4174		}
4175		PMAP_UNLOCK(pmap);
4176	}
4177}
4178
4179/*
4180 * Miscellaneous support routines follow
4181 */
4182
4183/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4184static __inline void
4185pmap_pte_attr(vm_offset_t va, int mode)
4186{
4187	pt_entry_t *pte;
4188	u_int opte, npte;
4189
4190	pte = vtopte(va);
4191
4192	/*
4193	 * The cache mode bits are all in the low 32-bits of the
4194	 * PTE, so we can just spin on updating the low 32-bits.
4195	 */
4196	do {
4197		opte = *(u_int *)pte;
4198		npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
4199		npte |= pmap_cache_bits(mode, 0);
4200	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4201}
4202
4203/* Adjust the cache mode for a 2MB page mapped via a PDE. */
4204static __inline void
4205pmap_pde_attr(vm_offset_t va, int mode)
4206{
4207	pd_entry_t *pde;
4208	u_int opde, npde;
4209
4210	pde = pmap_pde(kernel_pmap, va);
4211
4212	/*
4213	 * The cache mode bits are all in the low 32-bits of the
4214	 * PDE, so we can just spin on updating the low 32-bits.
4215	 */
4216	do {
4217		opde = *(u_int *)pde;
4218		npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT);
4219		npde |= pmap_cache_bits(mode, 1);
4220	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4221}
4222
4223/*
4224 * Map a set of physical memory pages into the kernel virtual
4225 * address space. Return a pointer to where it is mapped. This
4226 * routine is intended to be used for mapping device memory,
4227 * NOT real memory.
4228 */
4229void *
4230pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4231{
4232	vm_offset_t va, tmpva, offset;
4233
4234	/*
4235	 * If this fits within the direct map window and use WB caching
4236	 * mode, use the direct map.
4237	 */
4238	if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK)
4239		return ((void *)PHYS_TO_DMAP(pa));
4240	offset = pa & PAGE_MASK;
4241	size = roundup(offset + size, PAGE_SIZE);
4242	va = kmem_alloc_nofault(kernel_map, size);
4243	if (!va)
4244		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4245	pa = trunc_page(pa);
4246	for (tmpva = va; size > 0; ) {
4247		pmap_kenter_attr(tmpva, pa, mode);
4248		size -= PAGE_SIZE;
4249		tmpva += PAGE_SIZE;
4250		pa += PAGE_SIZE;
4251	}
4252	pmap_invalidate_range(kernel_pmap, va, tmpva);
4253	pmap_invalidate_cache();
4254	return ((void *)(va + offset));
4255}
4256
4257void *
4258pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4259{
4260
4261	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4262}
4263
4264void *
4265pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4266{
4267
4268	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4269}
4270
4271void
4272pmap_unmapdev(vm_offset_t va, vm_size_t size)
4273{
4274	vm_offset_t base, offset, tmpva;
4275
4276	/* If we gave a direct map region in pmap_mapdev, do nothing */
4277	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
4278		return;
4279	base = trunc_page(va);
4280	offset = va & PAGE_MASK;
4281	size = roundup(offset + size, PAGE_SIZE);
4282	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4283		pmap_kremove(tmpva);
4284	pmap_invalidate_range(kernel_pmap, va, tmpva);
4285	kmem_free(kernel_map, base, size);
4286}
4287
4288int
4289pmap_change_attr(va, size, mode)
4290	vm_offset_t va;
4291	vm_size_t size;
4292	int mode;
4293{
4294	vm_offset_t base, offset, tmpva;
4295	pd_entry_t *pde;
4296	pt_entry_t *pte;
4297
4298	base = trunc_page(va);
4299	offset = va & PAGE_MASK;
4300	size = roundup(offset + size, PAGE_SIZE);
4301
4302	/* Only supported on kernel virtual addresses. */
4303	if (base <= VM_MAXUSER_ADDRESS)
4304		return (EINVAL);
4305
4306	/*
4307	 * XXX: We have to support tearing 2MB pages down into 4k pages if
4308	 * needed here.
4309	 */
4310	/* Pages that aren't mapped aren't supported. */
4311	for (tmpva = base; tmpva < (base + size); ) {
4312		pde = pmap_pde(kernel_pmap, tmpva);
4313		if (*pde == 0)
4314			return (EINVAL);
4315		if (*pde & PG_PS) {
4316			/* Handle 2MB pages that are completely contained. */
4317			if (size >= NBPDR) {
4318				tmpva += NBPDR;
4319				continue;
4320			}
4321			return (EINVAL);
4322		}
4323		pte = vtopte(va);
4324		if (*pte == 0)
4325			return (EINVAL);
4326		tmpva += PAGE_SIZE;
4327	}
4328
4329	/*
4330	 * Ok, all the pages exist, so run through them updating their
4331	 * cache mode.
4332	 */
4333	for (tmpva = base; size > 0; ) {
4334		pde = pmap_pde(kernel_pmap, tmpva);
4335		if (*pde & PG_PS) {
4336			pmap_pde_attr(tmpva, mode);
4337			tmpva += NBPDR;
4338			size -= NBPDR;
4339		} else {
4340			pmap_pte_attr(tmpva, mode);
4341			tmpva += PAGE_SIZE;
4342			size -= PAGE_SIZE;
4343		}
4344	}
4345
4346	/*
4347	 * Flush CPU caches to make sure any data isn't cached that shouldn't
4348	 * be, etc.
4349	 */
4350	pmap_invalidate_range(kernel_pmap, base, tmpva);
4351	pmap_invalidate_cache();
4352	return (0);
4353}
4354
4355/*
4356 * perform the pmap work for mincore
4357 */
4358int
4359pmap_mincore(pmap_t pmap, vm_offset_t addr)
4360{
4361	pd_entry_t *pdep;
4362	pt_entry_t pte;
4363	vm_paddr_t pa;
4364	vm_page_t m;
4365	int val = 0;
4366
4367	PMAP_LOCK(pmap);
4368	pdep = pmap_pde(pmap, addr);
4369	if (pdep != NULL && (*pdep & PG_V)) {
4370		if (*pdep & PG_PS) {
4371			pte = *pdep;
4372			val = MINCORE_SUPER;
4373			/* Compute the physical address of the 4KB page. */
4374			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4375			    PG_FRAME;
4376		} else {
4377			pte = *pmap_pde_to_pte(pdep, addr);
4378			pa = pte & PG_FRAME;
4379		}
4380	} else {
4381		pte = 0;
4382		pa = 0;
4383	}
4384	PMAP_UNLOCK(pmap);
4385
4386	if (pte != 0) {
4387		val |= MINCORE_INCORE;
4388		if ((pte & PG_MANAGED) == 0)
4389			return val;
4390
4391		m = PHYS_TO_VM_PAGE(pa);
4392
4393		/*
4394		 * Modified by us
4395		 */
4396		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4397			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4398		else {
4399			/*
4400			 * Modified by someone else
4401			 */
4402			vm_page_lock_queues();
4403			if (m->dirty || pmap_is_modified(m))
4404				val |= MINCORE_MODIFIED_OTHER;
4405			vm_page_unlock_queues();
4406		}
4407		/*
4408		 * Referenced by us
4409		 */
4410		if (pte & PG_A)
4411			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4412		else {
4413			/*
4414			 * Referenced by someone else
4415			 */
4416			vm_page_lock_queues();
4417			if ((m->flags & PG_REFERENCED) ||
4418			    pmap_ts_referenced(m)) {
4419				val |= MINCORE_REFERENCED_OTHER;
4420				vm_page_flag_set(m, PG_REFERENCED);
4421			}
4422			vm_page_unlock_queues();
4423		}
4424	}
4425	return val;
4426}
4427
4428void
4429pmap_activate(struct thread *td)
4430{
4431	pmap_t	pmap, oldpmap;
4432	u_int64_t  cr3;
4433
4434	critical_enter();
4435	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4436	oldpmap = PCPU_GET(curpmap);
4437#ifdef SMP
4438if (oldpmap)	/* XXX FIXME */
4439	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4440	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4441#else
4442if (oldpmap)	/* XXX FIXME */
4443	oldpmap->pm_active &= ~PCPU_GET(cpumask);
4444	pmap->pm_active |= PCPU_GET(cpumask);
4445#endif
4446	cr3 = vtophys(pmap->pm_pml4);
4447	td->td_pcb->pcb_cr3 = cr3;
4448	load_cr3(cr3);
4449	critical_exit();
4450}
4451
4452vm_offset_t
4453pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
4454{
4455
4456	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
4457		return addr;
4458	}
4459
4460	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
4461	return addr;
4462}
4463