pmap.c revision 176803
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 176803 2008-03-04 18:50:15Z alc $");
81
82/*
83 *	Manages physical address maps.
84 *
85 *	In addition to hardware address maps, this
86 *	module is called upon to provide software-use-only
87 *	maps which may or may not be stored in the same
88 *	form as hardware maps.  These pseudo-maps are
89 *	used to store intermediate results from copy
90 *	operations to and from address spaces.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_msgbuf.h"
109#include "opt_pmap.h"
110#include "opt_vm.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/sx.h>
123#include <sys/vmmeter.h>
124#include <sys/sched.h>
125#include <sys/sysctl.h>
126#ifdef SMP
127#include <sys/smp.h>
128#endif
129
130#include <vm/vm.h>
131#include <vm/vm_param.h>
132#include <vm/vm_kern.h>
133#include <vm/vm_page.h>
134#include <vm/vm_map.h>
135#include <vm/vm_object.h>
136#include <vm/vm_extern.h>
137#include <vm/vm_pageout.h>
138#include <vm/vm_pager.h>
139#include <vm/vm_reserv.h>
140#include <vm/uma.h>
141
142#include <machine/cpu.h>
143#include <machine/cputypes.h>
144#include <machine/md_var.h>
145#include <machine/pcb.h>
146#include <machine/specialreg.h>
147#ifdef SMP
148#include <machine/smp.h>
149#endif
150
151#ifndef PMAP_SHPGPERPROC
152#define PMAP_SHPGPERPROC 200
153#endif
154
155#if !defined(DIAGNOSTIC)
156#define PMAP_INLINE	__gnu89_inline
157#else
158#define PMAP_INLINE
159#endif
160
161#define PV_STATS
162#ifdef PV_STATS
163#define PV_STAT(x)	do { x ; } while (0)
164#else
165#define PV_STAT(x)	do { } while (0)
166#endif
167
168#define	pa_index(pa)	((pa) >> PDRSHIFT)
169#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
170
171struct pmap kernel_pmap_store;
172
173vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
174vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
175
176static int nkpt;
177static int ndmpdp;
178static vm_paddr_t dmaplimit;
179vm_offset_t kernel_vm_end;
180pt_entry_t pg_nx;
181
182SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
183
184static int pg_ps_enabled;
185SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0,
186    "Are large page mappings enabled?");
187
188static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
189static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
190u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
191u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
192
193static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
194static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
195
196/*
197 * Data for the pv entry allocation mechanism
198 */
199static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
200static struct md_page *pv_table;
201static int shpgperproc = PMAP_SHPGPERPROC;
202
203/*
204 * All those kernel PT submaps that BSD is so fond of
205 */
206pt_entry_t *CMAP1 = 0;
207caddr_t CADDR1 = 0;
208struct msgbuf *msgbufp = 0;
209
210/*
211 * Crashdump maps.
212 */
213static caddr_t crashdumpmap;
214
215static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
216static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
217static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
218static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m);
219static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
220static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
221static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
222		    vm_offset_t va);
223
224static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
225static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
226    vm_prot_t prot);
227static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
228    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
229static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
230static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
231static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
232static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
233static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
234    vm_prot_t prot);
235static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
236		vm_page_t *free);
237static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
238		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
239static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
240static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
241    vm_page_t *free);
242static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
243		vm_offset_t va);
244static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
245static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
246    vm_page_t m);
247
248static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
249static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
250
251static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
252static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
253                vm_page_t* free);
254static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
255static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
256
257CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
258CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
259
260/*
261 * Move the kernel virtual free pointer to the next
262 * 2MB.  This is used to help improve performance
263 * by using a large (2MB) page for much of the kernel
264 * (.text, .data, .bss)
265 */
266static vm_offset_t
267pmap_kmem_choose(vm_offset_t addr)
268{
269	vm_offset_t newaddr = addr;
270
271	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
272	return newaddr;
273}
274
275/********************/
276/* Inline functions */
277/********************/
278
279/* Return a non-clipped PD index for a given VA */
280static __inline vm_pindex_t
281pmap_pde_pindex(vm_offset_t va)
282{
283	return va >> PDRSHIFT;
284}
285
286
287/* Return various clipped indexes for a given VA */
288static __inline vm_pindex_t
289pmap_pte_index(vm_offset_t va)
290{
291
292	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
293}
294
295static __inline vm_pindex_t
296pmap_pde_index(vm_offset_t va)
297{
298
299	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
300}
301
302static __inline vm_pindex_t
303pmap_pdpe_index(vm_offset_t va)
304{
305
306	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
307}
308
309static __inline vm_pindex_t
310pmap_pml4e_index(vm_offset_t va)
311{
312
313	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
314}
315
316/* Return a pointer to the PML4 slot that corresponds to a VA */
317static __inline pml4_entry_t *
318pmap_pml4e(pmap_t pmap, vm_offset_t va)
319{
320
321	if (!pmap)
322		return NULL;
323	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
324}
325
326/* Return a pointer to the PDP slot that corresponds to a VA */
327static __inline pdp_entry_t *
328pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
329{
330	pdp_entry_t *pdpe;
331
332	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
333	return (&pdpe[pmap_pdpe_index(va)]);
334}
335
336/* Return a pointer to the PDP slot that corresponds to a VA */
337static __inline pdp_entry_t *
338pmap_pdpe(pmap_t pmap, vm_offset_t va)
339{
340	pml4_entry_t *pml4e;
341
342	pml4e = pmap_pml4e(pmap, va);
343	if (pml4e == NULL || (*pml4e & PG_V) == 0)
344		return NULL;
345	return (pmap_pml4e_to_pdpe(pml4e, va));
346}
347
348/* Return a pointer to the PD slot that corresponds to a VA */
349static __inline pd_entry_t *
350pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
351{
352	pd_entry_t *pde;
353
354	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
355	return (&pde[pmap_pde_index(va)]);
356}
357
358/* Return a pointer to the PD slot that corresponds to a VA */
359static __inline pd_entry_t *
360pmap_pde(pmap_t pmap, vm_offset_t va)
361{
362	pdp_entry_t *pdpe;
363
364	pdpe = pmap_pdpe(pmap, va);
365	if (pdpe == NULL || (*pdpe & PG_V) == 0)
366		 return NULL;
367	return (pmap_pdpe_to_pde(pdpe, va));
368}
369
370/* Return a pointer to the PT slot that corresponds to a VA */
371static __inline pt_entry_t *
372pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
373{
374	pt_entry_t *pte;
375
376	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
377	return (&pte[pmap_pte_index(va)]);
378}
379
380/* Return a pointer to the PT slot that corresponds to a VA */
381static __inline pt_entry_t *
382pmap_pte(pmap_t pmap, vm_offset_t va)
383{
384	pd_entry_t *pde;
385
386	pde = pmap_pde(pmap, va);
387	if (pde == NULL || (*pde & PG_V) == 0)
388		return NULL;
389	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
390		return ((pt_entry_t *)pde);
391	return (pmap_pde_to_pte(pde, va));
392}
393
394
395PMAP_INLINE pt_entry_t *
396vtopte(vm_offset_t va)
397{
398	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
399
400	return (PTmap + ((va >> PAGE_SHIFT) & mask));
401}
402
403static __inline pd_entry_t *
404vtopde(vm_offset_t va)
405{
406	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
407
408	return (PDmap + ((va >> PDRSHIFT) & mask));
409}
410
411static u_int64_t
412allocpages(vm_paddr_t *firstaddr, int n)
413{
414	u_int64_t ret;
415
416	ret = *firstaddr;
417	bzero((void *)ret, n * PAGE_SIZE);
418	*firstaddr += n * PAGE_SIZE;
419	return (ret);
420}
421
422static void
423create_pagetables(vm_paddr_t *firstaddr)
424{
425	int i;
426
427	/* Allocate pages */
428	KPTphys = allocpages(firstaddr, NKPT);
429	KPML4phys = allocpages(firstaddr, 1);
430	KPDPphys = allocpages(firstaddr, NKPML4E);
431	KPDphys = allocpages(firstaddr, NKPDPE);
432
433	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
434	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
435		ndmpdp = 4;
436	DMPDPphys = allocpages(firstaddr, NDMPML4E);
437	if ((amd_feature & AMDID_PAGE1GB) == 0)
438		DMPDphys = allocpages(firstaddr, ndmpdp);
439	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
440
441	/* Fill in the underlying page table pages */
442	/* Read-only from zero to physfree */
443	/* XXX not fully used, underneath 2M pages */
444	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
445		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
446		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
447	}
448
449	/* Now map the page tables at their location within PTmap */
450	for (i = 0; i < NKPT; i++) {
451		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
452		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
453	}
454
455	/* Map from zero to end of allocations under 2M pages */
456	/* This replaces some of the KPTphys entries above */
457	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
458		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
459		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
460	}
461
462	/* And connect up the PD to the PDP */
463	for (i = 0; i < NKPDPE; i++) {
464		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
465		    (i << PAGE_SHIFT);
466		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
467	}
468
469	/* Now set up the direct map space using either 2MB or 1GB pages */
470	if ((amd_feature & AMDID_PAGE1GB) == 0) {
471		for (i = 0; i < NPDEPG * ndmpdp; i++) {
472			((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
473			((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS |
474			    PG_G;
475		}
476		/* And the direct map space's PDP */
477		for (i = 0; i < ndmpdp; i++) {
478			((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
479			    (i << PAGE_SHIFT);
480			((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
481		}
482	} else {
483		for (i = 0; i < ndmpdp; i++) {
484			((pdp_entry_t *)DMPDPphys)[i] =
485			    (vm_paddr_t)i << PDPSHIFT;
486			((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS |
487			    PG_G;
488		}
489	}
490
491	/* And recursively map PML4 to itself in order to get PTmap */
492	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
493	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
494
495	/* Connect the Direct Map slot up to the PML4 */
496	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
497	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
498
499	/* Connect the KVA slot up to the PML4 */
500	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
501	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
502}
503
504/*
505 *	Bootstrap the system enough to run with virtual memory.
506 *
507 *	On amd64 this is called after mapping has already been enabled
508 *	and just syncs the pmap module with what has already been done.
509 *	[We can't call it easily with mapping off since the kernel is not
510 *	mapped with PA == VA, hence we would have to relocate every address
511 *	from the linked base (virtual) address "KERNBASE" to the actual
512 *	(physical) address starting relative to 0]
513 */
514void
515pmap_bootstrap(vm_paddr_t *firstaddr)
516{
517	vm_offset_t va;
518	pt_entry_t *pte, *unused;
519
520	/*
521	 * Create an initial set of page tables to run the kernel in.
522	 */
523	create_pagetables(firstaddr);
524
525	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
526	virtual_avail = pmap_kmem_choose(virtual_avail);
527
528	virtual_end = VM_MAX_KERNEL_ADDRESS;
529
530
531	/* XXX do %cr0 as well */
532	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
533	load_cr3(KPML4phys);
534
535	/*
536	 * Initialize the kernel pmap (which is statically allocated).
537	 */
538	PMAP_LOCK_INIT(kernel_pmap);
539	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
540	kernel_pmap->pm_root = NULL;
541	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
542	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
543	nkpt = NKPT;
544
545	/*
546	 * Reserve some special page table entries/VA space for temporary
547	 * mapping of pages.
548	 */
549#define	SYSMAP(c, p, v, n)	\
550	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
551
552	va = virtual_avail;
553	pte = vtopte(va);
554
555	/*
556	 * CMAP1 is only used for the memory test.
557	 */
558	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
559
560	/*
561	 * Crashdump maps.
562	 */
563	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
564
565	/*
566	 * msgbufp is used to map the system message buffer.
567	 */
568	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
569
570	virtual_avail = va;
571
572	*CMAP1 = 0;
573
574	invltlb();
575
576	/* Initialize the PAT MSR. */
577	pmap_init_pat();
578}
579
580/*
581 * Setup the PAT MSR.
582 */
583void
584pmap_init_pat(void)
585{
586	uint64_t pat_msr;
587
588	/* Bail if this CPU doesn't implement PAT. */
589	if (!(cpu_feature & CPUID_PAT))
590		panic("no PAT??");
591
592#ifdef PAT_WORKS
593	/*
594	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
595	 * Program 4 and 5 as WP and WC.
596	 * Leave 6 and 7 as UC and UC-.
597	 */
598	pat_msr = rdmsr(MSR_PAT);
599	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
600	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
601	    PAT_VALUE(5, PAT_WRITE_COMBINING);
602#else
603	/*
604	 * Due to some Intel errata, we can only safely use the lower 4
605	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
606	 * of UC-.
607	 *
608	 *   Intel Pentium III Processor Specification Update
609	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
610	 * or Mode C Paging)
611	 *
612	 *   Intel Pentium IV  Processor Specification Update
613	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
614	 */
615	pat_msr = rdmsr(MSR_PAT);
616	pat_msr &= ~PAT_MASK(2);
617	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
618#endif
619	wrmsr(MSR_PAT, pat_msr);
620}
621
622/*
623 *	Initialize a vm_page's machine-dependent fields.
624 */
625void
626pmap_page_init(vm_page_t m)
627{
628
629	TAILQ_INIT(&m->md.pv_list);
630}
631
632/*
633 *	Initialize the pmap module.
634 *	Called by vm_init, to initialize any structures that the pmap
635 *	system needs to map virtual memory.
636 */
637void
638pmap_init(void)
639{
640	pd_entry_t *pd;
641	vm_page_t mpte;
642	vm_size_t s;
643	int i, pv_npg;
644
645	/*
646	 * Initialize the vm page array entries for the kernel pmap's
647	 * page table pages.
648	 */
649	pd = pmap_pde(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
650	for (i = 0; i < nkpt; i++) {
651		if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V))
652			continue;
653		mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME);
654		KASSERT(mpte >= vm_page_array &&
655		    mpte < &vm_page_array[vm_page_array_size],
656		    ("pmap_init: page table page is out of range"));
657		mpte->pindex = pmap_pde_pindex(VM_MIN_KERNEL_ADDRESS) + i;
658		mpte->phys_addr = pd[i] & PG_FRAME;
659	}
660
661	/*
662	 * Initialize the address space (zone) for the pv entries.  Set a
663	 * high water mark so that the system can recover from excessive
664	 * numbers of pv entries.
665	 */
666	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
667	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
668	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
669	pv_entry_high_water = 9 * (pv_entry_max / 10);
670
671	/*
672	 * Are large page mappings enabled?
673	 */
674	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
675
676	/*
677	 * Calculate the size of the pv head table for superpages.
678	 */
679	for (i = 0; phys_avail[i + 1]; i += 2);
680	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
681
682	/*
683	 * Allocate memory for the pv head table for superpages.
684	 */
685	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
686	s = round_page(s);
687	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
688	for (i = 0; i < pv_npg; i++)
689		TAILQ_INIT(&pv_table[i].pv_list);
690}
691
692static int
693pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
694{
695	int error;
696
697	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
698	if (error == 0 && req->newptr) {
699		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
700		pv_entry_high_water = 9 * (pv_entry_max / 10);
701	}
702	return (error);
703}
704SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW,
705    &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
706
707static int
708pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
709{
710	int error;
711
712	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
713	if (error == 0 && req->newptr) {
714		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
715		pv_entry_high_water = 9 * (pv_entry_max / 10);
716	}
717	return (error);
718}
719SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW,
720    &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
721
722SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
723    "2MB page mapping counters");
724
725static u_long pmap_pde_demotions;
726SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
727    &pmap_pde_demotions, 0, "2MB page demotions");
728
729static u_long pmap_pde_mappings;
730SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
731    &pmap_pde_mappings, 0, "2MB page mappings");
732
733static u_long pmap_pde_p_failures;
734SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
735    &pmap_pde_p_failures, 0, "2MB page promotion failures");
736
737static u_long pmap_pde_promotions;
738SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
739    &pmap_pde_promotions, 0, "2MB page promotions");
740
741
742/***************************************************
743 * Low level helper routines.....
744 ***************************************************/
745
746/*
747 * Determine the appropriate bits to set in a PTE or PDE for a specified
748 * caching mode.
749 */
750static int
751pmap_cache_bits(int mode, boolean_t is_pde)
752{
753	int pat_flag, pat_index, cache_bits;
754
755	/* The PAT bit is different for PTE's and PDE's. */
756	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
757
758	/* If we don't support PAT, map extended modes to older ones. */
759	if (!(cpu_feature & CPUID_PAT)) {
760		switch (mode) {
761		case PAT_UNCACHEABLE:
762		case PAT_WRITE_THROUGH:
763		case PAT_WRITE_BACK:
764			break;
765		case PAT_UNCACHED:
766		case PAT_WRITE_COMBINING:
767		case PAT_WRITE_PROTECTED:
768			mode = PAT_UNCACHEABLE;
769			break;
770		}
771	}
772
773	/* Map the caching mode to a PAT index. */
774	switch (mode) {
775#ifdef PAT_WORKS
776	case PAT_UNCACHEABLE:
777		pat_index = 3;
778		break;
779	case PAT_WRITE_THROUGH:
780		pat_index = 1;
781		break;
782	case PAT_WRITE_BACK:
783		pat_index = 0;
784		break;
785	case PAT_UNCACHED:
786		pat_index = 2;
787		break;
788	case PAT_WRITE_COMBINING:
789		pat_index = 5;
790		break;
791	case PAT_WRITE_PROTECTED:
792		pat_index = 4;
793		break;
794#else
795	case PAT_UNCACHED:
796	case PAT_UNCACHEABLE:
797	case PAT_WRITE_PROTECTED:
798		pat_index = 3;
799		break;
800	case PAT_WRITE_THROUGH:
801		pat_index = 1;
802		break;
803	case PAT_WRITE_BACK:
804		pat_index = 0;
805		break;
806	case PAT_WRITE_COMBINING:
807		pat_index = 2;
808		break;
809#endif
810	default:
811		panic("Unknown caching mode %d\n", mode);
812	}
813
814	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
815	cache_bits = 0;
816	if (pat_index & 0x4)
817		cache_bits |= pat_flag;
818	if (pat_index & 0x2)
819		cache_bits |= PG_NC_PCD;
820	if (pat_index & 0x1)
821		cache_bits |= PG_NC_PWT;
822	return (cache_bits);
823}
824#ifdef SMP
825/*
826 * For SMP, these functions have to use the IPI mechanism for coherence.
827 *
828 * N.B.: Before calling any of the following TLB invalidation functions,
829 * the calling processor must ensure that all stores updating a non-
830 * kernel page table are globally performed.  Otherwise, another
831 * processor could cache an old, pre-update entry without being
832 * invalidated.  This can happen one of two ways: (1) The pmap becomes
833 * active on another processor after its pm_active field is checked by
834 * one of the following functions but before a store updating the page
835 * table is globally performed. (2) The pmap becomes active on another
836 * processor before its pm_active field is checked but due to
837 * speculative loads one of the following functions stills reads the
838 * pmap as inactive on the other processor.
839 *
840 * The kernel page table is exempt because its pm_active field is
841 * immutable.  The kernel page table is always active on every
842 * processor.
843 */
844void
845pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
846{
847	u_int cpumask;
848	u_int other_cpus;
849
850	sched_pin();
851	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
852		invlpg(va);
853		smp_invlpg(va);
854	} else {
855		cpumask = PCPU_GET(cpumask);
856		other_cpus = PCPU_GET(other_cpus);
857		if (pmap->pm_active & cpumask)
858			invlpg(va);
859		if (pmap->pm_active & other_cpus)
860			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
861	}
862	sched_unpin();
863}
864
865void
866pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
867{
868	u_int cpumask;
869	u_int other_cpus;
870	vm_offset_t addr;
871
872	sched_pin();
873	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
874		for (addr = sva; addr < eva; addr += PAGE_SIZE)
875			invlpg(addr);
876		smp_invlpg_range(sva, eva);
877	} else {
878		cpumask = PCPU_GET(cpumask);
879		other_cpus = PCPU_GET(other_cpus);
880		if (pmap->pm_active & cpumask)
881			for (addr = sva; addr < eva; addr += PAGE_SIZE)
882				invlpg(addr);
883		if (pmap->pm_active & other_cpus)
884			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
885			    sva, eva);
886	}
887	sched_unpin();
888}
889
890void
891pmap_invalidate_all(pmap_t pmap)
892{
893	u_int cpumask;
894	u_int other_cpus;
895
896	sched_pin();
897	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
898		invltlb();
899		smp_invltlb();
900	} else {
901		cpumask = PCPU_GET(cpumask);
902		other_cpus = PCPU_GET(other_cpus);
903		if (pmap->pm_active & cpumask)
904			invltlb();
905		if (pmap->pm_active & other_cpus)
906			smp_masked_invltlb(pmap->pm_active & other_cpus);
907	}
908	sched_unpin();
909}
910
911void
912pmap_invalidate_cache(void)
913{
914
915	sched_pin();
916	wbinvd();
917	smp_cache_flush();
918	sched_unpin();
919}
920#else /* !SMP */
921/*
922 * Normal, non-SMP, invalidation functions.
923 * We inline these within pmap.c for speed.
924 */
925PMAP_INLINE void
926pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
927{
928
929	if (pmap == kernel_pmap || pmap->pm_active)
930		invlpg(va);
931}
932
933PMAP_INLINE void
934pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
935{
936	vm_offset_t addr;
937
938	if (pmap == kernel_pmap || pmap->pm_active)
939		for (addr = sva; addr < eva; addr += PAGE_SIZE)
940			invlpg(addr);
941}
942
943PMAP_INLINE void
944pmap_invalidate_all(pmap_t pmap)
945{
946
947	if (pmap == kernel_pmap || pmap->pm_active)
948		invltlb();
949}
950
951PMAP_INLINE void
952pmap_invalidate_cache(void)
953{
954
955	wbinvd();
956}
957#endif /* !SMP */
958
959/*
960 * Are we current address space or kernel?
961 */
962static __inline int
963pmap_is_current(pmap_t pmap)
964{
965	return (pmap == kernel_pmap ||
966	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
967}
968
969/*
970 *	Routine:	pmap_extract
971 *	Function:
972 *		Extract the physical page address associated
973 *		with the given map/virtual_address pair.
974 */
975vm_paddr_t
976pmap_extract(pmap_t pmap, vm_offset_t va)
977{
978	vm_paddr_t rtval;
979	pt_entry_t *pte;
980	pd_entry_t pde, *pdep;
981
982	rtval = 0;
983	PMAP_LOCK(pmap);
984	pdep = pmap_pde(pmap, va);
985	if (pdep != NULL) {
986		pde = *pdep;
987		if (pde) {
988			if ((pde & PG_PS) != 0) {
989				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
990				PMAP_UNLOCK(pmap);
991				return rtval;
992			}
993			pte = pmap_pde_to_pte(pdep, va);
994			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
995		}
996	}
997	PMAP_UNLOCK(pmap);
998	return (rtval);
999}
1000
1001/*
1002 *	Routine:	pmap_extract_and_hold
1003 *	Function:
1004 *		Atomically extract and hold the physical page
1005 *		with the given pmap and virtual address pair
1006 *		if that mapping permits the given protection.
1007 */
1008vm_page_t
1009pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1010{
1011	pd_entry_t pde, *pdep;
1012	pt_entry_t pte;
1013	vm_page_t m;
1014
1015	m = NULL;
1016	vm_page_lock_queues();
1017	PMAP_LOCK(pmap);
1018	pdep = pmap_pde(pmap, va);
1019	if (pdep != NULL && (pde = *pdep)) {
1020		if (pde & PG_PS) {
1021			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1022				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1023				    (va & PDRMASK));
1024				vm_page_hold(m);
1025			}
1026		} else {
1027			pte = *pmap_pde_to_pte(pdep, va);
1028			if ((pte & PG_V) &&
1029			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1030				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1031				vm_page_hold(m);
1032			}
1033		}
1034	}
1035	vm_page_unlock_queues();
1036	PMAP_UNLOCK(pmap);
1037	return (m);
1038}
1039
1040vm_paddr_t
1041pmap_kextract(vm_offset_t va)
1042{
1043	pd_entry_t *pde;
1044	vm_paddr_t pa;
1045
1046	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1047		pa = DMAP_TO_PHYS(va);
1048	} else {
1049		pde = vtopde(va);
1050		if (*pde & PG_PS) {
1051			pa = (*pde & PG_PS_FRAME) | (va & PDRMASK);
1052		} else {
1053			pa = *vtopte(va);
1054			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1055		}
1056	}
1057	return pa;
1058}
1059
1060/***************************************************
1061 * Low level mapping routines.....
1062 ***************************************************/
1063
1064/*
1065 * Add a wired page to the kva.
1066 * Note: not SMP coherent.
1067 */
1068PMAP_INLINE void
1069pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1070{
1071	pt_entry_t *pte;
1072
1073	pte = vtopte(va);
1074	pte_store(pte, pa | PG_RW | PG_V | PG_G);
1075}
1076
1077PMAP_INLINE void
1078pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1079{
1080	pt_entry_t *pte;
1081
1082	pte = vtopte(va);
1083	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
1084}
1085
1086/*
1087 * Remove a page from the kernel pagetables.
1088 * Note: not SMP coherent.
1089 */
1090PMAP_INLINE void
1091pmap_kremove(vm_offset_t va)
1092{
1093	pt_entry_t *pte;
1094
1095	pte = vtopte(va);
1096	pte_clear(pte);
1097}
1098
1099/*
1100 *	Used to map a range of physical addresses into kernel
1101 *	virtual address space.
1102 *
1103 *	The value passed in '*virt' is a suggested virtual address for
1104 *	the mapping. Architectures which can support a direct-mapped
1105 *	physical to virtual region can return the appropriate address
1106 *	within that region, leaving '*virt' unchanged. Other
1107 *	architectures should map the pages starting at '*virt' and
1108 *	update '*virt' with the first usable address after the mapped
1109 *	region.
1110 */
1111vm_offset_t
1112pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1113{
1114	return PHYS_TO_DMAP(start);
1115}
1116
1117
1118/*
1119 * Add a list of wired pages to the kva
1120 * this routine is only used for temporary
1121 * kernel mappings that do not need to have
1122 * page modification or references recorded.
1123 * Note that old mappings are simply written
1124 * over.  The page *must* be wired.
1125 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1126 */
1127void
1128pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1129{
1130	pt_entry_t *endpte, oldpte, *pte;
1131
1132	oldpte = 0;
1133	pte = vtopte(sva);
1134	endpte = pte + count;
1135	while (pte < endpte) {
1136		oldpte |= *pte;
1137		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V);
1138		pte++;
1139		ma++;
1140	}
1141	if ((oldpte & PG_V) != 0)
1142		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1143		    PAGE_SIZE);
1144}
1145
1146/*
1147 * This routine tears out page mappings from the
1148 * kernel -- it is meant only for temporary mappings.
1149 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1150 */
1151void
1152pmap_qremove(vm_offset_t sva, int count)
1153{
1154	vm_offset_t va;
1155
1156	va = sva;
1157	while (count-- > 0) {
1158		pmap_kremove(va);
1159		va += PAGE_SIZE;
1160	}
1161	pmap_invalidate_range(kernel_pmap, sva, va);
1162}
1163
1164/***************************************************
1165 * Page table page management routines.....
1166 ***************************************************/
1167static __inline void
1168pmap_free_zero_pages(vm_page_t free)
1169{
1170	vm_page_t m;
1171
1172	while (free != NULL) {
1173		m = free;
1174		free = m->right;
1175		/* Preserve the page's PG_ZERO setting. */
1176		vm_page_free_toq(m);
1177	}
1178}
1179
1180/*
1181 * Schedule the specified unused page table page to be freed.  Specifically,
1182 * add the page to the specified list of pages that will be released to the
1183 * physical memory manager after the TLB has been updated.
1184 */
1185static __inline void
1186pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1187{
1188
1189	if (set_PG_ZERO)
1190		m->flags |= PG_ZERO;
1191	else
1192		m->flags &= ~PG_ZERO;
1193	m->right = *free;
1194	*free = m;
1195}
1196
1197/*
1198 * Inserts the specified page table page into the specified pmap's collection
1199 * of idle page table pages.  Each of a pmap's page table pages is responsible
1200 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1201 * ordered by this virtual address range.
1202 */
1203static void
1204pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1205{
1206	vm_page_t root;
1207
1208	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1209	root = pmap->pm_root;
1210	if (root == NULL) {
1211		mpte->left = NULL;
1212		mpte->right = NULL;
1213	} else {
1214		root = vm_page_splay(mpte->pindex, root);
1215		if (mpte->pindex < root->pindex) {
1216			mpte->left = root->left;
1217			mpte->right = root;
1218			root->left = NULL;
1219		} else if (mpte->pindex == root->pindex)
1220			panic("pmap_insert_pt_page: pindex already inserted");
1221		else {
1222			mpte->right = root->right;
1223			mpte->left = root;
1224			root->right = NULL;
1225		}
1226	}
1227	pmap->pm_root = mpte;
1228}
1229
1230/*
1231 * Looks for a page table page mapping the specified virtual address in the
1232 * specified pmap's collection of idle page table pages.  Returns NULL if there
1233 * is no page table page corresponding to the specified virtual address.
1234 */
1235static vm_page_t
1236pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1237{
1238	vm_page_t mpte;
1239	vm_pindex_t pindex = pmap_pde_pindex(va);
1240
1241	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1242	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1243		mpte = vm_page_splay(pindex, mpte);
1244		if ((pmap->pm_root = mpte)->pindex != pindex)
1245			mpte = NULL;
1246	}
1247	return (mpte);
1248}
1249
1250/*
1251 * Removes the specified page table page from the specified pmap's collection
1252 * of idle page table pages.  The specified page table page must be a member of
1253 * the pmap's collection.
1254 */
1255static void
1256pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1257{
1258	vm_page_t root;
1259
1260	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1261	if (mpte != pmap->pm_root) {
1262		root = vm_page_splay(mpte->pindex, pmap->pm_root);
1263		KASSERT(mpte == root,
1264		    ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
1265		    mpte, pmap));
1266	}
1267	if (mpte->left == NULL)
1268		root = mpte->right;
1269	else {
1270		root = vm_page_splay(mpte->pindex, mpte->left);
1271		root->right = mpte->right;
1272	}
1273	pmap->pm_root = root;
1274}
1275
1276/*
1277 * This routine unholds page table pages, and if the hold count
1278 * drops to zero, then it decrements the wire count.
1279 */
1280static __inline int
1281pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1282{
1283
1284	--m->wire_count;
1285	if (m->wire_count == 0)
1286		return _pmap_unwire_pte_hold(pmap, va, m, free);
1287	else
1288		return 0;
1289}
1290
1291static int
1292_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
1293    vm_page_t *free)
1294{
1295	vm_offset_t pteva;
1296
1297	/*
1298	 * unmap the page table page
1299	 */
1300	if (m->pindex >= (NUPDE + NUPDPE)) {
1301		/* PDP page */
1302		pml4_entry_t *pml4;
1303		pml4 = pmap_pml4e(pmap, va);
1304		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
1305		*pml4 = 0;
1306	} else if (m->pindex >= NUPDE) {
1307		/* PD page */
1308		pdp_entry_t *pdp;
1309		pdp = pmap_pdpe(pmap, va);
1310		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
1311		*pdp = 0;
1312	} else {
1313		/* PTE page */
1314		pd_entry_t *pd;
1315		pd = pmap_pde(pmap, va);
1316		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
1317		*pd = 0;
1318	}
1319	--pmap->pm_stats.resident_count;
1320	if (m->pindex < NUPDE) {
1321		/* We just released a PT, unhold the matching PD */
1322		vm_page_t pdpg;
1323
1324		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1325		pmap_unwire_pte_hold(pmap, va, pdpg, free);
1326	}
1327	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1328		/* We just released a PD, unhold the matching PDP */
1329		vm_page_t pdppg;
1330
1331		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1332		pmap_unwire_pte_hold(pmap, va, pdppg, free);
1333	}
1334
1335	/*
1336	 * This is a release store so that the ordinary store unmapping
1337	 * the page table page is globally performed before TLB shoot-
1338	 * down is begun.
1339	 */
1340	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1341
1342	/*
1343	 * Do an invltlb to make the invalidated mapping
1344	 * take effect immediately.
1345	 */
1346	pmap_invalidate_page(pmap, pteva);
1347
1348	/*
1349	 * Put page on a list so that it is released after
1350	 * *ALL* TLB shootdown is done
1351	 */
1352	pmap_add_delayed_free_list(m, free, TRUE);
1353
1354	return 1;
1355}
1356
1357/*
1358 * After removing a page table entry, this routine is used to
1359 * conditionally free the page, and manage the hold/wire counts.
1360 */
1361static int
1362pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
1363{
1364	vm_page_t mpte;
1365
1366	if (va >= VM_MAXUSER_ADDRESS)
1367		return 0;
1368	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1369	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1370	return pmap_unwire_pte_hold(pmap, va, mpte, free);
1371}
1372
1373void
1374pmap_pinit0(pmap_t pmap)
1375{
1376
1377	PMAP_LOCK_INIT(pmap);
1378	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1379	pmap->pm_root = NULL;
1380	pmap->pm_active = 0;
1381	TAILQ_INIT(&pmap->pm_pvchunk);
1382	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1383}
1384
1385/*
1386 * Initialize a preallocated and zeroed pmap structure,
1387 * such as one in a vmspace structure.
1388 */
1389int
1390pmap_pinit(pmap_t pmap)
1391{
1392	vm_page_t pml4pg;
1393	static vm_pindex_t color;
1394
1395	PMAP_LOCK_INIT(pmap);
1396
1397	/*
1398	 * allocate the page directory page
1399	 */
1400	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1401	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1402		VM_WAIT;
1403
1404	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1405
1406	if ((pml4pg->flags & PG_ZERO) == 0)
1407		pagezero(pmap->pm_pml4);
1408
1409	/* Wire in kernel global address entries. */
1410	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1411	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1412
1413	/* install self-referential address mapping entry(s) */
1414	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1415
1416	pmap->pm_root = NULL;
1417	pmap->pm_active = 0;
1418	TAILQ_INIT(&pmap->pm_pvchunk);
1419	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1420
1421	return (1);
1422}
1423
1424/*
1425 * this routine is called if the page table page is not
1426 * mapped correctly.
1427 *
1428 * Note: If a page allocation fails at page table level two or three,
1429 * one or two pages may be held during the wait, only to be released
1430 * afterwards.  This conservative approach is easily argued to avoid
1431 * race conditions.
1432 */
1433static vm_page_t
1434_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1435{
1436	vm_page_t m, pdppg, pdpg;
1437
1438	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1439	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1440	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1441
1442	/*
1443	 * Allocate a page table page.
1444	 */
1445	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1446	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1447		if (flags & M_WAITOK) {
1448			PMAP_UNLOCK(pmap);
1449			vm_page_unlock_queues();
1450			VM_WAIT;
1451			vm_page_lock_queues();
1452			PMAP_LOCK(pmap);
1453		}
1454
1455		/*
1456		 * Indicate the need to retry.  While waiting, the page table
1457		 * page may have been allocated.
1458		 */
1459		return (NULL);
1460	}
1461	if ((m->flags & PG_ZERO) == 0)
1462		pmap_zero_page(m);
1463
1464	/*
1465	 * Map the pagetable page into the process address space, if
1466	 * it isn't already there.
1467	 */
1468
1469	pmap->pm_stats.resident_count++;
1470
1471	if (ptepindex >= (NUPDE + NUPDPE)) {
1472		pml4_entry_t *pml4;
1473		vm_pindex_t pml4index;
1474
1475		/* Wire up a new PDPE page */
1476		pml4index = ptepindex - (NUPDE + NUPDPE);
1477		pml4 = &pmap->pm_pml4[pml4index];
1478		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1479
1480	} else if (ptepindex >= NUPDE) {
1481		vm_pindex_t pml4index;
1482		vm_pindex_t pdpindex;
1483		pml4_entry_t *pml4;
1484		pdp_entry_t *pdp;
1485
1486		/* Wire up a new PDE page */
1487		pdpindex = ptepindex - NUPDE;
1488		pml4index = pdpindex >> NPML4EPGSHIFT;
1489
1490		pml4 = &pmap->pm_pml4[pml4index];
1491		if ((*pml4 & PG_V) == 0) {
1492			/* Have to allocate a new pdp, recurse */
1493			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1494			    flags) == NULL) {
1495				--m->wire_count;
1496				vm_page_free(m);
1497				return (NULL);
1498			}
1499		} else {
1500			/* Add reference to pdp page */
1501			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1502			pdppg->wire_count++;
1503		}
1504		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1505
1506		/* Now find the pdp page */
1507		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1508		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1509
1510	} else {
1511		vm_pindex_t pml4index;
1512		vm_pindex_t pdpindex;
1513		pml4_entry_t *pml4;
1514		pdp_entry_t *pdp;
1515		pd_entry_t *pd;
1516
1517		/* Wire up a new PTE page */
1518		pdpindex = ptepindex >> NPDPEPGSHIFT;
1519		pml4index = pdpindex >> NPML4EPGSHIFT;
1520
1521		/* First, find the pdp and check that its valid. */
1522		pml4 = &pmap->pm_pml4[pml4index];
1523		if ((*pml4 & PG_V) == 0) {
1524			/* Have to allocate a new pd, recurse */
1525			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1526			    flags) == NULL) {
1527				--m->wire_count;
1528				vm_page_free(m);
1529				return (NULL);
1530			}
1531			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1532			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1533		} else {
1534			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1535			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1536			if ((*pdp & PG_V) == 0) {
1537				/* Have to allocate a new pd, recurse */
1538				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1539				    flags) == NULL) {
1540					--m->wire_count;
1541					vm_page_free(m);
1542					return (NULL);
1543				}
1544			} else {
1545				/* Add reference to the pd page */
1546				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1547				pdpg->wire_count++;
1548			}
1549		}
1550		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1551
1552		/* Now we know where the page directory page is */
1553		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1554		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1555	}
1556
1557	return m;
1558}
1559
1560static vm_page_t
1561pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1562{
1563	vm_pindex_t pdpindex, ptepindex;
1564	pdp_entry_t *pdpe;
1565	vm_page_t pdpg;
1566
1567	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1568	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1569	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1570retry:
1571	pdpe = pmap_pdpe(pmap, va);
1572	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1573		/* Add a reference to the pd page. */
1574		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1575		pdpg->wire_count++;
1576	} else {
1577		/* Allocate a pd page. */
1578		ptepindex = pmap_pde_pindex(va);
1579		pdpindex = ptepindex >> NPDPEPGSHIFT;
1580		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1581		if (pdpg == NULL && (flags & M_WAITOK))
1582			goto retry;
1583	}
1584	return (pdpg);
1585}
1586
1587static vm_page_t
1588pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1589{
1590	vm_pindex_t ptepindex;
1591	pd_entry_t *pd;
1592	vm_page_t m;
1593
1594	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1595	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1596	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1597
1598	/*
1599	 * Calculate pagetable page index
1600	 */
1601	ptepindex = pmap_pde_pindex(va);
1602retry:
1603	/*
1604	 * Get the page directory entry
1605	 */
1606	pd = pmap_pde(pmap, va);
1607
1608	/*
1609	 * This supports switching from a 2MB page to a
1610	 * normal 4K page.
1611	 */
1612	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1613		if (!pmap_demote_pde(pmap, pd, va)) {
1614			/*
1615			 * Invalidation of the 2MB page mapping may have caused
1616			 * the deallocation of the underlying PD page.
1617			 */
1618			pd = NULL;
1619		}
1620	}
1621
1622	/*
1623	 * If the page table page is mapped, we just increment the
1624	 * hold count, and activate it.
1625	 */
1626	if (pd != NULL && (*pd & PG_V) != 0) {
1627		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1628		m->wire_count++;
1629	} else {
1630		/*
1631		 * Here if the pte page isn't mapped, or if it has been
1632		 * deallocated.
1633		 */
1634		m = _pmap_allocpte(pmap, ptepindex, flags);
1635		if (m == NULL && (flags & M_WAITOK))
1636			goto retry;
1637	}
1638	return (m);
1639}
1640
1641
1642/***************************************************
1643 * Pmap allocation/deallocation routines.
1644 ***************************************************/
1645
1646/*
1647 * Release any resources held by the given physical map.
1648 * Called when a pmap initialized by pmap_pinit is being released.
1649 * Should only be called if the map contains no valid mappings.
1650 */
1651void
1652pmap_release(pmap_t pmap)
1653{
1654	vm_page_t m;
1655
1656	KASSERT(pmap->pm_stats.resident_count == 0,
1657	    ("pmap_release: pmap resident count %ld != 0",
1658	    pmap->pm_stats.resident_count));
1659	KASSERT(pmap->pm_root == NULL,
1660	    ("pmap_release: pmap has reserved page table page(s)"));
1661
1662	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1663
1664	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1665	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1666	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1667
1668	m->wire_count--;
1669	atomic_subtract_int(&cnt.v_wire_count, 1);
1670	vm_page_free_zero(m);
1671	PMAP_LOCK_DESTROY(pmap);
1672}
1673
1674static int
1675kvm_size(SYSCTL_HANDLER_ARGS)
1676{
1677	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1678
1679	return sysctl_handle_long(oidp, &ksize, 0, req);
1680}
1681SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1682    0, 0, kvm_size, "LU", "Size of KVM");
1683
1684static int
1685kvm_free(SYSCTL_HANDLER_ARGS)
1686{
1687	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1688
1689	return sysctl_handle_long(oidp, &kfree, 0, req);
1690}
1691SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1692    0, 0, kvm_free, "LU", "Amount of KVM free");
1693
1694/*
1695 * grow the number of kernel page table entries, if needed
1696 */
1697void
1698pmap_growkernel(vm_offset_t addr)
1699{
1700	vm_paddr_t paddr;
1701	vm_page_t nkpg;
1702	pd_entry_t *pde, newpdir;
1703	pdp_entry_t newpdp;
1704
1705	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1706	if (kernel_vm_end == 0) {
1707		kernel_vm_end = KERNBASE;
1708		nkpt = 0;
1709		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1710			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1711			nkpt++;
1712			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1713				kernel_vm_end = kernel_map->max_offset;
1714				break;
1715			}
1716		}
1717	}
1718	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1719	if (addr - 1 >= kernel_map->max_offset)
1720		addr = kernel_map->max_offset;
1721	while (kernel_vm_end < addr) {
1722		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1723		if (pde == NULL) {
1724			/* We need a new PDP entry */
1725			nkpg = vm_page_alloc(NULL, nkpt,
1726			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1727			if (nkpg == NULL)
1728				panic("pmap_growkernel: no memory to grow kernel");
1729			pmap_zero_page(nkpg);
1730			paddr = VM_PAGE_TO_PHYS(nkpg);
1731			newpdp = (pdp_entry_t)
1732				(paddr | PG_V | PG_RW | PG_A | PG_M);
1733			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1734			continue; /* try again */
1735		}
1736		if ((*pde & PG_V) != 0) {
1737			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1738			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1739				kernel_vm_end = kernel_map->max_offset;
1740				break;
1741			}
1742			continue;
1743		}
1744
1745		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
1746		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1747		if (nkpg == NULL)
1748			panic("pmap_growkernel: no memory to grow kernel");
1749
1750		nkpt++;
1751
1752		pmap_zero_page(nkpg);
1753		paddr = VM_PAGE_TO_PHYS(nkpg);
1754		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1755		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1756
1757		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1758		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1759			kernel_vm_end = kernel_map->max_offset;
1760			break;
1761		}
1762	}
1763}
1764
1765
1766/***************************************************
1767 * page management routines.
1768 ***************************************************/
1769
1770CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1771CTASSERT(_NPCM == 3);
1772CTASSERT(_NPCPV == 168);
1773
1774static __inline struct pv_chunk *
1775pv_to_chunk(pv_entry_t pv)
1776{
1777
1778	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1779}
1780
1781#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1782
1783#define	PC_FREE0	0xfffffffffffffffful
1784#define	PC_FREE1	0xfffffffffffffffful
1785#define	PC_FREE2	0x000000fffffffffful
1786
1787static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1788
1789SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1790	"Current number of pv entries");
1791
1792#ifdef PV_STATS
1793static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1794
1795SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1796	"Current number of pv entry chunks");
1797SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1798	"Current number of pv entry chunks allocated");
1799SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1800	"Current number of pv entry chunks frees");
1801SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1802	"Number of times tried to get a chunk page but failed.");
1803
1804static long pv_entry_frees, pv_entry_allocs;
1805static int pv_entry_spare;
1806
1807SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1808	"Current number of pv entry frees");
1809SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1810	"Current number of pv entry allocs");
1811SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1812	"Current number of spare pv entries");
1813
1814static int pmap_collect_inactive, pmap_collect_active;
1815
1816SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1817	"Current number times pmap_collect called on inactive queue");
1818SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1819	"Current number times pmap_collect called on active queue");
1820#endif
1821
1822/*
1823 * We are in a serious low memory condition.  Resort to
1824 * drastic measures to free some pages so we can allocate
1825 * another pv entry chunk.  This is normally called to
1826 * unmap inactive pages, and if necessary, active pages.
1827 *
1828 * We do not, however, unmap 2mpages because subsequent accesses will
1829 * allocate per-page pv entries until repromotion occurs, thereby
1830 * exacerbating the shortage of free pv entries.
1831 */
1832static void
1833pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1834{
1835	struct md_page *pvh;
1836	pd_entry_t *pde;
1837	pmap_t pmap;
1838	pt_entry_t *pte, tpte;
1839	pv_entry_t next_pv, pv;
1840	vm_offset_t va;
1841	vm_page_t m, free;
1842
1843	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1844		if (m->hold_count || m->busy)
1845			continue;
1846		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1847			va = pv->pv_va;
1848			pmap = PV_PMAP(pv);
1849			/* Avoid deadlock and lock recursion. */
1850			if (pmap > locked_pmap)
1851				PMAP_LOCK(pmap);
1852			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1853				continue;
1854			pmap->pm_stats.resident_count--;
1855			pde = pmap_pde(pmap, va);
1856			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
1857			    " a 2mpage in page %p's pv list", m));
1858			pte = pmap_pde_to_pte(pde, va);
1859			tpte = pte_load_clear(pte);
1860			KASSERT((tpte & PG_W) == 0,
1861			    ("pmap_collect: wired pte %#lx", tpte));
1862			if (tpte & PG_A)
1863				vm_page_flag_set(m, PG_REFERENCED);
1864			if (tpte & PG_M) {
1865				KASSERT((tpte & PG_RW),
1866	("pmap_collect: modified page not writable: va: %#lx, pte: %#lx",
1867				    va, tpte));
1868				vm_page_dirty(m);
1869			}
1870			free = NULL;
1871			pmap_unuse_pt(pmap, va, *pde, &free);
1872			pmap_invalidate_page(pmap, va);
1873			pmap_free_zero_pages(free);
1874			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1875			if (TAILQ_EMPTY(&m->md.pv_list)) {
1876				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1877				if (TAILQ_EMPTY(&pvh->pv_list))
1878					vm_page_flag_clear(m, PG_WRITEABLE);
1879			}
1880			free_pv_entry(pmap, pv);
1881			if (pmap != locked_pmap)
1882				PMAP_UNLOCK(pmap);
1883		}
1884	}
1885}
1886
1887
1888/*
1889 * free the pv_entry back to the free list
1890 */
1891static void
1892free_pv_entry(pmap_t pmap, pv_entry_t pv)
1893{
1894	vm_page_t m;
1895	struct pv_chunk *pc;
1896	int idx, field, bit;
1897
1898	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1899	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1900	PV_STAT(pv_entry_frees++);
1901	PV_STAT(pv_entry_spare++);
1902	pv_entry_count--;
1903	pc = pv_to_chunk(pv);
1904	idx = pv - &pc->pc_pventry[0];
1905	field = idx / 64;
1906	bit = idx % 64;
1907	pc->pc_map[field] |= 1ul << bit;
1908	/* move to head of list */
1909	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1910	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1911	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1912	    pc->pc_map[2] != PC_FREE2)
1913		return;
1914	PV_STAT(pv_entry_spare -= _NPCPV);
1915	PV_STAT(pc_chunk_count--);
1916	PV_STAT(pc_chunk_frees++);
1917	/* entire chunk is free, return it */
1918	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1919	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1920	dump_drop_page(m->phys_addr);
1921	vm_page_unwire(m, 0);
1922	vm_page_free(m);
1923}
1924
1925/*
1926 * get a new pv_entry, allocating a block from the system
1927 * when needed.
1928 */
1929static pv_entry_t
1930get_pv_entry(pmap_t pmap, int try)
1931{
1932	static const struct timeval printinterval = { 60, 0 };
1933	static struct timeval lastprint;
1934	static vm_pindex_t colour;
1935	struct vpgqueues *pq;
1936	int bit, field;
1937	pv_entry_t pv;
1938	struct pv_chunk *pc;
1939	vm_page_t m;
1940
1941	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1942	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1943	PV_STAT(pv_entry_allocs++);
1944	pv_entry_count++;
1945	if (pv_entry_count > pv_entry_high_water)
1946		if (ratecheck(&lastprint, &printinterval))
1947			printf("Approaching the limit on PV entries, consider "
1948			    "increasing either the vm.pmap.shpgperproc or the "
1949			    "vm.pmap.pv_entry_max sysctl.\n");
1950	pq = NULL;
1951retry:
1952	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1953	if (pc != NULL) {
1954		for (field = 0; field < _NPCM; field++) {
1955			if (pc->pc_map[field]) {
1956				bit = bsfq(pc->pc_map[field]);
1957				break;
1958			}
1959		}
1960		if (field < _NPCM) {
1961			pv = &pc->pc_pventry[field * 64 + bit];
1962			pc->pc_map[field] &= ~(1ul << bit);
1963			/* If this was the last item, move it to tail */
1964			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1965			    pc->pc_map[2] == 0) {
1966				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1967				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1968			}
1969			PV_STAT(pv_entry_spare--);
1970			return (pv);
1971		}
1972	}
1973	/* No free items, allocate another chunk */
1974	m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ?
1975	    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
1976	    VM_ALLOC_WIRED);
1977	if (m == NULL) {
1978		if (try) {
1979			pv_entry_count--;
1980			PV_STAT(pc_chunk_tryfail++);
1981			return (NULL);
1982		}
1983		/*
1984		 * Reclaim pv entries: At first, destroy mappings to inactive
1985		 * pages.  After that, if a pv chunk entry is still needed,
1986		 * destroy mappings to active pages.
1987		 */
1988		if (pq == NULL) {
1989			PV_STAT(pmap_collect_inactive++);
1990			pq = &vm_page_queues[PQ_INACTIVE];
1991		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
1992			PV_STAT(pmap_collect_active++);
1993			pq = &vm_page_queues[PQ_ACTIVE];
1994		} else
1995			panic("get_pv_entry: increase vm.pmap.shpgperproc");
1996		pmap_collect(pmap, pq);
1997		goto retry;
1998	}
1999	PV_STAT(pc_chunk_count++);
2000	PV_STAT(pc_chunk_allocs++);
2001	colour++;
2002	dump_add_page(m->phys_addr);
2003	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2004	pc->pc_pmap = pmap;
2005	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2006	pc->pc_map[1] = PC_FREE1;
2007	pc->pc_map[2] = PC_FREE2;
2008	pv = &pc->pc_pventry[0];
2009	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2010	PV_STAT(pv_entry_spare += _NPCPV - 1);
2011	return (pv);
2012}
2013
2014/*
2015 * First find and then remove the pv entry for the specified pmap and virtual
2016 * address from the specified pv list.  Returns the pv entry if found and NULL
2017 * otherwise.  This operation can be performed on pv lists for either 4KB or
2018 * 2MB page mappings.
2019 */
2020static __inline pv_entry_t
2021pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2022{
2023	pv_entry_t pv;
2024
2025	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2026	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2027		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2028			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2029			break;
2030		}
2031	}
2032	return (pv);
2033}
2034
2035/*
2036 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2037 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2038 * entries for each of the 4KB page mappings.
2039 */
2040static void
2041pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2042{
2043	struct md_page *pvh;
2044	pv_entry_t pv;
2045	vm_offset_t va_last;
2046	vm_page_t m;
2047
2048	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2049	KASSERT((pa & PDRMASK) == 0,
2050	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
2051
2052	/*
2053	 * Transfer the 2mpage's pv entry for this mapping to the first
2054	 * page's pv list.
2055	 */
2056	pvh = pa_to_pvh(pa);
2057	va = trunc_2mpage(va);
2058	pv = pmap_pvh_remove(pvh, pmap, va);
2059	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2060	m = PHYS_TO_VM_PAGE(pa);
2061	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2062	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2063	va_last = va + NBPDR - PAGE_SIZE;
2064	do {
2065		m++;
2066		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2067		    ("pmap_pv_demote_pde: page %p is not managed", m));
2068		va += PAGE_SIZE;
2069		pmap_insert_entry(pmap, va, m);
2070	} while (va < va_last);
2071}
2072
2073/*
2074 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2075 * replace the many pv entries for the 4KB page mappings by a single pv entry
2076 * for the 2MB page mapping.
2077 */
2078static void
2079pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2080{
2081	struct md_page *pvh;
2082	pv_entry_t pv;
2083	vm_offset_t va_last;
2084	vm_page_t m;
2085
2086	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2087	KASSERT((pa & PDRMASK) == 0,
2088	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
2089
2090	/*
2091	 * Transfer the first page's pv entry for this mapping to the
2092	 * 2mpage's pv list.  Aside from avoiding the cost of a call
2093	 * to get_pv_entry(), a transfer avoids the possibility that
2094	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2095	 * removes one of the mappings that is being promoted.
2096	 */
2097	m = PHYS_TO_VM_PAGE(pa);
2098	va = trunc_2mpage(va);
2099	pv = pmap_pvh_remove(&m->md, pmap, va);
2100	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2101	pvh = pa_to_pvh(pa);
2102	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2103	/* Free the remaining NPTEPG - 1 pv entries. */
2104	va_last = va + NBPDR - PAGE_SIZE;
2105	do {
2106		m++;
2107		va += PAGE_SIZE;
2108		pmap_pvh_free(&m->md, pmap, va);
2109	} while (va < va_last);
2110}
2111
2112/*
2113 * First find and then destroy the pv entry for the specified pmap and virtual
2114 * address.  This operation can be performed on pv lists for either 4KB or 2MB
2115 * page mappings.
2116 */
2117static void
2118pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2119{
2120	pv_entry_t pv;
2121
2122	pv = pmap_pvh_remove(pvh, pmap, va);
2123	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2124	free_pv_entry(pmap, pv);
2125}
2126
2127static void
2128pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2129{
2130	struct md_page *pvh;
2131
2132	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2133	pmap_pvh_free(&m->md, pmap, va);
2134	if (TAILQ_EMPTY(&m->md.pv_list)) {
2135		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2136		if (TAILQ_EMPTY(&pvh->pv_list))
2137			vm_page_flag_clear(m, PG_WRITEABLE);
2138	}
2139}
2140
2141/*
2142 * Create a pv entry for page at pa for
2143 * (pmap, va).
2144 */
2145static void
2146pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2147{
2148	pv_entry_t pv;
2149
2150	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2151	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2152	pv = get_pv_entry(pmap, FALSE);
2153	pv->pv_va = va;
2154	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2155}
2156
2157/*
2158 * Conditionally create a pv entry.
2159 */
2160static boolean_t
2161pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2162{
2163	pv_entry_t pv;
2164
2165	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2166	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2167	if (pv_entry_count < pv_entry_high_water &&
2168	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2169		pv->pv_va = va;
2170		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2171		return (TRUE);
2172	} else
2173		return (FALSE);
2174}
2175
2176/*
2177 * Create the pv entry for a 2MB page mapping.
2178 */
2179static boolean_t
2180pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m)
2181{
2182	struct md_page *pvh;
2183	pv_entry_t pv;
2184
2185	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2186	if (pv_entry_count < pv_entry_high_water &&
2187	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2188		pv->pv_va = va;
2189		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2190		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2191		return (TRUE);
2192	} else
2193		return (FALSE);
2194}
2195
2196/*
2197 * Tries to demote a 2MB page mapping.
2198 */
2199static boolean_t
2200pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2201{
2202	pd_entry_t newpde, oldpde;
2203	pt_entry_t *firstpte, newpte, *pte;
2204	vm_paddr_t mptepa;
2205	vm_page_t free, mpte;
2206
2207	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2208	mpte = pmap_lookup_pt_page(pmap, va);
2209	if (mpte != NULL)
2210		pmap_remove_pt_page(pmap, mpte);
2211	else {
2212		KASSERT((*pde & PG_W) == 0,
2213		    ("pmap_demote_pde: page table page for a wired mapping"
2214		    " is missing"));
2215		free = NULL;
2216		pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
2217		pmap_invalidate_page(pmap, trunc_2mpage(va));
2218		pmap_free_zero_pages(free);
2219		CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
2220		    " in pmap %p", va, pmap);
2221		return (FALSE);
2222	}
2223	mptepa = VM_PAGE_TO_PHYS(mpte);
2224	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2225	oldpde = *pde;
2226	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2227	KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V),
2228	    ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V"));
2229	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2230	    ("pmap_demote_pde: oldpde is missing PG_M"));
2231	KASSERT((oldpde & PG_PS) != 0,
2232	    ("pmap_demote_pde: oldpde is missing PG_PS"));
2233	newpte = oldpde & ~PG_PS;
2234	if ((newpte & PG_PDE_PAT) != 0)
2235		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2236
2237	/*
2238	 * If the mapping has changed attributes, update the page table
2239	 * entries.
2240	 */
2241	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2242	    ("pmap_demote_pde: firstpte and newpte map different physical"
2243	    " addresses"));
2244	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2245		for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2246			*pte = newpte;
2247			newpte += PAGE_SIZE;
2248		}
2249
2250	/*
2251	 * Demote the mapping.  This pmap is locked.  The old PDE has
2252	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2253	 * set.  Thus, there is no danger of a race with another
2254	 * processor changing the setting of PG_A and/or PG_M between
2255	 * the read above and the store below.
2256	 */
2257	pde_store(pde, newpde);
2258
2259	/*
2260	 * Invalidate a stale mapping of the page table page.
2261	 */
2262	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2263
2264	/*
2265	 * Demote the pv entry.  This depends on the earlier demotion
2266	 * of the mapping.  Specifically, the (re)creation of a per-
2267	 * page pv entry might trigger the execution of pmap_collect(),
2268	 * which might reclaim a newly (re)created per-page pv entry
2269	 * and destroy the associated mapping.  In order to destroy
2270	 * the mapping, the PDE must have already changed from mapping
2271	 * the 2mpage to referencing the page table page.
2272	 */
2273	if ((oldpde & PG_MANAGED) != 0)
2274		pmap_pv_demote_pde(pmap, va, oldpde & PG_FRAME);
2275
2276	pmap_pde_demotions++;
2277	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
2278	    " in pmap %p", va, pmap);
2279	return (TRUE);
2280}
2281
2282/*
2283 * pmap_remove_pde: do the things to unmap a superpage in a process
2284 */
2285static int
2286pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2287    vm_page_t *free)
2288{
2289	struct md_page *pvh;
2290	pd_entry_t oldpde;
2291	vm_offset_t eva, va;
2292	vm_page_t m, mpte;
2293
2294	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2295	KASSERT((sva & PDRMASK) == 0,
2296	    ("pmap_remove_pde: sva is not 2mpage aligned"));
2297	oldpde = pte_load_clear(pdq);
2298	if (oldpde & PG_W)
2299		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2300
2301	/*
2302	 * Machines that don't support invlpg, also don't support
2303	 * PG_G.
2304	 */
2305	if (oldpde & PG_G)
2306		pmap_invalidate_page(kernel_pmap, sva);
2307	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2308	if (oldpde & PG_MANAGED) {
2309		pvh = pa_to_pvh(oldpde & PG_FRAME);
2310		pmap_pvh_free(pvh, pmap, sva);
2311		eva = sva + NBPDR;
2312		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
2313		    va < eva; va += PAGE_SIZE, m++) {
2314			if (oldpde & PG_M) {
2315				KASSERT((oldpde & PG_RW) != 0,
2316	("pmap_remove_pde: modified 2mpage not writable: va: %#lx, pde: %#lx",
2317				    va, oldpde));
2318				vm_page_dirty(m);
2319			}
2320			if (oldpde & PG_A)
2321				vm_page_flag_set(m, PG_REFERENCED);
2322			if (TAILQ_EMPTY(&m->md.pv_list) &&
2323			    TAILQ_EMPTY(&pvh->pv_list))
2324				vm_page_flag_clear(m, PG_WRITEABLE);
2325		}
2326	}
2327	if (pmap == kernel_pmap) {
2328		if (!pmap_demote_pde(pmap, pdq, sva))
2329			panic("pmap_remove_pde: failed demotion");
2330	} else {
2331		mpte = pmap_lookup_pt_page(pmap, sva);
2332		if (mpte != NULL) {
2333			pmap_remove_pt_page(pmap, mpte);
2334			KASSERT(mpte->wire_count == NPTEPG,
2335			    ("pmap_remove_pde: pte page wire count error"));
2336			mpte->wire_count = 0;
2337			pmap_add_delayed_free_list(mpte, free, FALSE);
2338			atomic_subtract_int(&cnt.v_wire_count, 1);
2339		}
2340	}
2341	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
2342}
2343
2344/*
2345 * pmap_remove_pte: do the things to unmap a page in a process
2346 */
2347static int
2348pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2349    pd_entry_t ptepde, vm_page_t *free)
2350{
2351	pt_entry_t oldpte;
2352	vm_page_t m;
2353
2354	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2355	oldpte = pte_load_clear(ptq);
2356	if (oldpte & PG_W)
2357		pmap->pm_stats.wired_count -= 1;
2358	/*
2359	 * Machines that don't support invlpg, also don't support
2360	 * PG_G.
2361	 */
2362	if (oldpte & PG_G)
2363		pmap_invalidate_page(kernel_pmap, va);
2364	pmap->pm_stats.resident_count -= 1;
2365	if (oldpte & PG_MANAGED) {
2366		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2367		if (oldpte & PG_M) {
2368			KASSERT((oldpte & PG_RW),
2369	("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx",
2370			    va, oldpte));
2371			vm_page_dirty(m);
2372		}
2373		if (oldpte & PG_A)
2374			vm_page_flag_set(m, PG_REFERENCED);
2375		pmap_remove_entry(pmap, m, va);
2376	}
2377	return (pmap_unuse_pt(pmap, va, ptepde, free));
2378}
2379
2380/*
2381 * Remove a single page from a process address space
2382 */
2383static void
2384pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
2385{
2386	pt_entry_t *pte;
2387
2388	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2389	if ((*pde & PG_V) == 0)
2390		return;
2391	pte = pmap_pde_to_pte(pde, va);
2392	if ((*pte & PG_V) == 0)
2393		return;
2394	pmap_remove_pte(pmap, pte, va, *pde, free);
2395	pmap_invalidate_page(pmap, va);
2396}
2397
2398/*
2399 *	Remove the given range of addresses from the specified map.
2400 *
2401 *	It is assumed that the start and end are properly
2402 *	rounded to the page size.
2403 */
2404void
2405pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2406{
2407	vm_offset_t va_next;
2408	pml4_entry_t *pml4e;
2409	pdp_entry_t *pdpe;
2410	pd_entry_t ptpaddr, *pde;
2411	pt_entry_t *pte;
2412	vm_page_t free = NULL;
2413	int anyvalid;
2414
2415	/*
2416	 * Perform an unsynchronized read.  This is, however, safe.
2417	 */
2418	if (pmap->pm_stats.resident_count == 0)
2419		return;
2420
2421	anyvalid = 0;
2422
2423	vm_page_lock_queues();
2424	PMAP_LOCK(pmap);
2425
2426	/*
2427	 * special handling of removing one page.  a very
2428	 * common operation and easy to short circuit some
2429	 * code.
2430	 */
2431	if (sva + PAGE_SIZE == eva) {
2432		pde = pmap_pde(pmap, sva);
2433		if (pde && (*pde & PG_PS) == 0) {
2434			pmap_remove_page(pmap, sva, pde, &free);
2435			goto out;
2436		}
2437	}
2438
2439	for (; sva < eva; sva = va_next) {
2440
2441		if (pmap->pm_stats.resident_count == 0)
2442			break;
2443
2444		pml4e = pmap_pml4e(pmap, sva);
2445		if ((*pml4e & PG_V) == 0) {
2446			va_next = (sva + NBPML4) & ~PML4MASK;
2447			continue;
2448		}
2449
2450		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2451		if ((*pdpe & PG_V) == 0) {
2452			va_next = (sva + NBPDP) & ~PDPMASK;
2453			continue;
2454		}
2455
2456		/*
2457		 * Calculate index for next page table.
2458		 */
2459		va_next = (sva + NBPDR) & ~PDRMASK;
2460
2461		pde = pmap_pdpe_to_pde(pdpe, sva);
2462		ptpaddr = *pde;
2463
2464		/*
2465		 * Weed out invalid mappings.
2466		 */
2467		if (ptpaddr == 0)
2468			continue;
2469
2470		/*
2471		 * Check for large page.
2472		 */
2473		if ((ptpaddr & PG_PS) != 0) {
2474			/*
2475			 * Are we removing the entire large page?  If not,
2476			 * demote the mapping and fall through.
2477			 */
2478			if (sva + NBPDR == va_next && eva >= va_next) {
2479				/*
2480				 * The TLB entry for a PG_G mapping is
2481				 * invalidated by pmap_remove_pde().
2482				 */
2483				if ((ptpaddr & PG_G) == 0)
2484					anyvalid = 1;
2485				pmap_remove_pde(pmap, pde, sva, &free);
2486				continue;
2487			} else if (!pmap_demote_pde(pmap, pde, sva)) {
2488				/* The large page mapping was destroyed. */
2489				continue;
2490			} else
2491				ptpaddr = *pde;
2492		}
2493
2494		/*
2495		 * Limit our scan to either the end of the va represented
2496		 * by the current page table page, or to the end of the
2497		 * range being removed.
2498		 */
2499		if (va_next > eva)
2500			va_next = eva;
2501
2502		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2503		    sva += PAGE_SIZE) {
2504			if (*pte == 0)
2505				continue;
2506
2507			/*
2508			 * The TLB entry for a PG_G mapping is invalidated
2509			 * by pmap_remove_pte().
2510			 */
2511			if ((*pte & PG_G) == 0)
2512				anyvalid = 1;
2513			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
2514				break;
2515		}
2516	}
2517out:
2518	if (anyvalid)
2519		pmap_invalidate_all(pmap);
2520	vm_page_unlock_queues();
2521	PMAP_UNLOCK(pmap);
2522	pmap_free_zero_pages(free);
2523}
2524
2525/*
2526 *	Routine:	pmap_remove_all
2527 *	Function:
2528 *		Removes this physical page from
2529 *		all physical maps in which it resides.
2530 *		Reflects back modify bits to the pager.
2531 *
2532 *	Notes:
2533 *		Original versions of this routine were very
2534 *		inefficient because they iteratively called
2535 *		pmap_remove (slow...)
2536 */
2537
2538void
2539pmap_remove_all(vm_page_t m)
2540{
2541	struct md_page *pvh;
2542	pv_entry_t pv;
2543	pmap_t pmap;
2544	pt_entry_t *pte, tpte;
2545	pd_entry_t *pde;
2546	vm_offset_t va;
2547	vm_page_t free;
2548
2549	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2550	    ("pmap_remove_all: page %p is fictitious", m));
2551	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2552	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2553	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2554		va = pv->pv_va;
2555		pmap = PV_PMAP(pv);
2556		PMAP_LOCK(pmap);
2557		pde = pmap_pde(pmap, va);
2558		(void)pmap_demote_pde(pmap, pde, va);
2559		PMAP_UNLOCK(pmap);
2560	}
2561	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2562		pmap = PV_PMAP(pv);
2563		PMAP_LOCK(pmap);
2564		pmap->pm_stats.resident_count--;
2565		pde = pmap_pde(pmap, pv->pv_va);
2566		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2567		    " a 2mpage in page %p's pv list", m));
2568		pte = pmap_pde_to_pte(pde, pv->pv_va);
2569		tpte = pte_load_clear(pte);
2570		if (tpte & PG_W)
2571			pmap->pm_stats.wired_count--;
2572		if (tpte & PG_A)
2573			vm_page_flag_set(m, PG_REFERENCED);
2574
2575		/*
2576		 * Update the vm_page_t clean and reference bits.
2577		 */
2578		if (tpte & PG_M) {
2579			KASSERT((tpte & PG_RW),
2580	("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx",
2581			    pv->pv_va, tpte));
2582			vm_page_dirty(m);
2583		}
2584		free = NULL;
2585		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
2586		pmap_invalidate_page(pmap, pv->pv_va);
2587		pmap_free_zero_pages(free);
2588		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2589		free_pv_entry(pmap, pv);
2590		PMAP_UNLOCK(pmap);
2591	}
2592	vm_page_flag_clear(m, PG_WRITEABLE);
2593}
2594
2595/*
2596 * pmap_protect_pde: do the things to protect a 2mpage in a process
2597 */
2598static boolean_t
2599pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2600{
2601	pd_entry_t newpde, oldpde;
2602	vm_offset_t eva, va;
2603	vm_page_t m;
2604	boolean_t anychanged;
2605
2606	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2607	KASSERT((sva & PDRMASK) == 0,
2608	    ("pmap_protect_pde: sva is not 2mpage aligned"));
2609	anychanged = FALSE;
2610retry:
2611	oldpde = newpde = *pde;
2612	if (oldpde & PG_MANAGED) {
2613		eva = sva + NBPDR;
2614		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME);
2615		    va < eva; va += PAGE_SIZE, m++) {
2616			/*
2617			 * In contrast to the analogous operation on a 4KB page
2618			 * mapping, the mapping's PG_A flag is not cleared and
2619			 * the page's PG_REFERENCED flag is not set.  The
2620			 * reason is that pmap_demote_pde() expects that a 2MB
2621			 * page mapping with a stored page table page has PG_A
2622			 * set.
2623			 */
2624			if ((oldpde & PG_M) != 0)
2625				vm_page_dirty(m);
2626		}
2627	}
2628	if ((prot & VM_PROT_WRITE) == 0)
2629		newpde &= ~(PG_RW | PG_M);
2630	if ((prot & VM_PROT_EXECUTE) == 0)
2631		newpde |= pg_nx;
2632	if (newpde != oldpde) {
2633		if (!atomic_cmpset_long(pde, oldpde, newpde))
2634			goto retry;
2635		if (oldpde & PG_G)
2636			pmap_invalidate_page(pmap, sva);
2637		else
2638			anychanged = TRUE;
2639	}
2640	return (anychanged);
2641}
2642
2643/*
2644 *	Set the physical protection on the
2645 *	specified range of this map as requested.
2646 */
2647void
2648pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2649{
2650	vm_offset_t va_next;
2651	pml4_entry_t *pml4e;
2652	pdp_entry_t *pdpe;
2653	pd_entry_t ptpaddr, *pde;
2654	pt_entry_t *pte;
2655	int anychanged;
2656
2657	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2658		pmap_remove(pmap, sva, eva);
2659		return;
2660	}
2661
2662	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2663	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2664		return;
2665
2666	anychanged = 0;
2667
2668	vm_page_lock_queues();
2669	PMAP_LOCK(pmap);
2670	for (; sva < eva; sva = va_next) {
2671
2672		pml4e = pmap_pml4e(pmap, sva);
2673		if ((*pml4e & PG_V) == 0) {
2674			va_next = (sva + NBPML4) & ~PML4MASK;
2675			continue;
2676		}
2677
2678		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2679		if ((*pdpe & PG_V) == 0) {
2680			va_next = (sva + NBPDP) & ~PDPMASK;
2681			continue;
2682		}
2683
2684		va_next = (sva + NBPDR) & ~PDRMASK;
2685
2686		pde = pmap_pdpe_to_pde(pdpe, sva);
2687		ptpaddr = *pde;
2688
2689		/*
2690		 * Weed out invalid mappings.
2691		 */
2692		if (ptpaddr == 0)
2693			continue;
2694
2695		/*
2696		 * Check for large page.
2697		 */
2698		if ((ptpaddr & PG_PS) != 0) {
2699			/*
2700			 * Are we protecting the entire large page?  If not,
2701			 * demote the mapping and fall through.
2702			 */
2703			if (sva + NBPDR == va_next && eva >= va_next) {
2704				/*
2705				 * The TLB entry for a PG_G mapping is
2706				 * invalidated by pmap_protect_pde().
2707				 */
2708				if (pmap_protect_pde(pmap, pde, sva, prot))
2709					anychanged = 1;
2710				continue;
2711			} else if (!pmap_demote_pde(pmap, pde, sva)) {
2712				/* The large page mapping was destroyed. */
2713				continue;
2714			}
2715		}
2716
2717		if (va_next > eva)
2718			va_next = eva;
2719
2720		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2721		    sva += PAGE_SIZE) {
2722			pt_entry_t obits, pbits;
2723			vm_page_t m;
2724
2725retry:
2726			obits = pbits = *pte;
2727			if ((pbits & PG_V) == 0)
2728				continue;
2729			if (pbits & PG_MANAGED) {
2730				m = NULL;
2731				if (pbits & PG_A) {
2732					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2733					vm_page_flag_set(m, PG_REFERENCED);
2734					pbits &= ~PG_A;
2735				}
2736				if ((pbits & PG_M) != 0) {
2737					if (m == NULL)
2738						m = PHYS_TO_VM_PAGE(pbits &
2739						    PG_FRAME);
2740					vm_page_dirty(m);
2741				}
2742			}
2743
2744			if ((prot & VM_PROT_WRITE) == 0)
2745				pbits &= ~(PG_RW | PG_M);
2746			if ((prot & VM_PROT_EXECUTE) == 0)
2747				pbits |= pg_nx;
2748
2749			if (pbits != obits) {
2750				if (!atomic_cmpset_long(pte, obits, pbits))
2751					goto retry;
2752				if (obits & PG_G)
2753					pmap_invalidate_page(pmap, sva);
2754				else
2755					anychanged = 1;
2756			}
2757		}
2758	}
2759	if (anychanged)
2760		pmap_invalidate_all(pmap);
2761	vm_page_unlock_queues();
2762	PMAP_UNLOCK(pmap);
2763}
2764
2765/*
2766 * Tries to promote the 512, contiguous 4KB page mappings that are within a
2767 * single page table page to a single 2MB page mapping.  For promotion to
2768 * occur, two conditions must be met: (1) the 4KB page mappings must map
2769 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2770 * identical characteristics.
2771 */
2772static void
2773pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2774{
2775	pd_entry_t newpde;
2776	pt_entry_t *firstpte, oldpte, *pte;
2777	vm_offset_t oldpteva;
2778	vm_paddr_t pa;
2779	vm_page_t mpte;
2780
2781	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2782	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
2783	KASSERT((*firstpte & PG_V) != 0,
2784	    ("pmap_promote_pde: firstpte is missing PG_V"));
2785	if ((*firstpte & PG_A) == 0) {
2786		pmap_pde_p_failures++;
2787		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2788		    " in pmap %p", va, pmap);
2789		return;
2790	}
2791	pa = *firstpte & PG_PS_FRAME;
2792	newpde = *firstpte;
2793	if ((newpde & (PG_M | PG_RW)) == PG_RW)
2794		newpde &= ~PG_RW;
2795
2796	/*
2797	 * Check all the ptes before promotion
2798	 */
2799	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2800retry:
2801		oldpte = *pte;
2802		if ((oldpte & PG_FRAME) != pa) {
2803			pmap_pde_p_failures++;
2804			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2805			    " in pmap %p", va, pmap);
2806			return;
2807		}
2808		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2809			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
2810				goto retry;
2811			oldpte &= ~PG_RW;
2812			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
2813			    (va & ~PDRMASK);
2814			pmap_invalidate_page(pmap, oldpteva);
2815			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
2816			    " in pmap %p", oldpteva, pmap);
2817		}
2818		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2819			pmap_pde_p_failures++;
2820			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2821			    " in pmap %p", va, pmap);
2822			return;
2823		}
2824		pa += PAGE_SIZE;
2825	}
2826
2827	/*
2828	 * Save the page table page in its current state until the PDE
2829	 * mapping the superpage is demoted by pmap_demote_pde() or
2830	 * destroyed by pmap_remove_pde().
2831	 */
2832	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
2833	KASSERT(mpte >= vm_page_array &&
2834	    mpte < &vm_page_array[vm_page_array_size],
2835	    ("pmap_promote_pde: page table page is out of range"));
2836	KASSERT(mpte->pindex == pmap_pde_pindex(va),
2837	    ("pmap_promote_pde: page table page's pindex is wrong"));
2838	pmap_insert_pt_page(pmap, mpte);
2839
2840	/*
2841	 * Promote the pv entries.
2842	 */
2843	if ((newpde & PG_MANAGED) != 0)
2844		pmap_pv_promote_pde(pmap, va, newpde & PG_FRAME);
2845
2846	/*
2847	 * Propagate the PAT index to its proper position.
2848	 */
2849	if ((newpde & PG_PTE_PAT) != 0)
2850		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
2851
2852	/*
2853	 * Map the superpage.
2854	 */
2855	pde_store(pde, PG_PS | newpde);
2856
2857	pmap_pde_promotions++;
2858	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
2859	    " in pmap %p", va, pmap);
2860}
2861
2862/*
2863 *	Insert the given physical page (p) at
2864 *	the specified virtual address (v) in the
2865 *	target physical map with the protection requested.
2866 *
2867 *	If specified, the page will be wired down, meaning
2868 *	that the related pte can not be reclaimed.
2869 *
2870 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2871 *	or lose information.  That is, this routine must actually
2872 *	insert this page into the given map NOW.
2873 */
2874void
2875pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
2876    vm_prot_t prot, boolean_t wired)
2877{
2878	vm_paddr_t pa;
2879	pd_entry_t *pde;
2880	pt_entry_t *pte;
2881	vm_paddr_t opa;
2882	pt_entry_t origpte, newpte;
2883	vm_page_t mpte, om;
2884	boolean_t invlva;
2885
2886	va = trunc_page(va);
2887	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2888	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
2889	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va));
2890
2891	mpte = NULL;
2892
2893	vm_page_lock_queues();
2894	PMAP_LOCK(pmap);
2895
2896	/*
2897	 * In the case that a page table page is not
2898	 * resident, we are creating it here.
2899	 */
2900	if (va < VM_MAXUSER_ADDRESS) {
2901		mpte = pmap_allocpte(pmap, va, M_WAITOK);
2902	}
2903
2904	pde = pmap_pde(pmap, va);
2905	if (pde != NULL && (*pde & PG_V) != 0) {
2906		if ((*pde & PG_PS) != 0)
2907			panic("pmap_enter: attempted pmap_enter on 2MB page");
2908		pte = pmap_pde_to_pte(pde, va);
2909	} else
2910		pte = NULL;
2911
2912	/*
2913	 * Page Directory table entry not valid, we need a new PT page
2914	 */
2915	if (pte == NULL)
2916		panic("pmap_enter: invalid page directory va=%#lx", va);
2917
2918	pa = VM_PAGE_TO_PHYS(m);
2919	om = NULL;
2920	origpte = *pte;
2921	opa = origpte & PG_FRAME;
2922
2923	/*
2924	 * Mapping has not changed, must be protection or wiring change.
2925	 */
2926	if (origpte && (opa == pa)) {
2927		/*
2928		 * Wiring change, just update stats. We don't worry about
2929		 * wiring PT pages as they remain resident as long as there
2930		 * are valid mappings in them. Hence, if a user page is wired,
2931		 * the PT page will be also.
2932		 */
2933		if (wired && ((origpte & PG_W) == 0))
2934			pmap->pm_stats.wired_count++;
2935		else if (!wired && (origpte & PG_W))
2936			pmap->pm_stats.wired_count--;
2937
2938		/*
2939		 * Remove extra pte reference
2940		 */
2941		if (mpte)
2942			mpte->wire_count--;
2943
2944		/*
2945		 * We might be turning off write access to the page,
2946		 * so we go ahead and sense modify status.
2947		 */
2948		if (origpte & PG_MANAGED) {
2949			om = m;
2950			pa |= PG_MANAGED;
2951		}
2952		goto validate;
2953	}
2954	/*
2955	 * Mapping has changed, invalidate old range and fall through to
2956	 * handle validating new mapping.
2957	 */
2958	if (opa) {
2959		if (origpte & PG_W)
2960			pmap->pm_stats.wired_count--;
2961		if (origpte & PG_MANAGED) {
2962			om = PHYS_TO_VM_PAGE(opa);
2963			pmap_remove_entry(pmap, om, va);
2964		}
2965		if (mpte != NULL) {
2966			mpte->wire_count--;
2967			KASSERT(mpte->wire_count > 0,
2968			    ("pmap_enter: missing reference to page table page,"
2969			     " va: 0x%lx", va));
2970		}
2971	} else
2972		pmap->pm_stats.resident_count++;
2973
2974	/*
2975	 * Enter on the PV list if part of our managed memory.
2976	 */
2977	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2978		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2979		    ("pmap_enter: managed mapping within the clean submap"));
2980		pmap_insert_entry(pmap, va, m);
2981		pa |= PG_MANAGED;
2982	}
2983
2984	/*
2985	 * Increment counters
2986	 */
2987	if (wired)
2988		pmap->pm_stats.wired_count++;
2989
2990validate:
2991	/*
2992	 * Now validate mapping with desired protection/wiring.
2993	 */
2994	newpte = (pt_entry_t)(pa | PG_V);
2995	if ((prot & VM_PROT_WRITE) != 0) {
2996		newpte |= PG_RW;
2997		vm_page_flag_set(m, PG_WRITEABLE);
2998	}
2999	if ((prot & VM_PROT_EXECUTE) == 0)
3000		newpte |= pg_nx;
3001	if (wired)
3002		newpte |= PG_W;
3003	if (va < VM_MAXUSER_ADDRESS)
3004		newpte |= PG_U;
3005	if (pmap == kernel_pmap)
3006		newpte |= PG_G;
3007
3008	/*
3009	 * if the mapping or permission bits are different, we need
3010	 * to update the pte.
3011	 */
3012	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3013		newpte |= PG_A;
3014		if ((access & VM_PROT_WRITE) != 0)
3015			newpte |= PG_M;
3016		if (origpte & PG_V) {
3017			invlva = FALSE;
3018			origpte = pte_load_store(pte, newpte);
3019			if (origpte & PG_A) {
3020				if (origpte & PG_MANAGED)
3021					vm_page_flag_set(om, PG_REFERENCED);
3022				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
3023				    PG_NX) == 0 && (newpte & PG_NX)))
3024					invlva = TRUE;
3025			}
3026			if (origpte & PG_M) {
3027				KASSERT((origpte & PG_RW),
3028	("pmap_enter: modified page not writable: va: %#lx, pte: %#lx",
3029				    va, origpte));
3030				if ((origpte & PG_MANAGED) != 0)
3031					vm_page_dirty(om);
3032				if ((newpte & PG_RW) == 0)
3033					invlva = TRUE;
3034			}
3035			if (invlva)
3036				pmap_invalidate_page(pmap, va);
3037		} else
3038			pte_store(pte, newpte);
3039	}
3040
3041	/*
3042	 * If both the page table page and the reservation are fully
3043	 * populated, then attempt promotion.
3044	 */
3045	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3046	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3047		pmap_promote_pde(pmap, pde, va);
3048
3049	vm_page_unlock_queues();
3050	PMAP_UNLOCK(pmap);
3051}
3052
3053/*
3054 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
3055 * otherwise.  Fails if (1) a page table page cannot be allocated without
3056 * blocking, (2) a mapping already exists at the specified virtual address, or
3057 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3058 */
3059static boolean_t
3060pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3061{
3062	pd_entry_t *pde, newpde;
3063	vm_page_t free, mpde;
3064
3065	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3066	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3067	if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
3068		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3069		    " in pmap %p", va, pmap);
3070		return (FALSE);
3071	}
3072	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
3073	pde = &pde[pmap_pde_index(va)];
3074	if ((*pde & PG_V) != 0) {
3075		KASSERT(mpde->wire_count > 1,
3076		    ("pmap_enter_pde: mpde's wire count is too low"));
3077		mpde->wire_count--;
3078		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3079		    " in pmap %p", va, pmap);
3080		return (FALSE);
3081	}
3082	newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V;
3083	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3084		newpde |= PG_MANAGED;
3085
3086		/*
3087		 * Create a PV entry for each of the managed pages.
3088		 */
3089		if (!pmap_pv_insert_pde(pmap, va, m)) {
3090			free = NULL;
3091			if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
3092				pmap_invalidate_page(pmap, va);
3093				pmap_free_zero_pages(free);
3094			}
3095			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3096			    " in pmap %p", va, pmap);
3097			return (FALSE);
3098		}
3099	}
3100	if ((prot & VM_PROT_EXECUTE) == 0)
3101		newpde |= pg_nx;
3102	if (va < VM_MAXUSER_ADDRESS)
3103		newpde |= PG_U;
3104
3105	/*
3106	 * Increment counters.
3107	 */
3108	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3109
3110	/*
3111	 * Map the superpage.
3112	 */
3113	pde_store(pde, newpde);
3114
3115	pmap_pde_mappings++;
3116	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3117	    " in pmap %p", va, pmap);
3118	return (TRUE);
3119}
3120
3121/*
3122 * Maps a sequence of resident pages belonging to the same object.
3123 * The sequence begins with the given page m_start.  This page is
3124 * mapped at the given virtual address start.  Each subsequent page is
3125 * mapped at a virtual address that is offset from start by the same
3126 * amount as the page is offset from m_start within the object.  The
3127 * last page in the sequence is the page with the largest offset from
3128 * m_start that can be mapped at a virtual address less than the given
3129 * virtual address end.  Not every virtual page between start and end
3130 * is mapped; only those for which a resident page exists with the
3131 * corresponding offset from m_start are mapped.
3132 */
3133void
3134pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3135    vm_page_t m_start, vm_prot_t prot)
3136{
3137	vm_offset_t va;
3138	vm_page_t m, mpte;
3139	vm_pindex_t diff, psize;
3140
3141	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3142	psize = atop(end - start);
3143	mpte = NULL;
3144	m = m_start;
3145	PMAP_LOCK(pmap);
3146	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3147		va = start + ptoa(diff);
3148		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3149		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3150		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3151		    pmap_enter_pde(pmap, va, m, prot))
3152			m = &m[NBPDR / PAGE_SIZE - 1];
3153		else
3154			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3155			    mpte);
3156		m = TAILQ_NEXT(m, listq);
3157	}
3158 	PMAP_UNLOCK(pmap);
3159}
3160
3161/*
3162 * this code makes some *MAJOR* assumptions:
3163 * 1. Current pmap & pmap exists.
3164 * 2. Not wired.
3165 * 3. Read access.
3166 * 4. No page table pages.
3167 * but is *MUCH* faster than pmap_enter...
3168 */
3169
3170void
3171pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3172{
3173
3174	PMAP_LOCK(pmap);
3175	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3176	PMAP_UNLOCK(pmap);
3177}
3178
3179static vm_page_t
3180pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3181    vm_prot_t prot, vm_page_t mpte)
3182{
3183	vm_page_t free;
3184	pt_entry_t *pte;
3185	vm_paddr_t pa;
3186
3187	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3188	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3189	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3190	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3191	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3192
3193	/*
3194	 * In the case that a page table page is not
3195	 * resident, we are creating it here.
3196	 */
3197	if (va < VM_MAXUSER_ADDRESS) {
3198		vm_pindex_t ptepindex;
3199		pd_entry_t *ptepa;
3200
3201		/*
3202		 * Calculate pagetable page index
3203		 */
3204		ptepindex = pmap_pde_pindex(va);
3205		if (mpte && (mpte->pindex == ptepindex)) {
3206			mpte->wire_count++;
3207		} else {
3208			/*
3209			 * Get the page directory entry
3210			 */
3211			ptepa = pmap_pde(pmap, va);
3212
3213			/*
3214			 * If the page table page is mapped, we just increment
3215			 * the hold count, and activate it.
3216			 */
3217			if (ptepa && (*ptepa & PG_V) != 0) {
3218				if (*ptepa & PG_PS)
3219					return (NULL);
3220				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
3221				mpte->wire_count++;
3222			} else {
3223				mpte = _pmap_allocpte(pmap, ptepindex,
3224				    M_NOWAIT);
3225				if (mpte == NULL)
3226					return (mpte);
3227			}
3228		}
3229	} else {
3230		mpte = NULL;
3231	}
3232
3233	/*
3234	 * This call to vtopte makes the assumption that we are
3235	 * entering the page into the current pmap.  In order to support
3236	 * quick entry into any pmap, one would likely use pmap_pte.
3237	 * But that isn't as quick as vtopte.
3238	 */
3239	pte = vtopte(va);
3240	if (*pte) {
3241		if (mpte != NULL) {
3242			mpte->wire_count--;
3243			mpte = NULL;
3244		}
3245		return (mpte);
3246	}
3247
3248	/*
3249	 * Enter on the PV list if part of our managed memory.
3250	 */
3251	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3252	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3253		if (mpte != NULL) {
3254			free = NULL;
3255			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
3256				pmap_invalidate_page(pmap, va);
3257				pmap_free_zero_pages(free);
3258			}
3259			mpte = NULL;
3260		}
3261		return (mpte);
3262	}
3263
3264	/*
3265	 * Increment counters
3266	 */
3267	pmap->pm_stats.resident_count++;
3268
3269	pa = VM_PAGE_TO_PHYS(m);
3270	if ((prot & VM_PROT_EXECUTE) == 0)
3271		pa |= pg_nx;
3272
3273	/*
3274	 * Now validate mapping with RO protection
3275	 */
3276	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3277		pte_store(pte, pa | PG_V | PG_U);
3278	else
3279		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3280	return mpte;
3281}
3282
3283/*
3284 * Make a temporary mapping for a physical address.  This is only intended
3285 * to be used for panic dumps.
3286 */
3287void *
3288pmap_kenter_temporary(vm_paddr_t pa, int i)
3289{
3290	vm_offset_t va;
3291
3292	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3293	pmap_kenter(va, pa);
3294	invlpg(va);
3295	return ((void *)crashdumpmap);
3296}
3297
3298/*
3299 * This code maps large physical mmap regions into the
3300 * processor address space.  Note that some shortcuts
3301 * are taken, but the code works.
3302 */
3303void
3304pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
3305		    vm_object_t object, vm_pindex_t pindex,
3306		    vm_size_t size)
3307{
3308	vm_offset_t va;
3309	vm_page_t p, pdpg;
3310
3311	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3312	KASSERT(object->type == OBJT_DEVICE,
3313	    ("pmap_object_init_pt: non-device object"));
3314	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
3315		vm_page_t m[1];
3316		pd_entry_t ptepa, *pde;
3317
3318		PMAP_LOCK(pmap);
3319		pde = pmap_pde(pmap, addr);
3320		if (pde != 0 && (*pde & PG_V) != 0)
3321			goto out;
3322		PMAP_UNLOCK(pmap);
3323retry:
3324		p = vm_page_lookup(object, pindex);
3325		if (p != NULL) {
3326			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
3327				goto retry;
3328		} else {
3329			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
3330			if (p == NULL)
3331				return;
3332			m[0] = p;
3333
3334			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
3335				vm_page_lock_queues();
3336				vm_page_free(p);
3337				vm_page_unlock_queues();
3338				return;
3339			}
3340
3341			p = vm_page_lookup(object, pindex);
3342			vm_page_lock_queues();
3343			vm_page_wakeup(p);
3344			vm_page_unlock_queues();
3345		}
3346
3347		ptepa = VM_PAGE_TO_PHYS(p);
3348		if (ptepa & (NBPDR - 1))
3349			return;
3350
3351		p->valid = VM_PAGE_BITS_ALL;
3352
3353		PMAP_LOCK(pmap);
3354		for (va = addr; va < addr + size; va += NBPDR) {
3355			while ((pdpg =
3356			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
3357				PMAP_UNLOCK(pmap);
3358				vm_page_lock_queues();
3359				vm_page_busy(p);
3360				vm_page_unlock_queues();
3361				VM_OBJECT_UNLOCK(object);
3362				VM_WAIT;
3363				VM_OBJECT_LOCK(object);
3364				vm_page_lock_queues();
3365				vm_page_wakeup(p);
3366				vm_page_unlock_queues();
3367				PMAP_LOCK(pmap);
3368			}
3369			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3370			pde = &pde[pmap_pde_index(va)];
3371			if ((*pde & PG_V) == 0) {
3372				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
3373				    PG_U | PG_RW | PG_V);
3374				pmap->pm_stats.resident_count +=
3375				    NBPDR / PAGE_SIZE;
3376			} else {
3377				pdpg->wire_count--;
3378				KASSERT(pdpg->wire_count > 0,
3379				    ("pmap_object_init_pt: missing reference "
3380				     "to page directory page, va: 0x%lx", va));
3381			}
3382			ptepa += NBPDR;
3383		}
3384		pmap_invalidate_all(pmap);
3385out:
3386		PMAP_UNLOCK(pmap);
3387	}
3388}
3389
3390/*
3391 *	Routine:	pmap_change_wiring
3392 *	Function:	Change the wiring attribute for a map/virtual-address
3393 *			pair.
3394 *	In/out conditions:
3395 *			The mapping must already exist in the pmap.
3396 */
3397void
3398pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3399{
3400	pd_entry_t *pde;
3401	pt_entry_t *pte;
3402	boolean_t are_queues_locked;
3403
3404	are_queues_locked = FALSE;
3405
3406	/*
3407	 * Wiring is not a hardware characteristic so there is no need to
3408	 * invalidate TLB.
3409	 */
3410retry:
3411	PMAP_LOCK(pmap);
3412	pde = pmap_pde(pmap, va);
3413	if ((*pde & PG_PS) != 0) {
3414		if (!wired != ((*pde & PG_W) == 0)) {
3415			if (!are_queues_locked) {
3416				are_queues_locked = TRUE;
3417				if (!mtx_trylock(&vm_page_queue_mtx)) {
3418					PMAP_UNLOCK(pmap);
3419					vm_page_lock_queues();
3420					goto retry;
3421				}
3422			}
3423			if (!pmap_demote_pde(pmap, pde, va))
3424				panic("pmap_change_wiring: demotion failed");
3425		} else
3426			goto out;
3427	}
3428	pte = pmap_pde_to_pte(pde, va);
3429	if (wired && (*pte & PG_W) == 0) {
3430		pmap->pm_stats.wired_count++;
3431		atomic_set_long(pte, PG_W);
3432	} else if (!wired && (*pte & PG_W) != 0) {
3433		pmap->pm_stats.wired_count--;
3434		atomic_clear_long(pte, PG_W);
3435	}
3436out:
3437	if (are_queues_locked)
3438		vm_page_unlock_queues();
3439	PMAP_UNLOCK(pmap);
3440}
3441
3442
3443
3444/*
3445 *	Copy the range specified by src_addr/len
3446 *	from the source map to the range dst_addr/len
3447 *	in the destination map.
3448 *
3449 *	This routine is only advisory and need not do anything.
3450 */
3451
3452void
3453pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3454	  vm_offset_t src_addr)
3455{
3456	vm_page_t   free;
3457	vm_offset_t addr;
3458	vm_offset_t end_addr = src_addr + len;
3459	vm_offset_t va_next;
3460
3461	if (dst_addr != src_addr)
3462		return;
3463
3464	if (!pmap_is_current(src_pmap))
3465		return;
3466
3467	vm_page_lock_queues();
3468	if (dst_pmap < src_pmap) {
3469		PMAP_LOCK(dst_pmap);
3470		PMAP_LOCK(src_pmap);
3471	} else {
3472		PMAP_LOCK(src_pmap);
3473		PMAP_LOCK(dst_pmap);
3474	}
3475	for (addr = src_addr; addr < end_addr; addr = va_next) {
3476		pt_entry_t *src_pte, *dst_pte;
3477		vm_page_t dstmpde, dstmpte, srcmpte;
3478		pml4_entry_t *pml4e;
3479		pdp_entry_t *pdpe;
3480		pd_entry_t srcptepaddr, *pde;
3481
3482		KASSERT(addr < UPT_MIN_ADDRESS,
3483		    ("pmap_copy: invalid to pmap_copy page tables"));
3484
3485		pml4e = pmap_pml4e(src_pmap, addr);
3486		if ((*pml4e & PG_V) == 0) {
3487			va_next = (addr + NBPML4) & ~PML4MASK;
3488			continue;
3489		}
3490
3491		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
3492		if ((*pdpe & PG_V) == 0) {
3493			va_next = (addr + NBPDP) & ~PDPMASK;
3494			continue;
3495		}
3496
3497		va_next = (addr + NBPDR) & ~PDRMASK;
3498
3499		pde = pmap_pdpe_to_pde(pdpe, addr);
3500		srcptepaddr = *pde;
3501		if (srcptepaddr == 0)
3502			continue;
3503
3504		if (srcptepaddr & PG_PS) {
3505			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
3506			if (dstmpde == NULL)
3507				break;
3508			pde = (pd_entry_t *)
3509			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
3510			pde = &pde[pmap_pde_index(addr)];
3511			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
3512			    pmap_pv_insert_pde(dst_pmap, addr,
3513			    PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME)))) {
3514				*pde = srcptepaddr & ~PG_W;
3515				dst_pmap->pm_stats.resident_count +=
3516				    NBPDR / PAGE_SIZE;
3517			} else
3518				dstmpde->wire_count--;
3519			continue;
3520		}
3521
3522		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3523		KASSERT(srcmpte->wire_count > 0,
3524		    ("pmap_copy: source page table page is unused"));
3525
3526		if (va_next > end_addr)
3527			va_next = end_addr;
3528
3529		src_pte = vtopte(addr);
3530		while (addr < va_next) {
3531			pt_entry_t ptetemp;
3532			ptetemp = *src_pte;
3533			/*
3534			 * we only virtual copy managed pages
3535			 */
3536			if ((ptetemp & PG_MANAGED) != 0) {
3537				dstmpte = pmap_allocpte(dst_pmap, addr,
3538				    M_NOWAIT);
3539				if (dstmpte == NULL)
3540					break;
3541				dst_pte = (pt_entry_t *)
3542				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
3543				dst_pte = &dst_pte[pmap_pte_index(addr)];
3544				if (*dst_pte == 0 &&
3545				    pmap_try_insert_pv_entry(dst_pmap, addr,
3546				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3547					/*
3548					 * Clear the wired, modified, and
3549					 * accessed (referenced) bits
3550					 * during the copy.
3551					 */
3552					*dst_pte = ptetemp & ~(PG_W | PG_M |
3553					    PG_A);
3554					dst_pmap->pm_stats.resident_count++;
3555	 			} else {
3556					free = NULL;
3557					if (pmap_unwire_pte_hold(dst_pmap,
3558					    addr, dstmpte, &free)) {
3559					    	pmap_invalidate_page(dst_pmap,
3560					 	    addr);
3561				    	    	pmap_free_zero_pages(free);
3562					}
3563				}
3564				if (dstmpte->wire_count >= srcmpte->wire_count)
3565					break;
3566			}
3567			addr += PAGE_SIZE;
3568			src_pte++;
3569		}
3570	}
3571	vm_page_unlock_queues();
3572	PMAP_UNLOCK(src_pmap);
3573	PMAP_UNLOCK(dst_pmap);
3574}
3575
3576/*
3577 *	pmap_zero_page zeros the specified hardware page by mapping
3578 *	the page into KVM and using bzero to clear its contents.
3579 */
3580void
3581pmap_zero_page(vm_page_t m)
3582{
3583	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3584
3585	pagezero((void *)va);
3586}
3587
3588/*
3589 *	pmap_zero_page_area zeros the specified hardware page by mapping
3590 *	the page into KVM and using bzero to clear its contents.
3591 *
3592 *	off and size may not cover an area beyond a single hardware page.
3593 */
3594void
3595pmap_zero_page_area(vm_page_t m, int off, int size)
3596{
3597	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3598
3599	if (off == 0 && size == PAGE_SIZE)
3600		pagezero((void *)va);
3601	else
3602		bzero((char *)va + off, size);
3603}
3604
3605/*
3606 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3607 *	the page into KVM and using bzero to clear its contents.  This
3608 *	is intended to be called from the vm_pagezero process only and
3609 *	outside of Giant.
3610 */
3611void
3612pmap_zero_page_idle(vm_page_t m)
3613{
3614	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3615
3616	pagezero((void *)va);
3617}
3618
3619/*
3620 *	pmap_copy_page copies the specified (machine independent)
3621 *	page by mapping the page into virtual memory and using
3622 *	bcopy to copy the page, one machine dependent page at a
3623 *	time.
3624 */
3625void
3626pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3627{
3628	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3629	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3630
3631	pagecopy((void *)src, (void *)dst);
3632}
3633
3634/*
3635 * Returns true if the pmap's pv is one of the first
3636 * 16 pvs linked to from this page.  This count may
3637 * be changed upwards or downwards in the future; it
3638 * is only necessary that true be returned for a small
3639 * subset of pmaps for proper page aging.
3640 */
3641boolean_t
3642pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3643{
3644	struct md_page *pvh;
3645	pv_entry_t pv;
3646	int loops = 0;
3647
3648	if (m->flags & PG_FICTITIOUS)
3649		return FALSE;
3650
3651	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3652	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3653		if (PV_PMAP(pv) == pmap) {
3654			return TRUE;
3655		}
3656		loops++;
3657		if (loops >= 16)
3658			break;
3659	}
3660	if (loops < 16) {
3661		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3662		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3663			if (PV_PMAP(pv) == pmap)
3664				return (TRUE);
3665			loops++;
3666			if (loops >= 16)
3667				break;
3668		}
3669	}
3670	return (FALSE);
3671}
3672
3673/*
3674 *	pmap_page_wired_mappings:
3675 *
3676 *	Return the number of managed mappings to the given physical page
3677 *	that are wired.
3678 */
3679int
3680pmap_page_wired_mappings(vm_page_t m)
3681{
3682	pv_entry_t pv;
3683	pt_entry_t *pte;
3684	pmap_t pmap;
3685	int count;
3686
3687	count = 0;
3688	if ((m->flags & PG_FICTITIOUS) != 0)
3689		return (count);
3690	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3691	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3692		pmap = PV_PMAP(pv);
3693		PMAP_LOCK(pmap);
3694		pte = pmap_pte(pmap, pv->pv_va);
3695		if ((*pte & PG_W) != 0)
3696			count++;
3697		PMAP_UNLOCK(pmap);
3698	}
3699	return (count);
3700}
3701
3702/*
3703 * Returns TRUE if the given page is mapped individually or as part of
3704 * a 2mpage.  Otherwise, returns FALSE.
3705 */
3706boolean_t
3707pmap_page_is_mapped(vm_page_t m)
3708{
3709	struct md_page *pvh;
3710
3711	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
3712		return (FALSE);
3713	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3714	if (TAILQ_EMPTY(&m->md.pv_list)) {
3715		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3716		return (!TAILQ_EMPTY(&pvh->pv_list));
3717	} else
3718		return (TRUE);
3719}
3720
3721/*
3722 * Remove all pages from specified address space
3723 * this aids process exit speeds.  Also, this code
3724 * is special cased for current process only, but
3725 * can have the more generic (and slightly slower)
3726 * mode enabled.  This is much faster than pmap_remove
3727 * in the case of running down an entire address space.
3728 */
3729void
3730pmap_remove_pages(pmap_t pmap)
3731{
3732	pd_entry_t *pde;
3733	pt_entry_t *pte, tpte;
3734	vm_page_t free = NULL;
3735	vm_page_t m, mpte, mt;
3736	pv_entry_t pv;
3737	struct md_page *pvh;
3738	struct pv_chunk *pc, *npc;
3739	int field, idx;
3740	int64_t bit;
3741	uint64_t inuse, bitmask;
3742	int allfree;
3743
3744	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3745		printf("warning: pmap_remove_pages called with non-current pmap\n");
3746		return;
3747	}
3748	vm_page_lock_queues();
3749	PMAP_LOCK(pmap);
3750	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3751		allfree = 1;
3752		for (field = 0; field < _NPCM; field++) {
3753			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
3754			while (inuse != 0) {
3755				bit = bsfq(inuse);
3756				bitmask = 1UL << bit;
3757				idx = field * 64 + bit;
3758				pv = &pc->pc_pventry[idx];
3759				inuse &= ~bitmask;
3760
3761				pde = vtopde(pv->pv_va);
3762				tpte = *pde;
3763				if ((tpte & PG_PS) != 0)
3764					pte = pde;
3765				else {
3766					pte = vtopte(pv->pv_va);
3767					tpte = *pte & ~PG_PTE_PAT;
3768				}
3769
3770				if (tpte == 0) {
3771					printf(
3772					    "TPTE at %p  IS ZERO @ VA %08lx\n",
3773					    pte, pv->pv_va);
3774					panic("bad pte");
3775				}
3776
3777/*
3778 * We cannot remove wired pages from a process' mapping at this time
3779 */
3780				if (tpte & PG_W) {
3781					allfree = 0;
3782					continue;
3783				}
3784
3785				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3786				KASSERT(m->phys_addr == (tpte & PG_FRAME),
3787				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3788				    m, (uintmax_t)m->phys_addr,
3789				    (uintmax_t)tpte));
3790
3791				KASSERT(m < &vm_page_array[vm_page_array_size],
3792					("pmap_remove_pages: bad tpte %#jx",
3793					(uintmax_t)tpte));
3794
3795				pte_clear(pte);
3796
3797				/*
3798				 * Update the vm_page_t clean/reference bits.
3799				 */
3800				if (tpte & PG_M) {
3801					KASSERT((tpte & PG_RW) != 0,
3802	("pmap_remove_pages: modified page not writable: va: %#lx, pte: %#lx",
3803					    pv->pv_va, tpte));
3804					if ((tpte & PG_PS) != 0) {
3805						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3806							vm_page_dirty(mt);
3807					} else
3808						vm_page_dirty(m);
3809				}
3810
3811				/* Mark free */
3812				PV_STAT(pv_entry_frees++);
3813				PV_STAT(pv_entry_spare++);
3814				pv_entry_count--;
3815				pc->pc_map[field] |= bitmask;
3816				if ((tpte & PG_PS) != 0) {
3817					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
3818					pvh = pa_to_pvh(tpte & PG_FRAME);
3819					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
3820					if (TAILQ_EMPTY(&pvh->pv_list)) {
3821						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3822							if (TAILQ_EMPTY(&mt->md.pv_list))
3823								vm_page_flag_clear(mt, PG_WRITEABLE);
3824					}
3825					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
3826					if (mpte != NULL) {
3827						pmap_remove_pt_page(pmap, mpte);
3828						KASSERT(mpte->wire_count == NPTEPG,
3829						    ("pmap_remove_pages: pte page wire count error"));
3830						mpte->wire_count = 0;
3831						pmap_add_delayed_free_list(mpte, &free, FALSE);
3832						atomic_subtract_int(&cnt.v_wire_count, 1);
3833					}
3834					pmap_unuse_pt(pmap, pv->pv_va,
3835					    *pmap_pdpe(pmap, pv->pv_va), &free);
3836				} else {
3837					pmap->pm_stats.resident_count--;
3838					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3839					if (TAILQ_EMPTY(&m->md.pv_list)) {
3840						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3841						if (TAILQ_EMPTY(&pvh->pv_list))
3842							vm_page_flag_clear(m, PG_WRITEABLE);
3843					}
3844					pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3845				}
3846			}
3847		}
3848		if (allfree) {
3849			PV_STAT(pv_entry_spare -= _NPCPV);
3850			PV_STAT(pc_chunk_count--);
3851			PV_STAT(pc_chunk_frees++);
3852			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3853			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3854			dump_drop_page(m->phys_addr);
3855			vm_page_unwire(m, 0);
3856			vm_page_free(m);
3857		}
3858	}
3859	pmap_invalidate_all(pmap);
3860	vm_page_unlock_queues();
3861	PMAP_UNLOCK(pmap);
3862	pmap_free_zero_pages(free);
3863}
3864
3865/*
3866 *	pmap_is_modified:
3867 *
3868 *	Return whether or not the specified physical page was modified
3869 *	in any physical maps.
3870 */
3871boolean_t
3872pmap_is_modified(vm_page_t m)
3873{
3874
3875	if (m->flags & PG_FICTITIOUS)
3876		return (FALSE);
3877	if (pmap_is_modified_pvh(&m->md))
3878		return (TRUE);
3879	return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
3880}
3881
3882/*
3883 * Returns TRUE if any of the given mappings were used to modify
3884 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
3885 * mappings are supported.
3886 */
3887static boolean_t
3888pmap_is_modified_pvh(struct md_page *pvh)
3889{
3890	pv_entry_t pv;
3891	pt_entry_t *pte;
3892	pmap_t pmap;
3893	boolean_t rv;
3894
3895	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3896	rv = FALSE;
3897	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3898		pmap = PV_PMAP(pv);
3899		PMAP_LOCK(pmap);
3900		pte = pmap_pte(pmap, pv->pv_va);
3901		rv = (*pte & PG_M) != 0;
3902		PMAP_UNLOCK(pmap);
3903		if (rv)
3904			break;
3905	}
3906	return (rv);
3907}
3908
3909/*
3910 *	pmap_is_prefaultable:
3911 *
3912 *	Return whether or not the specified virtual address is elgible
3913 *	for prefault.
3914 */
3915boolean_t
3916pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3917{
3918	pd_entry_t *pde;
3919	pt_entry_t *pte;
3920	boolean_t rv;
3921
3922	rv = FALSE;
3923	PMAP_LOCK(pmap);
3924	pde = pmap_pde(pmap, addr);
3925	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
3926		pte = pmap_pde_to_pte(pde, addr);
3927		rv = (*pte & PG_V) == 0;
3928	}
3929	PMAP_UNLOCK(pmap);
3930	return (rv);
3931}
3932
3933/*
3934 * Clear the write and modified bits in each of the given page's mappings.
3935 */
3936void
3937pmap_remove_write(vm_page_t m)
3938{
3939	struct md_page *pvh;
3940	pmap_t pmap;
3941	pv_entry_t next_pv, pv;
3942	pd_entry_t *pde;
3943	pt_entry_t oldpte, *pte;
3944	vm_offset_t va;
3945
3946	if ((m->flags & PG_FICTITIOUS) != 0 ||
3947	    (m->flags & PG_WRITEABLE) == 0)
3948		return;
3949	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3950	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3951	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
3952		va = pv->pv_va;
3953		pmap = PV_PMAP(pv);
3954		PMAP_LOCK(pmap);
3955		pde = pmap_pde(pmap, va);
3956		if ((*pde & PG_RW) != 0)
3957			(void)pmap_demote_pde(pmap, pde, va);
3958		PMAP_UNLOCK(pmap);
3959	}
3960	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3961		pmap = PV_PMAP(pv);
3962		PMAP_LOCK(pmap);
3963		pde = pmap_pde(pmap, pv->pv_va);
3964		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
3965		    " a 2mpage in page %p's pv list", m));
3966		pte = pmap_pde_to_pte(pde, pv->pv_va);
3967retry:
3968		oldpte = *pte;
3969		if (oldpte & PG_RW) {
3970			if (!atomic_cmpset_long(pte, oldpte, oldpte &
3971			    ~(PG_RW | PG_M)))
3972				goto retry;
3973			if ((oldpte & PG_M) != 0)
3974				vm_page_dirty(m);
3975			pmap_invalidate_page(pmap, pv->pv_va);
3976		}
3977		PMAP_UNLOCK(pmap);
3978	}
3979	vm_page_flag_clear(m, PG_WRITEABLE);
3980}
3981
3982/*
3983 *	pmap_ts_referenced:
3984 *
3985 *	Return a count of reference bits for a page, clearing those bits.
3986 *	It is not necessary for every reference bit to be cleared, but it
3987 *	is necessary that 0 only be returned when there are truly no
3988 *	reference bits set.
3989 *
3990 *	XXX: The exact number of bits to check and clear is a matter that
3991 *	should be tested and standardized at some point in the future for
3992 *	optimal aging of shared pages.
3993 */
3994int
3995pmap_ts_referenced(vm_page_t m)
3996{
3997	struct md_page *pvh;
3998	pv_entry_t pv, pvf, pvn;
3999	pmap_t pmap;
4000	pd_entry_t oldpde, *pde;
4001	pt_entry_t *pte;
4002	vm_offset_t va;
4003	int rtval = 0;
4004
4005	if (m->flags & PG_FICTITIOUS)
4006		return (rtval);
4007	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4008	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4009	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4010		va = pv->pv_va;
4011		pmap = PV_PMAP(pv);
4012		PMAP_LOCK(pmap);
4013		pde = pmap_pde(pmap, va);
4014		oldpde = *pde;
4015		if ((oldpde & PG_A) != 0) {
4016			if (pmap_demote_pde(pmap, pde, va)) {
4017				if ((oldpde & PG_W) == 0) {
4018					/*
4019					 * Remove the mapping to a single page
4020					 * so that a subsequent access may
4021					 * repromote.  Since the underlying
4022					 * page table page is fully populated,
4023					 * this removal never frees a page
4024					 * table page.
4025					 */
4026					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4027					    PG_FRAME);
4028					pmap_remove_page(pmap, va, pde, NULL);
4029					rtval++;
4030					if (rtval > 4) {
4031						PMAP_UNLOCK(pmap);
4032						return (rtval);
4033					}
4034				}
4035			}
4036		}
4037		PMAP_UNLOCK(pmap);
4038	}
4039	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4040		pvf = pv;
4041		do {
4042			pvn = TAILQ_NEXT(pv, pv_list);
4043			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4044			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4045			pmap = PV_PMAP(pv);
4046			PMAP_LOCK(pmap);
4047			pde = pmap_pde(pmap, pv->pv_va);
4048			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4049			    " found a 2mpage in page %p's pv list", m));
4050			pte = pmap_pde_to_pte(pde, pv->pv_va);
4051			if ((*pte & PG_A) != 0) {
4052				atomic_clear_long(pte, PG_A);
4053				pmap_invalidate_page(pmap, pv->pv_va);
4054				rtval++;
4055				if (rtval > 4)
4056					pvn = NULL;
4057			}
4058			PMAP_UNLOCK(pmap);
4059		} while ((pv = pvn) != NULL && pv != pvf);
4060	}
4061	return (rtval);
4062}
4063
4064/*
4065 *	Clear the modify bits on the specified physical page.
4066 */
4067void
4068pmap_clear_modify(vm_page_t m)
4069{
4070	struct md_page *pvh;
4071	pmap_t pmap;
4072	pv_entry_t next_pv, pv;
4073	pd_entry_t oldpde, *pde;
4074	pt_entry_t oldpte, *pte;
4075	vm_offset_t va;
4076
4077	if ((m->flags & PG_FICTITIOUS) != 0)
4078		return;
4079	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4080	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4081	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4082		va = pv->pv_va;
4083		pmap = PV_PMAP(pv);
4084		PMAP_LOCK(pmap);
4085		pde = pmap_pde(pmap, va);
4086		oldpde = *pde;
4087		if ((oldpde & PG_RW) != 0) {
4088			if (pmap_demote_pde(pmap, pde, va)) {
4089				if ((oldpde & PG_W) == 0) {
4090					/*
4091					 * Write protect the mapping to a
4092					 * single page so that a subsequent
4093					 * write access may repromote.
4094					 */
4095					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4096					    PG_FRAME);
4097					pte = pmap_pde_to_pte(pde, va);
4098					oldpte = *pte;
4099					if ((oldpte & PG_V) != 0) {
4100						while (!atomic_cmpset_long(pte,
4101						    oldpte,
4102						    oldpte & ~(PG_M | PG_RW)))
4103							oldpte = *pte;
4104						vm_page_dirty(m);
4105						pmap_invalidate_page(pmap, va);
4106					}
4107				}
4108			}
4109		} else
4110			KASSERT((oldpde & PG_M) == 0,
4111			    ("pmap_clear_modify: modified page not writable"));
4112		PMAP_UNLOCK(pmap);
4113	}
4114	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4115		pmap = PV_PMAP(pv);
4116		PMAP_LOCK(pmap);
4117		pde = pmap_pde(pmap, pv->pv_va);
4118		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4119		    " a 2mpage in page %p's pv list", m));
4120		pte = pmap_pde_to_pte(pde, pv->pv_va);
4121		if (*pte & PG_M) {
4122			atomic_clear_long(pte, PG_M);
4123			pmap_invalidate_page(pmap, pv->pv_va);
4124		}
4125		PMAP_UNLOCK(pmap);
4126	}
4127}
4128
4129/*
4130 *	pmap_clear_reference:
4131 *
4132 *	Clear the reference bit on the specified physical page.
4133 */
4134void
4135pmap_clear_reference(vm_page_t m)
4136{
4137	struct md_page *pvh;
4138	pmap_t pmap;
4139	pv_entry_t next_pv, pv;
4140	pd_entry_t oldpde, *pde;
4141	pt_entry_t *pte;
4142	vm_offset_t va;
4143
4144	if ((m->flags & PG_FICTITIOUS) != 0)
4145		return;
4146	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4147	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4148	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4149		va = pv->pv_va;
4150		pmap = PV_PMAP(pv);
4151		PMAP_LOCK(pmap);
4152		pde = pmap_pde(pmap, va);
4153		oldpde = *pde;
4154		if ((oldpde & PG_A) != 0) {
4155			if (pmap_demote_pde(pmap, pde, va)) {
4156				/*
4157				 * Remove the mapping to a single page so
4158				 * that a subsequent access may repromote.
4159				 * Since the underlying page table page is
4160				 * fully populated, this removal never frees
4161				 * a page table page.
4162				 */
4163				va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME);
4164				pmap_remove_page(pmap, va, pde, NULL);
4165			}
4166		}
4167		PMAP_UNLOCK(pmap);
4168	}
4169	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4170		pmap = PV_PMAP(pv);
4171		PMAP_LOCK(pmap);
4172		pde = pmap_pde(pmap, pv->pv_va);
4173		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4174		    " a 2mpage in page %p's pv list", m));
4175		pte = pmap_pde_to_pte(pde, pv->pv_va);
4176		if (*pte & PG_A) {
4177			atomic_clear_long(pte, PG_A);
4178			pmap_invalidate_page(pmap, pv->pv_va);
4179		}
4180		PMAP_UNLOCK(pmap);
4181	}
4182}
4183
4184/*
4185 * Miscellaneous support routines follow
4186 */
4187
4188/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4189static __inline void
4190pmap_pte_attr(vm_offset_t va, int mode)
4191{
4192	pt_entry_t *pte;
4193	u_int opte, npte;
4194
4195	pte = vtopte(va);
4196
4197	/*
4198	 * The cache mode bits are all in the low 32-bits of the
4199	 * PTE, so we can just spin on updating the low 32-bits.
4200	 */
4201	do {
4202		opte = *(u_int *)pte;
4203		npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
4204		npte |= pmap_cache_bits(mode, 0);
4205	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4206}
4207
4208/* Adjust the cache mode for a 2MB page mapped via a PDE. */
4209static __inline void
4210pmap_pde_attr(vm_offset_t va, int mode)
4211{
4212	pd_entry_t *pde;
4213	u_int opde, npde;
4214
4215	pde = pmap_pde(kernel_pmap, va);
4216
4217	/*
4218	 * The cache mode bits are all in the low 32-bits of the
4219	 * PDE, so we can just spin on updating the low 32-bits.
4220	 */
4221	do {
4222		opde = *(u_int *)pde;
4223		npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT);
4224		npde |= pmap_cache_bits(mode, 1);
4225	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4226}
4227
4228/*
4229 * Map a set of physical memory pages into the kernel virtual
4230 * address space. Return a pointer to where it is mapped. This
4231 * routine is intended to be used for mapping device memory,
4232 * NOT real memory.
4233 */
4234void *
4235pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4236{
4237	vm_offset_t va, tmpva, offset;
4238
4239	/*
4240	 * If this fits within the direct map window and use WB caching
4241	 * mode, use the direct map.
4242	 */
4243	if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK)
4244		return ((void *)PHYS_TO_DMAP(pa));
4245	offset = pa & PAGE_MASK;
4246	size = roundup(offset + size, PAGE_SIZE);
4247	va = kmem_alloc_nofault(kernel_map, size);
4248	if (!va)
4249		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4250	pa = trunc_page(pa);
4251	for (tmpva = va; size > 0; ) {
4252		pmap_kenter_attr(tmpva, pa, mode);
4253		size -= PAGE_SIZE;
4254		tmpva += PAGE_SIZE;
4255		pa += PAGE_SIZE;
4256	}
4257	pmap_invalidate_range(kernel_pmap, va, tmpva);
4258	pmap_invalidate_cache();
4259	return ((void *)(va + offset));
4260}
4261
4262void *
4263pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4264{
4265
4266	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4267}
4268
4269void *
4270pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4271{
4272
4273	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4274}
4275
4276void
4277pmap_unmapdev(vm_offset_t va, vm_size_t size)
4278{
4279	vm_offset_t base, offset, tmpva;
4280
4281	/* If we gave a direct map region in pmap_mapdev, do nothing */
4282	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
4283		return;
4284	base = trunc_page(va);
4285	offset = va & PAGE_MASK;
4286	size = roundup(offset + size, PAGE_SIZE);
4287	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4288		pmap_kremove(tmpva);
4289	pmap_invalidate_range(kernel_pmap, va, tmpva);
4290	kmem_free(kernel_map, base, size);
4291}
4292
4293int
4294pmap_change_attr(va, size, mode)
4295	vm_offset_t va;
4296	vm_size_t size;
4297	int mode;
4298{
4299	vm_offset_t base, offset, tmpva;
4300	pd_entry_t *pde;
4301	pt_entry_t *pte;
4302
4303	base = trunc_page(va);
4304	offset = va & PAGE_MASK;
4305	size = roundup(offset + size, PAGE_SIZE);
4306
4307	/* Only supported on kernel virtual addresses. */
4308	if (base <= VM_MAXUSER_ADDRESS)
4309		return (EINVAL);
4310
4311	/*
4312	 * XXX: We have to support tearing 2MB pages down into 4k pages if
4313	 * needed here.
4314	 */
4315	/* Pages that aren't mapped aren't supported. */
4316	for (tmpva = base; tmpva < (base + size); ) {
4317		pde = pmap_pde(kernel_pmap, tmpva);
4318		if (*pde == 0)
4319			return (EINVAL);
4320		if (*pde & PG_PS) {
4321			/* Handle 2MB pages that are completely contained. */
4322			if (size >= NBPDR) {
4323				tmpva += NBPDR;
4324				continue;
4325			}
4326			return (EINVAL);
4327		}
4328		pte = vtopte(va);
4329		if (*pte == 0)
4330			return (EINVAL);
4331		tmpva += PAGE_SIZE;
4332	}
4333
4334	/*
4335	 * Ok, all the pages exist, so run through them updating their
4336	 * cache mode.
4337	 */
4338	for (tmpva = base; size > 0; ) {
4339		pde = pmap_pde(kernel_pmap, tmpva);
4340		if (*pde & PG_PS) {
4341			pmap_pde_attr(tmpva, mode);
4342			tmpva += NBPDR;
4343			size -= NBPDR;
4344		} else {
4345			pmap_pte_attr(tmpva, mode);
4346			tmpva += PAGE_SIZE;
4347			size -= PAGE_SIZE;
4348		}
4349	}
4350
4351	/*
4352	 * Flush CPU caches to make sure any data isn't cached that shouldn't
4353	 * be, etc.
4354	 */
4355	pmap_invalidate_range(kernel_pmap, base, tmpva);
4356	pmap_invalidate_cache();
4357	return (0);
4358}
4359
4360/*
4361 * perform the pmap work for mincore
4362 */
4363int
4364pmap_mincore(pmap_t pmap, vm_offset_t addr)
4365{
4366	pd_entry_t *pdep;
4367	pt_entry_t pte;
4368	vm_paddr_t pa;
4369	vm_page_t m;
4370	int val = 0;
4371
4372	PMAP_LOCK(pmap);
4373	pdep = pmap_pde(pmap, addr);
4374	if (pdep != NULL && (*pdep & PG_V)) {
4375		if (*pdep & PG_PS) {
4376			KASSERT((*pdep & PG_FRAME & PDRMASK) == 0,
4377			    ("pmap_mincore: bad pde"));
4378			pte = *pdep;
4379			pa = (*pdep & PG_FRAME) | (addr & PDRMASK);
4380		} else {
4381			pte = *pmap_pde_to_pte(pdep, addr);
4382			pa = pte & PG_FRAME;
4383		}
4384	} else {
4385		pte = 0;
4386		pa = 0;
4387	}
4388	PMAP_UNLOCK(pmap);
4389
4390	if (pte != 0) {
4391		val = MINCORE_INCORE;
4392		if ((pte & PG_MANAGED) == 0)
4393			return val;
4394
4395		m = PHYS_TO_VM_PAGE(pa);
4396
4397		/*
4398		 * Modified by us
4399		 */
4400		if (pte & PG_M)
4401			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4402		else {
4403			/*
4404			 * Modified by someone else
4405			 */
4406			vm_page_lock_queues();
4407			if (m->dirty || pmap_is_modified(m))
4408				val |= MINCORE_MODIFIED_OTHER;
4409			vm_page_unlock_queues();
4410		}
4411		/*
4412		 * Referenced by us
4413		 */
4414		if (pte & PG_A)
4415			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4416		else {
4417			/*
4418			 * Referenced by someone else
4419			 */
4420			vm_page_lock_queues();
4421			if ((m->flags & PG_REFERENCED) ||
4422			    pmap_ts_referenced(m)) {
4423				val |= MINCORE_REFERENCED_OTHER;
4424				vm_page_flag_set(m, PG_REFERENCED);
4425			}
4426			vm_page_unlock_queues();
4427		}
4428	}
4429	return val;
4430}
4431
4432void
4433pmap_activate(struct thread *td)
4434{
4435	pmap_t	pmap, oldpmap;
4436	u_int64_t  cr3;
4437
4438	critical_enter();
4439	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4440	oldpmap = PCPU_GET(curpmap);
4441#ifdef SMP
4442if (oldpmap)	/* XXX FIXME */
4443	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4444	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4445#else
4446if (oldpmap)	/* XXX FIXME */
4447	oldpmap->pm_active &= ~PCPU_GET(cpumask);
4448	pmap->pm_active |= PCPU_GET(cpumask);
4449#endif
4450	cr3 = vtophys(pmap->pm_pml4);
4451	td->td_pcb->pcb_cr3 = cr3;
4452	load_cr3(cr3);
4453	critical_exit();
4454}
4455
4456vm_offset_t
4457pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
4458{
4459
4460	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
4461		return addr;
4462	}
4463
4464	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
4465	return addr;
4466}
4467