pmap.c revision 207210
1145326Snyan/*-
2145326Snyan * Copyright (c) 1991 Regents of the University of California.
3145326Snyan * All rights reserved.
4145326Snyan * Copyright (c) 1994 John S. Dyson
5145326Snyan * All rights reserved.
6145326Snyan * Copyright (c) 1994 David Greenman
7145326Snyan * All rights reserved.
8145326Snyan * Copyright (c) 2003 Peter Wemm
9145326Snyan * All rights reserved.
10145326Snyan * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11145326Snyan * All rights reserved.
12145326Snyan *
13145326Snyan * This code is derived from software contributed to Berkeley by
14145326Snyan * the Systems Programming Group of the University of Utah Computer
15145326Snyan * Science Department and William Jolitz of UUNET Technologies Inc.
16145326Snyan *
17145326Snyan * Redistribution and use in source and binary forms, with or without
18145326Snyan * modification, are permitted provided that the following conditions
19145326Snyan * are met:
20145326Snyan * 1. Redistributions of source code must retain the above copyright
21145326Snyan *    notice, this list of conditions and the following disclaimer.
22145326Snyan * 2. Redistributions in binary form must reproduce the above copyright
23145326Snyan *    notice, this list of conditions and the following disclaimer in the
24145326Snyan *    documentation and/or other materials provided with the distribution.
25145326Snyan * 3. All advertising materials mentioning features or use of this software
26145326Snyan *    must display the following acknowledgement:
27145326Snyan *	This product includes software developed by the University of
28145326Snyan *	California, Berkeley and its contributors.
29145326Snyan * 4. Neither the name of the University nor the names of its contributors
30145326Snyan *    may be used to endorse or promote products derived from this software
31145326Snyan *    without specific prior written permission.
32145326Snyan *
33145326Snyan * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34145326Snyan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35145326Snyan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36145326Snyan * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37145326Snyan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38145326Snyan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39145326Snyan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40145326Snyan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41145326Snyan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42145326Snyan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43145326Snyan * SUCH DAMAGE.
44145326Snyan *
45145326Snyan *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46145326Snyan */
47145326Snyan/*-
48145326Snyan * Copyright (c) 2003 Networks Associates Technology, Inc.
49145326Snyan * All rights reserved.
50145326Snyan *
51145326Snyan * This software was developed for the FreeBSD Project by Jake Burkholder,
52145326Snyan * Safeport Network Services, and Network Associates Laboratories, the
53145326Snyan * Security Research Division of Network Associates, Inc. under
54145326Snyan * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55145326Snyan * CHATS research program.
56145326Snyan *
57145326Snyan * Redistribution and use in source and binary forms, with or without
58145326Snyan * modification, are permitted provided that the following conditions
59145326Snyan * are met:
60145326Snyan * 1. Redistributions of source code must retain the above copyright
61145326Snyan *    notice, this list of conditions and the following disclaimer.
62145326Snyan * 2. Redistributions in binary form must reproduce the above copyright
63145326Snyan *    notice, this list of conditions and the following disclaimer in the
64145326Snyan *    documentation and/or other materials provided with the distribution.
65145326Snyan *
66145326Snyan * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67145326Snyan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68145326Snyan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69145326Snyan * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70145326Snyan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71145326Snyan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72145326Snyan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73145326Snyan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74145326Snyan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75145326Snyan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76145326Snyan * SUCH DAMAGE.
77145326Snyan */
78145326Snyan
79145326Snyan#include <sys/cdefs.h>
80145326Snyan__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 207210 2010-04-25 23:18:02Z kmacy $");
81145326Snyan
82145326Snyan/*
83145326Snyan *	Manages physical address maps.
84145326Snyan *
85145326Snyan *	In addition to hardware address maps, this
86145326Snyan *	module is called upon to provide software-use-only
87145326Snyan *	maps which may or may not be stored in the same
88 *	form as hardware maps.  These pseudo-maps are
89 *	used to store intermediate results from copy
90 *	operations to and from address spaces.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_pmap.h"
109#include "opt_vm.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/kernel.h>
114#include <sys/ktr.h>
115#include <sys/lock.h>
116#include <sys/malloc.h>
117#include <sys/mman.h>
118#include <sys/mutex.h>
119#include <sys/proc.h>
120#include <sys/sx.h>
121#include <sys/vmmeter.h>
122#include <sys/sched.h>
123#include <sys/sysctl.h>
124#ifdef SMP
125#include <sys/smp.h>
126#endif
127
128#include <vm/vm.h>
129#include <vm/vm_param.h>
130#include <vm/vm_kern.h>
131#include <vm/vm_page.h>
132#include <vm/vm_map.h>
133#include <vm/vm_object.h>
134#include <vm/vm_extern.h>
135#include <vm/vm_pageout.h>
136#include <vm/vm_pager.h>
137#include <vm/vm_reserv.h>
138#include <vm/uma.h>
139
140#include <machine/cpu.h>
141#include <machine/cputypes.h>
142#include <machine/md_var.h>
143#include <machine/pcb.h>
144#include <machine/specialreg.h>
145#ifdef SMP
146#include <machine/smp.h>
147#endif
148
149#ifndef PMAP_SHPGPERPROC
150#define PMAP_SHPGPERPROC 200
151#endif
152
153#if !defined(DIAGNOSTIC)
154#ifdef __GNUC_GNU_INLINE__
155#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
156#else
157#define PMAP_INLINE	extern inline
158#endif
159#else
160#define PMAP_INLINE
161#endif
162
163#define PV_STATS
164#ifdef PV_STATS
165#define PV_STAT(x)	do { x ; } while (0)
166#else
167#define PV_STAT(x)	do { } while (0)
168#endif
169
170#define	pa_index(pa)	((pa) >> PDRSHIFT)
171#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
172
173struct pmap kernel_pmap_store;
174
175vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
176vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
177
178static int ndmpdp;
179static vm_paddr_t dmaplimit;
180vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
181pt_entry_t pg_nx;
182
183static int pat_works = 0;		/* Is page attribute table sane? */
184
185SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
186
187static int pg_ps_enabled = 1;
188SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
189    "Are large page mappings enabled?");
190
191static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
192static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
193u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
194u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
195
196static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
197static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
198
199/*
200 * Data for the pv entry allocation mechanism
201 */
202static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
203static struct md_page *pv_table;
204static int shpgperproc = PMAP_SHPGPERPROC;
205
206/*
207 * All those kernel PT submaps that BSD is so fond of
208 */
209pt_entry_t *CMAP1 = 0;
210caddr_t CADDR1 = 0;
211
212/*
213 * Crashdump maps.
214 */
215static caddr_t crashdumpmap;
216
217static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
218static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
219static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
220static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
221static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
222static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
223static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
224		    vm_offset_t va);
225static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
226
227static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
228static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
229static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
230    vm_offset_t va);
231static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
232    vm_prot_t prot);
233static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
234    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
235static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
236static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
237static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
238static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
239static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
240static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
241static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
242static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
243static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
244static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
245    vm_prot_t prot);
246static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
247static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
248		vm_page_t *free);
249static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
250		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
251static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
252static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
253    vm_page_t *free);
254static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
255		vm_offset_t va);
256static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
257static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
258    vm_page_t m);
259static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
260    pd_entry_t newpde);
261static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
262
263static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
264static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
265
266static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
267static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
268                vm_page_t* free);
269static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
270static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
271
272CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
273CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
274
275/*
276 * Move the kernel virtual free pointer to the next
277 * 2MB.  This is used to help improve performance
278 * by using a large (2MB) page for much of the kernel
279 * (.text, .data, .bss)
280 */
281static vm_offset_t
282pmap_kmem_choose(vm_offset_t addr)
283{
284	vm_offset_t newaddr = addr;
285
286	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
287	return (newaddr);
288}
289
290/********************/
291/* Inline functions */
292/********************/
293
294/* Return a non-clipped PD index for a given VA */
295static __inline vm_pindex_t
296pmap_pde_pindex(vm_offset_t va)
297{
298	return (va >> PDRSHIFT);
299}
300
301
302/* Return various clipped indexes for a given VA */
303static __inline vm_pindex_t
304pmap_pte_index(vm_offset_t va)
305{
306
307	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
308}
309
310static __inline vm_pindex_t
311pmap_pde_index(vm_offset_t va)
312{
313
314	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
315}
316
317static __inline vm_pindex_t
318pmap_pdpe_index(vm_offset_t va)
319{
320
321	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
322}
323
324static __inline vm_pindex_t
325pmap_pml4e_index(vm_offset_t va)
326{
327
328	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
329}
330
331/* Return a pointer to the PML4 slot that corresponds to a VA */
332static __inline pml4_entry_t *
333pmap_pml4e(pmap_t pmap, vm_offset_t va)
334{
335
336	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
337}
338
339/* Return a pointer to the PDP slot that corresponds to a VA */
340static __inline pdp_entry_t *
341pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
342{
343	pdp_entry_t *pdpe;
344
345	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
346	return (&pdpe[pmap_pdpe_index(va)]);
347}
348
349/* Return a pointer to the PDP slot that corresponds to a VA */
350static __inline pdp_entry_t *
351pmap_pdpe(pmap_t pmap, vm_offset_t va)
352{
353	pml4_entry_t *pml4e;
354
355	pml4e = pmap_pml4e(pmap, va);
356	if ((*pml4e & PG_V) == 0)
357		return (NULL);
358	return (pmap_pml4e_to_pdpe(pml4e, va));
359}
360
361/* Return a pointer to the PD slot that corresponds to a VA */
362static __inline pd_entry_t *
363pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
364{
365	pd_entry_t *pde;
366
367	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
368	return (&pde[pmap_pde_index(va)]);
369}
370
371/* Return a pointer to the PD slot that corresponds to a VA */
372static __inline pd_entry_t *
373pmap_pde(pmap_t pmap, vm_offset_t va)
374{
375	pdp_entry_t *pdpe;
376
377	pdpe = pmap_pdpe(pmap, va);
378	if (pdpe == NULL || (*pdpe & PG_V) == 0)
379		return (NULL);
380	return (pmap_pdpe_to_pde(pdpe, va));
381}
382
383/* Return a pointer to the PT slot that corresponds to a VA */
384static __inline pt_entry_t *
385pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
386{
387	pt_entry_t *pte;
388
389	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
390	return (&pte[pmap_pte_index(va)]);
391}
392
393/* Return a pointer to the PT slot that corresponds to a VA */
394static __inline pt_entry_t *
395pmap_pte(pmap_t pmap, vm_offset_t va)
396{
397	pd_entry_t *pde;
398
399	pde = pmap_pde(pmap, va);
400	if (pde == NULL || (*pde & PG_V) == 0)
401		return (NULL);
402	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
403		return ((pt_entry_t *)pde);
404	return (pmap_pde_to_pte(pde, va));
405}
406
407static __inline void
408pmap_resident_count_inc(pmap_t pmap, int count)
409{
410
411	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
412	pmap->pm_stats.resident_count += count;
413}
414
415static __inline void
416pmap_resident_count_dec(pmap_t pmap, int count)
417{
418
419	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
420	pmap->pm_stats.resident_count -= count;
421}
422
423PMAP_INLINE pt_entry_t *
424vtopte(vm_offset_t va)
425{
426	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
427
428	return (PTmap + ((va >> PAGE_SHIFT) & mask));
429}
430
431static __inline pd_entry_t *
432vtopde(vm_offset_t va)
433{
434	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
435
436	return (PDmap + ((va >> PDRSHIFT) & mask));
437}
438
439static u_int64_t
440allocpages(vm_paddr_t *firstaddr, int n)
441{
442	u_int64_t ret;
443
444	ret = *firstaddr;
445	bzero((void *)ret, n * PAGE_SIZE);
446	*firstaddr += n * PAGE_SIZE;
447	return (ret);
448}
449
450static void
451create_pagetables(vm_paddr_t *firstaddr)
452{
453	int i;
454
455	/* Allocate pages */
456	KPTphys = allocpages(firstaddr, NKPT);
457	KPML4phys = allocpages(firstaddr, 1);
458	KPDPphys = allocpages(firstaddr, NKPML4E);
459	KPDphys = allocpages(firstaddr, NKPDPE);
460
461	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
462	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
463		ndmpdp = 4;
464	DMPDPphys = allocpages(firstaddr, NDMPML4E);
465	if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0)
466		DMPDphys = allocpages(firstaddr, ndmpdp);
467	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
468
469	/* Fill in the underlying page table pages */
470	/* Read-only from zero to physfree */
471	/* XXX not fully used, underneath 2M pages */
472	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
473		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
474		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
475	}
476
477	/* Now map the page tables at their location within PTmap */
478	for (i = 0; i < NKPT; i++) {
479		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
480		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
481	}
482
483	/* Map from zero to end of allocations under 2M pages */
484	/* This replaces some of the KPTphys entries above */
485	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
486		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
487		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
488	}
489
490	/* And connect up the PD to the PDP */
491	for (i = 0; i < NKPDPE; i++) {
492		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
493		    (i << PAGE_SHIFT);
494		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
495	}
496
497	/* Now set up the direct map space using either 2MB or 1GB pages */
498	/* Preset PG_M and PG_A because demotion expects it */
499	if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0) {
500		for (i = 0; i < NPDEPG * ndmpdp; i++) {
501			((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
502			((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS |
503			    PG_G | PG_M | PG_A;
504		}
505		/* And the direct map space's PDP */
506		for (i = 0; i < ndmpdp; i++) {
507			((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
508			    (i << PAGE_SHIFT);
509			((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
510		}
511	} else {
512		for (i = 0; i < ndmpdp; i++) {
513			((pdp_entry_t *)DMPDPphys)[i] =
514			    (vm_paddr_t)i << PDPSHIFT;
515			((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS |
516			    PG_G | PG_M | PG_A;
517		}
518	}
519
520	/* And recursively map PML4 to itself in order to get PTmap */
521	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
522	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
523
524	/* Connect the Direct Map slot up to the PML4 */
525	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
526	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
527
528	/* Connect the KVA slot up to the PML4 */
529	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
530	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
531}
532
533/*
534 *	Bootstrap the system enough to run with virtual memory.
535 *
536 *	On amd64 this is called after mapping has already been enabled
537 *	and just syncs the pmap module with what has already been done.
538 *	[We can't call it easily with mapping off since the kernel is not
539 *	mapped with PA == VA, hence we would have to relocate every address
540 *	from the linked base (virtual) address "KERNBASE" to the actual
541 *	(physical) address starting relative to 0]
542 */
543void
544pmap_bootstrap(vm_paddr_t *firstaddr)
545{
546	vm_offset_t va;
547	pt_entry_t *pte, *unused;
548
549	/*
550	 * Create an initial set of page tables to run the kernel in.
551	 */
552	create_pagetables(firstaddr);
553
554	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
555	virtual_avail = pmap_kmem_choose(virtual_avail);
556
557	virtual_end = VM_MAX_KERNEL_ADDRESS;
558
559
560	/* XXX do %cr0 as well */
561	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
562	load_cr3(KPML4phys);
563
564	/*
565	 * Initialize the kernel pmap (which is statically allocated).
566	 */
567	PMAP_LOCK_INIT(kernel_pmap);
568	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
569	kernel_pmap->pm_root = NULL;
570	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
571	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
572
573	/*
574	 * Reserve some special page table entries/VA space for temporary
575	 * mapping of pages.
576	 */
577#define	SYSMAP(c, p, v, n)	\
578	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
579
580	va = virtual_avail;
581	pte = vtopte(va);
582
583	/*
584	 * CMAP1 is only used for the memory test.
585	 */
586	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
587
588	/*
589	 * Crashdump maps.
590	 */
591	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
592
593	virtual_avail = va;
594
595	/* Initialize the PAT MSR. */
596	pmap_init_pat();
597}
598
599/*
600 * Setup the PAT MSR.
601 */
602void
603pmap_init_pat(void)
604{
605	uint64_t pat_msr;
606	char *sysenv;
607	static int pat_tested = 0;
608
609	/* Bail if this CPU doesn't implement PAT. */
610	if (!(cpu_feature & CPUID_PAT))
611		panic("no PAT??");
612
613	/*
614	 * Some Apple Macs based on nVidia chipsets cannot enter ACPI mode
615	 * via SMI# when we use upper 4 PAT entries for unknown reason.
616	 */
617	if (!pat_tested) {
618		pat_works = 1;
619		sysenv = getenv("smbios.system.product");
620		if (sysenv != NULL) {
621			if (strncmp(sysenv, "MacBook5,1", 10) == 0 ||
622			    strncmp(sysenv, "MacBookPro5,5", 13) == 0 ||
623			    strncmp(sysenv, "Macmini3,1", 10) == 0)
624				pat_works = 0;
625			freeenv(sysenv);
626		}
627		pat_tested = 1;
628	}
629
630	/* Initialize default PAT entries. */
631	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
632	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
633	    PAT_VALUE(2, PAT_UNCACHED) |
634	    PAT_VALUE(3, PAT_UNCACHEABLE) |
635	    PAT_VALUE(4, PAT_WRITE_BACK) |
636	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
637	    PAT_VALUE(6, PAT_UNCACHED) |
638	    PAT_VALUE(7, PAT_UNCACHEABLE);
639
640	if (pat_works) {
641		/*
642		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
643		 * Program 4 and 5 as WP and WC.
644		 * Leave 6 and 7 as UC- and UC.
645		 */
646		pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
647		pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
648		    PAT_VALUE(5, PAT_WRITE_COMBINING);
649	} else {
650		/*
651		 * Just replace PAT Index 2 with WC instead of UC-.
652		 */
653		pat_msr &= ~PAT_MASK(2);
654		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
655	}
656	wrmsr(MSR_PAT, pat_msr);
657}
658
659/*
660 *	Initialize a vm_page's machine-dependent fields.
661 */
662void
663pmap_page_init(vm_page_t m)
664{
665
666	TAILQ_INIT(&m->md.pv_list);
667	m->md.pat_mode = PAT_WRITE_BACK;
668}
669
670/*
671 *	Initialize the pmap module.
672 *	Called by vm_init, to initialize any structures that the pmap
673 *	system needs to map virtual memory.
674 */
675void
676pmap_init(void)
677{
678	vm_page_t mpte;
679	vm_size_t s;
680	int i, pv_npg;
681
682	/*
683	 * Initialize the vm page array entries for the kernel pmap's
684	 * page table pages.
685	 */
686	for (i = 0; i < NKPT; i++) {
687		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
688		KASSERT(mpte >= vm_page_array &&
689		    mpte < &vm_page_array[vm_page_array_size],
690		    ("pmap_init: page table page is out of range"));
691		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
692		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
693	}
694
695	/*
696	 * Initialize the address space (zone) for the pv entries.  Set a
697	 * high water mark so that the system can recover from excessive
698	 * numbers of pv entries.
699	 */
700	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
701	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
702	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
703	pv_entry_high_water = 9 * (pv_entry_max / 10);
704
705	/*
706	 * If the kernel is running in a virtual machine on an AMD Family 10h
707	 * processor, then it must assume that MCA is enabled by the virtual
708	 * machine monitor.
709	 */
710	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
711	    CPUID_TO_FAMILY(cpu_id) == 0x10)
712		workaround_erratum383 = 1;
713
714	/*
715	 * Are large page mappings enabled?
716	 */
717	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
718	if (pg_ps_enabled) {
719		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
720		    ("pmap_init: can't assign to pagesizes[1]"));
721		pagesizes[1] = NBPDR;
722	}
723
724	/*
725	 * Calculate the size of the pv head table for superpages.
726	 */
727	for (i = 0; phys_avail[i + 1]; i += 2);
728	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
729
730	/*
731	 * Allocate memory for the pv head table for superpages.
732	 */
733	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
734	s = round_page(s);
735	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
736	for (i = 0; i < pv_npg; i++)
737		TAILQ_INIT(&pv_table[i].pv_list);
738}
739
740static int
741pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
742{
743	int error;
744
745	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
746	if (error == 0 && req->newptr) {
747		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
748		pv_entry_high_water = 9 * (pv_entry_max / 10);
749	}
750	return (error);
751}
752SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW,
753    &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
754
755static int
756pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
757{
758	int error;
759
760	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
761	if (error == 0 && req->newptr) {
762		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
763		pv_entry_high_water = 9 * (pv_entry_max / 10);
764	}
765	return (error);
766}
767SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW,
768    &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
769
770SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
771    "2MB page mapping counters");
772
773static u_long pmap_pde_demotions;
774SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
775    &pmap_pde_demotions, 0, "2MB page demotions");
776
777static u_long pmap_pde_mappings;
778SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
779    &pmap_pde_mappings, 0, "2MB page mappings");
780
781static u_long pmap_pde_p_failures;
782SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
783    &pmap_pde_p_failures, 0, "2MB page promotion failures");
784
785static u_long pmap_pde_promotions;
786SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
787    &pmap_pde_promotions, 0, "2MB page promotions");
788
789SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
790    "1GB page mapping counters");
791
792static u_long pmap_pdpe_demotions;
793SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
794    &pmap_pdpe_demotions, 0, "1GB page demotions");
795
796
797/***************************************************
798 * Low level helper routines.....
799 ***************************************************/
800
801/*
802 * Determine the appropriate bits to set in a PTE or PDE for a specified
803 * caching mode.
804 */
805static int
806pmap_cache_bits(int mode, boolean_t is_pde)
807{
808	int pat_flag, pat_index, cache_bits;
809
810	/* The PAT bit is different for PTE's and PDE's. */
811	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
812
813	/* Map the caching mode to a PAT index. */
814	if (pat_works) {
815		switch (mode) {
816		case PAT_UNCACHEABLE:
817			pat_index = 3;
818			break;
819		case PAT_WRITE_THROUGH:
820			pat_index = 1;
821			break;
822		case PAT_WRITE_BACK:
823			pat_index = 0;
824			break;
825		case PAT_UNCACHED:
826			pat_index = 2;
827			break;
828		case PAT_WRITE_COMBINING:
829			pat_index = 5;
830			break;
831		case PAT_WRITE_PROTECTED:
832			pat_index = 4;
833			break;
834		default:
835			panic("Unknown caching mode %d\n", mode);
836		}
837	} else {
838		switch (mode) {
839		case PAT_UNCACHED:
840		case PAT_UNCACHEABLE:
841		case PAT_WRITE_PROTECTED:
842			pat_index = 3;
843			break;
844		case PAT_WRITE_THROUGH:
845			pat_index = 1;
846			break;
847		case PAT_WRITE_BACK:
848			pat_index = 0;
849			break;
850		case PAT_WRITE_COMBINING:
851			pat_index = 2;
852			break;
853		default:
854			panic("Unknown caching mode %d\n", mode);
855		}
856	}
857
858	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
859	cache_bits = 0;
860	if (pat_index & 0x4)
861		cache_bits |= pat_flag;
862	if (pat_index & 0x2)
863		cache_bits |= PG_NC_PCD;
864	if (pat_index & 0x1)
865		cache_bits |= PG_NC_PWT;
866	return (cache_bits);
867}
868
869/*
870 * After changing the page size for the specified virtual address in the page
871 * table, flush the corresponding entries from the processor's TLB.  Only the
872 * calling processor's TLB is affected.
873 *
874 * The calling thread must be pinned to a processor.
875 */
876static void
877pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
878{
879	u_long cr4;
880
881	if ((newpde & PG_PS) == 0)
882		/* Demotion: flush a specific 2MB page mapping. */
883		invlpg(va);
884	else if ((newpde & PG_G) == 0)
885		/*
886		 * Promotion: flush every 4KB page mapping from the TLB
887		 * because there are too many to flush individually.
888		 */
889		invltlb();
890	else {
891		/*
892		 * Promotion: flush every 4KB page mapping from the TLB,
893		 * including any global (PG_G) mappings.
894		 */
895		cr4 = rcr4();
896		load_cr4(cr4 & ~CR4_PGE);
897		/*
898		 * Although preemption at this point could be detrimental to
899		 * performance, it would not lead to an error.  PG_G is simply
900		 * ignored if CR4.PGE is clear.  Moreover, in case this block
901		 * is re-entered, the load_cr4() either above or below will
902		 * modify CR4.PGE flushing the TLB.
903		 */
904		load_cr4(cr4 | CR4_PGE);
905	}
906}
907#ifdef SMP
908/*
909 * For SMP, these functions have to use the IPI mechanism for coherence.
910 *
911 * N.B.: Before calling any of the following TLB invalidation functions,
912 * the calling processor must ensure that all stores updating a non-
913 * kernel page table are globally performed.  Otherwise, another
914 * processor could cache an old, pre-update entry without being
915 * invalidated.  This can happen one of two ways: (1) The pmap becomes
916 * active on another processor after its pm_active field is checked by
917 * one of the following functions but before a store updating the page
918 * table is globally performed. (2) The pmap becomes active on another
919 * processor before its pm_active field is checked but due to
920 * speculative loads one of the following functions stills reads the
921 * pmap as inactive on the other processor.
922 *
923 * The kernel page table is exempt because its pm_active field is
924 * immutable.  The kernel page table is always active on every
925 * processor.
926 */
927void
928pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
929{
930	u_int cpumask;
931	u_int other_cpus;
932
933	sched_pin();
934	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
935		invlpg(va);
936		smp_invlpg(va);
937	} else {
938		cpumask = PCPU_GET(cpumask);
939		other_cpus = PCPU_GET(other_cpus);
940		if (pmap->pm_active & cpumask)
941			invlpg(va);
942		if (pmap->pm_active & other_cpus)
943			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
944	}
945	sched_unpin();
946}
947
948void
949pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
950{
951	u_int cpumask;
952	u_int other_cpus;
953	vm_offset_t addr;
954
955	sched_pin();
956	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
957		for (addr = sva; addr < eva; addr += PAGE_SIZE)
958			invlpg(addr);
959		smp_invlpg_range(sva, eva);
960	} else {
961		cpumask = PCPU_GET(cpumask);
962		other_cpus = PCPU_GET(other_cpus);
963		if (pmap->pm_active & cpumask)
964			for (addr = sva; addr < eva; addr += PAGE_SIZE)
965				invlpg(addr);
966		if (pmap->pm_active & other_cpus)
967			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
968			    sva, eva);
969	}
970	sched_unpin();
971}
972
973void
974pmap_invalidate_all(pmap_t pmap)
975{
976	u_int cpumask;
977	u_int other_cpus;
978
979	sched_pin();
980	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
981		invltlb();
982		smp_invltlb();
983	} else {
984		cpumask = PCPU_GET(cpumask);
985		other_cpus = PCPU_GET(other_cpus);
986		if (pmap->pm_active & cpumask)
987			invltlb();
988		if (pmap->pm_active & other_cpus)
989			smp_masked_invltlb(pmap->pm_active & other_cpus);
990	}
991	sched_unpin();
992}
993
994void
995pmap_invalidate_cache(void)
996{
997
998	sched_pin();
999	wbinvd();
1000	smp_cache_flush();
1001	sched_unpin();
1002}
1003
1004struct pde_action {
1005	cpumask_t store;	/* processor that updates the PDE */
1006	cpumask_t invalidate;	/* processors that invalidate their TLB */
1007	vm_offset_t va;
1008	pd_entry_t *pde;
1009	pd_entry_t newpde;
1010};
1011
1012static void
1013pmap_update_pde_action(void *arg)
1014{
1015	struct pde_action *act = arg;
1016
1017	if (act->store == PCPU_GET(cpumask))
1018		pde_store(act->pde, act->newpde);
1019}
1020
1021static void
1022pmap_update_pde_teardown(void *arg)
1023{
1024	struct pde_action *act = arg;
1025
1026	if ((act->invalidate & PCPU_GET(cpumask)) != 0)
1027		pmap_update_pde_invalidate(act->va, act->newpde);
1028}
1029
1030/*
1031 * Change the page size for the specified virtual address in a way that
1032 * prevents any possibility of the TLB ever having two entries that map the
1033 * same virtual address using different page sizes.  This is the recommended
1034 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1035 * machine check exception for a TLB state that is improperly diagnosed as a
1036 * hardware error.
1037 */
1038static void
1039pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1040{
1041	struct pde_action act;
1042	cpumask_t active, cpumask;
1043
1044	sched_pin();
1045	cpumask = PCPU_GET(cpumask);
1046	if (pmap == kernel_pmap)
1047		active = all_cpus;
1048	else
1049		active = pmap->pm_active;
1050	if ((active & PCPU_GET(other_cpus)) != 0) {
1051		act.store = cpumask;
1052		act.invalidate = active;
1053		act.va = va;
1054		act.pde = pde;
1055		act.newpde = newpde;
1056		smp_rendezvous_cpus(cpumask | active,
1057		    smp_no_rendevous_barrier, pmap_update_pde_action,
1058		    pmap_update_pde_teardown, &act);
1059	} else {
1060		pde_store(pde, newpde);
1061		if ((active & cpumask) != 0)
1062			pmap_update_pde_invalidate(va, newpde);
1063	}
1064	sched_unpin();
1065}
1066#else /* !SMP */
1067/*
1068 * Normal, non-SMP, invalidation functions.
1069 * We inline these within pmap.c for speed.
1070 */
1071PMAP_INLINE void
1072pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1073{
1074
1075	if (pmap == kernel_pmap || pmap->pm_active)
1076		invlpg(va);
1077}
1078
1079PMAP_INLINE void
1080pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1081{
1082	vm_offset_t addr;
1083
1084	if (pmap == kernel_pmap || pmap->pm_active)
1085		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1086			invlpg(addr);
1087}
1088
1089PMAP_INLINE void
1090pmap_invalidate_all(pmap_t pmap)
1091{
1092
1093	if (pmap == kernel_pmap || pmap->pm_active)
1094		invltlb();
1095}
1096
1097PMAP_INLINE void
1098pmap_invalidate_cache(void)
1099{
1100
1101	wbinvd();
1102}
1103
1104static void
1105pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1106{
1107
1108	pde_store(pde, newpde);
1109	if (pmap == kernel_pmap || pmap->pm_active)
1110		pmap_update_pde_invalidate(va, newpde);
1111}
1112#endif /* !SMP */
1113
1114static void
1115pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1116{
1117
1118	KASSERT((sva & PAGE_MASK) == 0,
1119	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1120	KASSERT((eva & PAGE_MASK) == 0,
1121	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1122
1123	if (cpu_feature & CPUID_SS)
1124		; /* If "Self Snoop" is supported, do nothing. */
1125	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1126		 eva - sva < 2 * 1024 * 1024) {
1127
1128		/*
1129		 * Otherwise, do per-cache line flush.  Use the mfence
1130		 * instruction to insure that previous stores are
1131		 * included in the write-back.  The processor
1132		 * propagates flush to other processors in the cache
1133		 * coherence domain.
1134		 */
1135		mfence();
1136		for (; sva < eva; sva += cpu_clflush_line_size)
1137			clflush(sva);
1138		mfence();
1139	} else {
1140
1141		/*
1142		 * No targeted cache flush methods are supported by CPU,
1143		 * or the supplied range is bigger than 2MB.
1144		 * Globally invalidate cache.
1145		 */
1146		pmap_invalidate_cache();
1147	}
1148}
1149
1150/*
1151 * Are we current address space or kernel?
1152 */
1153static __inline int
1154pmap_is_current(pmap_t pmap)
1155{
1156	return (pmap == kernel_pmap ||
1157	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
1158}
1159
1160/*
1161 *	Routine:	pmap_extract
1162 *	Function:
1163 *		Extract the physical page address associated
1164 *		with the given map/virtual_address pair.
1165 */
1166vm_paddr_t
1167pmap_extract(pmap_t pmap, vm_offset_t va)
1168{
1169	vm_paddr_t rtval;
1170	pt_entry_t *pte;
1171	pd_entry_t pde, *pdep;
1172
1173	rtval = 0;
1174	PMAP_LOCK(pmap);
1175	pdep = pmap_pde(pmap, va);
1176	if (pdep != NULL) {
1177		pde = *pdep;
1178		if (pde) {
1179			if ((pde & PG_PS) != 0)
1180				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1181			else {
1182				pte = pmap_pde_to_pte(pdep, va);
1183				rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1184			}
1185		}
1186	}
1187	PMAP_UNLOCK(pmap);
1188	return (rtval);
1189}
1190
1191/*
1192 *	Routine:	pmap_extract_and_hold
1193 *	Function:
1194 *		Atomically extract and hold the physical page
1195 *		with the given pmap and virtual address pair
1196 *		if that mapping permits the given protection.
1197 */
1198vm_page_t
1199pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1200{
1201	pd_entry_t pde, *pdep;
1202	pt_entry_t pte;
1203	vm_page_t m;
1204
1205	m = NULL;
1206	vm_page_lock_queues();
1207	PMAP_LOCK(pmap);
1208	pdep = pmap_pde(pmap, va);
1209	if (pdep != NULL && (pde = *pdep)) {
1210		if (pde & PG_PS) {
1211			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1212				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1213				    (va & PDRMASK));
1214				vm_page_hold(m);
1215			}
1216		} else {
1217			pte = *pmap_pde_to_pte(pdep, va);
1218			if ((pte & PG_V) &&
1219			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1220				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1221				vm_page_hold(m);
1222			}
1223		}
1224	}
1225	vm_page_unlock_queues();
1226	PMAP_UNLOCK(pmap);
1227	return (m);
1228}
1229
1230vm_paddr_t
1231pmap_kextract(vm_offset_t va)
1232{
1233	pd_entry_t pde;
1234	vm_paddr_t pa;
1235
1236	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1237		pa = DMAP_TO_PHYS(va);
1238	} else {
1239		pde = *vtopde(va);
1240		if (pde & PG_PS) {
1241			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1242		} else {
1243			/*
1244			 * Beware of a concurrent promotion that changes the
1245			 * PDE at this point!  For example, vtopte() must not
1246			 * be used to access the PTE because it would use the
1247			 * new PDE.  It is, however, safe to use the old PDE
1248			 * because the page table page is preserved by the
1249			 * promotion.
1250			 */
1251			pa = *pmap_pde_to_pte(&pde, va);
1252			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1253		}
1254	}
1255	return (pa);
1256}
1257
1258/***************************************************
1259 * Low level mapping routines.....
1260 ***************************************************/
1261
1262/*
1263 * Add a wired page to the kva.
1264 * Note: not SMP coherent.
1265 */
1266PMAP_INLINE void
1267pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1268{
1269	pt_entry_t *pte;
1270
1271	pte = vtopte(va);
1272	pte_store(pte, pa | PG_RW | PG_V | PG_G);
1273}
1274
1275static __inline void
1276pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1277{
1278	pt_entry_t *pte;
1279
1280	pte = vtopte(va);
1281	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
1282}
1283
1284/*
1285 * Remove a page from the kernel pagetables.
1286 * Note: not SMP coherent.
1287 */
1288PMAP_INLINE void
1289pmap_kremove(vm_offset_t va)
1290{
1291	pt_entry_t *pte;
1292
1293	pte = vtopte(va);
1294	pte_clear(pte);
1295}
1296
1297/*
1298 *	Used to map a range of physical addresses into kernel
1299 *	virtual address space.
1300 *
1301 *	The value passed in '*virt' is a suggested virtual address for
1302 *	the mapping. Architectures which can support a direct-mapped
1303 *	physical to virtual region can return the appropriate address
1304 *	within that region, leaving '*virt' unchanged. Other
1305 *	architectures should map the pages starting at '*virt' and
1306 *	update '*virt' with the first usable address after the mapped
1307 *	region.
1308 */
1309vm_offset_t
1310pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1311{
1312	return PHYS_TO_DMAP(start);
1313}
1314
1315
1316/*
1317 * Add a list of wired pages to the kva
1318 * this routine is only used for temporary
1319 * kernel mappings that do not need to have
1320 * page modification or references recorded.
1321 * Note that old mappings are simply written
1322 * over.  The page *must* be wired.
1323 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1324 */
1325void
1326pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1327{
1328	pt_entry_t *endpte, oldpte, *pte;
1329
1330	oldpte = 0;
1331	pte = vtopte(sva);
1332	endpte = pte + count;
1333	while (pte < endpte) {
1334		oldpte |= *pte;
1335		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G |
1336		    pmap_cache_bits((*ma)->md.pat_mode, 0) | PG_RW | PG_V);
1337		pte++;
1338		ma++;
1339	}
1340	if ((oldpte & PG_V) != 0)
1341		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1342		    PAGE_SIZE);
1343}
1344
1345/*
1346 * This routine tears out page mappings from the
1347 * kernel -- it is meant only for temporary mappings.
1348 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1349 */
1350void
1351pmap_qremove(vm_offset_t sva, int count)
1352{
1353	vm_offset_t va;
1354
1355	va = sva;
1356	while (count-- > 0) {
1357		pmap_kremove(va);
1358		va += PAGE_SIZE;
1359	}
1360	pmap_invalidate_range(kernel_pmap, sva, va);
1361}
1362
1363/***************************************************
1364 * Page table page management routines.....
1365 ***************************************************/
1366static __inline void
1367pmap_free_zero_pages(vm_page_t free)
1368{
1369	vm_page_t m;
1370
1371	while (free != NULL) {
1372		m = free;
1373		free = m->right;
1374		/* Preserve the page's PG_ZERO setting. */
1375		vm_page_free_toq(m);
1376	}
1377}
1378
1379/*
1380 * Schedule the specified unused page table page to be freed.  Specifically,
1381 * add the page to the specified list of pages that will be released to the
1382 * physical memory manager after the TLB has been updated.
1383 */
1384static __inline void
1385pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1386{
1387
1388	if (set_PG_ZERO)
1389		m->flags |= PG_ZERO;
1390	else
1391		m->flags &= ~PG_ZERO;
1392	m->right = *free;
1393	*free = m;
1394}
1395
1396/*
1397 * Inserts the specified page table page into the specified pmap's collection
1398 * of idle page table pages.  Each of a pmap's page table pages is responsible
1399 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1400 * ordered by this virtual address range.
1401 */
1402static void
1403pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1404{
1405	vm_page_t root;
1406
1407	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1408	root = pmap->pm_root;
1409	if (root == NULL) {
1410		mpte->left = NULL;
1411		mpte->right = NULL;
1412	} else {
1413		root = vm_page_splay(mpte->pindex, root);
1414		if (mpte->pindex < root->pindex) {
1415			mpte->left = root->left;
1416			mpte->right = root;
1417			root->left = NULL;
1418		} else if (mpte->pindex == root->pindex)
1419			panic("pmap_insert_pt_page: pindex already inserted");
1420		else {
1421			mpte->right = root->right;
1422			mpte->left = root;
1423			root->right = NULL;
1424		}
1425	}
1426	pmap->pm_root = mpte;
1427}
1428
1429/*
1430 * Looks for a page table page mapping the specified virtual address in the
1431 * specified pmap's collection of idle page table pages.  Returns NULL if there
1432 * is no page table page corresponding to the specified virtual address.
1433 */
1434static vm_page_t
1435pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1436{
1437	vm_page_t mpte;
1438	vm_pindex_t pindex = pmap_pde_pindex(va);
1439
1440	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1441	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1442		mpte = vm_page_splay(pindex, mpte);
1443		if ((pmap->pm_root = mpte)->pindex != pindex)
1444			mpte = NULL;
1445	}
1446	return (mpte);
1447}
1448
1449/*
1450 * Removes the specified page table page from the specified pmap's collection
1451 * of idle page table pages.  The specified page table page must be a member of
1452 * the pmap's collection.
1453 */
1454static void
1455pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1456{
1457	vm_page_t root;
1458
1459	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1460	if (mpte != pmap->pm_root) {
1461		root = vm_page_splay(mpte->pindex, pmap->pm_root);
1462		KASSERT(mpte == root,
1463		    ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
1464		    mpte, pmap));
1465	}
1466	if (mpte->left == NULL)
1467		root = mpte->right;
1468	else {
1469		root = vm_page_splay(mpte->pindex, mpte->left);
1470		root->right = mpte->right;
1471	}
1472	pmap->pm_root = root;
1473}
1474
1475/*
1476 * This routine unholds page table pages, and if the hold count
1477 * drops to zero, then it decrements the wire count.
1478 */
1479static __inline int
1480pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1481{
1482
1483	--m->wire_count;
1484	if (m->wire_count == 0)
1485		return (_pmap_unwire_pte_hold(pmap, va, m, free));
1486	else
1487		return (0);
1488}
1489
1490static int
1491_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
1492    vm_page_t *free)
1493{
1494
1495	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1496	/*
1497	 * unmap the page table page
1498	 */
1499	if (m->pindex >= (NUPDE + NUPDPE)) {
1500		/* PDP page */
1501		pml4_entry_t *pml4;
1502		pml4 = pmap_pml4e(pmap, va);
1503		*pml4 = 0;
1504	} else if (m->pindex >= NUPDE) {
1505		/* PD page */
1506		pdp_entry_t *pdp;
1507		pdp = pmap_pdpe(pmap, va);
1508		*pdp = 0;
1509	} else {
1510		/* PTE page */
1511		pd_entry_t *pd;
1512		pd = pmap_pde(pmap, va);
1513		*pd = 0;
1514	}
1515	pmap_resident_count_dec(pmap, 1);
1516	if (m->pindex < NUPDE) {
1517		/* We just released a PT, unhold the matching PD */
1518		vm_page_t pdpg;
1519
1520		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1521		pmap_unwire_pte_hold(pmap, va, pdpg, free);
1522	}
1523	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1524		/* We just released a PD, unhold the matching PDP */
1525		vm_page_t pdppg;
1526
1527		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1528		pmap_unwire_pte_hold(pmap, va, pdppg, free);
1529	}
1530
1531	/*
1532	 * This is a release store so that the ordinary store unmapping
1533	 * the page table page is globally performed before TLB shoot-
1534	 * down is begun.
1535	 */
1536	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1537
1538	/*
1539	 * Put page on a list so that it is released after
1540	 * *ALL* TLB shootdown is done
1541	 */
1542	pmap_add_delayed_free_list(m, free, TRUE);
1543
1544	return (1);
1545}
1546
1547/*
1548 * After removing a page table entry, this routine is used to
1549 * conditionally free the page, and manage the hold/wire counts.
1550 */
1551static int
1552pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
1553{
1554	vm_page_t mpte;
1555
1556	if (va >= VM_MAXUSER_ADDRESS)
1557		return (0);
1558	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1559	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1560	return (pmap_unwire_pte_hold(pmap, va, mpte, free));
1561}
1562
1563void
1564pmap_pinit0(pmap_t pmap)
1565{
1566
1567	PMAP_LOCK_INIT(pmap);
1568	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
1569	pmap->pm_root = NULL;
1570	pmap->pm_active = 0;
1571	TAILQ_INIT(&pmap->pm_pvchunk);
1572	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1573}
1574
1575/*
1576 * Initialize a preallocated and zeroed pmap structure,
1577 * such as one in a vmspace structure.
1578 */
1579int
1580pmap_pinit(pmap_t pmap)
1581{
1582	vm_page_t pml4pg;
1583	static vm_pindex_t color;
1584
1585	PMAP_LOCK_INIT(pmap);
1586
1587	/*
1588	 * allocate the page directory page
1589	 */
1590	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1591	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1592		VM_WAIT;
1593
1594	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1595
1596	if ((pml4pg->flags & PG_ZERO) == 0)
1597		pagezero(pmap->pm_pml4);
1598
1599	/* Wire in kernel global address entries. */
1600	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1601	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1602
1603	/* install self-referential address mapping entry(s) */
1604	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1605
1606	pmap->pm_root = NULL;
1607	pmap->pm_active = 0;
1608	TAILQ_INIT(&pmap->pm_pvchunk);
1609	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1610
1611	return (1);
1612}
1613
1614/*
1615 * this routine is called if the page table page is not
1616 * mapped correctly.
1617 *
1618 * Note: If a page allocation fails at page table level two or three,
1619 * one or two pages may be held during the wait, only to be released
1620 * afterwards.  This conservative approach is easily argued to avoid
1621 * race conditions.
1622 */
1623static vm_page_t
1624_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1625{
1626	vm_page_t m, pdppg, pdpg;
1627
1628	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1629	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1630	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1631
1632	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1633	/*
1634	 * Allocate a page table page.
1635	 */
1636	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1637	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1638		if (flags & M_WAITOK) {
1639			PMAP_UNLOCK(pmap);
1640			vm_page_unlock_queues();
1641			VM_WAIT;
1642			vm_page_lock_queues();
1643			PMAP_LOCK(pmap);
1644		}
1645
1646		/*
1647		 * Indicate the need to retry.  While waiting, the page table
1648		 * page may have been allocated.
1649		 */
1650		return (NULL);
1651	}
1652	if ((m->flags & PG_ZERO) == 0)
1653		pmap_zero_page(m);
1654
1655	/*
1656	 * Map the pagetable page into the process address space, if
1657	 * it isn't already there.
1658	 */
1659
1660	if (ptepindex >= (NUPDE + NUPDPE)) {
1661		pml4_entry_t *pml4;
1662		vm_pindex_t pml4index;
1663
1664		/* Wire up a new PDPE page */
1665		pml4index = ptepindex - (NUPDE + NUPDPE);
1666		pml4 = &pmap->pm_pml4[pml4index];
1667		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1668
1669	} else if (ptepindex >= NUPDE) {
1670		vm_pindex_t pml4index;
1671		vm_pindex_t pdpindex;
1672		pml4_entry_t *pml4;
1673		pdp_entry_t *pdp;
1674
1675		/* Wire up a new PDE page */
1676		pdpindex = ptepindex - NUPDE;
1677		pml4index = pdpindex >> NPML4EPGSHIFT;
1678
1679		pml4 = &pmap->pm_pml4[pml4index];
1680		if ((*pml4 & PG_V) == 0) {
1681			/* Have to allocate a new pdp, recurse */
1682			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1683			    flags) == NULL) {
1684				--m->wire_count;
1685				atomic_subtract_int(&cnt.v_wire_count, 1);
1686				vm_page_free_zero(m);
1687				return (NULL);
1688			}
1689		} else {
1690			/* Add reference to pdp page */
1691			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1692			pdppg->wire_count++;
1693		}
1694		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1695
1696		/* Now find the pdp page */
1697		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1698		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1699
1700	} else {
1701		vm_pindex_t pml4index;
1702		vm_pindex_t pdpindex;
1703		pml4_entry_t *pml4;
1704		pdp_entry_t *pdp;
1705		pd_entry_t *pd;
1706
1707		/* Wire up a new PTE page */
1708		pdpindex = ptepindex >> NPDPEPGSHIFT;
1709		pml4index = pdpindex >> NPML4EPGSHIFT;
1710
1711		/* First, find the pdp and check that its valid. */
1712		pml4 = &pmap->pm_pml4[pml4index];
1713		if ((*pml4 & PG_V) == 0) {
1714			/* Have to allocate a new pd, recurse */
1715			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1716			    flags) == NULL) {
1717				--m->wire_count;
1718				atomic_subtract_int(&cnt.v_wire_count, 1);
1719				vm_page_free_zero(m);
1720				return (NULL);
1721			}
1722			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1723			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1724		} else {
1725			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1726			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1727			if ((*pdp & PG_V) == 0) {
1728				/* Have to allocate a new pd, recurse */
1729				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1730				    flags) == NULL) {
1731					--m->wire_count;
1732					atomic_subtract_int(&cnt.v_wire_count,
1733					    1);
1734					vm_page_free_zero(m);
1735					return (NULL);
1736				}
1737			} else {
1738				/* Add reference to the pd page */
1739				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1740				pdpg->wire_count++;
1741			}
1742		}
1743		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1744
1745		/* Now we know where the page directory page is */
1746		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1747		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1748	}
1749
1750	pmap_resident_count_inc(pmap, 1);
1751
1752	return (m);
1753}
1754
1755static vm_page_t
1756pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1757{
1758	vm_pindex_t pdpindex, ptepindex;
1759	pdp_entry_t *pdpe;
1760	vm_page_t pdpg;
1761
1762	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1763	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1764	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1765retry:
1766	pdpe = pmap_pdpe(pmap, va);
1767	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1768		/* Add a reference to the pd page. */
1769		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1770		pdpg->wire_count++;
1771	} else {
1772		/* Allocate a pd page. */
1773		ptepindex = pmap_pde_pindex(va);
1774		pdpindex = ptepindex >> NPDPEPGSHIFT;
1775		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1776		if (pdpg == NULL && (flags & M_WAITOK))
1777			goto retry;
1778	}
1779	return (pdpg);
1780}
1781
1782static vm_page_t
1783pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1784{
1785	vm_pindex_t ptepindex;
1786	pd_entry_t *pd;
1787	vm_page_t m;
1788
1789	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1790	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1791	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1792
1793	/*
1794	 * Calculate pagetable page index
1795	 */
1796	ptepindex = pmap_pde_pindex(va);
1797retry:
1798	/*
1799	 * Get the page directory entry
1800	 */
1801	pd = pmap_pde(pmap, va);
1802
1803	/*
1804	 * This supports switching from a 2MB page to a
1805	 * normal 4K page.
1806	 */
1807	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1808		if (!pmap_demote_pde(pmap, pd, va)) {
1809			/*
1810			 * Invalidation of the 2MB page mapping may have caused
1811			 * the deallocation of the underlying PD page.
1812			 */
1813			pd = NULL;
1814		}
1815	}
1816
1817	/*
1818	 * If the page table page is mapped, we just increment the
1819	 * hold count, and activate it.
1820	 */
1821	if (pd != NULL && (*pd & PG_V) != 0) {
1822		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1823		m->wire_count++;
1824	} else {
1825		/*
1826		 * Here if the pte page isn't mapped, or if it has been
1827		 * deallocated.
1828		 */
1829		m = _pmap_allocpte(pmap, ptepindex, flags);
1830		if (m == NULL && (flags & M_WAITOK))
1831			goto retry;
1832	}
1833	return (m);
1834}
1835
1836
1837/***************************************************
1838 * Pmap allocation/deallocation routines.
1839 ***************************************************/
1840
1841/*
1842 * Release any resources held by the given physical map.
1843 * Called when a pmap initialized by pmap_pinit is being released.
1844 * Should only be called if the map contains no valid mappings.
1845 */
1846void
1847pmap_release(pmap_t pmap)
1848{
1849	vm_page_t m;
1850
1851	KASSERT(pmap->pm_stats.resident_count == 0,
1852	    ("pmap_release: pmap resident count %ld != 0",
1853	    pmap->pm_stats.resident_count));
1854	KASSERT(pmap->pm_root == NULL,
1855	    ("pmap_release: pmap has reserved page table page(s)"));
1856
1857	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1858
1859	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1860	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1861	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1862
1863	m->wire_count--;
1864	atomic_subtract_int(&cnt.v_wire_count, 1);
1865	vm_page_free_zero(m);
1866	PMAP_LOCK_DESTROY(pmap);
1867}
1868
1869static int
1870kvm_size(SYSCTL_HANDLER_ARGS)
1871{
1872	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1873
1874	return sysctl_handle_long(oidp, &ksize, 0, req);
1875}
1876SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1877    0, 0, kvm_size, "LU", "Size of KVM");
1878
1879static int
1880kvm_free(SYSCTL_HANDLER_ARGS)
1881{
1882	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1883
1884	return sysctl_handle_long(oidp, &kfree, 0, req);
1885}
1886SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1887    0, 0, kvm_free, "LU", "Amount of KVM free");
1888
1889/*
1890 * grow the number of kernel page table entries, if needed
1891 */
1892void
1893pmap_growkernel(vm_offset_t addr)
1894{
1895	vm_paddr_t paddr;
1896	vm_page_t nkpg;
1897	pd_entry_t *pde, newpdir;
1898	pdp_entry_t *pdpe;
1899
1900	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1901
1902	/*
1903	 * Return if "addr" is within the range of kernel page table pages
1904	 * that were preallocated during pmap bootstrap.  Moreover, leave
1905	 * "kernel_vm_end" and the kernel page table as they were.
1906	 *
1907	 * The correctness of this action is based on the following
1908	 * argument: vm_map_findspace() allocates contiguous ranges of the
1909	 * kernel virtual address space.  It calls this function if a range
1910	 * ends after "kernel_vm_end".  If the kernel is mapped between
1911	 * "kernel_vm_end" and "addr", then the range cannot begin at
1912	 * "kernel_vm_end".  In fact, its beginning address cannot be less
1913	 * than the kernel.  Thus, there is no immediate need to allocate
1914	 * any new kernel page table pages between "kernel_vm_end" and
1915	 * "KERNBASE".
1916	 */
1917	if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR)
1918		return;
1919
1920	addr = roundup2(addr, NBPDR);
1921	if (addr - 1 >= kernel_map->max_offset)
1922		addr = kernel_map->max_offset;
1923	while (kernel_vm_end < addr) {
1924		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
1925		if ((*pdpe & PG_V) == 0) {
1926			/* We need a new PDP entry */
1927			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
1928			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1929			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1930			if (nkpg == NULL)
1931				panic("pmap_growkernel: no memory to grow kernel");
1932			if ((nkpg->flags & PG_ZERO) == 0)
1933				pmap_zero_page(nkpg);
1934			paddr = VM_PAGE_TO_PHYS(nkpg);
1935			*pdpe = (pdp_entry_t)
1936				(paddr | PG_V | PG_RW | PG_A | PG_M);
1937			continue; /* try again */
1938		}
1939		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
1940		if ((*pde & PG_V) != 0) {
1941			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
1942			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1943				kernel_vm_end = kernel_map->max_offset;
1944				break;
1945			}
1946			continue;
1947		}
1948
1949		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
1950		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1951		    VM_ALLOC_ZERO);
1952		if (nkpg == NULL)
1953			panic("pmap_growkernel: no memory to grow kernel");
1954		if ((nkpg->flags & PG_ZERO) == 0)
1955			pmap_zero_page(nkpg);
1956		paddr = VM_PAGE_TO_PHYS(nkpg);
1957		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1958		pde_store(pde, newpdir);
1959
1960		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
1961		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1962			kernel_vm_end = kernel_map->max_offset;
1963			break;
1964		}
1965	}
1966}
1967
1968
1969/***************************************************
1970 * page management routines.
1971 ***************************************************/
1972
1973CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1974CTASSERT(_NPCM == 3);
1975CTASSERT(_NPCPV == 168);
1976
1977static __inline struct pv_chunk *
1978pv_to_chunk(pv_entry_t pv)
1979{
1980
1981	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1982}
1983
1984#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1985
1986#define	PC_FREE0	0xfffffffffffffffful
1987#define	PC_FREE1	0xfffffffffffffffful
1988#define	PC_FREE2	0x000000fffffffffful
1989
1990static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1991
1992SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1993	"Current number of pv entries");
1994
1995#ifdef PV_STATS
1996static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1997
1998SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1999	"Current number of pv entry chunks");
2000SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2001	"Current number of pv entry chunks allocated");
2002SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2003	"Current number of pv entry chunks frees");
2004SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2005	"Number of times tried to get a chunk page but failed.");
2006
2007static long pv_entry_frees, pv_entry_allocs;
2008static int pv_entry_spare;
2009
2010SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2011	"Current number of pv entry frees");
2012SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2013	"Current number of pv entry allocs");
2014SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2015	"Current number of spare pv entries");
2016
2017static int pmap_collect_inactive, pmap_collect_active;
2018
2019SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
2020	"Current number times pmap_collect called on inactive queue");
2021SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
2022	"Current number times pmap_collect called on active queue");
2023#endif
2024
2025/*
2026 * We are in a serious low memory condition.  Resort to
2027 * drastic measures to free some pages so we can allocate
2028 * another pv entry chunk.  This is normally called to
2029 * unmap inactive pages, and if necessary, active pages.
2030 *
2031 * We do not, however, unmap 2mpages because subsequent accesses will
2032 * allocate per-page pv entries until repromotion occurs, thereby
2033 * exacerbating the shortage of free pv entries.
2034 */
2035static void
2036pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
2037{
2038	struct md_page *pvh;
2039	pd_entry_t *pde;
2040	pmap_t pmap;
2041	pt_entry_t *pte, tpte;
2042	pv_entry_t next_pv, pv;
2043	vm_offset_t va;
2044	vm_page_t m, free;
2045
2046	TAILQ_FOREACH(m, &vpq->pl, pageq) {
2047		if (m->hold_count || m->busy)
2048			continue;
2049		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
2050			va = pv->pv_va;
2051			pmap = PV_PMAP(pv);
2052			/* Avoid deadlock and lock recursion. */
2053			if (pmap > locked_pmap)
2054				PMAP_LOCK(pmap);
2055			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
2056				continue;
2057			pmap_resident_count_dec(pmap, 1);
2058			pde = pmap_pde(pmap, va);
2059			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
2060			    " a 2mpage in page %p's pv list", m));
2061			pte = pmap_pde_to_pte(pde, va);
2062			tpte = pte_load_clear(pte);
2063			KASSERT((tpte & PG_W) == 0,
2064			    ("pmap_collect: wired pte %#lx", tpte));
2065			if (tpte & PG_A)
2066				vm_page_flag_set(m, PG_REFERENCED);
2067			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2068				vm_page_dirty(m);
2069			free = NULL;
2070			pmap_unuse_pt(pmap, va, *pde, &free);
2071			pmap_invalidate_page(pmap, va);
2072			pmap_free_zero_pages(free);
2073			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2074			if (TAILQ_EMPTY(&m->md.pv_list)) {
2075				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2076				if (TAILQ_EMPTY(&pvh->pv_list))
2077					vm_page_flag_clear(m, PG_WRITEABLE);
2078			}
2079			free_pv_entry(pmap, pv);
2080			if (pmap != locked_pmap)
2081				PMAP_UNLOCK(pmap);
2082		}
2083	}
2084}
2085
2086
2087/*
2088 * free the pv_entry back to the free list
2089 */
2090static void
2091free_pv_entry(pmap_t pmap, pv_entry_t pv)
2092{
2093	vm_page_t m;
2094	struct pv_chunk *pc;
2095	int idx, field, bit;
2096
2097	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2098	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2099	PV_STAT(pv_entry_frees++);
2100	PV_STAT(pv_entry_spare++);
2101	pv_entry_count--;
2102	pc = pv_to_chunk(pv);
2103	idx = pv - &pc->pc_pventry[0];
2104	field = idx / 64;
2105	bit = idx % 64;
2106	pc->pc_map[field] |= 1ul << bit;
2107	/* move to head of list */
2108	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2109	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2110	    pc->pc_map[2] != PC_FREE2) {
2111		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2112		return;
2113	}
2114	PV_STAT(pv_entry_spare -= _NPCPV);
2115	PV_STAT(pc_chunk_count--);
2116	PV_STAT(pc_chunk_frees++);
2117	/* entire chunk is free, return it */
2118	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2119	dump_drop_page(m->phys_addr);
2120	vm_page_unwire(m, 0);
2121	vm_page_free(m);
2122}
2123
2124/*
2125 * get a new pv_entry, allocating a block from the system
2126 * when needed.
2127 */
2128static pv_entry_t
2129get_pv_entry(pmap_t pmap, int try)
2130{
2131	static const struct timeval printinterval = { 60, 0 };
2132	static struct timeval lastprint;
2133	static vm_pindex_t colour;
2134	struct vpgqueues *pq;
2135	int bit, field;
2136	pv_entry_t pv;
2137	struct pv_chunk *pc;
2138	vm_page_t m;
2139
2140	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2141	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2142	PV_STAT(pv_entry_allocs++);
2143	pv_entry_count++;
2144	if (pv_entry_count > pv_entry_high_water)
2145		if (ratecheck(&lastprint, &printinterval))
2146			printf("Approaching the limit on PV entries, consider "
2147			    "increasing either the vm.pmap.shpgperproc or the "
2148			    "vm.pmap.pv_entry_max sysctl.\n");
2149	pq = NULL;
2150retry:
2151	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2152	if (pc != NULL) {
2153		for (field = 0; field < _NPCM; field++) {
2154			if (pc->pc_map[field]) {
2155				bit = bsfq(pc->pc_map[field]);
2156				break;
2157			}
2158		}
2159		if (field < _NPCM) {
2160			pv = &pc->pc_pventry[field * 64 + bit];
2161			pc->pc_map[field] &= ~(1ul << bit);
2162			/* If this was the last item, move it to tail */
2163			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2164			    pc->pc_map[2] == 0) {
2165				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2166				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2167			}
2168			PV_STAT(pv_entry_spare--);
2169			return (pv);
2170		}
2171	}
2172	/* No free items, allocate another chunk */
2173	m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ?
2174	    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
2175	    VM_ALLOC_WIRED);
2176	if (m == NULL) {
2177		if (try) {
2178			pv_entry_count--;
2179			PV_STAT(pc_chunk_tryfail++);
2180			return (NULL);
2181		}
2182		/*
2183		 * Reclaim pv entries: At first, destroy mappings to inactive
2184		 * pages.  After that, if a pv chunk entry is still needed,
2185		 * destroy mappings to active pages.
2186		 */
2187		if (pq == NULL) {
2188			PV_STAT(pmap_collect_inactive++);
2189			pq = &vm_page_queues[PQ_INACTIVE];
2190		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2191			PV_STAT(pmap_collect_active++);
2192			pq = &vm_page_queues[PQ_ACTIVE];
2193		} else
2194			panic("get_pv_entry: increase vm.pmap.shpgperproc");
2195		pmap_collect(pmap, pq);
2196		goto retry;
2197	}
2198	PV_STAT(pc_chunk_count++);
2199	PV_STAT(pc_chunk_allocs++);
2200	colour++;
2201	dump_add_page(m->phys_addr);
2202	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2203	pc->pc_pmap = pmap;
2204	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2205	pc->pc_map[1] = PC_FREE1;
2206	pc->pc_map[2] = PC_FREE2;
2207	pv = &pc->pc_pventry[0];
2208	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2209	PV_STAT(pv_entry_spare += _NPCPV - 1);
2210	return (pv);
2211}
2212
2213/*
2214 * First find and then remove the pv entry for the specified pmap and virtual
2215 * address from the specified pv list.  Returns the pv entry if found and NULL
2216 * otherwise.  This operation can be performed on pv lists for either 4KB or
2217 * 2MB page mappings.
2218 */
2219static __inline pv_entry_t
2220pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2221{
2222	pv_entry_t pv;
2223
2224	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2225	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2226		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2227			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2228			break;
2229		}
2230	}
2231	return (pv);
2232}
2233
2234/*
2235 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2236 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2237 * entries for each of the 4KB page mappings.
2238 */
2239static void
2240pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2241{
2242	struct md_page *pvh;
2243	pv_entry_t pv;
2244	vm_offset_t va_last;
2245	vm_page_t m;
2246
2247	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2248	KASSERT((pa & PDRMASK) == 0,
2249	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
2250
2251	/*
2252	 * Transfer the 2mpage's pv entry for this mapping to the first
2253	 * page's pv list.
2254	 */
2255	pvh = pa_to_pvh(pa);
2256	va = trunc_2mpage(va);
2257	pv = pmap_pvh_remove(pvh, pmap, va);
2258	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2259	m = PHYS_TO_VM_PAGE(pa);
2260	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2261	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2262	va_last = va + NBPDR - PAGE_SIZE;
2263	do {
2264		m++;
2265		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2266		    ("pmap_pv_demote_pde: page %p is not managed", m));
2267		va += PAGE_SIZE;
2268		pmap_insert_entry(pmap, va, m);
2269	} while (va < va_last);
2270}
2271
2272/*
2273 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2274 * replace the many pv entries for the 4KB page mappings by a single pv entry
2275 * for the 2MB page mapping.
2276 */
2277static void
2278pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2279{
2280	struct md_page *pvh;
2281	pv_entry_t pv;
2282	vm_offset_t va_last;
2283	vm_page_t m;
2284
2285	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2286	KASSERT((pa & PDRMASK) == 0,
2287	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
2288
2289	/*
2290	 * Transfer the first page's pv entry for this mapping to the
2291	 * 2mpage's pv list.  Aside from avoiding the cost of a call
2292	 * to get_pv_entry(), a transfer avoids the possibility that
2293	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2294	 * removes one of the mappings that is being promoted.
2295	 */
2296	m = PHYS_TO_VM_PAGE(pa);
2297	va = trunc_2mpage(va);
2298	pv = pmap_pvh_remove(&m->md, pmap, va);
2299	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2300	pvh = pa_to_pvh(pa);
2301	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2302	/* Free the remaining NPTEPG - 1 pv entries. */
2303	va_last = va + NBPDR - PAGE_SIZE;
2304	do {
2305		m++;
2306		va += PAGE_SIZE;
2307		pmap_pvh_free(&m->md, pmap, va);
2308	} while (va < va_last);
2309}
2310
2311/*
2312 * First find and then destroy the pv entry for the specified pmap and virtual
2313 * address.  This operation can be performed on pv lists for either 4KB or 2MB
2314 * page mappings.
2315 */
2316static void
2317pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2318{
2319	pv_entry_t pv;
2320
2321	pv = pmap_pvh_remove(pvh, pmap, va);
2322	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2323	free_pv_entry(pmap, pv);
2324}
2325
2326static void
2327pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2328{
2329	struct md_page *pvh;
2330
2331	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2332	pmap_pvh_free(&m->md, pmap, va);
2333	if (TAILQ_EMPTY(&m->md.pv_list)) {
2334		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2335		if (TAILQ_EMPTY(&pvh->pv_list))
2336			vm_page_flag_clear(m, PG_WRITEABLE);
2337	}
2338}
2339
2340/*
2341 * Create a pv entry for page at pa for
2342 * (pmap, va).
2343 */
2344static void
2345pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2346{
2347	pv_entry_t pv;
2348
2349	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2350	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2351	pv = get_pv_entry(pmap, FALSE);
2352	pv->pv_va = va;
2353	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2354}
2355
2356/*
2357 * Conditionally create a pv entry.
2358 */
2359static boolean_t
2360pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2361{
2362	pv_entry_t pv;
2363
2364	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2365	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2366	if (pv_entry_count < pv_entry_high_water &&
2367	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2368		pv->pv_va = va;
2369		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2370		return (TRUE);
2371	} else
2372		return (FALSE);
2373}
2374
2375/*
2376 * Create the pv entry for a 2MB page mapping.
2377 */
2378static boolean_t
2379pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2380{
2381	struct md_page *pvh;
2382	pv_entry_t pv;
2383
2384	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2385	if (pv_entry_count < pv_entry_high_water &&
2386	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2387		pv->pv_va = va;
2388		pvh = pa_to_pvh(pa);
2389		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2390		return (TRUE);
2391	} else
2392		return (FALSE);
2393}
2394
2395/*
2396 * Fills a page table page with mappings to consecutive physical pages.
2397 */
2398static void
2399pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2400{
2401	pt_entry_t *pte;
2402
2403	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2404		*pte = newpte;
2405		newpte += PAGE_SIZE;
2406	}
2407}
2408
2409/*
2410 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
2411 * mapping is invalidated.
2412 */
2413static boolean_t
2414pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2415{
2416	pd_entry_t newpde, oldpde;
2417	pt_entry_t *firstpte, newpte;
2418	vm_paddr_t mptepa;
2419	vm_page_t free, mpte;
2420
2421	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2422	oldpde = *pde;
2423	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2424	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2425	mpte = pmap_lookup_pt_page(pmap, va);
2426	if (mpte != NULL)
2427		pmap_remove_pt_page(pmap, mpte);
2428	else {
2429		KASSERT((oldpde & PG_W) == 0,
2430		    ("pmap_demote_pde: page table page for a wired mapping"
2431		    " is missing"));
2432
2433		/*
2434		 * Invalidate the 2MB page mapping and return "failure" if the
2435		 * mapping was never accessed or the allocation of the new
2436		 * page table page fails.  If the 2MB page mapping belongs to
2437		 * the direct map region of the kernel's address space, then
2438		 * the page allocation request specifies the highest possible
2439		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
2440		 * normal.  Page table pages are preallocated for every other
2441		 * part of the kernel address space, so the direct map region
2442		 * is the only part of the kernel address space that must be
2443		 * handled here.
2444		 */
2445		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2446		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
2447		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
2448		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2449			free = NULL;
2450			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
2451			pmap_invalidate_page(pmap, trunc_2mpage(va));
2452			pmap_free_zero_pages(free);
2453			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
2454			    " in pmap %p", va, pmap);
2455			return (FALSE);
2456		}
2457		if (va < VM_MAXUSER_ADDRESS)
2458			pmap_resident_count_inc(pmap, 1);
2459	}
2460	mptepa = VM_PAGE_TO_PHYS(mpte);
2461	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2462	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2463	KASSERT((oldpde & PG_A) != 0,
2464	    ("pmap_demote_pde: oldpde is missing PG_A"));
2465	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2466	    ("pmap_demote_pde: oldpde is missing PG_M"));
2467	newpte = oldpde & ~PG_PS;
2468	if ((newpte & PG_PDE_PAT) != 0)
2469		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2470
2471	/*
2472	 * If the page table page is new, initialize it.
2473	 */
2474	if (mpte->wire_count == 1) {
2475		mpte->wire_count = NPTEPG;
2476		pmap_fill_ptp(firstpte, newpte);
2477	}
2478	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2479	    ("pmap_demote_pde: firstpte and newpte map different physical"
2480	    " addresses"));
2481
2482	/*
2483	 * If the mapping has changed attributes, update the page table
2484	 * entries.
2485	 */
2486	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2487		pmap_fill_ptp(firstpte, newpte);
2488
2489	/*
2490	 * Demote the mapping.  This pmap is locked.  The old PDE has
2491	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2492	 * set.  Thus, there is no danger of a race with another
2493	 * processor changing the setting of PG_A and/or PG_M between
2494	 * the read above and the store below.
2495	 */
2496	if (workaround_erratum383)
2497		pmap_update_pde(pmap, va, pde, newpde);
2498	else
2499		pde_store(pde, newpde);
2500
2501	/*
2502	 * Invalidate a stale recursive mapping of the page table page.
2503	 */
2504	if (va >= VM_MAXUSER_ADDRESS)
2505		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2506
2507	/*
2508	 * Demote the pv entry.  This depends on the earlier demotion
2509	 * of the mapping.  Specifically, the (re)creation of a per-
2510	 * page pv entry might trigger the execution of pmap_collect(),
2511	 * which might reclaim a newly (re)created per-page pv entry
2512	 * and destroy the associated mapping.  In order to destroy
2513	 * the mapping, the PDE must have already changed from mapping
2514	 * the 2mpage to referencing the page table page.
2515	 */
2516	if ((oldpde & PG_MANAGED) != 0)
2517		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2518
2519	pmap_pde_demotions++;
2520	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
2521	    " in pmap %p", va, pmap);
2522	return (TRUE);
2523}
2524
2525/*
2526 * pmap_remove_pde: do the things to unmap a superpage in a process
2527 */
2528static int
2529pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2530    vm_page_t *free)
2531{
2532	struct md_page *pvh;
2533	pd_entry_t oldpde;
2534	vm_offset_t eva, va;
2535	vm_page_t m, mpte;
2536
2537	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2538	KASSERT((sva & PDRMASK) == 0,
2539	    ("pmap_remove_pde: sva is not 2mpage aligned"));
2540	oldpde = pte_load_clear(pdq);
2541	if (oldpde & PG_W)
2542		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2543
2544	/*
2545	 * Machines that don't support invlpg, also don't support
2546	 * PG_G.
2547	 */
2548	if (oldpde & PG_G)
2549		pmap_invalidate_page(kernel_pmap, sva);
2550	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
2551	if (oldpde & PG_MANAGED) {
2552		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2553		pmap_pvh_free(pvh, pmap, sva);
2554		eva = sva + NBPDR;
2555		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2556		    va < eva; va += PAGE_SIZE, m++) {
2557			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2558				vm_page_dirty(m);
2559			if (oldpde & PG_A)
2560				vm_page_flag_set(m, PG_REFERENCED);
2561			if (TAILQ_EMPTY(&m->md.pv_list) &&
2562			    TAILQ_EMPTY(&pvh->pv_list))
2563				vm_page_flag_clear(m, PG_WRITEABLE);
2564		}
2565	}
2566	if (pmap == kernel_pmap) {
2567		if (!pmap_demote_pde(pmap, pdq, sva))
2568			panic("pmap_remove_pde: failed demotion");
2569	} else {
2570		mpte = pmap_lookup_pt_page(pmap, sva);
2571		if (mpte != NULL) {
2572			pmap_remove_pt_page(pmap, mpte);
2573			pmap_resident_count_dec(pmap, 1);
2574			KASSERT(mpte->wire_count == NPTEPG,
2575			    ("pmap_remove_pde: pte page wire count error"));
2576			mpte->wire_count = 0;
2577			pmap_add_delayed_free_list(mpte, free, FALSE);
2578			atomic_subtract_int(&cnt.v_wire_count, 1);
2579		}
2580	}
2581	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
2582}
2583
2584/*
2585 * pmap_remove_pte: do the things to unmap a page in a process
2586 */
2587static int
2588pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2589    pd_entry_t ptepde, vm_page_t *free)
2590{
2591	pt_entry_t oldpte;
2592	vm_page_t m;
2593
2594	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2595	oldpte = pte_load_clear(ptq);
2596	if (oldpte & PG_W)
2597		pmap->pm_stats.wired_count -= 1;
2598	/*
2599	 * Machines that don't support invlpg, also don't support
2600	 * PG_G.
2601	 */
2602	if (oldpte & PG_G)
2603		pmap_invalidate_page(kernel_pmap, va);
2604	pmap_resident_count_dec(pmap, 1);
2605	if (oldpte & PG_MANAGED) {
2606		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2607		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2608			vm_page_dirty(m);
2609		if (oldpte & PG_A)
2610			vm_page_flag_set(m, PG_REFERENCED);
2611		pmap_remove_entry(pmap, m, va);
2612	}
2613	return (pmap_unuse_pt(pmap, va, ptepde, free));
2614}
2615
2616/*
2617 * Remove a single page from a process address space
2618 */
2619static void
2620pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
2621{
2622	pt_entry_t *pte;
2623
2624	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2625	if ((*pde & PG_V) == 0)
2626		return;
2627	pte = pmap_pde_to_pte(pde, va);
2628	if ((*pte & PG_V) == 0)
2629		return;
2630	pmap_remove_pte(pmap, pte, va, *pde, free);
2631	pmap_invalidate_page(pmap, va);
2632}
2633
2634/*
2635 *	Remove the given range of addresses from the specified map.
2636 *
2637 *	It is assumed that the start and end are properly
2638 *	rounded to the page size.
2639 */
2640void
2641pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2642{
2643	vm_offset_t va_next;
2644	pml4_entry_t *pml4e;
2645	pdp_entry_t *pdpe;
2646	pd_entry_t ptpaddr, *pde;
2647	pt_entry_t *pte;
2648	vm_page_t free = NULL;
2649	int anyvalid;
2650
2651	/*
2652	 * Perform an unsynchronized read.  This is, however, safe.
2653	 */
2654	if (pmap->pm_stats.resident_count == 0)
2655		return;
2656
2657	anyvalid = 0;
2658
2659	vm_page_lock_queues();
2660	PMAP_LOCK(pmap);
2661
2662	/*
2663	 * special handling of removing one page.  a very
2664	 * common operation and easy to short circuit some
2665	 * code.
2666	 */
2667	if (sva + PAGE_SIZE == eva) {
2668		pde = pmap_pde(pmap, sva);
2669		if (pde && (*pde & PG_PS) == 0) {
2670			pmap_remove_page(pmap, sva, pde, &free);
2671			goto out;
2672		}
2673	}
2674
2675	for (; sva < eva; sva = va_next) {
2676
2677		if (pmap->pm_stats.resident_count == 0)
2678			break;
2679
2680		pml4e = pmap_pml4e(pmap, sva);
2681		if ((*pml4e & PG_V) == 0) {
2682			va_next = (sva + NBPML4) & ~PML4MASK;
2683			if (va_next < sva)
2684				va_next = eva;
2685			continue;
2686		}
2687
2688		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2689		if ((*pdpe & PG_V) == 0) {
2690			va_next = (sva + NBPDP) & ~PDPMASK;
2691			if (va_next < sva)
2692				va_next = eva;
2693			continue;
2694		}
2695
2696		/*
2697		 * Calculate index for next page table.
2698		 */
2699		va_next = (sva + NBPDR) & ~PDRMASK;
2700		if (va_next < sva)
2701			va_next = eva;
2702
2703		pde = pmap_pdpe_to_pde(pdpe, sva);
2704		ptpaddr = *pde;
2705
2706		/*
2707		 * Weed out invalid mappings.
2708		 */
2709		if (ptpaddr == 0)
2710			continue;
2711
2712		/*
2713		 * Check for large page.
2714		 */
2715		if ((ptpaddr & PG_PS) != 0) {
2716			/*
2717			 * Are we removing the entire large page?  If not,
2718			 * demote the mapping and fall through.
2719			 */
2720			if (sva + NBPDR == va_next && eva >= va_next) {
2721				/*
2722				 * The TLB entry for a PG_G mapping is
2723				 * invalidated by pmap_remove_pde().
2724				 */
2725				if ((ptpaddr & PG_G) == 0)
2726					anyvalid = 1;
2727				pmap_remove_pde(pmap, pde, sva, &free);
2728				continue;
2729			} else if (!pmap_demote_pde(pmap, pde, sva)) {
2730				/* The large page mapping was destroyed. */
2731				continue;
2732			} else
2733				ptpaddr = *pde;
2734		}
2735
2736		/*
2737		 * Limit our scan to either the end of the va represented
2738		 * by the current page table page, or to the end of the
2739		 * range being removed.
2740		 */
2741		if (va_next > eva)
2742			va_next = eva;
2743
2744		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2745		    sva += PAGE_SIZE) {
2746			if (*pte == 0)
2747				continue;
2748
2749			/*
2750			 * The TLB entry for a PG_G mapping is invalidated
2751			 * by pmap_remove_pte().
2752			 */
2753			if ((*pte & PG_G) == 0)
2754				anyvalid = 1;
2755			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
2756				break;
2757		}
2758	}
2759out:
2760	if (anyvalid)
2761		pmap_invalidate_all(pmap);
2762	vm_page_unlock_queues();
2763	PMAP_UNLOCK(pmap);
2764	pmap_free_zero_pages(free);
2765}
2766
2767/*
2768 *	Routine:	pmap_remove_all
2769 *	Function:
2770 *		Removes this physical page from
2771 *		all physical maps in which it resides.
2772 *		Reflects back modify bits to the pager.
2773 *
2774 *	Notes:
2775 *		Original versions of this routine were very
2776 *		inefficient because they iteratively called
2777 *		pmap_remove (slow...)
2778 */
2779
2780void
2781pmap_remove_all(vm_page_t m)
2782{
2783	struct md_page *pvh;
2784	pv_entry_t pv;
2785	pmap_t pmap;
2786	pt_entry_t *pte, tpte;
2787	pd_entry_t *pde;
2788	vm_offset_t va;
2789	vm_page_t free;
2790
2791	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2792	    ("pmap_remove_all: page %p is fictitious", m));
2793	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2794	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2795	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2796		va = pv->pv_va;
2797		pmap = PV_PMAP(pv);
2798		PMAP_LOCK(pmap);
2799		pde = pmap_pde(pmap, va);
2800		(void)pmap_demote_pde(pmap, pde, va);
2801		PMAP_UNLOCK(pmap);
2802	}
2803	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2804		pmap = PV_PMAP(pv);
2805		PMAP_LOCK(pmap);
2806		pmap_resident_count_dec(pmap, 1);
2807		pde = pmap_pde(pmap, pv->pv_va);
2808		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2809		    " a 2mpage in page %p's pv list", m));
2810		pte = pmap_pde_to_pte(pde, pv->pv_va);
2811		tpte = pte_load_clear(pte);
2812		if (tpte & PG_W)
2813			pmap->pm_stats.wired_count--;
2814		if (tpte & PG_A)
2815			vm_page_flag_set(m, PG_REFERENCED);
2816
2817		/*
2818		 * Update the vm_page_t clean and reference bits.
2819		 */
2820		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2821			vm_page_dirty(m);
2822		free = NULL;
2823		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
2824		pmap_invalidate_page(pmap, pv->pv_va);
2825		pmap_free_zero_pages(free);
2826		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2827		free_pv_entry(pmap, pv);
2828		PMAP_UNLOCK(pmap);
2829	}
2830	vm_page_flag_clear(m, PG_WRITEABLE);
2831}
2832
2833/*
2834 * pmap_protect_pde: do the things to protect a 2mpage in a process
2835 */
2836static boolean_t
2837pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2838{
2839	pd_entry_t newpde, oldpde;
2840	vm_offset_t eva, va;
2841	vm_page_t m;
2842	boolean_t anychanged;
2843
2844	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2845	KASSERT((sva & PDRMASK) == 0,
2846	    ("pmap_protect_pde: sva is not 2mpage aligned"));
2847	anychanged = FALSE;
2848retry:
2849	oldpde = newpde = *pde;
2850	if (oldpde & PG_MANAGED) {
2851		eva = sva + NBPDR;
2852		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2853		    va < eva; va += PAGE_SIZE, m++)
2854			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2855				vm_page_dirty(m);
2856	}
2857	if ((prot & VM_PROT_WRITE) == 0)
2858		newpde &= ~(PG_RW | PG_M);
2859	if ((prot & VM_PROT_EXECUTE) == 0)
2860		newpde |= pg_nx;
2861	if (newpde != oldpde) {
2862		if (!atomic_cmpset_long(pde, oldpde, newpde))
2863			goto retry;
2864		if (oldpde & PG_G)
2865			pmap_invalidate_page(pmap, sva);
2866		else
2867			anychanged = TRUE;
2868	}
2869	return (anychanged);
2870}
2871
2872/*
2873 *	Set the physical protection on the
2874 *	specified range of this map as requested.
2875 */
2876void
2877pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2878{
2879	vm_offset_t va_next;
2880	pml4_entry_t *pml4e;
2881	pdp_entry_t *pdpe;
2882	pd_entry_t ptpaddr, *pde;
2883	pt_entry_t *pte;
2884	int anychanged;
2885
2886	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2887		pmap_remove(pmap, sva, eva);
2888		return;
2889	}
2890
2891	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2892	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2893		return;
2894
2895	anychanged = 0;
2896
2897	vm_page_lock_queues();
2898	PMAP_LOCK(pmap);
2899	for (; sva < eva; sva = va_next) {
2900
2901		pml4e = pmap_pml4e(pmap, sva);
2902		if ((*pml4e & PG_V) == 0) {
2903			va_next = (sva + NBPML4) & ~PML4MASK;
2904			if (va_next < sva)
2905				va_next = eva;
2906			continue;
2907		}
2908
2909		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2910		if ((*pdpe & PG_V) == 0) {
2911			va_next = (sva + NBPDP) & ~PDPMASK;
2912			if (va_next < sva)
2913				va_next = eva;
2914			continue;
2915		}
2916
2917		va_next = (sva + NBPDR) & ~PDRMASK;
2918		if (va_next < sva)
2919			va_next = eva;
2920
2921		pde = pmap_pdpe_to_pde(pdpe, sva);
2922		ptpaddr = *pde;
2923
2924		/*
2925		 * Weed out invalid mappings.
2926		 */
2927		if (ptpaddr == 0)
2928			continue;
2929
2930		/*
2931		 * Check for large page.
2932		 */
2933		if ((ptpaddr & PG_PS) != 0) {
2934			/*
2935			 * Are we protecting the entire large page?  If not,
2936			 * demote the mapping and fall through.
2937			 */
2938			if (sva + NBPDR == va_next && eva >= va_next) {
2939				/*
2940				 * The TLB entry for a PG_G mapping is
2941				 * invalidated by pmap_protect_pde().
2942				 */
2943				if (pmap_protect_pde(pmap, pde, sva, prot))
2944					anychanged = 1;
2945				continue;
2946			} else if (!pmap_demote_pde(pmap, pde, sva)) {
2947				/* The large page mapping was destroyed. */
2948				continue;
2949			}
2950		}
2951
2952		if (va_next > eva)
2953			va_next = eva;
2954
2955		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2956		    sva += PAGE_SIZE) {
2957			pt_entry_t obits, pbits;
2958			vm_page_t m;
2959
2960retry:
2961			obits = pbits = *pte;
2962			if ((pbits & PG_V) == 0)
2963				continue;
2964
2965			if ((prot & VM_PROT_WRITE) == 0) {
2966				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
2967				    (PG_MANAGED | PG_M | PG_RW)) {
2968					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2969					vm_page_dirty(m);
2970				}
2971				pbits &= ~(PG_RW | PG_M);
2972			}
2973			if ((prot & VM_PROT_EXECUTE) == 0)
2974				pbits |= pg_nx;
2975
2976			if (pbits != obits) {
2977				if (!atomic_cmpset_long(pte, obits, pbits))
2978					goto retry;
2979				if (obits & PG_G)
2980					pmap_invalidate_page(pmap, sva);
2981				else
2982					anychanged = 1;
2983			}
2984		}
2985	}
2986	if (anychanged)
2987		pmap_invalidate_all(pmap);
2988	vm_page_unlock_queues();
2989	PMAP_UNLOCK(pmap);
2990}
2991
2992/*
2993 * Tries to promote the 512, contiguous 4KB page mappings that are within a
2994 * single page table page (PTP) to a single 2MB page mapping.  For promotion
2995 * to occur, two conditions must be met: (1) the 4KB page mappings must map
2996 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2997 * identical characteristics.
2998 */
2999static void
3000pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3001{
3002	pd_entry_t newpde;
3003	pt_entry_t *firstpte, oldpte, pa, *pte;
3004	vm_offset_t oldpteva;
3005	vm_page_t mpte;
3006
3007	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3008
3009	/*
3010	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3011	 * either invalid, unused, or does not map the first 4KB physical page
3012	 * within a 2MB page.
3013	 */
3014	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
3015setpde:
3016	newpde = *firstpte;
3017	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3018		pmap_pde_p_failures++;
3019		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
3020		    " in pmap %p", va, pmap);
3021		return;
3022	}
3023	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3024		/*
3025		 * When PG_M is already clear, PG_RW can be cleared without
3026		 * a TLB invalidation.
3027		 */
3028		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
3029			goto setpde;
3030		newpde &= ~PG_RW;
3031	}
3032
3033	/*
3034	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3035	 * PTE maps an unexpected 4KB physical page or does not have identical
3036	 * characteristics to the first PTE.
3037	 */
3038	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3039	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3040setpte:
3041		oldpte = *pte;
3042		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3043			pmap_pde_p_failures++;
3044			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
3045			    " in pmap %p", va, pmap);
3046			return;
3047		}
3048		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3049			/*
3050			 * When PG_M is already clear, PG_RW can be cleared
3051			 * without a TLB invalidation.
3052			 */
3053			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
3054				goto setpte;
3055			oldpte &= ~PG_RW;
3056			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3057			    (va & ~PDRMASK);
3058			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
3059			    " in pmap %p", oldpteva, pmap);
3060		}
3061		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3062			pmap_pde_p_failures++;
3063			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
3064			    " in pmap %p", va, pmap);
3065			return;
3066		}
3067		pa -= PAGE_SIZE;
3068	}
3069
3070	/*
3071	 * Save the page table page in its current state until the PDE
3072	 * mapping the superpage is demoted by pmap_demote_pde() or
3073	 * destroyed by pmap_remove_pde().
3074	 */
3075	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3076	KASSERT(mpte >= vm_page_array &&
3077	    mpte < &vm_page_array[vm_page_array_size],
3078	    ("pmap_promote_pde: page table page is out of range"));
3079	KASSERT(mpte->pindex == pmap_pde_pindex(va),
3080	    ("pmap_promote_pde: page table page's pindex is wrong"));
3081	pmap_insert_pt_page(pmap, mpte);
3082
3083	/*
3084	 * Promote the pv entries.
3085	 */
3086	if ((newpde & PG_MANAGED) != 0)
3087		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3088
3089	/*
3090	 * Propagate the PAT index to its proper position.
3091	 */
3092	if ((newpde & PG_PTE_PAT) != 0)
3093		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3094
3095	/*
3096	 * Map the superpage.
3097	 */
3098	if (workaround_erratum383)
3099		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3100	else
3101		pde_store(pde, PG_PS | newpde);
3102
3103	pmap_pde_promotions++;
3104	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
3105	    " in pmap %p", va, pmap);
3106}
3107
3108/*
3109 *	Insert the given physical page (p) at
3110 *	the specified virtual address (v) in the
3111 *	target physical map with the protection requested.
3112 *
3113 *	If specified, the page will be wired down, meaning
3114 *	that the related pte can not be reclaimed.
3115 *
3116 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3117 *	or lose information.  That is, this routine must actually
3118 *	insert this page into the given map NOW.
3119 */
3120void
3121pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3122    vm_prot_t prot, boolean_t wired)
3123{
3124	vm_paddr_t pa;
3125	pd_entry_t *pde;
3126	pt_entry_t *pte;
3127	vm_paddr_t opa;
3128	pt_entry_t origpte, newpte;
3129	vm_page_t mpte, om;
3130	boolean_t invlva;
3131
3132	va = trunc_page(va);
3133	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3134	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3135	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va));
3136
3137	mpte = NULL;
3138
3139	vm_page_lock_queues();
3140	PMAP_LOCK(pmap);
3141
3142	/*
3143	 * In the case that a page table page is not
3144	 * resident, we are creating it here.
3145	 */
3146	if (va < VM_MAXUSER_ADDRESS) {
3147		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3148	}
3149
3150	pde = pmap_pde(pmap, va);
3151	if (pde != NULL && (*pde & PG_V) != 0) {
3152		if ((*pde & PG_PS) != 0)
3153			panic("pmap_enter: attempted pmap_enter on 2MB page");
3154		pte = pmap_pde_to_pte(pde, va);
3155	} else
3156		panic("pmap_enter: invalid page directory va=%#lx", va);
3157
3158	pa = VM_PAGE_TO_PHYS(m);
3159	om = NULL;
3160	origpte = *pte;
3161	opa = origpte & PG_FRAME;
3162
3163	/*
3164	 * Mapping has not changed, must be protection or wiring change.
3165	 */
3166	if (origpte && (opa == pa)) {
3167		/*
3168		 * Wiring change, just update stats. We don't worry about
3169		 * wiring PT pages as they remain resident as long as there
3170		 * are valid mappings in them. Hence, if a user page is wired,
3171		 * the PT page will be also.
3172		 */
3173		if (wired && ((origpte & PG_W) == 0))
3174			pmap->pm_stats.wired_count++;
3175		else if (!wired && (origpte & PG_W))
3176			pmap->pm_stats.wired_count--;
3177
3178		/*
3179		 * Remove extra pte reference
3180		 */
3181		if (mpte)
3182			mpte->wire_count--;
3183
3184		/*
3185		 * We might be turning off write access to the page,
3186		 * so we go ahead and sense modify status.
3187		 */
3188		if (origpte & PG_MANAGED) {
3189			om = m;
3190			pa |= PG_MANAGED;
3191		}
3192		goto validate;
3193	}
3194	/*
3195	 * Mapping has changed, invalidate old range and fall through to
3196	 * handle validating new mapping.
3197	 */
3198	if (opa) {
3199		if (origpte & PG_W)
3200			pmap->pm_stats.wired_count--;
3201		if (origpte & PG_MANAGED) {
3202			om = PHYS_TO_VM_PAGE(opa);
3203			pmap_remove_entry(pmap, om, va);
3204		}
3205		if (mpte != NULL) {
3206			mpte->wire_count--;
3207			KASSERT(mpte->wire_count > 0,
3208			    ("pmap_enter: missing reference to page table page,"
3209			     " va: 0x%lx", va));
3210		}
3211	} else
3212		pmap_resident_count_inc(pmap, 1);
3213
3214	/*
3215	 * Enter on the PV list if part of our managed memory.
3216	 */
3217	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3218		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3219		    ("pmap_enter: managed mapping within the clean submap"));
3220		pmap_insert_entry(pmap, va, m);
3221		pa |= PG_MANAGED;
3222	}
3223
3224	/*
3225	 * Increment counters
3226	 */
3227	if (wired)
3228		pmap->pm_stats.wired_count++;
3229
3230validate:
3231	/*
3232	 * Now validate mapping with desired protection/wiring.
3233	 */
3234	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3235	if ((prot & VM_PROT_WRITE) != 0) {
3236		newpte |= PG_RW;
3237		vm_page_flag_set(m, PG_WRITEABLE);
3238	}
3239	if ((prot & VM_PROT_EXECUTE) == 0)
3240		newpte |= pg_nx;
3241	if (wired)
3242		newpte |= PG_W;
3243	if (va < VM_MAXUSER_ADDRESS)
3244		newpte |= PG_U;
3245	if (pmap == kernel_pmap)
3246		newpte |= PG_G;
3247
3248	/*
3249	 * if the mapping or permission bits are different, we need
3250	 * to update the pte.
3251	 */
3252	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3253		newpte |= PG_A;
3254		if ((access & VM_PROT_WRITE) != 0)
3255			newpte |= PG_M;
3256		if (origpte & PG_V) {
3257			invlva = FALSE;
3258			origpte = pte_load_store(pte, newpte);
3259			if (origpte & PG_A) {
3260				if (origpte & PG_MANAGED)
3261					vm_page_flag_set(om, PG_REFERENCED);
3262				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
3263				    PG_NX) == 0 && (newpte & PG_NX)))
3264					invlva = TRUE;
3265			}
3266			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3267				if ((origpte & PG_MANAGED) != 0)
3268					vm_page_dirty(om);
3269				if ((newpte & PG_RW) == 0)
3270					invlva = TRUE;
3271			}
3272			if (invlva)
3273				pmap_invalidate_page(pmap, va);
3274		} else
3275			pte_store(pte, newpte);
3276	}
3277
3278	/*
3279	 * If both the page table page and the reservation are fully
3280	 * populated, then attempt promotion.
3281	 */
3282	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3283	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3284		pmap_promote_pde(pmap, pde, va);
3285
3286	vm_page_unlock_queues();
3287	PMAP_UNLOCK(pmap);
3288}
3289
3290/*
3291 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
3292 * otherwise.  Fails if (1) a page table page cannot be allocated without
3293 * blocking, (2) a mapping already exists at the specified virtual address, or
3294 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3295 */
3296static boolean_t
3297pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3298{
3299	pd_entry_t *pde, newpde;
3300	vm_page_t free, mpde;
3301
3302	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3303	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3304	if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
3305		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3306		    " in pmap %p", va, pmap);
3307		return (FALSE);
3308	}
3309	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
3310	pde = &pde[pmap_pde_index(va)];
3311	if ((*pde & PG_V) != 0) {
3312		KASSERT(mpde->wire_count > 1,
3313		    ("pmap_enter_pde: mpde's wire count is too low"));
3314		mpde->wire_count--;
3315		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3316		    " in pmap %p", va, pmap);
3317		return (FALSE);
3318	}
3319	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3320	    PG_PS | PG_V;
3321	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3322		newpde |= PG_MANAGED;
3323
3324		/*
3325		 * Abort this mapping if its PV entry could not be created.
3326		 */
3327		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3328			free = NULL;
3329			if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
3330				pmap_invalidate_page(pmap, va);
3331				pmap_free_zero_pages(free);
3332			}
3333			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3334			    " in pmap %p", va, pmap);
3335			return (FALSE);
3336		}
3337	}
3338	if ((prot & VM_PROT_EXECUTE) == 0)
3339		newpde |= pg_nx;
3340	if (va < VM_MAXUSER_ADDRESS)
3341		newpde |= PG_U;
3342
3343	/*
3344	 * Increment counters.
3345	 */
3346	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
3347
3348	/*
3349	 * Map the superpage.
3350	 */
3351	pde_store(pde, newpde);
3352
3353	pmap_pde_mappings++;
3354	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3355	    " in pmap %p", va, pmap);
3356	return (TRUE);
3357}
3358
3359/*
3360 * Maps a sequence of resident pages belonging to the same object.
3361 * The sequence begins with the given page m_start.  This page is
3362 * mapped at the given virtual address start.  Each subsequent page is
3363 * mapped at a virtual address that is offset from start by the same
3364 * amount as the page is offset from m_start within the object.  The
3365 * last page in the sequence is the page with the largest offset from
3366 * m_start that can be mapped at a virtual address less than the given
3367 * virtual address end.  Not every virtual page between start and end
3368 * is mapped; only those for which a resident page exists with the
3369 * corresponding offset from m_start are mapped.
3370 */
3371void
3372pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3373    vm_page_t m_start, vm_prot_t prot)
3374{
3375	vm_offset_t va;
3376	vm_page_t m, mpte;
3377	vm_pindex_t diff, psize;
3378
3379	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3380	psize = atop(end - start);
3381	mpte = NULL;
3382	m = m_start;
3383	PMAP_LOCK(pmap);
3384	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3385		va = start + ptoa(diff);
3386		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3387		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3388		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3389		    pmap_enter_pde(pmap, va, m, prot))
3390			m = &m[NBPDR / PAGE_SIZE - 1];
3391		else
3392			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3393			    mpte);
3394		m = TAILQ_NEXT(m, listq);
3395	}
3396 	PMAP_UNLOCK(pmap);
3397}
3398
3399/*
3400 * this code makes some *MAJOR* assumptions:
3401 * 1. Current pmap & pmap exists.
3402 * 2. Not wired.
3403 * 3. Read access.
3404 * 4. No page table pages.
3405 * but is *MUCH* faster than pmap_enter...
3406 */
3407
3408void
3409pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3410{
3411
3412	PMAP_LOCK(pmap);
3413	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3414	PMAP_UNLOCK(pmap);
3415}
3416
3417static vm_page_t
3418pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3419    vm_prot_t prot, vm_page_t mpte)
3420{
3421	vm_page_t free;
3422	pt_entry_t *pte;
3423	vm_paddr_t pa;
3424
3425	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3426	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3427	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3428	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3429	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3430
3431	/*
3432	 * In the case that a page table page is not
3433	 * resident, we are creating it here.
3434	 */
3435	if (va < VM_MAXUSER_ADDRESS) {
3436		vm_pindex_t ptepindex;
3437		pd_entry_t *ptepa;
3438
3439		/*
3440		 * Calculate pagetable page index
3441		 */
3442		ptepindex = pmap_pde_pindex(va);
3443		if (mpte && (mpte->pindex == ptepindex)) {
3444			mpte->wire_count++;
3445		} else {
3446			/*
3447			 * Get the page directory entry
3448			 */
3449			ptepa = pmap_pde(pmap, va);
3450
3451			/*
3452			 * If the page table page is mapped, we just increment
3453			 * the hold count, and activate it.
3454			 */
3455			if (ptepa && (*ptepa & PG_V) != 0) {
3456				if (*ptepa & PG_PS)
3457					return (NULL);
3458				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
3459				mpte->wire_count++;
3460			} else {
3461				mpte = _pmap_allocpte(pmap, ptepindex,
3462				    M_NOWAIT);
3463				if (mpte == NULL)
3464					return (mpte);
3465			}
3466		}
3467		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3468		pte = &pte[pmap_pte_index(va)];
3469	} else {
3470		mpte = NULL;
3471		pte = vtopte(va);
3472	}
3473	if (*pte) {
3474		if (mpte != NULL) {
3475			mpte->wire_count--;
3476			mpte = NULL;
3477		}
3478		return (mpte);
3479	}
3480
3481	/*
3482	 * Enter on the PV list if part of our managed memory.
3483	 */
3484	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3485	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3486		if (mpte != NULL) {
3487			free = NULL;
3488			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
3489				pmap_invalidate_page(pmap, va);
3490				pmap_free_zero_pages(free);
3491			}
3492			mpte = NULL;
3493		}
3494		return (mpte);
3495	}
3496
3497	/*
3498	 * Increment counters
3499	 */
3500	pmap_resident_count_inc(pmap, 1);
3501
3502	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3503	if ((prot & VM_PROT_EXECUTE) == 0)
3504		pa |= pg_nx;
3505
3506	/*
3507	 * Now validate mapping with RO protection
3508	 */
3509	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3510		pte_store(pte, pa | PG_V | PG_U);
3511	else
3512		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3513	return (mpte);
3514}
3515
3516/*
3517 * Make a temporary mapping for a physical address.  This is only intended
3518 * to be used for panic dumps.
3519 */
3520void *
3521pmap_kenter_temporary(vm_paddr_t pa, int i)
3522{
3523	vm_offset_t va;
3524
3525	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3526	pmap_kenter(va, pa);
3527	invlpg(va);
3528	return ((void *)crashdumpmap);
3529}
3530
3531/*
3532 * This code maps large physical mmap regions into the
3533 * processor address space.  Note that some shortcuts
3534 * are taken, but the code works.
3535 */
3536void
3537pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3538    vm_pindex_t pindex, vm_size_t size)
3539{
3540	pd_entry_t *pde;
3541	vm_paddr_t pa, ptepa;
3542	vm_page_t p, pdpg;
3543	int pat_mode;
3544
3545	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3546	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3547	    ("pmap_object_init_pt: non-device object"));
3548	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3549		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3550			return;
3551		p = vm_page_lookup(object, pindex);
3552		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3553		    ("pmap_object_init_pt: invalid page %p", p));
3554		pat_mode = p->md.pat_mode;
3555
3556		/*
3557		 * Abort the mapping if the first page is not physically
3558		 * aligned to a 2MB page boundary.
3559		 */
3560		ptepa = VM_PAGE_TO_PHYS(p);
3561		if (ptepa & (NBPDR - 1))
3562			return;
3563
3564		/*
3565		 * Skip the first page.  Abort the mapping if the rest of
3566		 * the pages are not physically contiguous or have differing
3567		 * memory attributes.
3568		 */
3569		p = TAILQ_NEXT(p, listq);
3570		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3571		    pa += PAGE_SIZE) {
3572			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3573			    ("pmap_object_init_pt: invalid page %p", p));
3574			if (pa != VM_PAGE_TO_PHYS(p) ||
3575			    pat_mode != p->md.pat_mode)
3576				return;
3577			p = TAILQ_NEXT(p, listq);
3578		}
3579
3580		/*
3581		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
3582		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
3583		 * will not affect the termination of this loop.
3584		 */
3585		PMAP_LOCK(pmap);
3586		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3587		    size; pa += NBPDR) {
3588			pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
3589			if (pdpg == NULL) {
3590				/*
3591				 * The creation of mappings below is only an
3592				 * optimization.  If a page directory page
3593				 * cannot be allocated without blocking,
3594				 * continue on to the next mapping rather than
3595				 * blocking.
3596				 */
3597				addr += NBPDR;
3598				continue;
3599			}
3600			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3601			pde = &pde[pmap_pde_index(addr)];
3602			if ((*pde & PG_V) == 0) {
3603				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3604				    PG_U | PG_RW | PG_V);
3605				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
3606				pmap_pde_mappings++;
3607			} else {
3608				/* Continue on if the PDE is already valid. */
3609				pdpg->wire_count--;
3610				KASSERT(pdpg->wire_count > 0,
3611				    ("pmap_object_init_pt: missing reference "
3612				    "to page directory page, va: 0x%lx", addr));
3613			}
3614			addr += NBPDR;
3615		}
3616		PMAP_UNLOCK(pmap);
3617	}
3618}
3619
3620/*
3621 *	Routine:	pmap_change_wiring
3622 *	Function:	Change the wiring attribute for a map/virtual-address
3623 *			pair.
3624 *	In/out conditions:
3625 *			The mapping must already exist in the pmap.
3626 */
3627void
3628pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3629{
3630	pd_entry_t *pde;
3631	pt_entry_t *pte;
3632	boolean_t are_queues_locked;
3633
3634	are_queues_locked = FALSE;
3635
3636	/*
3637	 * Wiring is not a hardware characteristic so there is no need to
3638	 * invalidate TLB.
3639	 */
3640retry:
3641	PMAP_LOCK(pmap);
3642	pde = pmap_pde(pmap, va);
3643	if ((*pde & PG_PS) != 0) {
3644		if (!wired != ((*pde & PG_W) == 0)) {
3645			if (!are_queues_locked) {
3646				are_queues_locked = TRUE;
3647				if (!mtx_trylock(&vm_page_queue_mtx)) {
3648					PMAP_UNLOCK(pmap);
3649					vm_page_lock_queues();
3650					goto retry;
3651				}
3652			}
3653			if (!pmap_demote_pde(pmap, pde, va))
3654				panic("pmap_change_wiring: demotion failed");
3655		} else
3656			goto out;
3657	}
3658	pte = pmap_pde_to_pte(pde, va);
3659	if (wired && (*pte & PG_W) == 0) {
3660		pmap->pm_stats.wired_count++;
3661		atomic_set_long(pte, PG_W);
3662	} else if (!wired && (*pte & PG_W) != 0) {
3663		pmap->pm_stats.wired_count--;
3664		atomic_clear_long(pte, PG_W);
3665	}
3666out:
3667	if (are_queues_locked)
3668		vm_page_unlock_queues();
3669	PMAP_UNLOCK(pmap);
3670}
3671
3672/*
3673 *	Copy the range specified by src_addr/len
3674 *	from the source map to the range dst_addr/len
3675 *	in the destination map.
3676 *
3677 *	This routine is only advisory and need not do anything.
3678 */
3679
3680void
3681pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3682    vm_offset_t src_addr)
3683{
3684	vm_page_t   free;
3685	vm_offset_t addr;
3686	vm_offset_t end_addr = src_addr + len;
3687	vm_offset_t va_next;
3688
3689	if (dst_addr != src_addr)
3690		return;
3691
3692	vm_page_lock_queues();
3693	if (dst_pmap < src_pmap) {
3694		PMAP_LOCK(dst_pmap);
3695		PMAP_LOCK(src_pmap);
3696	} else {
3697		PMAP_LOCK(src_pmap);
3698		PMAP_LOCK(dst_pmap);
3699	}
3700	for (addr = src_addr; addr < end_addr; addr = va_next) {
3701		pt_entry_t *src_pte, *dst_pte;
3702		vm_page_t dstmpde, dstmpte, srcmpte;
3703		pml4_entry_t *pml4e;
3704		pdp_entry_t *pdpe;
3705		pd_entry_t srcptepaddr, *pde;
3706
3707		KASSERT(addr < UPT_MIN_ADDRESS,
3708		    ("pmap_copy: invalid to pmap_copy page tables"));
3709
3710		pml4e = pmap_pml4e(src_pmap, addr);
3711		if ((*pml4e & PG_V) == 0) {
3712			va_next = (addr + NBPML4) & ~PML4MASK;
3713			if (va_next < addr)
3714				va_next = end_addr;
3715			continue;
3716		}
3717
3718		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
3719		if ((*pdpe & PG_V) == 0) {
3720			va_next = (addr + NBPDP) & ~PDPMASK;
3721			if (va_next < addr)
3722				va_next = end_addr;
3723			continue;
3724		}
3725
3726		va_next = (addr + NBPDR) & ~PDRMASK;
3727		if (va_next < addr)
3728			va_next = end_addr;
3729
3730		pde = pmap_pdpe_to_pde(pdpe, addr);
3731		srcptepaddr = *pde;
3732		if (srcptepaddr == 0)
3733			continue;
3734
3735		if (srcptepaddr & PG_PS) {
3736			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
3737			if (dstmpde == NULL)
3738				break;
3739			pde = (pd_entry_t *)
3740			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
3741			pde = &pde[pmap_pde_index(addr)];
3742			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
3743			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3744			    PG_PS_FRAME))) {
3745				*pde = srcptepaddr & ~PG_W;
3746				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
3747			} else
3748				dstmpde->wire_count--;
3749			continue;
3750		}
3751
3752		srcptepaddr &= PG_FRAME;
3753		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
3754		KASSERT(srcmpte->wire_count > 0,
3755		    ("pmap_copy: source page table page is unused"));
3756
3757		if (va_next > end_addr)
3758			va_next = end_addr;
3759
3760		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
3761		src_pte = &src_pte[pmap_pte_index(addr)];
3762		dstmpte = NULL;
3763		while (addr < va_next) {
3764			pt_entry_t ptetemp;
3765			ptetemp = *src_pte;
3766			/*
3767			 * we only virtual copy managed pages
3768			 */
3769			if ((ptetemp & PG_MANAGED) != 0) {
3770				if (dstmpte != NULL &&
3771				    dstmpte->pindex == pmap_pde_pindex(addr))
3772					dstmpte->wire_count++;
3773				else if ((dstmpte = pmap_allocpte(dst_pmap,
3774				    addr, M_NOWAIT)) == NULL)
3775					goto out;
3776				dst_pte = (pt_entry_t *)
3777				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
3778				dst_pte = &dst_pte[pmap_pte_index(addr)];
3779				if (*dst_pte == 0 &&
3780				    pmap_try_insert_pv_entry(dst_pmap, addr,
3781				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3782					/*
3783					 * Clear the wired, modified, and
3784					 * accessed (referenced) bits
3785					 * during the copy.
3786					 */
3787					*dst_pte = ptetemp & ~(PG_W | PG_M |
3788					    PG_A);
3789					pmap_resident_count_inc(dst_pmap, 1);
3790	 			} else {
3791					free = NULL;
3792					if (pmap_unwire_pte_hold(dst_pmap,
3793					    addr, dstmpte, &free)) {
3794					    	pmap_invalidate_page(dst_pmap,
3795					 	    addr);
3796				    	    	pmap_free_zero_pages(free);
3797					}
3798					goto out;
3799				}
3800				if (dstmpte->wire_count >= srcmpte->wire_count)
3801					break;
3802			}
3803			addr += PAGE_SIZE;
3804			src_pte++;
3805		}
3806	}
3807out:
3808	vm_page_unlock_queues();
3809	PMAP_UNLOCK(src_pmap);
3810	PMAP_UNLOCK(dst_pmap);
3811}
3812
3813/*
3814 *	pmap_zero_page zeros the specified hardware page by mapping
3815 *	the page into KVM and using bzero to clear its contents.
3816 */
3817void
3818pmap_zero_page(vm_page_t m)
3819{
3820	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3821
3822	pagezero((void *)va);
3823}
3824
3825/*
3826 *	pmap_zero_page_area zeros the specified hardware page by mapping
3827 *	the page into KVM and using bzero to clear its contents.
3828 *
3829 *	off and size may not cover an area beyond a single hardware page.
3830 */
3831void
3832pmap_zero_page_area(vm_page_t m, int off, int size)
3833{
3834	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3835
3836	if (off == 0 && size == PAGE_SIZE)
3837		pagezero((void *)va);
3838	else
3839		bzero((char *)va + off, size);
3840}
3841
3842/*
3843 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3844 *	the page into KVM and using bzero to clear its contents.  This
3845 *	is intended to be called from the vm_pagezero process only and
3846 *	outside of Giant.
3847 */
3848void
3849pmap_zero_page_idle(vm_page_t m)
3850{
3851	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3852
3853	pagezero((void *)va);
3854}
3855
3856/*
3857 *	pmap_copy_page copies the specified (machine independent)
3858 *	page by mapping the page into virtual memory and using
3859 *	bcopy to copy the page, one machine dependent page at a
3860 *	time.
3861 */
3862void
3863pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3864{
3865	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3866	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3867
3868	pagecopy((void *)src, (void *)dst);
3869}
3870
3871/*
3872 * Returns true if the pmap's pv is one of the first
3873 * 16 pvs linked to from this page.  This count may
3874 * be changed upwards or downwards in the future; it
3875 * is only necessary that true be returned for a small
3876 * subset of pmaps for proper page aging.
3877 */
3878boolean_t
3879pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3880{
3881	struct md_page *pvh;
3882	pv_entry_t pv;
3883	int loops = 0;
3884
3885	if (m->flags & PG_FICTITIOUS)
3886		return (FALSE);
3887
3888	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3889	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3890		if (PV_PMAP(pv) == pmap) {
3891			return (TRUE);
3892		}
3893		loops++;
3894		if (loops >= 16)
3895			break;
3896	}
3897	if (loops < 16) {
3898		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3899		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3900			if (PV_PMAP(pv) == pmap)
3901				return (TRUE);
3902			loops++;
3903			if (loops >= 16)
3904				break;
3905		}
3906	}
3907	return (FALSE);
3908}
3909
3910/*
3911 *	pmap_page_wired_mappings:
3912 *
3913 *	Return the number of managed mappings to the given physical page
3914 *	that are wired.
3915 */
3916int
3917pmap_page_wired_mappings(vm_page_t m)
3918{
3919	int count;
3920
3921	count = 0;
3922	if ((m->flags & PG_FICTITIOUS) != 0)
3923		return (count);
3924	count = pmap_pvh_wired_mappings(&m->md, count);
3925	return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count));
3926}
3927
3928/*
3929 *	pmap_pvh_wired_mappings:
3930 *
3931 *	Return the updated number "count" of managed mappings that are wired.
3932 */
3933static int
3934pmap_pvh_wired_mappings(struct md_page *pvh, int count)
3935{
3936	pmap_t pmap;
3937	pt_entry_t *pte;
3938	pv_entry_t pv;
3939
3940	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3941	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3942		pmap = PV_PMAP(pv);
3943		PMAP_LOCK(pmap);
3944		pte = pmap_pte(pmap, pv->pv_va);
3945		if ((*pte & PG_W) != 0)
3946			count++;
3947		PMAP_UNLOCK(pmap);
3948	}
3949	return (count);
3950}
3951
3952/*
3953 * Returns TRUE if the given page is mapped individually or as part of
3954 * a 2mpage.  Otherwise, returns FALSE.
3955 */
3956boolean_t
3957pmap_page_is_mapped(vm_page_t m)
3958{
3959	struct md_page *pvh;
3960
3961	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
3962		return (FALSE);
3963	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3964	if (TAILQ_EMPTY(&m->md.pv_list)) {
3965		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3966		return (!TAILQ_EMPTY(&pvh->pv_list));
3967	} else
3968		return (TRUE);
3969}
3970
3971/*
3972 * Remove all pages from specified address space
3973 * this aids process exit speeds.  Also, this code
3974 * is special cased for current process only, but
3975 * can have the more generic (and slightly slower)
3976 * mode enabled.  This is much faster than pmap_remove
3977 * in the case of running down an entire address space.
3978 */
3979void
3980pmap_remove_pages(pmap_t pmap)
3981{
3982	pd_entry_t ptepde;
3983	pt_entry_t *pte, tpte;
3984	vm_page_t free = NULL;
3985	vm_page_t m, mpte, mt;
3986	pv_entry_t pv;
3987	struct md_page *pvh;
3988	struct pv_chunk *pc, *npc;
3989	int field, idx;
3990	int64_t bit;
3991	uint64_t inuse, bitmask;
3992	int allfree;
3993
3994	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3995		printf("warning: pmap_remove_pages called with non-current pmap\n");
3996		return;
3997	}
3998	vm_page_lock_queues();
3999	PMAP_LOCK(pmap);
4000	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4001		allfree = 1;
4002		for (field = 0; field < _NPCM; field++) {
4003			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4004			while (inuse != 0) {
4005				bit = bsfq(inuse);
4006				bitmask = 1UL << bit;
4007				idx = field * 64 + bit;
4008				pv = &pc->pc_pventry[idx];
4009				inuse &= ~bitmask;
4010
4011				pte = pmap_pdpe(pmap, pv->pv_va);
4012				ptepde = *pte;
4013				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
4014				tpte = *pte;
4015				if ((tpte & (PG_PS | PG_V)) == PG_V) {
4016					ptepde = tpte;
4017					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
4018					    PG_FRAME);
4019					pte = &pte[pmap_pte_index(pv->pv_va)];
4020					tpte = *pte & ~PG_PTE_PAT;
4021				}
4022				if ((tpte & PG_V) == 0)
4023					panic("bad pte");
4024
4025/*
4026 * We cannot remove wired pages from a process' mapping at this time
4027 */
4028				if (tpte & PG_W) {
4029					allfree = 0;
4030					continue;
4031				}
4032
4033				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4034				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4035				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4036				    m, (uintmax_t)m->phys_addr,
4037				    (uintmax_t)tpte));
4038
4039				KASSERT(m < &vm_page_array[vm_page_array_size],
4040					("pmap_remove_pages: bad tpte %#jx",
4041					(uintmax_t)tpte));
4042
4043				pte_clear(pte);
4044
4045				/*
4046				 * Update the vm_page_t clean/reference bits.
4047				 */
4048				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4049					if ((tpte & PG_PS) != 0) {
4050						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4051							vm_page_dirty(mt);
4052					} else
4053						vm_page_dirty(m);
4054				}
4055
4056				/* Mark free */
4057				PV_STAT(pv_entry_frees++);
4058				PV_STAT(pv_entry_spare++);
4059				pv_entry_count--;
4060				pc->pc_map[field] |= bitmask;
4061				if ((tpte & PG_PS) != 0) {
4062					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
4063					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4064					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4065					if (TAILQ_EMPTY(&pvh->pv_list)) {
4066						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4067							if (TAILQ_EMPTY(&mt->md.pv_list))
4068								vm_page_flag_clear(mt, PG_WRITEABLE);
4069					}
4070					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4071					if (mpte != NULL) {
4072						pmap_remove_pt_page(pmap, mpte);
4073						pmap_resident_count_dec(pmap, 1);
4074						KASSERT(mpte->wire_count == NPTEPG,
4075						    ("pmap_remove_pages: pte page wire count error"));
4076						mpte->wire_count = 0;
4077						pmap_add_delayed_free_list(mpte, &free, FALSE);
4078						atomic_subtract_int(&cnt.v_wire_count, 1);
4079					}
4080				} else {
4081					pmap_resident_count_dec(pmap, 1);
4082					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4083					if (TAILQ_EMPTY(&m->md.pv_list)) {
4084						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4085						if (TAILQ_EMPTY(&pvh->pv_list))
4086							vm_page_flag_clear(m, PG_WRITEABLE);
4087					}
4088				}
4089				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4090			}
4091		}
4092		if (allfree) {
4093			PV_STAT(pv_entry_spare -= _NPCPV);
4094			PV_STAT(pc_chunk_count--);
4095			PV_STAT(pc_chunk_frees++);
4096			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4097			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
4098			dump_drop_page(m->phys_addr);
4099			vm_page_unwire(m, 0);
4100			vm_page_free(m);
4101		}
4102	}
4103	pmap_invalidate_all(pmap);
4104	vm_page_unlock_queues();
4105	PMAP_UNLOCK(pmap);
4106	pmap_free_zero_pages(free);
4107}
4108
4109/*
4110 *	pmap_is_modified:
4111 *
4112 *	Return whether or not the specified physical page was modified
4113 *	in any physical maps.
4114 */
4115boolean_t
4116pmap_is_modified(vm_page_t m)
4117{
4118
4119	if (m->flags & PG_FICTITIOUS)
4120		return (FALSE);
4121	if (pmap_is_modified_pvh(&m->md))
4122		return (TRUE);
4123	return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4124}
4125
4126/*
4127 * Returns TRUE if any of the given mappings were used to modify
4128 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4129 * mappings are supported.
4130 */
4131static boolean_t
4132pmap_is_modified_pvh(struct md_page *pvh)
4133{
4134	pv_entry_t pv;
4135	pt_entry_t *pte;
4136	pmap_t pmap;
4137	boolean_t rv;
4138
4139	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4140	rv = FALSE;
4141	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4142		pmap = PV_PMAP(pv);
4143		PMAP_LOCK(pmap);
4144		pte = pmap_pte(pmap, pv->pv_va);
4145		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4146		PMAP_UNLOCK(pmap);
4147		if (rv)
4148			break;
4149	}
4150	return (rv);
4151}
4152
4153/*
4154 *	pmap_is_prefaultable:
4155 *
4156 *	Return whether or not the specified virtual address is elgible
4157 *	for prefault.
4158 */
4159boolean_t
4160pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4161{
4162	pd_entry_t *pde;
4163	pt_entry_t *pte;
4164	boolean_t rv;
4165
4166	rv = FALSE;
4167	PMAP_LOCK(pmap);
4168	pde = pmap_pde(pmap, addr);
4169	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
4170		pte = pmap_pde_to_pte(pde, addr);
4171		rv = (*pte & PG_V) == 0;
4172	}
4173	PMAP_UNLOCK(pmap);
4174	return (rv);
4175}
4176
4177/*
4178 *	pmap_is_referenced:
4179 *
4180 *	Return whether or not the specified physical page was referenced
4181 *	in any physical maps.
4182 */
4183boolean_t
4184pmap_is_referenced(vm_page_t m)
4185{
4186
4187	if (m->flags & PG_FICTITIOUS)
4188		return (FALSE);
4189	if (pmap_is_referenced_pvh(&m->md))
4190		return (TRUE);
4191	return (pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4192}
4193
4194/*
4195 * Returns TRUE if any of the given mappings were referenced and FALSE
4196 * otherwise.  Both page and 2mpage mappings are supported.
4197 */
4198static boolean_t
4199pmap_is_referenced_pvh(struct md_page *pvh)
4200{
4201	pv_entry_t pv;
4202	pt_entry_t *pte;
4203	pmap_t pmap;
4204	boolean_t rv;
4205
4206	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4207	rv = FALSE;
4208	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4209		pmap = PV_PMAP(pv);
4210		PMAP_LOCK(pmap);
4211		pte = pmap_pte(pmap, pv->pv_va);
4212		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4213		PMAP_UNLOCK(pmap);
4214		if (rv)
4215			break;
4216	}
4217	return (rv);
4218}
4219
4220/*
4221 * Clear the write and modified bits in each of the given page's mappings.
4222 */
4223void
4224pmap_remove_write(vm_page_t m)
4225{
4226	struct md_page *pvh;
4227	pmap_t pmap;
4228	pv_entry_t next_pv, pv;
4229	pd_entry_t *pde;
4230	pt_entry_t oldpte, *pte;
4231	vm_offset_t va;
4232
4233	if ((m->flags & PG_FICTITIOUS) != 0 ||
4234	    (m->flags & PG_WRITEABLE) == 0)
4235		return;
4236	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4237	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4238	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4239		pmap = PV_PMAP(pv);
4240		PMAP_LOCK(pmap);
4241		va = pv->pv_va;
4242		pde = pmap_pde(pmap, va);
4243		if ((*pde & PG_RW) != 0)
4244			(void)pmap_demote_pde(pmap, pde, va);
4245		PMAP_UNLOCK(pmap);
4246	}
4247	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4248		pmap = PV_PMAP(pv);
4249		PMAP_LOCK(pmap);
4250		pde = pmap_pde(pmap, pv->pv_va);
4251		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4252		    " a 2mpage in page %p's pv list", m));
4253		pte = pmap_pde_to_pte(pde, pv->pv_va);
4254retry:
4255		oldpte = *pte;
4256		if (oldpte & PG_RW) {
4257			if (!atomic_cmpset_long(pte, oldpte, oldpte &
4258			    ~(PG_RW | PG_M)))
4259				goto retry;
4260			if ((oldpte & PG_M) != 0)
4261				vm_page_dirty(m);
4262			pmap_invalidate_page(pmap, pv->pv_va);
4263		}
4264		PMAP_UNLOCK(pmap);
4265	}
4266	vm_page_flag_clear(m, PG_WRITEABLE);
4267}
4268
4269/*
4270 *	pmap_ts_referenced:
4271 *
4272 *	Return a count of reference bits for a page, clearing those bits.
4273 *	It is not necessary for every reference bit to be cleared, but it
4274 *	is necessary that 0 only be returned when there are truly no
4275 *	reference bits set.
4276 *
4277 *	XXX: The exact number of bits to check and clear is a matter that
4278 *	should be tested and standardized at some point in the future for
4279 *	optimal aging of shared pages.
4280 */
4281int
4282pmap_ts_referenced(vm_page_t m)
4283{
4284	struct md_page *pvh;
4285	pv_entry_t pv, pvf, pvn;
4286	pmap_t pmap;
4287	pd_entry_t oldpde, *pde;
4288	pt_entry_t *pte;
4289	vm_offset_t va;
4290	int rtval = 0;
4291
4292	if (m->flags & PG_FICTITIOUS)
4293		return (rtval);
4294	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4295	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4296	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4297		pmap = PV_PMAP(pv);
4298		PMAP_LOCK(pmap);
4299		va = pv->pv_va;
4300		pde = pmap_pde(pmap, va);
4301		oldpde = *pde;
4302		if ((oldpde & PG_A) != 0) {
4303			if (pmap_demote_pde(pmap, pde, va)) {
4304				if ((oldpde & PG_W) == 0) {
4305					/*
4306					 * Remove the mapping to a single page
4307					 * so that a subsequent access may
4308					 * repromote.  Since the underlying
4309					 * page table page is fully populated,
4310					 * this removal never frees a page
4311					 * table page.
4312					 */
4313					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4314					    PG_PS_FRAME);
4315					pmap_remove_page(pmap, va, pde, NULL);
4316					rtval++;
4317					if (rtval > 4) {
4318						PMAP_UNLOCK(pmap);
4319						return (rtval);
4320					}
4321				}
4322			}
4323		}
4324		PMAP_UNLOCK(pmap);
4325	}
4326	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4327		pvf = pv;
4328		do {
4329			pvn = TAILQ_NEXT(pv, pv_list);
4330			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4331			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4332			pmap = PV_PMAP(pv);
4333			PMAP_LOCK(pmap);
4334			pde = pmap_pde(pmap, pv->pv_va);
4335			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4336			    " found a 2mpage in page %p's pv list", m));
4337			pte = pmap_pde_to_pte(pde, pv->pv_va);
4338			if ((*pte & PG_A) != 0) {
4339				atomic_clear_long(pte, PG_A);
4340				pmap_invalidate_page(pmap, pv->pv_va);
4341				rtval++;
4342				if (rtval > 4)
4343					pvn = NULL;
4344			}
4345			PMAP_UNLOCK(pmap);
4346		} while ((pv = pvn) != NULL && pv != pvf);
4347	}
4348	return (rtval);
4349}
4350
4351/*
4352 *	Clear the modify bits on the specified physical page.
4353 */
4354void
4355pmap_clear_modify(vm_page_t m)
4356{
4357	struct md_page *pvh;
4358	pmap_t pmap;
4359	pv_entry_t next_pv, pv;
4360	pd_entry_t oldpde, *pde;
4361	pt_entry_t oldpte, *pte;
4362	vm_offset_t va;
4363
4364	if ((m->flags & PG_FICTITIOUS) != 0)
4365		return;
4366	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4367	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4368	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4369		pmap = PV_PMAP(pv);
4370		PMAP_LOCK(pmap);
4371		va = pv->pv_va;
4372		pde = pmap_pde(pmap, va);
4373		oldpde = *pde;
4374		if ((oldpde & PG_RW) != 0) {
4375			if (pmap_demote_pde(pmap, pde, va)) {
4376				if ((oldpde & PG_W) == 0) {
4377					/*
4378					 * Write protect the mapping to a
4379					 * single page so that a subsequent
4380					 * write access may repromote.
4381					 */
4382					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4383					    PG_PS_FRAME);
4384					pte = pmap_pde_to_pte(pde, va);
4385					oldpte = *pte;
4386					if ((oldpte & PG_V) != 0) {
4387						while (!atomic_cmpset_long(pte,
4388						    oldpte,
4389						    oldpte & ~(PG_M | PG_RW)))
4390							oldpte = *pte;
4391						vm_page_dirty(m);
4392						pmap_invalidate_page(pmap, va);
4393					}
4394				}
4395			}
4396		}
4397		PMAP_UNLOCK(pmap);
4398	}
4399	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4400		pmap = PV_PMAP(pv);
4401		PMAP_LOCK(pmap);
4402		pde = pmap_pde(pmap, pv->pv_va);
4403		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4404		    " a 2mpage in page %p's pv list", m));
4405		pte = pmap_pde_to_pte(pde, pv->pv_va);
4406		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4407			atomic_clear_long(pte, PG_M);
4408			pmap_invalidate_page(pmap, pv->pv_va);
4409		}
4410		PMAP_UNLOCK(pmap);
4411	}
4412}
4413
4414/*
4415 *	pmap_clear_reference:
4416 *
4417 *	Clear the reference bit on the specified physical page.
4418 */
4419void
4420pmap_clear_reference(vm_page_t m)
4421{
4422	struct md_page *pvh;
4423	pmap_t pmap;
4424	pv_entry_t next_pv, pv;
4425	pd_entry_t oldpde, *pde;
4426	pt_entry_t *pte;
4427	vm_offset_t va;
4428
4429	if ((m->flags & PG_FICTITIOUS) != 0)
4430		return;
4431	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4432	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4433	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4434		pmap = PV_PMAP(pv);
4435		PMAP_LOCK(pmap);
4436		va = pv->pv_va;
4437		pde = pmap_pde(pmap, va);
4438		oldpde = *pde;
4439		if ((oldpde & PG_A) != 0) {
4440			if (pmap_demote_pde(pmap, pde, va)) {
4441				/*
4442				 * Remove the mapping to a single page so
4443				 * that a subsequent access may repromote.
4444				 * Since the underlying page table page is
4445				 * fully populated, this removal never frees
4446				 * a page table page.
4447				 */
4448				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4449				    PG_PS_FRAME);
4450				pmap_remove_page(pmap, va, pde, NULL);
4451			}
4452		}
4453		PMAP_UNLOCK(pmap);
4454	}
4455	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4456		pmap = PV_PMAP(pv);
4457		PMAP_LOCK(pmap);
4458		pde = pmap_pde(pmap, pv->pv_va);
4459		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4460		    " a 2mpage in page %p's pv list", m));
4461		pte = pmap_pde_to_pte(pde, pv->pv_va);
4462		if (*pte & PG_A) {
4463			atomic_clear_long(pte, PG_A);
4464			pmap_invalidate_page(pmap, pv->pv_va);
4465		}
4466		PMAP_UNLOCK(pmap);
4467	}
4468}
4469
4470/*
4471 * Miscellaneous support routines follow
4472 */
4473
4474/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4475static __inline void
4476pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4477{
4478	u_int opte, npte;
4479
4480	/*
4481	 * The cache mode bits are all in the low 32-bits of the
4482	 * PTE, so we can just spin on updating the low 32-bits.
4483	 */
4484	do {
4485		opte = *(u_int *)pte;
4486		npte = opte & ~PG_PTE_CACHE;
4487		npte |= cache_bits;
4488	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4489}
4490
4491/* Adjust the cache mode for a 2MB page mapped via a PDE. */
4492static __inline void
4493pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4494{
4495	u_int opde, npde;
4496
4497	/*
4498	 * The cache mode bits are all in the low 32-bits of the
4499	 * PDE, so we can just spin on updating the low 32-bits.
4500	 */
4501	do {
4502		opde = *(u_int *)pde;
4503		npde = opde & ~PG_PDE_CACHE;
4504		npde |= cache_bits;
4505	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4506}
4507
4508/*
4509 * Map a set of physical memory pages into the kernel virtual
4510 * address space. Return a pointer to where it is mapped. This
4511 * routine is intended to be used for mapping device memory,
4512 * NOT real memory.
4513 */
4514void *
4515pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4516{
4517	vm_offset_t va, offset;
4518	vm_size_t tmpsize;
4519
4520	/*
4521	 * If the specified range of physical addresses fits within the direct
4522	 * map window, use the direct map.
4523	 */
4524	if (pa < dmaplimit && pa + size < dmaplimit) {
4525		va = PHYS_TO_DMAP(pa);
4526		if (!pmap_change_attr(va, size, mode))
4527			return ((void *)va);
4528	}
4529	offset = pa & PAGE_MASK;
4530	size = roundup(offset + size, PAGE_SIZE);
4531	va = kmem_alloc_nofault(kernel_map, size);
4532	if (!va)
4533		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4534	pa = trunc_page(pa);
4535	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4536		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4537	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4538	pmap_invalidate_cache_range(va, va + tmpsize);
4539	return ((void *)(va + offset));
4540}
4541
4542void *
4543pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4544{
4545
4546	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4547}
4548
4549void *
4550pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4551{
4552
4553	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4554}
4555
4556void
4557pmap_unmapdev(vm_offset_t va, vm_size_t size)
4558{
4559	vm_offset_t base, offset, tmpva;
4560
4561	/* If we gave a direct map region in pmap_mapdev, do nothing */
4562	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
4563		return;
4564	base = trunc_page(va);
4565	offset = va & PAGE_MASK;
4566	size = roundup(offset + size, PAGE_SIZE);
4567	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4568		pmap_kremove(tmpva);
4569	pmap_invalidate_range(kernel_pmap, va, tmpva);
4570	kmem_free(kernel_map, base, size);
4571}
4572
4573/*
4574 * Tries to demote a 1GB page mapping.
4575 */
4576static boolean_t
4577pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
4578{
4579	pdp_entry_t newpdpe, oldpdpe;
4580	pd_entry_t *firstpde, newpde, *pde;
4581	vm_paddr_t mpdepa;
4582	vm_page_t mpde;
4583
4584	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4585	oldpdpe = *pdpe;
4586	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
4587	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
4588	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
4589	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4590		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
4591		    " in pmap %p", va, pmap);
4592		return (FALSE);
4593	}
4594	mpdepa = VM_PAGE_TO_PHYS(mpde);
4595	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
4596	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
4597	KASSERT((oldpdpe & PG_A) != 0,
4598	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
4599	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
4600	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
4601	newpde = oldpdpe;
4602
4603	/*
4604	 * Initialize the page directory page.
4605	 */
4606	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
4607		*pde = newpde;
4608		newpde += NBPDR;
4609	}
4610
4611	/*
4612	 * Demote the mapping.
4613	 */
4614	*pdpe = newpdpe;
4615
4616	/*
4617	 * Invalidate a stale recursive mapping of the page directory page.
4618	 */
4619	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
4620
4621	pmap_pdpe_demotions++;
4622	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
4623	    " in pmap %p", va, pmap);
4624	return (TRUE);
4625}
4626
4627/*
4628 * Sets the memory attribute for the specified page.
4629 */
4630void
4631pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4632{
4633
4634	m->md.pat_mode = ma;
4635
4636	/*
4637	 * If "m" is a normal page, update its direct mapping.  This update
4638	 * can be relied upon to perform any cache operations that are
4639	 * required for data coherence.
4640	 */
4641	if ((m->flags & PG_FICTITIOUS) == 0 &&
4642	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4643	    m->md.pat_mode))
4644		panic("memory attribute change on the direct map failed");
4645}
4646
4647/*
4648 * Changes the specified virtual address range's memory type to that given by
4649 * the parameter "mode".  The specified virtual address range must be
4650 * completely contained within either the direct map or the kernel map.  If
4651 * the virtual address range is contained within the kernel map, then the
4652 * memory type for each of the corresponding ranges of the direct map is also
4653 * changed.  (The corresponding ranges of the direct map are those ranges that
4654 * map the same physical pages as the specified virtual address range.)  These
4655 * changes to the direct map are necessary because Intel describes the
4656 * behavior of their processors as "undefined" if two or more mappings to the
4657 * same physical page have different memory types.
4658 *
4659 * Returns zero if the change completed successfully, and either EINVAL or
4660 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4661 * of the virtual address range was not mapped, and ENOMEM is returned if
4662 * there was insufficient memory available to complete the change.  In the
4663 * latter case, the memory type may have been changed on some part of the
4664 * virtual address range or the direct map.
4665 */
4666int
4667pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4668{
4669	int error;
4670
4671	PMAP_LOCK(kernel_pmap);
4672	error = pmap_change_attr_locked(va, size, mode);
4673	PMAP_UNLOCK(kernel_pmap);
4674	return (error);
4675}
4676
4677static int
4678pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4679{
4680	vm_offset_t base, offset, tmpva;
4681	vm_paddr_t pa_start, pa_end;
4682	pdp_entry_t *pdpe;
4683	pd_entry_t *pde;
4684	pt_entry_t *pte;
4685	int cache_bits_pte, cache_bits_pde, error;
4686	boolean_t changed;
4687
4688	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4689	base = trunc_page(va);
4690	offset = va & PAGE_MASK;
4691	size = roundup(offset + size, PAGE_SIZE);
4692
4693	/*
4694	 * Only supported on kernel virtual addresses, including the direct
4695	 * map but excluding the recursive map.
4696	 */
4697	if (base < DMAP_MIN_ADDRESS)
4698		return (EINVAL);
4699
4700	cache_bits_pde = pmap_cache_bits(mode, 1);
4701	cache_bits_pte = pmap_cache_bits(mode, 0);
4702	changed = FALSE;
4703
4704	/*
4705	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
4706	 * into 4KB pages if required.
4707	 */
4708	for (tmpva = base; tmpva < base + size; ) {
4709		pdpe = pmap_pdpe(kernel_pmap, tmpva);
4710		if (*pdpe == 0)
4711			return (EINVAL);
4712		if (*pdpe & PG_PS) {
4713			/*
4714			 * If the current 1GB page already has the required
4715			 * memory type, then we need not demote this page. Just
4716			 * increment tmpva to the next 1GB page frame.
4717			 */
4718			if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
4719				tmpva = trunc_1gpage(tmpva) + NBPDP;
4720				continue;
4721			}
4722
4723			/*
4724			 * If the current offset aligns with a 1GB page frame
4725			 * and there is at least 1GB left within the range, then
4726			 * we need not break down this page into 2MB pages.
4727			 */
4728			if ((tmpva & PDPMASK) == 0 &&
4729			    tmpva + PDPMASK < base + size) {
4730				tmpva += NBPDP;
4731				continue;
4732			}
4733			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
4734				return (ENOMEM);
4735		}
4736		pde = pmap_pdpe_to_pde(pdpe, tmpva);
4737		if (*pde == 0)
4738			return (EINVAL);
4739		if (*pde & PG_PS) {
4740			/*
4741			 * If the current 2MB page already has the required
4742			 * memory type, then we need not demote this page. Just
4743			 * increment tmpva to the next 2MB page frame.
4744			 */
4745			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4746				tmpva = trunc_2mpage(tmpva) + NBPDR;
4747				continue;
4748			}
4749
4750			/*
4751			 * If the current offset aligns with a 2MB page frame
4752			 * and there is at least 2MB left within the range, then
4753			 * we need not break down this page into 4KB pages.
4754			 */
4755			if ((tmpva & PDRMASK) == 0 &&
4756			    tmpva + PDRMASK < base + size) {
4757				tmpva += NBPDR;
4758				continue;
4759			}
4760			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
4761				return (ENOMEM);
4762		}
4763		pte = pmap_pde_to_pte(pde, tmpva);
4764		if (*pte == 0)
4765			return (EINVAL);
4766		tmpva += PAGE_SIZE;
4767	}
4768	error = 0;
4769
4770	/*
4771	 * Ok, all the pages exist, so run through them updating their
4772	 * cache mode if required.
4773	 */
4774	pa_start = pa_end = 0;
4775	for (tmpva = base; tmpva < base + size; ) {
4776		pdpe = pmap_pdpe(kernel_pmap, tmpva);
4777		if (*pdpe & PG_PS) {
4778			if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
4779				pmap_pde_attr(pdpe, cache_bits_pde);
4780				changed = TRUE;
4781			}
4782			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
4783				if (pa_start == pa_end) {
4784					/* Start physical address run. */
4785					pa_start = *pdpe & PG_PS_FRAME;
4786					pa_end = pa_start + NBPDP;
4787				} else if (pa_end == (*pdpe & PG_PS_FRAME))
4788					pa_end += NBPDP;
4789				else {
4790					/* Run ended, update direct map. */
4791					error = pmap_change_attr_locked(
4792					    PHYS_TO_DMAP(pa_start),
4793					    pa_end - pa_start, mode);
4794					if (error != 0)
4795						break;
4796					/* Start physical address run. */
4797					pa_start = *pdpe & PG_PS_FRAME;
4798					pa_end = pa_start + NBPDP;
4799				}
4800			}
4801			tmpva = trunc_1gpage(tmpva) + NBPDP;
4802			continue;
4803		}
4804		pde = pmap_pdpe_to_pde(pdpe, tmpva);
4805		if (*pde & PG_PS) {
4806			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
4807				pmap_pde_attr(pde, cache_bits_pde);
4808				changed = TRUE;
4809			}
4810			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
4811				if (pa_start == pa_end) {
4812					/* Start physical address run. */
4813					pa_start = *pde & PG_PS_FRAME;
4814					pa_end = pa_start + NBPDR;
4815				} else if (pa_end == (*pde & PG_PS_FRAME))
4816					pa_end += NBPDR;
4817				else {
4818					/* Run ended, update direct map. */
4819					error = pmap_change_attr_locked(
4820					    PHYS_TO_DMAP(pa_start),
4821					    pa_end - pa_start, mode);
4822					if (error != 0)
4823						break;
4824					/* Start physical address run. */
4825					pa_start = *pde & PG_PS_FRAME;
4826					pa_end = pa_start + NBPDR;
4827				}
4828			}
4829			tmpva = trunc_2mpage(tmpva) + NBPDR;
4830		} else {
4831			pte = pmap_pde_to_pte(pde, tmpva);
4832			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
4833				pmap_pte_attr(pte, cache_bits_pte);
4834				changed = TRUE;
4835			}
4836			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
4837				if (pa_start == pa_end) {
4838					/* Start physical address run. */
4839					pa_start = *pte & PG_FRAME;
4840					pa_end = pa_start + PAGE_SIZE;
4841				} else if (pa_end == (*pte & PG_FRAME))
4842					pa_end += PAGE_SIZE;
4843				else {
4844					/* Run ended, update direct map. */
4845					error = pmap_change_attr_locked(
4846					    PHYS_TO_DMAP(pa_start),
4847					    pa_end - pa_start, mode);
4848					if (error != 0)
4849						break;
4850					/* Start physical address run. */
4851					pa_start = *pte & PG_FRAME;
4852					pa_end = pa_start + PAGE_SIZE;
4853				}
4854			}
4855			tmpva += PAGE_SIZE;
4856		}
4857	}
4858	if (error == 0 && pa_start != pa_end)
4859		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
4860		    pa_end - pa_start, mode);
4861
4862	/*
4863	 * Flush CPU caches if required to make sure any data isn't cached that
4864	 * shouldn't be, etc.
4865	 */
4866	if (changed) {
4867		pmap_invalidate_range(kernel_pmap, base, tmpva);
4868		pmap_invalidate_cache_range(base, tmpva);
4869	}
4870	return (error);
4871}
4872
4873/*
4874 * perform the pmap work for mincore
4875 */
4876int
4877pmap_mincore(pmap_t pmap, vm_offset_t addr)
4878{
4879	pd_entry_t *pdep;
4880	pt_entry_t pte;
4881	vm_paddr_t pa;
4882	vm_page_t m;
4883	int val = 0;
4884
4885	PMAP_LOCK(pmap);
4886	pdep = pmap_pde(pmap, addr);
4887	if (pdep != NULL && (*pdep & PG_V)) {
4888		if (*pdep & PG_PS) {
4889			pte = *pdep;
4890			val = MINCORE_SUPER;
4891			/* Compute the physical address of the 4KB page. */
4892			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4893			    PG_FRAME;
4894		} else {
4895			pte = *pmap_pde_to_pte(pdep, addr);
4896			pa = pte & PG_FRAME;
4897		}
4898	} else {
4899		pte = 0;
4900		pa = 0;
4901	}
4902	PMAP_UNLOCK(pmap);
4903
4904	if (pte != 0) {
4905		val |= MINCORE_INCORE;
4906		if ((pte & PG_MANAGED) == 0)
4907			return (val);
4908
4909		m = PHYS_TO_VM_PAGE(pa);
4910
4911		/*
4912		 * Modified by us
4913		 */
4914		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4915			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4916		else {
4917			/*
4918			 * Modified by someone else
4919			 */
4920			vm_page_lock_queues();
4921			if (m->dirty || pmap_is_modified(m))
4922				val |= MINCORE_MODIFIED_OTHER;
4923			vm_page_unlock_queues();
4924		}
4925		/*
4926		 * Referenced by us
4927		 */
4928		if (pte & PG_A)
4929			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4930		else {
4931			/*
4932			 * Referenced by someone else
4933			 */
4934			vm_page_lock_queues();
4935			if ((m->flags & PG_REFERENCED) ||
4936			    pmap_is_referenced(m))
4937				val |= MINCORE_REFERENCED_OTHER;
4938			vm_page_unlock_queues();
4939		}
4940	}
4941	return (val);
4942}
4943
4944void
4945pmap_activate(struct thread *td)
4946{
4947	pmap_t	pmap, oldpmap;
4948	u_int64_t  cr3;
4949
4950	critical_enter();
4951	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4952	oldpmap = PCPU_GET(curpmap);
4953#ifdef SMP
4954if (oldpmap)	/* XXX FIXME */
4955	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4956	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4957#else
4958if (oldpmap)	/* XXX FIXME */
4959	oldpmap->pm_active &= ~PCPU_GET(cpumask);
4960	pmap->pm_active |= PCPU_GET(cpumask);
4961#endif
4962	cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
4963	td->td_pcb->pcb_cr3 = cr3;
4964	load_cr3(cr3);
4965	critical_exit();
4966}
4967
4968void
4969pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
4970{
4971}
4972
4973/*
4974 *	Increase the starting virtual address of the given mapping if a
4975 *	different alignment might result in more superpage mappings.
4976 */
4977void
4978pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4979    vm_offset_t *addr, vm_size_t size)
4980{
4981	vm_offset_t superpage_offset;
4982
4983	if (size < NBPDR)
4984		return;
4985	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4986		offset += ptoa(object->pg_color);
4987	superpage_offset = offset & PDRMASK;
4988	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
4989	    (*addr & PDRMASK) == superpage_offset)
4990		return;
4991	if ((*addr & PDRMASK) < superpage_offset)
4992		*addr = (*addr & ~PDRMASK) + superpage_offset;
4993	else
4994		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
4995}
4996