pmap.c revision 201751
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 201751 2010-01-07 17:34:45Z alc $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_cpu.h"
107#include "opt_pmap.h"
108#include "opt_msgbuf.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/sf_buf.h>
123#include <sys/sx.h>
124#include <sys/vmmeter.h>
125#include <sys/sched.h>
126#include <sys/sysctl.h>
127#ifdef SMP
128#include <sys/smp.h>
129#endif
130
131#include <vm/vm.h>
132#include <vm/vm_param.h>
133#include <vm/vm_kern.h>
134#include <vm/vm_page.h>
135#include <vm/vm_map.h>
136#include <vm/vm_object.h>
137#include <vm/vm_extern.h>
138#include <vm/vm_pageout.h>
139#include <vm/vm_pager.h>
140#include <vm/vm_reserv.h>
141#include <vm/uma.h>
142
143#include <machine/cpu.h>
144#include <machine/cputypes.h>
145#include <machine/md_var.h>
146#include <machine/pcb.h>
147#include <machine/specialreg.h>
148#ifdef SMP
149#include <machine/smp.h>
150#endif
151
152#ifdef XBOX
153#include <machine/xbox.h>
154#endif
155
156#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
157#define CPU_ENABLE_SSE
158#endif
159
160#ifndef PMAP_SHPGPERPROC
161#define PMAP_SHPGPERPROC 200
162#endif
163
164#if !defined(DIAGNOSTIC)
165#define PMAP_INLINE	__gnu89_inline
166#else
167#define PMAP_INLINE
168#endif
169
170#define PV_STATS
171#ifdef PV_STATS
172#define PV_STAT(x)	do { x ; } while (0)
173#else
174#define PV_STAT(x)	do { } while (0)
175#endif
176
177#define	pa_index(pa)	((pa) >> PDRSHIFT)
178#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
179
180/*
181 * Get PDEs and PTEs for user/kernel address space
182 */
183#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
184#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
185
186#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
187#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
188#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
189#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
190#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
191
192#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
193    atomic_clear_int((u_int *)(pte), PG_W))
194#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
195
196struct pmap kernel_pmap_store;
197LIST_HEAD(pmaplist, pmap);
198static struct pmaplist allpmaps;
199static struct mtx allpmaps_lock;
200
201vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
202vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
203int pgeflag = 0;		/* PG_G or-in */
204int pseflag = 0;		/* PG_PS or-in */
205
206static int nkpt;
207vm_offset_t kernel_vm_end;
208extern u_int32_t KERNend;
209
210#ifdef PAE
211pt_entry_t pg_nx;
212static uma_zone_t pdptzone;
213#endif
214
215static int pat_works = 0;		/* Is page attribute table sane? */
216
217SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
218
219static int pg_ps_enabled;
220SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
221    "Are large page mappings enabled?");
222
223/*
224 * Data for the pv entry allocation mechanism
225 */
226static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
227static struct md_page *pv_table;
228static int shpgperproc = PMAP_SHPGPERPROC;
229
230struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
231int pv_maxchunks;			/* How many chunks we have KVA for */
232vm_offset_t pv_vafree;			/* freelist stored in the PTE */
233
234/*
235 * All those kernel PT submaps that BSD is so fond of
236 */
237struct sysmaps {
238	struct	mtx lock;
239	pt_entry_t *CMAP1;
240	pt_entry_t *CMAP2;
241	caddr_t	CADDR1;
242	caddr_t	CADDR2;
243};
244static struct sysmaps sysmaps_pcpu[MAXCPU];
245pt_entry_t *CMAP1 = 0;
246static pt_entry_t *CMAP3;
247caddr_t CADDR1 = 0, ptvmmap = 0;
248static caddr_t CADDR3;
249struct msgbuf *msgbufp = 0;
250
251/*
252 * Crashdump maps.
253 */
254static caddr_t crashdumpmap;
255
256static pt_entry_t *PMAP1 = 0, *PMAP2;
257static pt_entry_t *PADDR1 = 0, *PADDR2;
258#ifdef SMP
259static int PMAP1cpu;
260static int PMAP1changedcpu;
261SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
262	   &PMAP1changedcpu, 0,
263	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
264#endif
265static int PMAP1changed;
266SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
267	   &PMAP1changed, 0,
268	   "Number of times pmap_pte_quick changed PMAP1");
269static int PMAP1unchanged;
270SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
271	   &PMAP1unchanged, 0,
272	   "Number of times pmap_pte_quick didn't change PMAP1");
273static struct mtx PMAP2mutex;
274
275static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
276static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
277static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
278static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
279static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
280static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
281static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
282		    vm_offset_t va);
283static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
284
285static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
286static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
287    vm_prot_t prot);
288static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
289    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
290static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
291static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
292static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
293static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
294static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
295static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
296static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
297static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
298    vm_prot_t prot);
299static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
300static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
301    vm_page_t *free);
302static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
303    vm_page_t *free);
304static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
305static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
306    vm_page_t *free);
307static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
308					vm_offset_t va);
309static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
310static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
311    vm_page_t m);
312
313static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
314
315static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
316static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
317static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
318static void pmap_pte_release(pt_entry_t *pte);
319static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
320static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
321#ifdef PAE
322static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
323#endif
324static void pmap_set_pg(void);
325
326CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
327CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
328
329/*
330 * If you get an error here, then you set KVA_PAGES wrong! See the
331 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
332 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
333 */
334CTASSERT(KERNBASE % (1 << 24) == 0);
335
336/*
337 * Move the kernel virtual free pointer to the next
338 * 4MB.  This is used to help improve performance
339 * by using a large (4MB) page for much of the kernel
340 * (.text, .data, .bss)
341 */
342static vm_offset_t
343pmap_kmem_choose(vm_offset_t addr)
344{
345	vm_offset_t newaddr = addr;
346
347#ifndef DISABLE_PSE
348	if (cpu_feature & CPUID_PSE)
349		newaddr = (addr + PDRMASK) & ~PDRMASK;
350#endif
351	return newaddr;
352}
353
354/*
355 *	Bootstrap the system enough to run with virtual memory.
356 *
357 *	On the i386 this is called after mapping has already been enabled
358 *	and just syncs the pmap module with what has already been done.
359 *	[We can't call it easily with mapping off since the kernel is not
360 *	mapped with PA == VA, hence we would have to relocate every address
361 *	from the linked base (virtual) address "KERNBASE" to the actual
362 *	(physical) address starting relative to 0]
363 */
364void
365pmap_bootstrap(vm_paddr_t firstaddr)
366{
367	vm_offset_t va;
368	pt_entry_t *pte, *unused;
369	struct sysmaps *sysmaps;
370	int i;
371
372	/*
373	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
374	 * large. It should instead be correctly calculated in locore.s and
375	 * not based on 'first' (which is a physical address, not a virtual
376	 * address, for the start of unused physical memory). The kernel
377	 * page tables are NOT double mapped and thus should not be included
378	 * in this calculation.
379	 */
380	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
381	virtual_avail = pmap_kmem_choose(virtual_avail);
382
383	virtual_end = VM_MAX_KERNEL_ADDRESS;
384
385	/*
386	 * Initialize the kernel pmap (which is statically allocated).
387	 */
388	PMAP_LOCK_INIT(kernel_pmap);
389	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
390#ifdef PAE
391	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
392#endif
393	kernel_pmap->pm_root = NULL;
394	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
395	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
396	LIST_INIT(&allpmaps);
397	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
398	mtx_lock_spin(&allpmaps_lock);
399	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
400	mtx_unlock_spin(&allpmaps_lock);
401	nkpt = NKPT;
402
403	/*
404	 * Reserve some special page table entries/VA space for temporary
405	 * mapping of pages.
406	 */
407#define	SYSMAP(c, p, v, n)	\
408	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
409
410	va = virtual_avail;
411	pte = vtopte(va);
412
413	/*
414	 * CMAP1/CMAP2 are used for zeroing and copying pages.
415	 * CMAP3 is used for the idle process page zeroing.
416	 */
417	for (i = 0; i < MAXCPU; i++) {
418		sysmaps = &sysmaps_pcpu[i];
419		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
420		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
421		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
422	}
423	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
424	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
425	*CMAP3 = 0;
426
427	/*
428	 * Crashdump maps.
429	 */
430	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
431
432	/*
433	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
434	 */
435	SYSMAP(caddr_t, unused, ptvmmap, 1)
436
437	/*
438	 * msgbufp is used to map the system message buffer.
439	 */
440	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
441
442	/*
443	 * ptemap is used for pmap_pte_quick
444	 */
445	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
446	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
447
448	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
449
450	virtual_avail = va;
451
452	*CMAP1 = 0;
453
454	/*
455	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
456	 * physical memory region that is used by the ACPI wakeup code.  This
457	 * mapping must not have PG_G set.
458	 */
459#ifdef XBOX
460	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
461	 * an early stadium, we cannot yet neatly map video memory ... :-(
462	 * Better fixes are very welcome! */
463	if (!arch_i386_is_xbox)
464#endif
465	for (i = 1; i < NKPT; i++)
466		PTD[i] = 0;
467
468	/* Initialize the PAT MSR if present. */
469	pmap_init_pat();
470
471	/* Turn on PG_G on kernel page(s) */
472	pmap_set_pg();
473}
474
475/*
476 * Setup the PAT MSR.
477 */
478void
479pmap_init_pat(void)
480{
481	uint64_t pat_msr;
482	char *sysenv;
483	static int pat_tested = 0;
484
485	/* Bail if this CPU doesn't implement PAT. */
486	if (!(cpu_feature & CPUID_PAT))
487		return;
488
489	/*
490	 * Due to some Intel errata, we can only safely use the lower 4
491	 * PAT entries.
492	 *
493	 *   Intel Pentium III Processor Specification Update
494	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
495	 * or Mode C Paging)
496	 *
497	 *   Intel Pentium IV  Processor Specification Update
498	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
499	 *
500	 * Some Apple Macs based on nVidia chipsets cannot enter ACPI mode
501	 * via SMI# when we use upper 4 PAT entries for unknown reason.
502	 */
503	if (!pat_tested) {
504		if (cpu_vendor_id != CPU_VENDOR_INTEL ||
505		    (CPUID_TO_FAMILY(cpu_id) == 6 &&
506		    CPUID_TO_MODEL(cpu_id) >= 0xe)) {
507			pat_works = 1;
508			sysenv = getenv("smbios.system.product");
509			if (sysenv != NULL) {
510				if (strncmp(sysenv, "MacBook5,1", 10) == 0 ||
511				    strncmp(sysenv, "MacBookPro5,5", 13) == 0 ||
512				    strncmp(sysenv, "Macmini3,1", 10) == 0)
513					pat_works = 0;
514				freeenv(sysenv);
515			}
516		}
517		pat_tested = 1;
518	}
519
520	/* Initialize default PAT entries. */
521	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
522	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
523	    PAT_VALUE(2, PAT_UNCACHED) |
524	    PAT_VALUE(3, PAT_UNCACHEABLE) |
525	    PAT_VALUE(4, PAT_WRITE_BACK) |
526	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
527	    PAT_VALUE(6, PAT_UNCACHED) |
528	    PAT_VALUE(7, PAT_UNCACHEABLE);
529
530	if (pat_works) {
531		/*
532		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
533		 * Program 4 and 5 as WP and WC.
534		 * Leave 6 and 7 as UC- and UC.
535		 */
536		pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
537		pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
538		    PAT_VALUE(5, PAT_WRITE_COMBINING);
539	} else {
540		/*
541		 * Just replace PAT Index 2 with WC instead of UC-.
542		 */
543		pat_msr &= ~PAT_MASK(2);
544		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
545	}
546	wrmsr(MSR_PAT, pat_msr);
547}
548
549/*
550 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
551 */
552static void
553pmap_set_pg(void)
554{
555	pd_entry_t pdir;
556	pt_entry_t *pte;
557	vm_offset_t va, endva;
558	int i;
559
560	if (pgeflag == 0)
561		return;
562
563	i = KERNLOAD/NBPDR;
564	endva = KERNBASE + KERNend;
565
566	if (pseflag) {
567		va = KERNBASE + KERNLOAD;
568		while (va  < endva) {
569			pdir = kernel_pmap->pm_pdir[KPTDI+i];
570			pdir |= pgeflag;
571			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
572			invltlb();	/* Play it safe, invltlb() every time */
573			i++;
574			va += NBPDR;
575		}
576	} else {
577		va = (vm_offset_t)btext;
578		while (va < endva) {
579			pte = vtopte(va);
580			if (*pte)
581				*pte |= pgeflag;
582			invltlb();	/* Play it safe, invltlb() every time */
583			va += PAGE_SIZE;
584		}
585	}
586}
587
588/*
589 * Initialize a vm_page's machine-dependent fields.
590 */
591void
592pmap_page_init(vm_page_t m)
593{
594
595	TAILQ_INIT(&m->md.pv_list);
596	m->md.pat_mode = PAT_WRITE_BACK;
597}
598
599#ifdef PAE
600static void *
601pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
602{
603
604	/* Inform UMA that this allocator uses kernel_map/object. */
605	*flags = UMA_SLAB_KERNEL;
606	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
607	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
608}
609#endif
610
611/*
612 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
613 * Requirements:
614 *  - Must deal with pages in order to ensure that none of the PG_* bits
615 *    are ever set, PG_V in particular.
616 *  - Assumes we can write to ptes without pte_store() atomic ops, even
617 *    on PAE systems.  This should be ok.
618 *  - Assumes nothing will ever test these addresses for 0 to indicate
619 *    no mapping instead of correctly checking PG_V.
620 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
621 * Because PG_V is never set, there can be no mappings to invalidate.
622 */
623static vm_offset_t
624pmap_ptelist_alloc(vm_offset_t *head)
625{
626	pt_entry_t *pte;
627	vm_offset_t va;
628
629	va = *head;
630	if (va == 0)
631		return (va);	/* Out of memory */
632	pte = vtopte(va);
633	*head = *pte;
634	if (*head & PG_V)
635		panic("pmap_ptelist_alloc: va with PG_V set!");
636	*pte = 0;
637	return (va);
638}
639
640static void
641pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
642{
643	pt_entry_t *pte;
644
645	if (va & PG_V)
646		panic("pmap_ptelist_free: freeing va with PG_V set!");
647	pte = vtopte(va);
648	*pte = *head;		/* virtual! PG_V is 0 though */
649	*head = va;
650}
651
652static void
653pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
654{
655	int i;
656	vm_offset_t va;
657
658	*head = 0;
659	for (i = npages - 1; i >= 0; i--) {
660		va = (vm_offset_t)base + i * PAGE_SIZE;
661		pmap_ptelist_free(head, va);
662	}
663}
664
665
666/*
667 *	Initialize the pmap module.
668 *	Called by vm_init, to initialize any structures that the pmap
669 *	system needs to map virtual memory.
670 */
671void
672pmap_init(void)
673{
674	vm_page_t mpte;
675	vm_size_t s;
676	int i, pv_npg;
677
678	/*
679	 * Initialize the vm page array entries for the kernel pmap's
680	 * page table pages.
681	 */
682	for (i = 0; i < nkpt; i++) {
683		mpte = PHYS_TO_VM_PAGE(PTD[i + KPTDI] & PG_FRAME);
684		KASSERT(mpte >= vm_page_array &&
685		    mpte < &vm_page_array[vm_page_array_size],
686		    ("pmap_init: page table page is out of range"));
687		mpte->pindex = i + KPTDI;
688		mpte->phys_addr = PTD[i + KPTDI] & PG_FRAME;
689	}
690
691	/*
692	 * Initialize the address space (zone) for the pv entries.  Set a
693	 * high water mark so that the system can recover from excessive
694	 * numbers of pv entries.
695	 */
696	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
697	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
698	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
699	pv_entry_max = roundup(pv_entry_max, _NPCPV);
700	pv_entry_high_water = 9 * (pv_entry_max / 10);
701
702	/*
703	 * Are large page mappings enabled?
704	 */
705	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
706	if (pg_ps_enabled) {
707		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
708		    ("pmap_init: can't assign to pagesizes[1]"));
709		pagesizes[1] = NBPDR;
710	}
711
712	/*
713	 * Calculate the size of the pv head table for superpages.
714	 */
715	for (i = 0; phys_avail[i + 1]; i += 2);
716	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
717
718	/*
719	 * Allocate memory for the pv head table for superpages.
720	 */
721	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
722	s = round_page(s);
723	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
724	for (i = 0; i < pv_npg; i++)
725		TAILQ_INIT(&pv_table[i].pv_list);
726
727	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
728	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
729	    PAGE_SIZE * pv_maxchunks);
730	if (pv_chunkbase == NULL)
731		panic("pmap_init: not enough kvm for pv chunks");
732	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
733#ifdef PAE
734	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
735	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
736	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
737	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
738#endif
739}
740
741
742SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
743	"Max number of PV entries");
744SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
745	"Page share factor per proc");
746
747SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
748    "2/4MB page mapping counters");
749
750static u_long pmap_pde_demotions;
751SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
752    &pmap_pde_demotions, 0, "2/4MB page demotions");
753
754static u_long pmap_pde_mappings;
755SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
756    &pmap_pde_mappings, 0, "2/4MB page mappings");
757
758static u_long pmap_pde_p_failures;
759SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
760    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
761
762static u_long pmap_pde_promotions;
763SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
764    &pmap_pde_promotions, 0, "2/4MB page promotions");
765
766/***************************************************
767 * Low level helper routines.....
768 ***************************************************/
769
770/*
771 * Determine the appropriate bits to set in a PTE or PDE for a specified
772 * caching mode.
773 */
774int
775pmap_cache_bits(int mode, boolean_t is_pde)
776{
777	int pat_flag, pat_index, cache_bits;
778
779	/* The PAT bit is different for PTE's and PDE's. */
780	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
781
782	/* If we don't support PAT, map extended modes to older ones. */
783	if (!(cpu_feature & CPUID_PAT)) {
784		switch (mode) {
785		case PAT_UNCACHEABLE:
786		case PAT_WRITE_THROUGH:
787		case PAT_WRITE_BACK:
788			break;
789		case PAT_UNCACHED:
790		case PAT_WRITE_COMBINING:
791		case PAT_WRITE_PROTECTED:
792			mode = PAT_UNCACHEABLE;
793			break;
794		}
795	}
796
797	/* Map the caching mode to a PAT index. */
798	if (pat_works) {
799		switch (mode) {
800		case PAT_UNCACHEABLE:
801			pat_index = 3;
802			break;
803		case PAT_WRITE_THROUGH:
804			pat_index = 1;
805			break;
806		case PAT_WRITE_BACK:
807			pat_index = 0;
808			break;
809		case PAT_UNCACHED:
810			pat_index = 2;
811			break;
812		case PAT_WRITE_COMBINING:
813			pat_index = 5;
814			break;
815		case PAT_WRITE_PROTECTED:
816			pat_index = 4;
817			break;
818		default:
819			panic("Unknown caching mode %d\n", mode);
820		}
821	} else {
822		switch (mode) {
823		case PAT_UNCACHED:
824		case PAT_UNCACHEABLE:
825		case PAT_WRITE_PROTECTED:
826			pat_index = 3;
827			break;
828		case PAT_WRITE_THROUGH:
829			pat_index = 1;
830			break;
831		case PAT_WRITE_BACK:
832			pat_index = 0;
833			break;
834		case PAT_WRITE_COMBINING:
835			pat_index = 2;
836			break;
837		default:
838			panic("Unknown caching mode %d\n", mode);
839		}
840	}
841
842	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
843	cache_bits = 0;
844	if (pat_index & 0x4)
845		cache_bits |= pat_flag;
846	if (pat_index & 0x2)
847		cache_bits |= PG_NC_PCD;
848	if (pat_index & 0x1)
849		cache_bits |= PG_NC_PWT;
850	return (cache_bits);
851}
852#ifdef SMP
853/*
854 * For SMP, these functions have to use the IPI mechanism for coherence.
855 *
856 * N.B.: Before calling any of the following TLB invalidation functions,
857 * the calling processor must ensure that all stores updating a non-
858 * kernel page table are globally performed.  Otherwise, another
859 * processor could cache an old, pre-update entry without being
860 * invalidated.  This can happen one of two ways: (1) The pmap becomes
861 * active on another processor after its pm_active field is checked by
862 * one of the following functions but before a store updating the page
863 * table is globally performed. (2) The pmap becomes active on another
864 * processor before its pm_active field is checked but due to
865 * speculative loads one of the following functions stills reads the
866 * pmap as inactive on the other processor.
867 *
868 * The kernel page table is exempt because its pm_active field is
869 * immutable.  The kernel page table is always active on every
870 * processor.
871 */
872void
873pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
874{
875	u_int cpumask;
876	u_int other_cpus;
877
878	sched_pin();
879	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
880		invlpg(va);
881		smp_invlpg(va);
882	} else {
883		cpumask = PCPU_GET(cpumask);
884		other_cpus = PCPU_GET(other_cpus);
885		if (pmap->pm_active & cpumask)
886			invlpg(va);
887		if (pmap->pm_active & other_cpus)
888			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
889	}
890	sched_unpin();
891}
892
893void
894pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
895{
896	u_int cpumask;
897	u_int other_cpus;
898	vm_offset_t addr;
899
900	sched_pin();
901	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
902		for (addr = sva; addr < eva; addr += PAGE_SIZE)
903			invlpg(addr);
904		smp_invlpg_range(sva, eva);
905	} else {
906		cpumask = PCPU_GET(cpumask);
907		other_cpus = PCPU_GET(other_cpus);
908		if (pmap->pm_active & cpumask)
909			for (addr = sva; addr < eva; addr += PAGE_SIZE)
910				invlpg(addr);
911		if (pmap->pm_active & other_cpus)
912			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
913			    sva, eva);
914	}
915	sched_unpin();
916}
917
918void
919pmap_invalidate_all(pmap_t pmap)
920{
921	u_int cpumask;
922	u_int other_cpus;
923
924	sched_pin();
925	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
926		invltlb();
927		smp_invltlb();
928	} else {
929		cpumask = PCPU_GET(cpumask);
930		other_cpus = PCPU_GET(other_cpus);
931		if (pmap->pm_active & cpumask)
932			invltlb();
933		if (pmap->pm_active & other_cpus)
934			smp_masked_invltlb(pmap->pm_active & other_cpus);
935	}
936	sched_unpin();
937}
938
939void
940pmap_invalidate_cache(void)
941{
942
943	sched_pin();
944	wbinvd();
945	smp_cache_flush();
946	sched_unpin();
947}
948#else /* !SMP */
949/*
950 * Normal, non-SMP, 486+ invalidation functions.
951 * We inline these within pmap.c for speed.
952 */
953PMAP_INLINE void
954pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
955{
956
957	if (pmap == kernel_pmap || pmap->pm_active)
958		invlpg(va);
959}
960
961PMAP_INLINE void
962pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
963{
964	vm_offset_t addr;
965
966	if (pmap == kernel_pmap || pmap->pm_active)
967		for (addr = sva; addr < eva; addr += PAGE_SIZE)
968			invlpg(addr);
969}
970
971PMAP_INLINE void
972pmap_invalidate_all(pmap_t pmap)
973{
974
975	if (pmap == kernel_pmap || pmap->pm_active)
976		invltlb();
977}
978
979PMAP_INLINE void
980pmap_invalidate_cache(void)
981{
982
983	wbinvd();
984}
985#endif /* !SMP */
986
987void
988pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
989{
990
991	KASSERT((sva & PAGE_MASK) == 0,
992	    ("pmap_invalidate_cache_range: sva not page-aligned"));
993	KASSERT((eva & PAGE_MASK) == 0,
994	    ("pmap_invalidate_cache_range: eva not page-aligned"));
995
996	if (cpu_feature & CPUID_SS)
997		; /* If "Self Snoop" is supported, do nothing. */
998	else if (cpu_feature & CPUID_CLFSH) {
999
1000		/*
1001		 * Otherwise, do per-cache line flush.  Use the mfence
1002		 * instruction to insure that previous stores are
1003		 * included in the write-back.  The processor
1004		 * propagates flush to other processors in the cache
1005		 * coherence domain.
1006		 */
1007		mfence();
1008		for (; sva < eva; sva += cpu_clflush_line_size)
1009			clflush(sva);
1010		mfence();
1011	} else {
1012
1013		/*
1014		 * No targeted cache flush methods are supported by CPU,
1015		 * globally invalidate cache as a last resort.
1016		 */
1017		pmap_invalidate_cache();
1018	}
1019}
1020
1021/*
1022 * Are we current address space or kernel?  N.B. We return FALSE when
1023 * a pmap's page table is in use because a kernel thread is borrowing
1024 * it.  The borrowed page table can change spontaneously, making any
1025 * dependence on its continued use subject to a race condition.
1026 */
1027static __inline int
1028pmap_is_current(pmap_t pmap)
1029{
1030
1031	return (pmap == kernel_pmap ||
1032		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1033	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1034}
1035
1036/*
1037 * If the given pmap is not the current or kernel pmap, the returned pte must
1038 * be released by passing it to pmap_pte_release().
1039 */
1040pt_entry_t *
1041pmap_pte(pmap_t pmap, vm_offset_t va)
1042{
1043	pd_entry_t newpf;
1044	pd_entry_t *pde;
1045
1046	pde = pmap_pde(pmap, va);
1047	if (*pde & PG_PS)
1048		return (pde);
1049	if (*pde != 0) {
1050		/* are we current address space or kernel? */
1051		if (pmap_is_current(pmap))
1052			return (vtopte(va));
1053		mtx_lock(&PMAP2mutex);
1054		newpf = *pde & PG_FRAME;
1055		if ((*PMAP2 & PG_FRAME) != newpf) {
1056			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1057			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1058		}
1059		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1060	}
1061	return (0);
1062}
1063
1064/*
1065 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1066 * being NULL.
1067 */
1068static __inline void
1069pmap_pte_release(pt_entry_t *pte)
1070{
1071
1072	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1073		mtx_unlock(&PMAP2mutex);
1074}
1075
1076static __inline void
1077invlcaddr(void *caddr)
1078{
1079
1080	invlpg((u_int)caddr);
1081}
1082
1083/*
1084 * Super fast pmap_pte routine best used when scanning
1085 * the pv lists.  This eliminates many coarse-grained
1086 * invltlb calls.  Note that many of the pv list
1087 * scans are across different pmaps.  It is very wasteful
1088 * to do an entire invltlb for checking a single mapping.
1089 *
1090 * If the given pmap is not the current pmap, vm_page_queue_mtx
1091 * must be held and curthread pinned to a CPU.
1092 */
1093static pt_entry_t *
1094pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1095{
1096	pd_entry_t newpf;
1097	pd_entry_t *pde;
1098
1099	pde = pmap_pde(pmap, va);
1100	if (*pde & PG_PS)
1101		return (pde);
1102	if (*pde != 0) {
1103		/* are we current address space or kernel? */
1104		if (pmap_is_current(pmap))
1105			return (vtopte(va));
1106		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1107		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1108		newpf = *pde & PG_FRAME;
1109		if ((*PMAP1 & PG_FRAME) != newpf) {
1110			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1111#ifdef SMP
1112			PMAP1cpu = PCPU_GET(cpuid);
1113#endif
1114			invlcaddr(PADDR1);
1115			PMAP1changed++;
1116		} else
1117#ifdef SMP
1118		if (PMAP1cpu != PCPU_GET(cpuid)) {
1119			PMAP1cpu = PCPU_GET(cpuid);
1120			invlcaddr(PADDR1);
1121			PMAP1changedcpu++;
1122		} else
1123#endif
1124			PMAP1unchanged++;
1125		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1126	}
1127	return (0);
1128}
1129
1130/*
1131 *	Routine:	pmap_extract
1132 *	Function:
1133 *		Extract the physical page address associated
1134 *		with the given map/virtual_address pair.
1135 */
1136vm_paddr_t
1137pmap_extract(pmap_t pmap, vm_offset_t va)
1138{
1139	vm_paddr_t rtval;
1140	pt_entry_t *pte;
1141	pd_entry_t pde;
1142
1143	rtval = 0;
1144	PMAP_LOCK(pmap);
1145	pde = pmap->pm_pdir[va >> PDRSHIFT];
1146	if (pde != 0) {
1147		if ((pde & PG_PS) != 0)
1148			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1149		else {
1150			pte = pmap_pte(pmap, va);
1151			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1152			pmap_pte_release(pte);
1153		}
1154	}
1155	PMAP_UNLOCK(pmap);
1156	return (rtval);
1157}
1158
1159/*
1160 *	Routine:	pmap_extract_and_hold
1161 *	Function:
1162 *		Atomically extract and hold the physical page
1163 *		with the given pmap and virtual address pair
1164 *		if that mapping permits the given protection.
1165 */
1166vm_page_t
1167pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1168{
1169	pd_entry_t pde;
1170	pt_entry_t pte;
1171	vm_page_t m;
1172
1173	m = NULL;
1174	vm_page_lock_queues();
1175	PMAP_LOCK(pmap);
1176	pde = *pmap_pde(pmap, va);
1177	if (pde != 0) {
1178		if (pde & PG_PS) {
1179			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1180				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1181				    (va & PDRMASK));
1182				vm_page_hold(m);
1183			}
1184		} else {
1185			sched_pin();
1186			pte = *pmap_pte_quick(pmap, va);
1187			if (pte != 0 &&
1188			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1189				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1190				vm_page_hold(m);
1191			}
1192			sched_unpin();
1193		}
1194	}
1195	vm_page_unlock_queues();
1196	PMAP_UNLOCK(pmap);
1197	return (m);
1198}
1199
1200/***************************************************
1201 * Low level mapping routines.....
1202 ***************************************************/
1203
1204/*
1205 * Add a wired page to the kva.
1206 * Note: not SMP coherent.
1207 */
1208PMAP_INLINE void
1209pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1210{
1211	pt_entry_t *pte;
1212
1213	pte = vtopte(va);
1214	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1215}
1216
1217static __inline void
1218pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1219{
1220	pt_entry_t *pte;
1221
1222	pte = vtopte(va);
1223	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1224}
1225
1226/*
1227 * Remove a page from the kernel pagetables.
1228 * Note: not SMP coherent.
1229 */
1230PMAP_INLINE void
1231pmap_kremove(vm_offset_t va)
1232{
1233	pt_entry_t *pte;
1234
1235	pte = vtopte(va);
1236	pte_clear(pte);
1237}
1238
1239/*
1240 *	Used to map a range of physical addresses into kernel
1241 *	virtual address space.
1242 *
1243 *	The value passed in '*virt' is a suggested virtual address for
1244 *	the mapping. Architectures which can support a direct-mapped
1245 *	physical to virtual region can return the appropriate address
1246 *	within that region, leaving '*virt' unchanged. Other
1247 *	architectures should map the pages starting at '*virt' and
1248 *	update '*virt' with the first usable address after the mapped
1249 *	region.
1250 */
1251vm_offset_t
1252pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1253{
1254	vm_offset_t va, sva;
1255
1256	va = sva = *virt;
1257	while (start < end) {
1258		pmap_kenter(va, start);
1259		va += PAGE_SIZE;
1260		start += PAGE_SIZE;
1261	}
1262	pmap_invalidate_range(kernel_pmap, sva, va);
1263	*virt = va;
1264	return (sva);
1265}
1266
1267
1268/*
1269 * Add a list of wired pages to the kva
1270 * this routine is only used for temporary
1271 * kernel mappings that do not need to have
1272 * page modification or references recorded.
1273 * Note that old mappings are simply written
1274 * over.  The page *must* be wired.
1275 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1276 */
1277void
1278pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1279{
1280	pt_entry_t *endpte, oldpte, *pte;
1281
1282	oldpte = 0;
1283	pte = vtopte(sva);
1284	endpte = pte + count;
1285	while (pte < endpte) {
1286		oldpte |= *pte;
1287		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag |
1288		    pmap_cache_bits((*ma)->md.pat_mode, 0) | PG_RW | PG_V);
1289		pte++;
1290		ma++;
1291	}
1292	if ((oldpte & PG_V) != 0)
1293		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1294		    PAGE_SIZE);
1295}
1296
1297/*
1298 * This routine tears out page mappings from the
1299 * kernel -- it is meant only for temporary mappings.
1300 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1301 */
1302void
1303pmap_qremove(vm_offset_t sva, int count)
1304{
1305	vm_offset_t va;
1306
1307	va = sva;
1308	while (count-- > 0) {
1309		pmap_kremove(va);
1310		va += PAGE_SIZE;
1311	}
1312	pmap_invalidate_range(kernel_pmap, sva, va);
1313}
1314
1315/***************************************************
1316 * Page table page management routines.....
1317 ***************************************************/
1318static __inline void
1319pmap_free_zero_pages(vm_page_t free)
1320{
1321	vm_page_t m;
1322
1323	while (free != NULL) {
1324		m = free;
1325		free = m->right;
1326		/* Preserve the page's PG_ZERO setting. */
1327		vm_page_free_toq(m);
1328	}
1329}
1330
1331/*
1332 * Schedule the specified unused page table page to be freed.  Specifically,
1333 * add the page to the specified list of pages that will be released to the
1334 * physical memory manager after the TLB has been updated.
1335 */
1336static __inline void
1337pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1338{
1339
1340	if (set_PG_ZERO)
1341		m->flags |= PG_ZERO;
1342	else
1343		m->flags &= ~PG_ZERO;
1344	m->right = *free;
1345	*free = m;
1346}
1347
1348/*
1349 * Inserts the specified page table page into the specified pmap's collection
1350 * of idle page table pages.  Each of a pmap's page table pages is responsible
1351 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1352 * ordered by this virtual address range.
1353 */
1354static void
1355pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1356{
1357	vm_page_t root;
1358
1359	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1360	root = pmap->pm_root;
1361	if (root == NULL) {
1362		mpte->left = NULL;
1363		mpte->right = NULL;
1364	} else {
1365		root = vm_page_splay(mpte->pindex, root);
1366		if (mpte->pindex < root->pindex) {
1367			mpte->left = root->left;
1368			mpte->right = root;
1369			root->left = NULL;
1370		} else if (mpte->pindex == root->pindex)
1371			panic("pmap_insert_pt_page: pindex already inserted");
1372		else {
1373			mpte->right = root->right;
1374			mpte->left = root;
1375			root->right = NULL;
1376		}
1377	}
1378	pmap->pm_root = mpte;
1379}
1380
1381/*
1382 * Looks for a page table page mapping the specified virtual address in the
1383 * specified pmap's collection of idle page table pages.  Returns NULL if there
1384 * is no page table page corresponding to the specified virtual address.
1385 */
1386static vm_page_t
1387pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1388{
1389	vm_page_t mpte;
1390	vm_pindex_t pindex = va >> PDRSHIFT;
1391
1392	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1393	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1394		mpte = vm_page_splay(pindex, mpte);
1395		if ((pmap->pm_root = mpte)->pindex != pindex)
1396			mpte = NULL;
1397	}
1398	return (mpte);
1399}
1400
1401/*
1402 * Removes the specified page table page from the specified pmap's collection
1403 * of idle page table pages.  The specified page table page must be a member of
1404 * the pmap's collection.
1405 */
1406static void
1407pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1408{
1409	vm_page_t root;
1410
1411	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1412	if (mpte != pmap->pm_root)
1413		vm_page_splay(mpte->pindex, pmap->pm_root);
1414	if (mpte->left == NULL)
1415		root = mpte->right;
1416	else {
1417		root = vm_page_splay(mpte->pindex, mpte->left);
1418		root->right = mpte->right;
1419	}
1420	pmap->pm_root = root;
1421}
1422
1423/*
1424 * This routine unholds page table pages, and if the hold count
1425 * drops to zero, then it decrements the wire count.
1426 */
1427static __inline int
1428pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1429{
1430
1431	--m->wire_count;
1432	if (m->wire_count == 0)
1433		return _pmap_unwire_pte_hold(pmap, m, free);
1434	else
1435		return 0;
1436}
1437
1438static int
1439_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1440{
1441	vm_offset_t pteva;
1442
1443	/*
1444	 * unmap the page table page
1445	 */
1446	pmap->pm_pdir[m->pindex] = 0;
1447	--pmap->pm_stats.resident_count;
1448
1449	/*
1450	 * This is a release store so that the ordinary store unmapping
1451	 * the page table page is globally performed before TLB shoot-
1452	 * down is begun.
1453	 */
1454	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1455
1456	/*
1457	 * Do an invltlb to make the invalidated mapping
1458	 * take effect immediately.
1459	 */
1460	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1461	pmap_invalidate_page(pmap, pteva);
1462
1463	/*
1464	 * Put page on a list so that it is released after
1465	 * *ALL* TLB shootdown is done
1466	 */
1467	pmap_add_delayed_free_list(m, free, TRUE);
1468
1469	return 1;
1470}
1471
1472/*
1473 * After removing a page table entry, this routine is used to
1474 * conditionally free the page, and manage the hold/wire counts.
1475 */
1476static int
1477pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1478{
1479	pd_entry_t ptepde;
1480	vm_page_t mpte;
1481
1482	if (va >= VM_MAXUSER_ADDRESS)
1483		return 0;
1484	ptepde = *pmap_pde(pmap, va);
1485	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1486	return pmap_unwire_pte_hold(pmap, mpte, free);
1487}
1488
1489void
1490pmap_pinit0(pmap_t pmap)
1491{
1492
1493	PMAP_LOCK_INIT(pmap);
1494	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1495#ifdef PAE
1496	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1497#endif
1498	pmap->pm_root = NULL;
1499	pmap->pm_active = 0;
1500	PCPU_SET(curpmap, pmap);
1501	TAILQ_INIT(&pmap->pm_pvchunk);
1502	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1503	mtx_lock_spin(&allpmaps_lock);
1504	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1505	mtx_unlock_spin(&allpmaps_lock);
1506}
1507
1508/*
1509 * Initialize a preallocated and zeroed pmap structure,
1510 * such as one in a vmspace structure.
1511 */
1512int
1513pmap_pinit(pmap_t pmap)
1514{
1515	vm_page_t m, ptdpg[NPGPTD];
1516	vm_paddr_t pa;
1517	static int color;
1518	int i;
1519
1520	PMAP_LOCK_INIT(pmap);
1521
1522	/*
1523	 * No need to allocate page table space yet but we do need a valid
1524	 * page directory table.
1525	 */
1526	if (pmap->pm_pdir == NULL) {
1527		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1528		    NBPTD);
1529
1530		if (pmap->pm_pdir == NULL) {
1531			PMAP_LOCK_DESTROY(pmap);
1532			return (0);
1533		}
1534#ifdef PAE
1535		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1536		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1537		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1538		    ("pmap_pinit: pdpt misaligned"));
1539		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1540		    ("pmap_pinit: pdpt above 4g"));
1541#endif
1542		pmap->pm_root = NULL;
1543	}
1544	KASSERT(pmap->pm_root == NULL,
1545	    ("pmap_pinit: pmap has reserved page table page(s)"));
1546
1547	/*
1548	 * allocate the page directory page(s)
1549	 */
1550	for (i = 0; i < NPGPTD;) {
1551		m = vm_page_alloc(NULL, color++,
1552		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1553		    VM_ALLOC_ZERO);
1554		if (m == NULL)
1555			VM_WAIT;
1556		else {
1557			ptdpg[i++] = m;
1558		}
1559	}
1560
1561	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1562
1563	for (i = 0; i < NPGPTD; i++) {
1564		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1565			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1566	}
1567
1568	mtx_lock_spin(&allpmaps_lock);
1569	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1570	mtx_unlock_spin(&allpmaps_lock);
1571	/* Wire in kernel global address entries. */
1572	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1573
1574	/* install self-referential address mapping entry(s) */
1575	for (i = 0; i < NPGPTD; i++) {
1576		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1577		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1578#ifdef PAE
1579		pmap->pm_pdpt[i] = pa | PG_V;
1580#endif
1581	}
1582
1583	pmap->pm_active = 0;
1584	TAILQ_INIT(&pmap->pm_pvchunk);
1585	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1586
1587	return (1);
1588}
1589
1590/*
1591 * this routine is called if the page table page is not
1592 * mapped correctly.
1593 */
1594static vm_page_t
1595_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1596{
1597	vm_paddr_t ptepa;
1598	vm_page_t m;
1599
1600	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1601	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1602	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1603
1604	/*
1605	 * Allocate a page table page.
1606	 */
1607	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1608	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1609		if (flags & M_WAITOK) {
1610			PMAP_UNLOCK(pmap);
1611			vm_page_unlock_queues();
1612			VM_WAIT;
1613			vm_page_lock_queues();
1614			PMAP_LOCK(pmap);
1615		}
1616
1617		/*
1618		 * Indicate the need to retry.  While waiting, the page table
1619		 * page may have been allocated.
1620		 */
1621		return (NULL);
1622	}
1623	if ((m->flags & PG_ZERO) == 0)
1624		pmap_zero_page(m);
1625
1626	/*
1627	 * Map the pagetable page into the process address space, if
1628	 * it isn't already there.
1629	 */
1630
1631	pmap->pm_stats.resident_count++;
1632
1633	ptepa = VM_PAGE_TO_PHYS(m);
1634	pmap->pm_pdir[ptepindex] =
1635		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1636
1637	return m;
1638}
1639
1640static vm_page_t
1641pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1642{
1643	unsigned ptepindex;
1644	pd_entry_t ptepa;
1645	vm_page_t m;
1646
1647	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1648	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1649	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1650
1651	/*
1652	 * Calculate pagetable page index
1653	 */
1654	ptepindex = va >> PDRSHIFT;
1655retry:
1656	/*
1657	 * Get the page directory entry
1658	 */
1659	ptepa = pmap->pm_pdir[ptepindex];
1660
1661	/*
1662	 * This supports switching from a 4MB page to a
1663	 * normal 4K page.
1664	 */
1665	if (ptepa & PG_PS) {
1666		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1667		ptepa = pmap->pm_pdir[ptepindex];
1668	}
1669
1670	/*
1671	 * If the page table page is mapped, we just increment the
1672	 * hold count, and activate it.
1673	 */
1674	if (ptepa) {
1675		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1676		m->wire_count++;
1677	} else {
1678		/*
1679		 * Here if the pte page isn't mapped, or if it has
1680		 * been deallocated.
1681		 */
1682		m = _pmap_allocpte(pmap, ptepindex, flags);
1683		if (m == NULL && (flags & M_WAITOK))
1684			goto retry;
1685	}
1686	return (m);
1687}
1688
1689
1690/***************************************************
1691* Pmap allocation/deallocation routines.
1692 ***************************************************/
1693
1694#ifdef SMP
1695/*
1696 * Deal with a SMP shootdown of other users of the pmap that we are
1697 * trying to dispose of.  This can be a bit hairy.
1698 */
1699static cpumask_t *lazymask;
1700static u_int lazyptd;
1701static volatile u_int lazywait;
1702
1703void pmap_lazyfix_action(void);
1704
1705void
1706pmap_lazyfix_action(void)
1707{
1708	cpumask_t mymask = PCPU_GET(cpumask);
1709
1710#ifdef COUNT_IPIS
1711	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1712#endif
1713	if (rcr3() == lazyptd)
1714		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1715	atomic_clear_int(lazymask, mymask);
1716	atomic_store_rel_int(&lazywait, 1);
1717}
1718
1719static void
1720pmap_lazyfix_self(cpumask_t mymask)
1721{
1722
1723	if (rcr3() == lazyptd)
1724		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1725	atomic_clear_int(lazymask, mymask);
1726}
1727
1728
1729static void
1730pmap_lazyfix(pmap_t pmap)
1731{
1732	cpumask_t mymask, mask;
1733	u_int spins;
1734
1735	while ((mask = pmap->pm_active) != 0) {
1736		spins = 50000000;
1737		mask = mask & -mask;	/* Find least significant set bit */
1738		mtx_lock_spin(&smp_ipi_mtx);
1739#ifdef PAE
1740		lazyptd = vtophys(pmap->pm_pdpt);
1741#else
1742		lazyptd = vtophys(pmap->pm_pdir);
1743#endif
1744		mymask = PCPU_GET(cpumask);
1745		if (mask == mymask) {
1746			lazymask = &pmap->pm_active;
1747			pmap_lazyfix_self(mymask);
1748		} else {
1749			atomic_store_rel_int((u_int *)&lazymask,
1750			    (u_int)&pmap->pm_active);
1751			atomic_store_rel_int(&lazywait, 0);
1752			ipi_selected(mask, IPI_LAZYPMAP);
1753			while (lazywait == 0) {
1754				ia32_pause();
1755				if (--spins == 0)
1756					break;
1757			}
1758		}
1759		mtx_unlock_spin(&smp_ipi_mtx);
1760		if (spins == 0)
1761			printf("pmap_lazyfix: spun for 50000000\n");
1762	}
1763}
1764
1765#else	/* SMP */
1766
1767/*
1768 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1769 * unlikely to have to even execute this code, including the fact
1770 * that the cleanup is deferred until the parent does a wait(2), which
1771 * means that another userland process has run.
1772 */
1773static void
1774pmap_lazyfix(pmap_t pmap)
1775{
1776	u_int cr3;
1777
1778	cr3 = vtophys(pmap->pm_pdir);
1779	if (cr3 == rcr3()) {
1780		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1781		pmap->pm_active &= ~(PCPU_GET(cpumask));
1782	}
1783}
1784#endif	/* SMP */
1785
1786/*
1787 * Release any resources held by the given physical map.
1788 * Called when a pmap initialized by pmap_pinit is being released.
1789 * Should only be called if the map contains no valid mappings.
1790 */
1791void
1792pmap_release(pmap_t pmap)
1793{
1794	vm_page_t m, ptdpg[NPGPTD];
1795	int i;
1796
1797	KASSERT(pmap->pm_stats.resident_count == 0,
1798	    ("pmap_release: pmap resident count %ld != 0",
1799	    pmap->pm_stats.resident_count));
1800	KASSERT(pmap->pm_root == NULL,
1801	    ("pmap_release: pmap has reserved page table page(s)"));
1802
1803	pmap_lazyfix(pmap);
1804	mtx_lock_spin(&allpmaps_lock);
1805	LIST_REMOVE(pmap, pm_list);
1806	mtx_unlock_spin(&allpmaps_lock);
1807
1808	for (i = 0; i < NPGPTD; i++)
1809		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1810		    PG_FRAME);
1811
1812	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1813	    sizeof(*pmap->pm_pdir));
1814
1815	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1816
1817	for (i = 0; i < NPGPTD; i++) {
1818		m = ptdpg[i];
1819#ifdef PAE
1820		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1821		    ("pmap_release: got wrong ptd page"));
1822#endif
1823		m->wire_count--;
1824		atomic_subtract_int(&cnt.v_wire_count, 1);
1825		vm_page_free_zero(m);
1826	}
1827	PMAP_LOCK_DESTROY(pmap);
1828}
1829
1830static int
1831kvm_size(SYSCTL_HANDLER_ARGS)
1832{
1833	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1834
1835	return sysctl_handle_long(oidp, &ksize, 0, req);
1836}
1837SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1838    0, 0, kvm_size, "IU", "Size of KVM");
1839
1840static int
1841kvm_free(SYSCTL_HANDLER_ARGS)
1842{
1843	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1844
1845	return sysctl_handle_long(oidp, &kfree, 0, req);
1846}
1847SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1848    0, 0, kvm_free, "IU", "Amount of KVM free");
1849
1850/*
1851 * grow the number of kernel page table entries, if needed
1852 */
1853void
1854pmap_growkernel(vm_offset_t addr)
1855{
1856	struct pmap *pmap;
1857	vm_paddr_t ptppaddr;
1858	vm_page_t nkpg;
1859	pd_entry_t newpdir;
1860	pt_entry_t *pde;
1861
1862	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1863	if (kernel_vm_end == 0) {
1864		kernel_vm_end = KERNBASE;
1865		nkpt = 0;
1866		while (pdir_pde(PTD, kernel_vm_end)) {
1867			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1868			nkpt++;
1869			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1870				kernel_vm_end = kernel_map->max_offset;
1871				break;
1872			}
1873		}
1874	}
1875	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1876	if (addr - 1 >= kernel_map->max_offset)
1877		addr = kernel_map->max_offset;
1878	while (kernel_vm_end < addr) {
1879		if (pdir_pde(PTD, kernel_vm_end)) {
1880			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1881			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1882				kernel_vm_end = kernel_map->max_offset;
1883				break;
1884			}
1885			continue;
1886		}
1887
1888		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
1889		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1890		    VM_ALLOC_ZERO);
1891		if (nkpg == NULL)
1892			panic("pmap_growkernel: no memory to grow kernel");
1893
1894		nkpt++;
1895
1896		if ((nkpg->flags & PG_ZERO) == 0)
1897			pmap_zero_page(nkpg);
1898		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1899		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1900		pdir_pde(PTD, kernel_vm_end) = newpdir;
1901
1902		mtx_lock_spin(&allpmaps_lock);
1903		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1904			pde = pmap_pde(pmap, kernel_vm_end);
1905			pde_store(pde, newpdir);
1906		}
1907		mtx_unlock_spin(&allpmaps_lock);
1908		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1909		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1910			kernel_vm_end = kernel_map->max_offset;
1911			break;
1912		}
1913	}
1914}
1915
1916
1917/***************************************************
1918 * page management routines.
1919 ***************************************************/
1920
1921CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1922CTASSERT(_NPCM == 11);
1923
1924static __inline struct pv_chunk *
1925pv_to_chunk(pv_entry_t pv)
1926{
1927
1928	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1929}
1930
1931#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1932
1933#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
1934#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
1935
1936static uint32_t pc_freemask[11] = {
1937	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1938	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1939	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1940	PC_FREE0_9, PC_FREE10
1941};
1942
1943SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1944	"Current number of pv entries");
1945
1946#ifdef PV_STATS
1947static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1948
1949SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1950	"Current number of pv entry chunks");
1951SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1952	"Current number of pv entry chunks allocated");
1953SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1954	"Current number of pv entry chunks frees");
1955SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1956	"Number of times tried to get a chunk page but failed.");
1957
1958static long pv_entry_frees, pv_entry_allocs;
1959static int pv_entry_spare;
1960
1961SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1962	"Current number of pv entry frees");
1963SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1964	"Current number of pv entry allocs");
1965SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1966	"Current number of spare pv entries");
1967
1968static int pmap_collect_inactive, pmap_collect_active;
1969
1970SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1971	"Current number times pmap_collect called on inactive queue");
1972SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1973	"Current number times pmap_collect called on active queue");
1974#endif
1975
1976/*
1977 * We are in a serious low memory condition.  Resort to
1978 * drastic measures to free some pages so we can allocate
1979 * another pv entry chunk.  This is normally called to
1980 * unmap inactive pages, and if necessary, active pages.
1981 */
1982static void
1983pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1984{
1985	struct md_page *pvh;
1986	pd_entry_t *pde;
1987	pmap_t pmap;
1988	pt_entry_t *pte, tpte;
1989	pv_entry_t next_pv, pv;
1990	vm_offset_t va;
1991	vm_page_t m, free;
1992
1993	sched_pin();
1994	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1995		if (m->hold_count || m->busy)
1996			continue;
1997		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1998			va = pv->pv_va;
1999			pmap = PV_PMAP(pv);
2000			/* Avoid deadlock and lock recursion. */
2001			if (pmap > locked_pmap)
2002				PMAP_LOCK(pmap);
2003			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
2004				continue;
2005			pmap->pm_stats.resident_count--;
2006			pde = pmap_pde(pmap, va);
2007			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
2008			    " a 4mpage in page %p's pv list", m));
2009			pte = pmap_pte_quick(pmap, va);
2010			tpte = pte_load_clear(pte);
2011			KASSERT((tpte & PG_W) == 0,
2012			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
2013			if (tpte & PG_A)
2014				vm_page_flag_set(m, PG_REFERENCED);
2015			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2016				vm_page_dirty(m);
2017			free = NULL;
2018			pmap_unuse_pt(pmap, va, &free);
2019			pmap_invalidate_page(pmap, va);
2020			pmap_free_zero_pages(free);
2021			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2022			if (TAILQ_EMPTY(&m->md.pv_list)) {
2023				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2024				if (TAILQ_EMPTY(&pvh->pv_list))
2025					vm_page_flag_clear(m, PG_WRITEABLE);
2026			}
2027			free_pv_entry(pmap, pv);
2028			if (pmap != locked_pmap)
2029				PMAP_UNLOCK(pmap);
2030		}
2031	}
2032	sched_unpin();
2033}
2034
2035
2036/*
2037 * free the pv_entry back to the free list
2038 */
2039static void
2040free_pv_entry(pmap_t pmap, pv_entry_t pv)
2041{
2042	vm_page_t m;
2043	struct pv_chunk *pc;
2044	int idx, field, bit;
2045
2046	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2047	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2048	PV_STAT(pv_entry_frees++);
2049	PV_STAT(pv_entry_spare++);
2050	pv_entry_count--;
2051	pc = pv_to_chunk(pv);
2052	idx = pv - &pc->pc_pventry[0];
2053	field = idx / 32;
2054	bit = idx % 32;
2055	pc->pc_map[field] |= 1ul << bit;
2056	/* move to head of list */
2057	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2058	for (idx = 0; idx < _NPCM; idx++)
2059		if (pc->pc_map[idx] != pc_freemask[idx]) {
2060			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2061			return;
2062		}
2063	PV_STAT(pv_entry_spare -= _NPCPV);
2064	PV_STAT(pc_chunk_count--);
2065	PV_STAT(pc_chunk_frees++);
2066	/* entire chunk is free, return it */
2067	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2068	pmap_qremove((vm_offset_t)pc, 1);
2069	vm_page_unwire(m, 0);
2070	vm_page_free(m);
2071	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2072}
2073
2074/*
2075 * get a new pv_entry, allocating a block from the system
2076 * when needed.
2077 */
2078static pv_entry_t
2079get_pv_entry(pmap_t pmap, int try)
2080{
2081	static const struct timeval printinterval = { 60, 0 };
2082	static struct timeval lastprint;
2083	static vm_pindex_t colour;
2084	struct vpgqueues *pq;
2085	int bit, field;
2086	pv_entry_t pv;
2087	struct pv_chunk *pc;
2088	vm_page_t m;
2089
2090	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2091	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2092	PV_STAT(pv_entry_allocs++);
2093	pv_entry_count++;
2094	if (pv_entry_count > pv_entry_high_water)
2095		if (ratecheck(&lastprint, &printinterval))
2096			printf("Approaching the limit on PV entries, consider "
2097			    "increasing either the vm.pmap.shpgperproc or the "
2098			    "vm.pmap.pv_entry_max tunable.\n");
2099	pq = NULL;
2100retry:
2101	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2102	if (pc != NULL) {
2103		for (field = 0; field < _NPCM; field++) {
2104			if (pc->pc_map[field]) {
2105				bit = bsfl(pc->pc_map[field]);
2106				break;
2107			}
2108		}
2109		if (field < _NPCM) {
2110			pv = &pc->pc_pventry[field * 32 + bit];
2111			pc->pc_map[field] &= ~(1ul << bit);
2112			/* If this was the last item, move it to tail */
2113			for (field = 0; field < _NPCM; field++)
2114				if (pc->pc_map[field] != 0) {
2115					PV_STAT(pv_entry_spare--);
2116					return (pv);	/* not full, return */
2117				}
2118			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2119			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2120			PV_STAT(pv_entry_spare--);
2121			return (pv);
2122		}
2123	}
2124	/*
2125	 * Access to the ptelist "pv_vafree" is synchronized by the page
2126	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2127	 * remain non-empty until pmap_ptelist_alloc() completes.
2128	 */
2129	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq ==
2130	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
2131	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2132		if (try) {
2133			pv_entry_count--;
2134			PV_STAT(pc_chunk_tryfail++);
2135			return (NULL);
2136		}
2137		/*
2138		 * Reclaim pv entries: At first, destroy mappings to
2139		 * inactive pages.  After that, if a pv chunk entry
2140		 * is still needed, destroy mappings to active pages.
2141		 */
2142		if (pq == NULL) {
2143			PV_STAT(pmap_collect_inactive++);
2144			pq = &vm_page_queues[PQ_INACTIVE];
2145		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2146			PV_STAT(pmap_collect_active++);
2147			pq = &vm_page_queues[PQ_ACTIVE];
2148		} else
2149			panic("get_pv_entry: increase vm.pmap.shpgperproc");
2150		pmap_collect(pmap, pq);
2151		goto retry;
2152	}
2153	PV_STAT(pc_chunk_count++);
2154	PV_STAT(pc_chunk_allocs++);
2155	colour++;
2156	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2157	pmap_qenter((vm_offset_t)pc, &m, 1);
2158	pc->pc_pmap = pmap;
2159	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2160	for (field = 1; field < _NPCM; field++)
2161		pc->pc_map[field] = pc_freemask[field];
2162	pv = &pc->pc_pventry[0];
2163	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2164	PV_STAT(pv_entry_spare += _NPCPV - 1);
2165	return (pv);
2166}
2167
2168static __inline pv_entry_t
2169pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2170{
2171	pv_entry_t pv;
2172
2173	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2174	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2175		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2176			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2177			break;
2178		}
2179	}
2180	return (pv);
2181}
2182
2183static void
2184pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2185{
2186	struct md_page *pvh;
2187	pv_entry_t pv;
2188	vm_offset_t va_last;
2189	vm_page_t m;
2190
2191	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2192	KASSERT((pa & PDRMASK) == 0,
2193	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2194
2195	/*
2196	 * Transfer the 4mpage's pv entry for this mapping to the first
2197	 * page's pv list.
2198	 */
2199	pvh = pa_to_pvh(pa);
2200	va = trunc_4mpage(va);
2201	pv = pmap_pvh_remove(pvh, pmap, va);
2202	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2203	m = PHYS_TO_VM_PAGE(pa);
2204	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2205	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2206	va_last = va + NBPDR - PAGE_SIZE;
2207	do {
2208		m++;
2209		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2210		    ("pmap_pv_demote_pde: page %p is not managed", m));
2211		va += PAGE_SIZE;
2212		pmap_insert_entry(pmap, va, m);
2213	} while (va < va_last);
2214}
2215
2216static void
2217pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2218{
2219	struct md_page *pvh;
2220	pv_entry_t pv;
2221	vm_offset_t va_last;
2222	vm_page_t m;
2223
2224	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2225	KASSERT((pa & PDRMASK) == 0,
2226	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2227
2228	/*
2229	 * Transfer the first page's pv entry for this mapping to the
2230	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2231	 * to get_pv_entry(), a transfer avoids the possibility that
2232	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2233	 * removes one of the mappings that is being promoted.
2234	 */
2235	m = PHYS_TO_VM_PAGE(pa);
2236	va = trunc_4mpage(va);
2237	pv = pmap_pvh_remove(&m->md, pmap, va);
2238	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2239	pvh = pa_to_pvh(pa);
2240	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2241	/* Free the remaining NPTEPG - 1 pv entries. */
2242	va_last = va + NBPDR - PAGE_SIZE;
2243	do {
2244		m++;
2245		va += PAGE_SIZE;
2246		pmap_pvh_free(&m->md, pmap, va);
2247	} while (va < va_last);
2248}
2249
2250static void
2251pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2252{
2253	pv_entry_t pv;
2254
2255	pv = pmap_pvh_remove(pvh, pmap, va);
2256	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2257	free_pv_entry(pmap, pv);
2258}
2259
2260static void
2261pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2262{
2263	struct md_page *pvh;
2264
2265	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2266	pmap_pvh_free(&m->md, pmap, va);
2267	if (TAILQ_EMPTY(&m->md.pv_list)) {
2268		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2269		if (TAILQ_EMPTY(&pvh->pv_list))
2270			vm_page_flag_clear(m, PG_WRITEABLE);
2271	}
2272}
2273
2274/*
2275 * Create a pv entry for page at pa for
2276 * (pmap, va).
2277 */
2278static void
2279pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2280{
2281	pv_entry_t pv;
2282
2283	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2284	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2285	pv = get_pv_entry(pmap, FALSE);
2286	pv->pv_va = va;
2287	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2288}
2289
2290/*
2291 * Conditionally create a pv entry.
2292 */
2293static boolean_t
2294pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2295{
2296	pv_entry_t pv;
2297
2298	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2299	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2300	if (pv_entry_count < pv_entry_high_water &&
2301	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2302		pv->pv_va = va;
2303		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2304		return (TRUE);
2305	} else
2306		return (FALSE);
2307}
2308
2309/*
2310 * Create the pv entries for each of the pages within a superpage.
2311 */
2312static boolean_t
2313pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2314{
2315	struct md_page *pvh;
2316	pv_entry_t pv;
2317
2318	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2319	if (pv_entry_count < pv_entry_high_water &&
2320	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2321		pv->pv_va = va;
2322		pvh = pa_to_pvh(pa);
2323		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2324		return (TRUE);
2325	} else
2326		return (FALSE);
2327}
2328
2329/*
2330 * Fills a page table page with mappings to consecutive physical pages.
2331 */
2332static void
2333pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2334{
2335	pt_entry_t *pte;
2336
2337	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2338		*pte = newpte;
2339		newpte += PAGE_SIZE;
2340	}
2341}
2342
2343/*
2344 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2345 * 2- or 4MB page mapping is invalidated.
2346 */
2347static boolean_t
2348pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2349{
2350	pd_entry_t newpde, oldpde;
2351	pmap_t allpmaps_entry;
2352	pt_entry_t *firstpte, newpte;
2353	vm_paddr_t mptepa;
2354	vm_page_t free, mpte;
2355
2356	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2357	oldpde = *pde;
2358	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2359	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2360	mpte = pmap_lookup_pt_page(pmap, va);
2361	if (mpte != NULL)
2362		pmap_remove_pt_page(pmap, mpte);
2363	else {
2364		KASSERT((oldpde & PG_W) == 0,
2365		    ("pmap_demote_pde: page table page for a wired mapping"
2366		    " is missing"));
2367
2368		/*
2369		 * Invalidate the 2- or 4MB page mapping and return
2370		 * "failure" if the mapping was never accessed or the
2371		 * allocation of the new page table page fails.
2372		 */
2373		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2374		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2375		    VM_ALLOC_WIRED)) == NULL) {
2376			free = NULL;
2377			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2378			pmap_invalidate_page(pmap, trunc_4mpage(va));
2379			pmap_free_zero_pages(free);
2380			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2381			    " in pmap %p", va, pmap);
2382			return (FALSE);
2383		}
2384		if (va < VM_MAXUSER_ADDRESS)
2385			pmap->pm_stats.resident_count++;
2386	}
2387	mptepa = VM_PAGE_TO_PHYS(mpte);
2388
2389	/*
2390	 * Temporarily map the page table page (mpte) into the kernel's
2391	 * address space at either PADDR1 or PADDR2.
2392	 */
2393	if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2394		if ((*PMAP1 & PG_FRAME) != mptepa) {
2395			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2396#ifdef SMP
2397			PMAP1cpu = PCPU_GET(cpuid);
2398#endif
2399			invlcaddr(PADDR1);
2400			PMAP1changed++;
2401		} else
2402#ifdef SMP
2403		if (PMAP1cpu != PCPU_GET(cpuid)) {
2404			PMAP1cpu = PCPU_GET(cpuid);
2405			invlcaddr(PADDR1);
2406			PMAP1changedcpu++;
2407		} else
2408#endif
2409			PMAP1unchanged++;
2410		firstpte = PADDR1;
2411	} else {
2412		mtx_lock(&PMAP2mutex);
2413		if ((*PMAP2 & PG_FRAME) != mptepa) {
2414			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2415			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2416		}
2417		firstpte = PADDR2;
2418	}
2419	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2420	KASSERT((oldpde & PG_A) != 0,
2421	    ("pmap_demote_pde: oldpde is missing PG_A"));
2422	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2423	    ("pmap_demote_pde: oldpde is missing PG_M"));
2424	newpte = oldpde & ~PG_PS;
2425	if ((newpte & PG_PDE_PAT) != 0)
2426		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2427
2428	/*
2429	 * If the page table page is new, initialize it.
2430	 */
2431	if (mpte->wire_count == 1) {
2432		mpte->wire_count = NPTEPG;
2433		pmap_fill_ptp(firstpte, newpte);
2434	}
2435	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2436	    ("pmap_demote_pde: firstpte and newpte map different physical"
2437	    " addresses"));
2438
2439	/*
2440	 * If the mapping has changed attributes, update the page table
2441	 * entries.
2442	 */
2443	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2444		pmap_fill_ptp(firstpte, newpte);
2445
2446	/*
2447	 * Demote the mapping.  This pmap is locked.  The old PDE has
2448	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2449	 * set.  Thus, there is no danger of a race with another
2450	 * processor changing the setting of PG_A and/or PG_M between
2451	 * the read above and the store below.
2452	 */
2453	if (pmap == kernel_pmap) {
2454		/*
2455		 * A harmless race exists between this loop and the bcopy()
2456		 * in pmap_pinit() that initializes the kernel segment of
2457		 * the new page table.  Specifically, that bcopy() may copy
2458		 * the new PDE from the PTD, which is first in allpmaps, to
2459		 * the new page table before this loop updates that new
2460		 * page table.
2461		 */
2462		mtx_lock_spin(&allpmaps_lock);
2463		LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) {
2464			pde = pmap_pde(allpmaps_entry, va);
2465			KASSERT(*pde == newpde || (*pde & PG_PTE_PROMOTE) ==
2466			    (oldpde & PG_PTE_PROMOTE),
2467			    ("pmap_demote_pde: pde was %#jx, expected %#jx",
2468			    (uintmax_t)*pde, (uintmax_t)oldpde));
2469			pde_store(pde, newpde);
2470		}
2471		mtx_unlock_spin(&allpmaps_lock);
2472	} else
2473		pde_store(pde, newpde);
2474	if (firstpte == PADDR2)
2475		mtx_unlock(&PMAP2mutex);
2476
2477	/*
2478	 * Invalidate the recursive mapping of the page table page.
2479	 */
2480	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2481
2482	/*
2483	 * Demote the pv entry.  This depends on the earlier demotion
2484	 * of the mapping.  Specifically, the (re)creation of a per-
2485	 * page pv entry might trigger the execution of pmap_collect(),
2486	 * which might reclaim a newly (re)created per-page pv entry
2487	 * and destroy the associated mapping.  In order to destroy
2488	 * the mapping, the PDE must have already changed from mapping
2489	 * the 2mpage to referencing the page table page.
2490	 */
2491	if ((oldpde & PG_MANAGED) != 0)
2492		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2493
2494	pmap_pde_demotions++;
2495	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2496	    " in pmap %p", va, pmap);
2497	return (TRUE);
2498}
2499
2500/*
2501 * pmap_remove_pde: do the things to unmap a superpage in a process
2502 */
2503static void
2504pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2505    vm_page_t *free)
2506{
2507	struct md_page *pvh;
2508	pd_entry_t oldpde;
2509	vm_offset_t eva, va;
2510	vm_page_t m, mpte;
2511
2512	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2513	KASSERT((sva & PDRMASK) == 0,
2514	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2515	oldpde = pte_load_clear(pdq);
2516	if (oldpde & PG_W)
2517		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2518
2519	/*
2520	 * Machines that don't support invlpg, also don't support
2521	 * PG_G.
2522	 */
2523	if (oldpde & PG_G)
2524		pmap_invalidate_page(kernel_pmap, sva);
2525	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2526	if (oldpde & PG_MANAGED) {
2527		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2528		pmap_pvh_free(pvh, pmap, sva);
2529		eva = sva + NBPDR;
2530		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2531		    va < eva; va += PAGE_SIZE, m++) {
2532			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2533				vm_page_dirty(m);
2534			if (oldpde & PG_A)
2535				vm_page_flag_set(m, PG_REFERENCED);
2536			if (TAILQ_EMPTY(&m->md.pv_list) &&
2537			    TAILQ_EMPTY(&pvh->pv_list))
2538				vm_page_flag_clear(m, PG_WRITEABLE);
2539		}
2540	}
2541	if (pmap == kernel_pmap) {
2542		if (!pmap_demote_pde(pmap, pdq, sva))
2543			panic("pmap_remove_pde: failed demotion");
2544	} else {
2545		mpte = pmap_lookup_pt_page(pmap, sva);
2546		if (mpte != NULL) {
2547			pmap_remove_pt_page(pmap, mpte);
2548			pmap->pm_stats.resident_count--;
2549			KASSERT(mpte->wire_count == NPTEPG,
2550			    ("pmap_remove_pde: pte page wire count error"));
2551			mpte->wire_count = 0;
2552			pmap_add_delayed_free_list(mpte, free, FALSE);
2553			atomic_subtract_int(&cnt.v_wire_count, 1);
2554		}
2555	}
2556}
2557
2558/*
2559 * pmap_remove_pte: do the things to unmap a page in a process
2560 */
2561static int
2562pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2563{
2564	pt_entry_t oldpte;
2565	vm_page_t m;
2566
2567	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2568	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2569	oldpte = pte_load_clear(ptq);
2570	if (oldpte & PG_W)
2571		pmap->pm_stats.wired_count -= 1;
2572	/*
2573	 * Machines that don't support invlpg, also don't support
2574	 * PG_G.
2575	 */
2576	if (oldpte & PG_G)
2577		pmap_invalidate_page(kernel_pmap, va);
2578	pmap->pm_stats.resident_count -= 1;
2579	if (oldpte & PG_MANAGED) {
2580		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2581		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2582			vm_page_dirty(m);
2583		if (oldpte & PG_A)
2584			vm_page_flag_set(m, PG_REFERENCED);
2585		pmap_remove_entry(pmap, m, va);
2586	}
2587	return (pmap_unuse_pt(pmap, va, free));
2588}
2589
2590/*
2591 * Remove a single page from a process address space
2592 */
2593static void
2594pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2595{
2596	pt_entry_t *pte;
2597
2598	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2599	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2600	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2601	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2602		return;
2603	pmap_remove_pte(pmap, pte, va, free);
2604	pmap_invalidate_page(pmap, va);
2605}
2606
2607/*
2608 *	Remove the given range of addresses from the specified map.
2609 *
2610 *	It is assumed that the start and end are properly
2611 *	rounded to the page size.
2612 */
2613void
2614pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2615{
2616	vm_offset_t pdnxt;
2617	pd_entry_t ptpaddr;
2618	pt_entry_t *pte;
2619	vm_page_t free = NULL;
2620	int anyvalid;
2621
2622	/*
2623	 * Perform an unsynchronized read.  This is, however, safe.
2624	 */
2625	if (pmap->pm_stats.resident_count == 0)
2626		return;
2627
2628	anyvalid = 0;
2629
2630	vm_page_lock_queues();
2631	sched_pin();
2632	PMAP_LOCK(pmap);
2633
2634	/*
2635	 * special handling of removing one page.  a very
2636	 * common operation and easy to short circuit some
2637	 * code.
2638	 */
2639	if ((sva + PAGE_SIZE == eva) &&
2640	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2641		pmap_remove_page(pmap, sva, &free);
2642		goto out;
2643	}
2644
2645	for (; sva < eva; sva = pdnxt) {
2646		unsigned pdirindex;
2647
2648		/*
2649		 * Calculate index for next page table.
2650		 */
2651		pdnxt = (sva + NBPDR) & ~PDRMASK;
2652		if (pdnxt < sva)
2653			pdnxt = eva;
2654		if (pmap->pm_stats.resident_count == 0)
2655			break;
2656
2657		pdirindex = sva >> PDRSHIFT;
2658		ptpaddr = pmap->pm_pdir[pdirindex];
2659
2660		/*
2661		 * Weed out invalid mappings. Note: we assume that the page
2662		 * directory table is always allocated, and in kernel virtual.
2663		 */
2664		if (ptpaddr == 0)
2665			continue;
2666
2667		/*
2668		 * Check for large page.
2669		 */
2670		if ((ptpaddr & PG_PS) != 0) {
2671			/*
2672			 * Are we removing the entire large page?  If not,
2673			 * demote the mapping and fall through.
2674			 */
2675			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2676				/*
2677				 * The TLB entry for a PG_G mapping is
2678				 * invalidated by pmap_remove_pde().
2679				 */
2680				if ((ptpaddr & PG_G) == 0)
2681					anyvalid = 1;
2682				pmap_remove_pde(pmap,
2683				    &pmap->pm_pdir[pdirindex], sva, &free);
2684				continue;
2685			} else if (!pmap_demote_pde(pmap,
2686			    &pmap->pm_pdir[pdirindex], sva)) {
2687				/* The large page mapping was destroyed. */
2688				continue;
2689			}
2690		}
2691
2692		/*
2693		 * Limit our scan to either the end of the va represented
2694		 * by the current page table page, or to the end of the
2695		 * range being removed.
2696		 */
2697		if (pdnxt > eva)
2698			pdnxt = eva;
2699
2700		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2701		    sva += PAGE_SIZE) {
2702			if (*pte == 0)
2703				continue;
2704
2705			/*
2706			 * The TLB entry for a PG_G mapping is invalidated
2707			 * by pmap_remove_pte().
2708			 */
2709			if ((*pte & PG_G) == 0)
2710				anyvalid = 1;
2711			if (pmap_remove_pte(pmap, pte, sva, &free))
2712				break;
2713		}
2714	}
2715out:
2716	sched_unpin();
2717	if (anyvalid)
2718		pmap_invalidate_all(pmap);
2719	vm_page_unlock_queues();
2720	PMAP_UNLOCK(pmap);
2721	pmap_free_zero_pages(free);
2722}
2723
2724/*
2725 *	Routine:	pmap_remove_all
2726 *	Function:
2727 *		Removes this physical page from
2728 *		all physical maps in which it resides.
2729 *		Reflects back modify bits to the pager.
2730 *
2731 *	Notes:
2732 *		Original versions of this routine were very
2733 *		inefficient because they iteratively called
2734 *		pmap_remove (slow...)
2735 */
2736
2737void
2738pmap_remove_all(vm_page_t m)
2739{
2740	struct md_page *pvh;
2741	pv_entry_t pv;
2742	pmap_t pmap;
2743	pt_entry_t *pte, tpte;
2744	pd_entry_t *pde;
2745	vm_offset_t va;
2746	vm_page_t free;
2747
2748	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2749	    ("pmap_remove_all: page %p is fictitious", m));
2750	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2751	sched_pin();
2752	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2753	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2754		va = pv->pv_va;
2755		pmap = PV_PMAP(pv);
2756		PMAP_LOCK(pmap);
2757		pde = pmap_pde(pmap, va);
2758		(void)pmap_demote_pde(pmap, pde, va);
2759		PMAP_UNLOCK(pmap);
2760	}
2761	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2762		pmap = PV_PMAP(pv);
2763		PMAP_LOCK(pmap);
2764		pmap->pm_stats.resident_count--;
2765		pde = pmap_pde(pmap, pv->pv_va);
2766		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2767		    " a 4mpage in page %p's pv list", m));
2768		pte = pmap_pte_quick(pmap, pv->pv_va);
2769		tpte = pte_load_clear(pte);
2770		if (tpte & PG_W)
2771			pmap->pm_stats.wired_count--;
2772		if (tpte & PG_A)
2773			vm_page_flag_set(m, PG_REFERENCED);
2774
2775		/*
2776		 * Update the vm_page_t clean and reference bits.
2777		 */
2778		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2779			vm_page_dirty(m);
2780		free = NULL;
2781		pmap_unuse_pt(pmap, pv->pv_va, &free);
2782		pmap_invalidate_page(pmap, pv->pv_va);
2783		pmap_free_zero_pages(free);
2784		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2785		free_pv_entry(pmap, pv);
2786		PMAP_UNLOCK(pmap);
2787	}
2788	vm_page_flag_clear(m, PG_WRITEABLE);
2789	sched_unpin();
2790}
2791
2792/*
2793 * pmap_protect_pde: do the things to protect a 4mpage in a process
2794 */
2795static boolean_t
2796pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2797{
2798	pd_entry_t newpde, oldpde;
2799	vm_offset_t eva, va;
2800	vm_page_t m;
2801	boolean_t anychanged;
2802
2803	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2804	KASSERT((sva & PDRMASK) == 0,
2805	    ("pmap_protect_pde: sva is not 4mpage aligned"));
2806	anychanged = FALSE;
2807retry:
2808	oldpde = newpde = *pde;
2809	if (oldpde & PG_MANAGED) {
2810		eva = sva + NBPDR;
2811		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2812		    va < eva; va += PAGE_SIZE, m++) {
2813			/*
2814			 * In contrast to the analogous operation on a 4KB page
2815			 * mapping, the mapping's PG_A flag is not cleared and
2816			 * the page's PG_REFERENCED flag is not set.  The
2817			 * reason is that pmap_demote_pde() expects that a 2/4MB
2818			 * page mapping with a stored page table page has PG_A
2819			 * set.
2820			 */
2821			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2822				vm_page_dirty(m);
2823		}
2824	}
2825	if ((prot & VM_PROT_WRITE) == 0)
2826		newpde &= ~(PG_RW | PG_M);
2827#ifdef PAE
2828	if ((prot & VM_PROT_EXECUTE) == 0)
2829		newpde |= pg_nx;
2830#endif
2831	if (newpde != oldpde) {
2832		if (!pde_cmpset(pde, oldpde, newpde))
2833			goto retry;
2834		if (oldpde & PG_G)
2835			pmap_invalidate_page(pmap, sva);
2836		else
2837			anychanged = TRUE;
2838	}
2839	return (anychanged);
2840}
2841
2842/*
2843 *	Set the physical protection on the
2844 *	specified range of this map as requested.
2845 */
2846void
2847pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2848{
2849	vm_offset_t pdnxt;
2850	pd_entry_t ptpaddr;
2851	pt_entry_t *pte;
2852	int anychanged;
2853
2854	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2855		pmap_remove(pmap, sva, eva);
2856		return;
2857	}
2858
2859#ifdef PAE
2860	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2861	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2862		return;
2863#else
2864	if (prot & VM_PROT_WRITE)
2865		return;
2866#endif
2867
2868	anychanged = 0;
2869
2870	vm_page_lock_queues();
2871	sched_pin();
2872	PMAP_LOCK(pmap);
2873	for (; sva < eva; sva = pdnxt) {
2874		pt_entry_t obits, pbits;
2875		unsigned pdirindex;
2876
2877		pdnxt = (sva + NBPDR) & ~PDRMASK;
2878		if (pdnxt < sva)
2879			pdnxt = eva;
2880
2881		pdirindex = sva >> PDRSHIFT;
2882		ptpaddr = pmap->pm_pdir[pdirindex];
2883
2884		/*
2885		 * Weed out invalid mappings. Note: we assume that the page
2886		 * directory table is always allocated, and in kernel virtual.
2887		 */
2888		if (ptpaddr == 0)
2889			continue;
2890
2891		/*
2892		 * Check for large page.
2893		 */
2894		if ((ptpaddr & PG_PS) != 0) {
2895			/*
2896			 * Are we protecting the entire large page?  If not,
2897			 * demote the mapping and fall through.
2898			 */
2899			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2900				/*
2901				 * The TLB entry for a PG_G mapping is
2902				 * invalidated by pmap_protect_pde().
2903				 */
2904				if (pmap_protect_pde(pmap,
2905				    &pmap->pm_pdir[pdirindex], sva, prot))
2906					anychanged = 1;
2907				continue;
2908			} else if (!pmap_demote_pde(pmap,
2909			    &pmap->pm_pdir[pdirindex], sva)) {
2910				/* The large page mapping was destroyed. */
2911				continue;
2912			}
2913		}
2914
2915		if (pdnxt > eva)
2916			pdnxt = eva;
2917
2918		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2919		    sva += PAGE_SIZE) {
2920			vm_page_t m;
2921
2922retry:
2923			/*
2924			 * Regardless of whether a pte is 32 or 64 bits in
2925			 * size, PG_RW, PG_A, and PG_M are among the least
2926			 * significant 32 bits.
2927			 */
2928			obits = pbits = *pte;
2929			if ((pbits & PG_V) == 0)
2930				continue;
2931			if (pbits & PG_MANAGED) {
2932				m = NULL;
2933				if (pbits & PG_A) {
2934					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2935					vm_page_flag_set(m, PG_REFERENCED);
2936					pbits &= ~PG_A;
2937				}
2938				if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2939					if (m == NULL)
2940						m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2941					vm_page_dirty(m);
2942				}
2943			}
2944
2945			if ((prot & VM_PROT_WRITE) == 0)
2946				pbits &= ~(PG_RW | PG_M);
2947#ifdef PAE
2948			if ((prot & VM_PROT_EXECUTE) == 0)
2949				pbits |= pg_nx;
2950#endif
2951
2952			if (pbits != obits) {
2953#ifdef PAE
2954				if (!atomic_cmpset_64(pte, obits, pbits))
2955					goto retry;
2956#else
2957				if (!atomic_cmpset_int((u_int *)pte, obits,
2958				    pbits))
2959					goto retry;
2960#endif
2961				if (obits & PG_G)
2962					pmap_invalidate_page(pmap, sva);
2963				else
2964					anychanged = 1;
2965			}
2966		}
2967	}
2968	sched_unpin();
2969	if (anychanged)
2970		pmap_invalidate_all(pmap);
2971	vm_page_unlock_queues();
2972	PMAP_UNLOCK(pmap);
2973}
2974
2975/*
2976 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
2977 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
2978 * For promotion to occur, two conditions must be met: (1) the 4KB page
2979 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
2980 * mappings must have identical characteristics.
2981 *
2982 * Managed (PG_MANAGED) mappings within the kernel address space are not
2983 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
2984 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
2985 * pmap.
2986 */
2987static void
2988pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2989{
2990	pd_entry_t newpde;
2991	pmap_t allpmaps_entry;
2992	pt_entry_t *firstpte, oldpte, pa, *pte;
2993	vm_offset_t oldpteva;
2994	vm_page_t mpte;
2995
2996	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2997
2998	/*
2999	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3000	 * either invalid, unused, or does not map the first 4KB physical page
3001	 * within a 2- or 4MB page.
3002	 */
3003	firstpte = vtopte(trunc_4mpage(va));
3004setpde:
3005	newpde = *firstpte;
3006	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3007		pmap_pde_p_failures++;
3008		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3009		    " in pmap %p", va, pmap);
3010		return;
3011	}
3012	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3013		pmap_pde_p_failures++;
3014		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3015		    " in pmap %p", va, pmap);
3016		return;
3017	}
3018	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3019		/*
3020		 * When PG_M is already clear, PG_RW can be cleared without
3021		 * a TLB invalidation.
3022		 */
3023		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3024		    ~PG_RW))
3025			goto setpde;
3026		newpde &= ~PG_RW;
3027	}
3028
3029	/*
3030	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3031	 * PTE maps an unexpected 4KB physical page or does not have identical
3032	 * characteristics to the first PTE.
3033	 */
3034	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3035	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3036setpte:
3037		oldpte = *pte;
3038		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3039			pmap_pde_p_failures++;
3040			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3041			    " in pmap %p", va, pmap);
3042			return;
3043		}
3044		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3045			/*
3046			 * When PG_M is already clear, PG_RW can be cleared
3047			 * without a TLB invalidation.
3048			 */
3049			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3050			    oldpte & ~PG_RW))
3051				goto setpte;
3052			oldpte &= ~PG_RW;
3053			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3054			    (va & ~PDRMASK);
3055			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3056			    " in pmap %p", oldpteva, pmap);
3057		}
3058		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3059			pmap_pde_p_failures++;
3060			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3061			    " in pmap %p", va, pmap);
3062			return;
3063		}
3064		pa -= PAGE_SIZE;
3065	}
3066
3067	/*
3068	 * Save the page table page in its current state until the PDE
3069	 * mapping the superpage is demoted by pmap_demote_pde() or
3070	 * destroyed by pmap_remove_pde().
3071	 */
3072	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3073	KASSERT(mpte >= vm_page_array &&
3074	    mpte < &vm_page_array[vm_page_array_size],
3075	    ("pmap_promote_pde: page table page is out of range"));
3076	KASSERT(mpte->pindex == va >> PDRSHIFT,
3077	    ("pmap_promote_pde: page table page's pindex is wrong"));
3078	pmap_insert_pt_page(pmap, mpte);
3079
3080	/*
3081	 * Promote the pv entries.
3082	 */
3083	if ((newpde & PG_MANAGED) != 0)
3084		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3085
3086	/*
3087	 * Propagate the PAT index to its proper position.
3088	 */
3089	if ((newpde & PG_PTE_PAT) != 0)
3090		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3091
3092	/*
3093	 * Map the superpage.
3094	 */
3095	if (pmap == kernel_pmap) {
3096		mtx_lock_spin(&allpmaps_lock);
3097		LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) {
3098			pde = pmap_pde(allpmaps_entry, va);
3099			pde_store(pde, PG_PS | newpde);
3100		}
3101		mtx_unlock_spin(&allpmaps_lock);
3102	} else
3103		pde_store(pde, PG_PS | newpde);
3104
3105	pmap_pde_promotions++;
3106	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3107	    " in pmap %p", va, pmap);
3108}
3109
3110/*
3111 *	Insert the given physical page (p) at
3112 *	the specified virtual address (v) in the
3113 *	target physical map with the protection requested.
3114 *
3115 *	If specified, the page will be wired down, meaning
3116 *	that the related pte can not be reclaimed.
3117 *
3118 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3119 *	or lose information.  That is, this routine must actually
3120 *	insert this page into the given map NOW.
3121 */
3122void
3123pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3124    vm_prot_t prot, boolean_t wired)
3125{
3126	vm_paddr_t pa;
3127	pd_entry_t *pde;
3128	pt_entry_t *pte;
3129	vm_paddr_t opa;
3130	pt_entry_t origpte, newpte;
3131	vm_page_t mpte, om;
3132	boolean_t invlva;
3133
3134	va = trunc_page(va);
3135	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3136	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3137	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va));
3138
3139	mpte = NULL;
3140
3141	vm_page_lock_queues();
3142	PMAP_LOCK(pmap);
3143	sched_pin();
3144
3145	/*
3146	 * In the case that a page table page is not
3147	 * resident, we are creating it here.
3148	 */
3149	if (va < VM_MAXUSER_ADDRESS) {
3150		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3151	}
3152
3153	pde = pmap_pde(pmap, va);
3154	if ((*pde & PG_PS) != 0)
3155		panic("pmap_enter: attempted pmap_enter on 4MB page");
3156	pte = pmap_pte_quick(pmap, va);
3157
3158	/*
3159	 * Page Directory table entry not valid, we need a new PT page
3160	 */
3161	if (pte == NULL) {
3162		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3163			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3164	}
3165
3166	pa = VM_PAGE_TO_PHYS(m);
3167	om = NULL;
3168	origpte = *pte;
3169	opa = origpte & PG_FRAME;
3170
3171	/*
3172	 * Mapping has not changed, must be protection or wiring change.
3173	 */
3174	if (origpte && (opa == pa)) {
3175		/*
3176		 * Wiring change, just update stats. We don't worry about
3177		 * wiring PT pages as they remain resident as long as there
3178		 * are valid mappings in them. Hence, if a user page is wired,
3179		 * the PT page will be also.
3180		 */
3181		if (wired && ((origpte & PG_W) == 0))
3182			pmap->pm_stats.wired_count++;
3183		else if (!wired && (origpte & PG_W))
3184			pmap->pm_stats.wired_count--;
3185
3186		/*
3187		 * Remove extra pte reference
3188		 */
3189		if (mpte)
3190			mpte->wire_count--;
3191
3192		/*
3193		 * We might be turning off write access to the page,
3194		 * so we go ahead and sense modify status.
3195		 */
3196		if (origpte & PG_MANAGED) {
3197			om = m;
3198			pa |= PG_MANAGED;
3199		}
3200		goto validate;
3201	}
3202	/*
3203	 * Mapping has changed, invalidate old range and fall through to
3204	 * handle validating new mapping.
3205	 */
3206	if (opa) {
3207		if (origpte & PG_W)
3208			pmap->pm_stats.wired_count--;
3209		if (origpte & PG_MANAGED) {
3210			om = PHYS_TO_VM_PAGE(opa);
3211			pmap_remove_entry(pmap, om, va);
3212		}
3213		if (mpte != NULL) {
3214			mpte->wire_count--;
3215			KASSERT(mpte->wire_count > 0,
3216			    ("pmap_enter: missing reference to page table page,"
3217			     " va: 0x%x", va));
3218		}
3219	} else
3220		pmap->pm_stats.resident_count++;
3221
3222	/*
3223	 * Enter on the PV list if part of our managed memory.
3224	 */
3225	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3226		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3227		    ("pmap_enter: managed mapping within the clean submap"));
3228		pmap_insert_entry(pmap, va, m);
3229		pa |= PG_MANAGED;
3230	}
3231
3232	/*
3233	 * Increment counters
3234	 */
3235	if (wired)
3236		pmap->pm_stats.wired_count++;
3237
3238validate:
3239	/*
3240	 * Now validate mapping with desired protection/wiring.
3241	 */
3242	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3243	if ((prot & VM_PROT_WRITE) != 0) {
3244		newpte |= PG_RW;
3245		vm_page_flag_set(m, PG_WRITEABLE);
3246	}
3247#ifdef PAE
3248	if ((prot & VM_PROT_EXECUTE) == 0)
3249		newpte |= pg_nx;
3250#endif
3251	if (wired)
3252		newpte |= PG_W;
3253	if (va < VM_MAXUSER_ADDRESS)
3254		newpte |= PG_U;
3255	if (pmap == kernel_pmap)
3256		newpte |= pgeflag;
3257
3258	/*
3259	 * if the mapping or permission bits are different, we need
3260	 * to update the pte.
3261	 */
3262	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3263		newpte |= PG_A;
3264		if ((access & VM_PROT_WRITE) != 0)
3265			newpte |= PG_M;
3266		if (origpte & PG_V) {
3267			invlva = FALSE;
3268			origpte = pte_load_store(pte, newpte);
3269			if (origpte & PG_A) {
3270				if (origpte & PG_MANAGED)
3271					vm_page_flag_set(om, PG_REFERENCED);
3272				if (opa != VM_PAGE_TO_PHYS(m))
3273					invlva = TRUE;
3274#ifdef PAE
3275				if ((origpte & PG_NX) == 0 &&
3276				    (newpte & PG_NX) != 0)
3277					invlva = TRUE;
3278#endif
3279			}
3280			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3281				if ((origpte & PG_MANAGED) != 0)
3282					vm_page_dirty(om);
3283				if ((prot & VM_PROT_WRITE) == 0)
3284					invlva = TRUE;
3285			}
3286			if (invlva)
3287				pmap_invalidate_page(pmap, va);
3288		} else
3289			pte_store(pte, newpte);
3290	}
3291
3292	/*
3293	 * If both the page table page and the reservation are fully
3294	 * populated, then attempt promotion.
3295	 */
3296	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3297	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3298		pmap_promote_pde(pmap, pde, va);
3299
3300	sched_unpin();
3301	vm_page_unlock_queues();
3302	PMAP_UNLOCK(pmap);
3303}
3304
3305/*
3306 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3307 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3308 * blocking, (2) a mapping already exists at the specified virtual address, or
3309 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3310 */
3311static boolean_t
3312pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3313{
3314	pd_entry_t *pde, newpde;
3315
3316	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3317	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3318	pde = pmap_pde(pmap, va);
3319	if (*pde != 0) {
3320		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3321		    " in pmap %p", va, pmap);
3322		return (FALSE);
3323	}
3324	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3325	    PG_PS | PG_V;
3326	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3327		newpde |= PG_MANAGED;
3328
3329		/*
3330		 * Abort this mapping if its PV entry could not be created.
3331		 */
3332		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3333			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3334			    " in pmap %p", va, pmap);
3335			return (FALSE);
3336		}
3337	}
3338#ifdef PAE
3339	if ((prot & VM_PROT_EXECUTE) == 0)
3340		newpde |= pg_nx;
3341#endif
3342	if (va < VM_MAXUSER_ADDRESS)
3343		newpde |= PG_U;
3344
3345	/*
3346	 * Increment counters.
3347	 */
3348	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3349
3350	/*
3351	 * Map the superpage.
3352	 */
3353	pde_store(pde, newpde);
3354
3355	pmap_pde_mappings++;
3356	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3357	    " in pmap %p", va, pmap);
3358	return (TRUE);
3359}
3360
3361/*
3362 * Maps a sequence of resident pages belonging to the same object.
3363 * The sequence begins with the given page m_start.  This page is
3364 * mapped at the given virtual address start.  Each subsequent page is
3365 * mapped at a virtual address that is offset from start by the same
3366 * amount as the page is offset from m_start within the object.  The
3367 * last page in the sequence is the page with the largest offset from
3368 * m_start that can be mapped at a virtual address less than the given
3369 * virtual address end.  Not every virtual page between start and end
3370 * is mapped; only those for which a resident page exists with the
3371 * corresponding offset from m_start are mapped.
3372 */
3373void
3374pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3375    vm_page_t m_start, vm_prot_t prot)
3376{
3377	vm_offset_t va;
3378	vm_page_t m, mpte;
3379	vm_pindex_t diff, psize;
3380
3381	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3382	psize = atop(end - start);
3383	mpte = NULL;
3384	m = m_start;
3385	PMAP_LOCK(pmap);
3386	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3387		va = start + ptoa(diff);
3388		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3389		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3390		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3391		    pmap_enter_pde(pmap, va, m, prot))
3392			m = &m[NBPDR / PAGE_SIZE - 1];
3393		else
3394			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3395			    mpte);
3396		m = TAILQ_NEXT(m, listq);
3397	}
3398 	PMAP_UNLOCK(pmap);
3399}
3400
3401/*
3402 * this code makes some *MAJOR* assumptions:
3403 * 1. Current pmap & pmap exists.
3404 * 2. Not wired.
3405 * 3. Read access.
3406 * 4. No page table pages.
3407 * but is *MUCH* faster than pmap_enter...
3408 */
3409
3410void
3411pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3412{
3413
3414	PMAP_LOCK(pmap);
3415	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3416	PMAP_UNLOCK(pmap);
3417}
3418
3419static vm_page_t
3420pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3421    vm_prot_t prot, vm_page_t mpte)
3422{
3423	pt_entry_t *pte;
3424	vm_paddr_t pa;
3425	vm_page_t free;
3426
3427	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3428	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3429	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3430	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3431	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3432
3433	/*
3434	 * In the case that a page table page is not
3435	 * resident, we are creating it here.
3436	 */
3437	if (va < VM_MAXUSER_ADDRESS) {
3438		unsigned ptepindex;
3439		pd_entry_t ptepa;
3440
3441		/*
3442		 * Calculate pagetable page index
3443		 */
3444		ptepindex = va >> PDRSHIFT;
3445		if (mpte && (mpte->pindex == ptepindex)) {
3446			mpte->wire_count++;
3447		} else {
3448			/*
3449			 * Get the page directory entry
3450			 */
3451			ptepa = pmap->pm_pdir[ptepindex];
3452
3453			/*
3454			 * If the page table page is mapped, we just increment
3455			 * the hold count, and activate it.
3456			 */
3457			if (ptepa) {
3458				if (ptepa & PG_PS)
3459					return (NULL);
3460				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3461				mpte->wire_count++;
3462			} else {
3463				mpte = _pmap_allocpte(pmap, ptepindex,
3464				    M_NOWAIT);
3465				if (mpte == NULL)
3466					return (mpte);
3467			}
3468		}
3469	} else {
3470		mpte = NULL;
3471	}
3472
3473	/*
3474	 * This call to vtopte makes the assumption that we are
3475	 * entering the page into the current pmap.  In order to support
3476	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3477	 * But that isn't as quick as vtopte.
3478	 */
3479	pte = vtopte(va);
3480	if (*pte) {
3481		if (mpte != NULL) {
3482			mpte->wire_count--;
3483			mpte = NULL;
3484		}
3485		return (mpte);
3486	}
3487
3488	/*
3489	 * Enter on the PV list if part of our managed memory.
3490	 */
3491	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3492	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3493		if (mpte != NULL) {
3494			free = NULL;
3495			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3496				pmap_invalidate_page(pmap, va);
3497				pmap_free_zero_pages(free);
3498			}
3499
3500			mpte = NULL;
3501		}
3502		return (mpte);
3503	}
3504
3505	/*
3506	 * Increment counters
3507	 */
3508	pmap->pm_stats.resident_count++;
3509
3510	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3511#ifdef PAE
3512	if ((prot & VM_PROT_EXECUTE) == 0)
3513		pa |= pg_nx;
3514#endif
3515
3516	/*
3517	 * Now validate mapping with RO protection
3518	 */
3519	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3520		pte_store(pte, pa | PG_V | PG_U);
3521	else
3522		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3523	return mpte;
3524}
3525
3526/*
3527 * Make a temporary mapping for a physical address.  This is only intended
3528 * to be used for panic dumps.
3529 */
3530void *
3531pmap_kenter_temporary(vm_paddr_t pa, int i)
3532{
3533	vm_offset_t va;
3534
3535	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3536	pmap_kenter(va, pa);
3537	invlpg(va);
3538	return ((void *)crashdumpmap);
3539}
3540
3541/*
3542 * This code maps large physical mmap regions into the
3543 * processor address space.  Note that some shortcuts
3544 * are taken, but the code works.
3545 */
3546void
3547pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3548    vm_pindex_t pindex, vm_size_t size)
3549{
3550	pd_entry_t *pde;
3551	vm_paddr_t pa, ptepa;
3552	vm_page_t p;
3553	int pat_mode;
3554
3555	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3556	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3557	    ("pmap_object_init_pt: non-device object"));
3558	if (pseflag &&
3559	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3560		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3561			return;
3562		p = vm_page_lookup(object, pindex);
3563		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3564		    ("pmap_object_init_pt: invalid page %p", p));
3565		pat_mode = p->md.pat_mode;
3566
3567		/*
3568		 * Abort the mapping if the first page is not physically
3569		 * aligned to a 2/4MB page boundary.
3570		 */
3571		ptepa = VM_PAGE_TO_PHYS(p);
3572		if (ptepa & (NBPDR - 1))
3573			return;
3574
3575		/*
3576		 * Skip the first page.  Abort the mapping if the rest of
3577		 * the pages are not physically contiguous or have differing
3578		 * memory attributes.
3579		 */
3580		p = TAILQ_NEXT(p, listq);
3581		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3582		    pa += PAGE_SIZE) {
3583			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3584			    ("pmap_object_init_pt: invalid page %p", p));
3585			if (pa != VM_PAGE_TO_PHYS(p) ||
3586			    pat_mode != p->md.pat_mode)
3587				return;
3588			p = TAILQ_NEXT(p, listq);
3589		}
3590
3591		/*
3592		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3593		 * "size" is a multiple of 2/4M, adding the PAT setting to
3594		 * "pa" will not affect the termination of this loop.
3595		 */
3596		PMAP_LOCK(pmap);
3597		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3598		    size; pa += NBPDR) {
3599			pde = pmap_pde(pmap, addr);
3600			if (*pde == 0) {
3601				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3602				    PG_U | PG_RW | PG_V);
3603				pmap->pm_stats.resident_count += NBPDR /
3604				    PAGE_SIZE;
3605				pmap_pde_mappings++;
3606			}
3607			/* Else continue on if the PDE is already valid. */
3608			addr += NBPDR;
3609		}
3610		PMAP_UNLOCK(pmap);
3611	}
3612}
3613
3614/*
3615 *	Routine:	pmap_change_wiring
3616 *	Function:	Change the wiring attribute for a map/virtual-address
3617 *			pair.
3618 *	In/out conditions:
3619 *			The mapping must already exist in the pmap.
3620 */
3621void
3622pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3623{
3624	pd_entry_t *pde;
3625	pt_entry_t *pte;
3626	boolean_t are_queues_locked;
3627
3628	are_queues_locked = FALSE;
3629retry:
3630	PMAP_LOCK(pmap);
3631	pde = pmap_pde(pmap, va);
3632	if ((*pde & PG_PS) != 0) {
3633		if (!wired != ((*pde & PG_W) == 0)) {
3634			if (!are_queues_locked) {
3635				are_queues_locked = TRUE;
3636				if (!mtx_trylock(&vm_page_queue_mtx)) {
3637					PMAP_UNLOCK(pmap);
3638					vm_page_lock_queues();
3639					goto retry;
3640				}
3641			}
3642			if (!pmap_demote_pde(pmap, pde, va))
3643				panic("pmap_change_wiring: demotion failed");
3644		} else
3645			goto out;
3646	}
3647	pte = pmap_pte(pmap, va);
3648
3649	if (wired && !pmap_pte_w(pte))
3650		pmap->pm_stats.wired_count++;
3651	else if (!wired && pmap_pte_w(pte))
3652		pmap->pm_stats.wired_count--;
3653
3654	/*
3655	 * Wiring is not a hardware characteristic so there is no need to
3656	 * invalidate TLB.
3657	 */
3658	pmap_pte_set_w(pte, wired);
3659	pmap_pte_release(pte);
3660out:
3661	if (are_queues_locked)
3662		vm_page_unlock_queues();
3663	PMAP_UNLOCK(pmap);
3664}
3665
3666
3667
3668/*
3669 *	Copy the range specified by src_addr/len
3670 *	from the source map to the range dst_addr/len
3671 *	in the destination map.
3672 *
3673 *	This routine is only advisory and need not do anything.
3674 */
3675
3676void
3677pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3678    vm_offset_t src_addr)
3679{
3680	vm_page_t   free;
3681	vm_offset_t addr;
3682	vm_offset_t end_addr = src_addr + len;
3683	vm_offset_t pdnxt;
3684
3685	if (dst_addr != src_addr)
3686		return;
3687
3688	if (!pmap_is_current(src_pmap))
3689		return;
3690
3691	vm_page_lock_queues();
3692	if (dst_pmap < src_pmap) {
3693		PMAP_LOCK(dst_pmap);
3694		PMAP_LOCK(src_pmap);
3695	} else {
3696		PMAP_LOCK(src_pmap);
3697		PMAP_LOCK(dst_pmap);
3698	}
3699	sched_pin();
3700	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3701		pt_entry_t *src_pte, *dst_pte;
3702		vm_page_t dstmpte, srcmpte;
3703		pd_entry_t srcptepaddr;
3704		unsigned ptepindex;
3705
3706		KASSERT(addr < UPT_MIN_ADDRESS,
3707		    ("pmap_copy: invalid to pmap_copy page tables"));
3708
3709		pdnxt = (addr + NBPDR) & ~PDRMASK;
3710		if (pdnxt < addr)
3711			pdnxt = end_addr;
3712		ptepindex = addr >> PDRSHIFT;
3713
3714		srcptepaddr = src_pmap->pm_pdir[ptepindex];
3715		if (srcptepaddr == 0)
3716			continue;
3717
3718		if (srcptepaddr & PG_PS) {
3719			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
3720			    ((srcptepaddr & PG_MANAGED) == 0 ||
3721			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3722			    PG_PS_FRAME))) {
3723				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
3724				    ~PG_W;
3725				dst_pmap->pm_stats.resident_count +=
3726				    NBPDR / PAGE_SIZE;
3727			}
3728			continue;
3729		}
3730
3731		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3732		KASSERT(srcmpte->wire_count > 0,
3733		    ("pmap_copy: source page table page is unused"));
3734
3735		if (pdnxt > end_addr)
3736			pdnxt = end_addr;
3737
3738		src_pte = vtopte(addr);
3739		while (addr < pdnxt) {
3740			pt_entry_t ptetemp;
3741			ptetemp = *src_pte;
3742			/*
3743			 * we only virtual copy managed pages
3744			 */
3745			if ((ptetemp & PG_MANAGED) != 0) {
3746				dstmpte = pmap_allocpte(dst_pmap, addr,
3747				    M_NOWAIT);
3748				if (dstmpte == NULL)
3749					goto out;
3750				dst_pte = pmap_pte_quick(dst_pmap, addr);
3751				if (*dst_pte == 0 &&
3752				    pmap_try_insert_pv_entry(dst_pmap, addr,
3753				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3754					/*
3755					 * Clear the wired, modified, and
3756					 * accessed (referenced) bits
3757					 * during the copy.
3758					 */
3759					*dst_pte = ptetemp & ~(PG_W | PG_M |
3760					    PG_A);
3761					dst_pmap->pm_stats.resident_count++;
3762	 			} else {
3763					free = NULL;
3764					if (pmap_unwire_pte_hold(dst_pmap,
3765					    dstmpte, &free)) {
3766						pmap_invalidate_page(dst_pmap,
3767						    addr);
3768						pmap_free_zero_pages(free);
3769					}
3770					goto out;
3771				}
3772				if (dstmpte->wire_count >= srcmpte->wire_count)
3773					break;
3774			}
3775			addr += PAGE_SIZE;
3776			src_pte++;
3777		}
3778	}
3779out:
3780	sched_unpin();
3781	vm_page_unlock_queues();
3782	PMAP_UNLOCK(src_pmap);
3783	PMAP_UNLOCK(dst_pmap);
3784}
3785
3786static __inline void
3787pagezero(void *page)
3788{
3789#if defined(I686_CPU)
3790	if (cpu_class == CPUCLASS_686) {
3791#if defined(CPU_ENABLE_SSE)
3792		if (cpu_feature & CPUID_SSE2)
3793			sse2_pagezero(page);
3794		else
3795#endif
3796			i686_pagezero(page);
3797	} else
3798#endif
3799		bzero(page, PAGE_SIZE);
3800}
3801
3802/*
3803 *	pmap_zero_page zeros the specified hardware page by mapping
3804 *	the page into KVM and using bzero to clear its contents.
3805 */
3806void
3807pmap_zero_page(vm_page_t m)
3808{
3809	struct sysmaps *sysmaps;
3810
3811	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3812	mtx_lock(&sysmaps->lock);
3813	if (*sysmaps->CMAP2)
3814		panic("pmap_zero_page: CMAP2 busy");
3815	sched_pin();
3816	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3817	    pmap_cache_bits(m->md.pat_mode, 0);
3818	invlcaddr(sysmaps->CADDR2);
3819	pagezero(sysmaps->CADDR2);
3820	*sysmaps->CMAP2 = 0;
3821	sched_unpin();
3822	mtx_unlock(&sysmaps->lock);
3823}
3824
3825/*
3826 *	pmap_zero_page_area zeros the specified hardware page by mapping
3827 *	the page into KVM and using bzero to clear its contents.
3828 *
3829 *	off and size may not cover an area beyond a single hardware page.
3830 */
3831void
3832pmap_zero_page_area(vm_page_t m, int off, int size)
3833{
3834	struct sysmaps *sysmaps;
3835
3836	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3837	mtx_lock(&sysmaps->lock);
3838	if (*sysmaps->CMAP2)
3839		panic("pmap_zero_page_area: CMAP2 busy");
3840	sched_pin();
3841	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3842	    pmap_cache_bits(m->md.pat_mode, 0);
3843	invlcaddr(sysmaps->CADDR2);
3844	if (off == 0 && size == PAGE_SIZE)
3845		pagezero(sysmaps->CADDR2);
3846	else
3847		bzero((char *)sysmaps->CADDR2 + off, size);
3848	*sysmaps->CMAP2 = 0;
3849	sched_unpin();
3850	mtx_unlock(&sysmaps->lock);
3851}
3852
3853/*
3854 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3855 *	the page into KVM and using bzero to clear its contents.  This
3856 *	is intended to be called from the vm_pagezero process only and
3857 *	outside of Giant.
3858 */
3859void
3860pmap_zero_page_idle(vm_page_t m)
3861{
3862
3863	if (*CMAP3)
3864		panic("pmap_zero_page_idle: CMAP3 busy");
3865	sched_pin();
3866	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3867	    pmap_cache_bits(m->md.pat_mode, 0);
3868	invlcaddr(CADDR3);
3869	pagezero(CADDR3);
3870	*CMAP3 = 0;
3871	sched_unpin();
3872}
3873
3874/*
3875 *	pmap_copy_page copies the specified (machine independent)
3876 *	page by mapping the page into virtual memory and using
3877 *	bcopy to copy the page, one machine dependent page at a
3878 *	time.
3879 */
3880void
3881pmap_copy_page(vm_page_t src, vm_page_t dst)
3882{
3883	struct sysmaps *sysmaps;
3884
3885	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3886	mtx_lock(&sysmaps->lock);
3887	if (*sysmaps->CMAP1)
3888		panic("pmap_copy_page: CMAP1 busy");
3889	if (*sysmaps->CMAP2)
3890		panic("pmap_copy_page: CMAP2 busy");
3891	sched_pin();
3892	invlpg((u_int)sysmaps->CADDR1);
3893	invlpg((u_int)sysmaps->CADDR2);
3894	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
3895	    pmap_cache_bits(src->md.pat_mode, 0);
3896	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
3897	    pmap_cache_bits(dst->md.pat_mode, 0);
3898	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
3899	*sysmaps->CMAP1 = 0;
3900	*sysmaps->CMAP2 = 0;
3901	sched_unpin();
3902	mtx_unlock(&sysmaps->lock);
3903}
3904
3905/*
3906 * Returns true if the pmap's pv is one of the first
3907 * 16 pvs linked to from this page.  This count may
3908 * be changed upwards or downwards in the future; it
3909 * is only necessary that true be returned for a small
3910 * subset of pmaps for proper page aging.
3911 */
3912boolean_t
3913pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3914{
3915	struct md_page *pvh;
3916	pv_entry_t pv;
3917	int loops = 0;
3918
3919	if (m->flags & PG_FICTITIOUS)
3920		return FALSE;
3921
3922	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3923	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3924		if (PV_PMAP(pv) == pmap) {
3925			return TRUE;
3926		}
3927		loops++;
3928		if (loops >= 16)
3929			break;
3930	}
3931	if (loops < 16) {
3932		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3933		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3934			if (PV_PMAP(pv) == pmap)
3935				return (TRUE);
3936			loops++;
3937			if (loops >= 16)
3938				break;
3939		}
3940	}
3941	return (FALSE);
3942}
3943
3944/*
3945 *	pmap_page_wired_mappings:
3946 *
3947 *	Return the number of managed mappings to the given physical page
3948 *	that are wired.
3949 */
3950int
3951pmap_page_wired_mappings(vm_page_t m)
3952{
3953	int count;
3954
3955	count = 0;
3956	if ((m->flags & PG_FICTITIOUS) != 0)
3957		return (count);
3958	count = pmap_pvh_wired_mappings(&m->md, count);
3959	return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count));
3960}
3961
3962/*
3963 *	pmap_pvh_wired_mappings:
3964 *
3965 *	Return the updated number "count" of managed mappings that are wired.
3966 */
3967static int
3968pmap_pvh_wired_mappings(struct md_page *pvh, int count)
3969{
3970	pmap_t pmap;
3971	pt_entry_t *pte;
3972	pv_entry_t pv;
3973
3974	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3975	sched_pin();
3976	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3977		pmap = PV_PMAP(pv);
3978		PMAP_LOCK(pmap);
3979		pte = pmap_pte_quick(pmap, pv->pv_va);
3980		if ((*pte & PG_W) != 0)
3981			count++;
3982		PMAP_UNLOCK(pmap);
3983	}
3984	sched_unpin();
3985	return (count);
3986}
3987
3988/*
3989 * Returns TRUE if the given page is mapped individually or as part of
3990 * a 4mpage.  Otherwise, returns FALSE.
3991 */
3992boolean_t
3993pmap_page_is_mapped(vm_page_t m)
3994{
3995	struct md_page *pvh;
3996
3997	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
3998		return (FALSE);
3999	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4000	if (TAILQ_EMPTY(&m->md.pv_list)) {
4001		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4002		return (!TAILQ_EMPTY(&pvh->pv_list));
4003	} else
4004		return (TRUE);
4005}
4006
4007/*
4008 * Remove all pages from specified address space
4009 * this aids process exit speeds.  Also, this code
4010 * is special cased for current process only, but
4011 * can have the more generic (and slightly slower)
4012 * mode enabled.  This is much faster than pmap_remove
4013 * in the case of running down an entire address space.
4014 */
4015void
4016pmap_remove_pages(pmap_t pmap)
4017{
4018	pt_entry_t *pte, tpte;
4019	vm_page_t free = NULL;
4020	vm_page_t m, mpte, mt;
4021	pv_entry_t pv;
4022	struct md_page *pvh;
4023	struct pv_chunk *pc, *npc;
4024	int field, idx;
4025	int32_t bit;
4026	uint32_t inuse, bitmask;
4027	int allfree;
4028
4029	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
4030		printf("warning: pmap_remove_pages called with non-current pmap\n");
4031		return;
4032	}
4033	vm_page_lock_queues();
4034	PMAP_LOCK(pmap);
4035	sched_pin();
4036	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4037		allfree = 1;
4038		for (field = 0; field < _NPCM; field++) {
4039			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4040			while (inuse != 0) {
4041				bit = bsfl(inuse);
4042				bitmask = 1UL << bit;
4043				idx = field * 32 + bit;
4044				pv = &pc->pc_pventry[idx];
4045				inuse &= ~bitmask;
4046
4047				pte = pmap_pde(pmap, pv->pv_va);
4048				tpte = *pte;
4049				if ((tpte & PG_PS) == 0) {
4050					pte = vtopte(pv->pv_va);
4051					tpte = *pte & ~PG_PTE_PAT;
4052				}
4053
4054				if (tpte == 0) {
4055					printf(
4056					    "TPTE at %p  IS ZERO @ VA %08x\n",
4057					    pte, pv->pv_va);
4058					panic("bad pte");
4059				}
4060
4061/*
4062 * We cannot remove wired pages from a process' mapping at this time
4063 */
4064				if (tpte & PG_W) {
4065					allfree = 0;
4066					continue;
4067				}
4068
4069				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4070				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4071				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4072				    m, (uintmax_t)m->phys_addr,
4073				    (uintmax_t)tpte));
4074
4075				KASSERT(m < &vm_page_array[vm_page_array_size],
4076					("pmap_remove_pages: bad tpte %#jx",
4077					(uintmax_t)tpte));
4078
4079				pte_clear(pte);
4080
4081				/*
4082				 * Update the vm_page_t clean/reference bits.
4083				 */
4084				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4085					if ((tpte & PG_PS) != 0) {
4086						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4087							vm_page_dirty(mt);
4088					} else
4089						vm_page_dirty(m);
4090				}
4091
4092				/* Mark free */
4093				PV_STAT(pv_entry_frees++);
4094				PV_STAT(pv_entry_spare++);
4095				pv_entry_count--;
4096				pc->pc_map[field] |= bitmask;
4097				if ((tpte & PG_PS) != 0) {
4098					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4099					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4100					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4101					if (TAILQ_EMPTY(&pvh->pv_list)) {
4102						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4103							if (TAILQ_EMPTY(&mt->md.pv_list))
4104								vm_page_flag_clear(mt, PG_WRITEABLE);
4105					}
4106					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4107					if (mpte != NULL) {
4108						pmap_remove_pt_page(pmap, mpte);
4109						pmap->pm_stats.resident_count--;
4110						KASSERT(mpte->wire_count == NPTEPG,
4111						    ("pmap_remove_pages: pte page wire count error"));
4112						mpte->wire_count = 0;
4113						pmap_add_delayed_free_list(mpte, &free, FALSE);
4114						atomic_subtract_int(&cnt.v_wire_count, 1);
4115					}
4116				} else {
4117					pmap->pm_stats.resident_count--;
4118					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4119					if (TAILQ_EMPTY(&m->md.pv_list)) {
4120						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4121						if (TAILQ_EMPTY(&pvh->pv_list))
4122							vm_page_flag_clear(m, PG_WRITEABLE);
4123					}
4124					pmap_unuse_pt(pmap, pv->pv_va, &free);
4125				}
4126			}
4127		}
4128		if (allfree) {
4129			PV_STAT(pv_entry_spare -= _NPCPV);
4130			PV_STAT(pc_chunk_count--);
4131			PV_STAT(pc_chunk_frees++);
4132			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4133			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
4134			pmap_qremove((vm_offset_t)pc, 1);
4135			vm_page_unwire(m, 0);
4136			vm_page_free(m);
4137			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
4138		}
4139	}
4140	sched_unpin();
4141	pmap_invalidate_all(pmap);
4142	vm_page_unlock_queues();
4143	PMAP_UNLOCK(pmap);
4144	pmap_free_zero_pages(free);
4145}
4146
4147/*
4148 *	pmap_is_modified:
4149 *
4150 *	Return whether or not the specified physical page was modified
4151 *	in any physical maps.
4152 */
4153boolean_t
4154pmap_is_modified(vm_page_t m)
4155{
4156
4157	if (m->flags & PG_FICTITIOUS)
4158		return (FALSE);
4159	if (pmap_is_modified_pvh(&m->md))
4160		return (TRUE);
4161	return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4162}
4163
4164/*
4165 * Returns TRUE if any of the given mappings were used to modify
4166 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4167 * mappings are supported.
4168 */
4169static boolean_t
4170pmap_is_modified_pvh(struct md_page *pvh)
4171{
4172	pv_entry_t pv;
4173	pt_entry_t *pte;
4174	pmap_t pmap;
4175	boolean_t rv;
4176
4177	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4178	rv = FALSE;
4179	sched_pin();
4180	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4181		pmap = PV_PMAP(pv);
4182		PMAP_LOCK(pmap);
4183		pte = pmap_pte_quick(pmap, pv->pv_va);
4184		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4185		PMAP_UNLOCK(pmap);
4186		if (rv)
4187			break;
4188	}
4189	sched_unpin();
4190	return (rv);
4191}
4192
4193/*
4194 *	pmap_is_prefaultable:
4195 *
4196 *	Return whether or not the specified virtual address is elgible
4197 *	for prefault.
4198 */
4199boolean_t
4200pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4201{
4202	pd_entry_t *pde;
4203	pt_entry_t *pte;
4204	boolean_t rv;
4205
4206	rv = FALSE;
4207	PMAP_LOCK(pmap);
4208	pde = pmap_pde(pmap, addr);
4209	if (*pde != 0 && (*pde & PG_PS) == 0) {
4210		pte = vtopte(addr);
4211		rv = *pte == 0;
4212	}
4213	PMAP_UNLOCK(pmap);
4214	return (rv);
4215}
4216
4217/*
4218 * Clear the write and modified bits in each of the given page's mappings.
4219 */
4220void
4221pmap_remove_write(vm_page_t m)
4222{
4223	struct md_page *pvh;
4224	pv_entry_t next_pv, pv;
4225	pmap_t pmap;
4226	pd_entry_t *pde;
4227	pt_entry_t oldpte, *pte;
4228	vm_offset_t va;
4229
4230	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4231	if ((m->flags & PG_FICTITIOUS) != 0 ||
4232	    (m->flags & PG_WRITEABLE) == 0)
4233		return;
4234	sched_pin();
4235	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4236	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4237		va = pv->pv_va;
4238		pmap = PV_PMAP(pv);
4239		PMAP_LOCK(pmap);
4240		pde = pmap_pde(pmap, va);
4241		if ((*pde & PG_RW) != 0)
4242			(void)pmap_demote_pde(pmap, pde, va);
4243		PMAP_UNLOCK(pmap);
4244	}
4245	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4246		pmap = PV_PMAP(pv);
4247		PMAP_LOCK(pmap);
4248		pde = pmap_pde(pmap, pv->pv_va);
4249		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4250		    " a 4mpage in page %p's pv list", m));
4251		pte = pmap_pte_quick(pmap, pv->pv_va);
4252retry:
4253		oldpte = *pte;
4254		if ((oldpte & PG_RW) != 0) {
4255			/*
4256			 * Regardless of whether a pte is 32 or 64 bits
4257			 * in size, PG_RW and PG_M are among the least
4258			 * significant 32 bits.
4259			 */
4260			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4261			    oldpte & ~(PG_RW | PG_M)))
4262				goto retry;
4263			if ((oldpte & PG_M) != 0)
4264				vm_page_dirty(m);
4265			pmap_invalidate_page(pmap, pv->pv_va);
4266		}
4267		PMAP_UNLOCK(pmap);
4268	}
4269	vm_page_flag_clear(m, PG_WRITEABLE);
4270	sched_unpin();
4271}
4272
4273/*
4274 *	pmap_ts_referenced:
4275 *
4276 *	Return a count of reference bits for a page, clearing those bits.
4277 *	It is not necessary for every reference bit to be cleared, but it
4278 *	is necessary that 0 only be returned when there are truly no
4279 *	reference bits set.
4280 *
4281 *	XXX: The exact number of bits to check and clear is a matter that
4282 *	should be tested and standardized at some point in the future for
4283 *	optimal aging of shared pages.
4284 */
4285int
4286pmap_ts_referenced(vm_page_t m)
4287{
4288	struct md_page *pvh;
4289	pv_entry_t pv, pvf, pvn;
4290	pmap_t pmap;
4291	pd_entry_t oldpde, *pde;
4292	pt_entry_t *pte;
4293	vm_offset_t va;
4294	int rtval = 0;
4295
4296	if (m->flags & PG_FICTITIOUS)
4297		return (rtval);
4298	sched_pin();
4299	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4300	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4301	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4302		va = pv->pv_va;
4303		pmap = PV_PMAP(pv);
4304		PMAP_LOCK(pmap);
4305		pde = pmap_pde(pmap, va);
4306		oldpde = *pde;
4307		if ((oldpde & PG_A) != 0) {
4308			if (pmap_demote_pde(pmap, pde, va)) {
4309				if ((oldpde & PG_W) == 0) {
4310					/*
4311					 * Remove the mapping to a single page
4312					 * so that a subsequent access may
4313					 * repromote.  Since the underlying
4314					 * page table page is fully populated,
4315					 * this removal never frees a page
4316					 * table page.
4317					 */
4318					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4319					    PG_PS_FRAME);
4320					pmap_remove_page(pmap, va, NULL);
4321					rtval++;
4322					if (rtval > 4) {
4323						PMAP_UNLOCK(pmap);
4324						return (rtval);
4325					}
4326				}
4327			}
4328		}
4329		PMAP_UNLOCK(pmap);
4330	}
4331	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4332		pvf = pv;
4333		do {
4334			pvn = TAILQ_NEXT(pv, pv_list);
4335			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4336			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4337			pmap = PV_PMAP(pv);
4338			PMAP_LOCK(pmap);
4339			pde = pmap_pde(pmap, pv->pv_va);
4340			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4341			    " found a 4mpage in page %p's pv list", m));
4342			pte = pmap_pte_quick(pmap, pv->pv_va);
4343			if ((*pte & PG_A) != 0) {
4344				atomic_clear_int((u_int *)pte, PG_A);
4345				pmap_invalidate_page(pmap, pv->pv_va);
4346				rtval++;
4347				if (rtval > 4)
4348					pvn = NULL;
4349			}
4350			PMAP_UNLOCK(pmap);
4351		} while ((pv = pvn) != NULL && pv != pvf);
4352	}
4353	sched_unpin();
4354	return (rtval);
4355}
4356
4357/*
4358 *	Clear the modify bits on the specified physical page.
4359 */
4360void
4361pmap_clear_modify(vm_page_t m)
4362{
4363	struct md_page *pvh;
4364	pv_entry_t next_pv, pv;
4365	pmap_t pmap;
4366	pd_entry_t oldpde, *pde;
4367	pt_entry_t oldpte, *pte;
4368	vm_offset_t va;
4369
4370	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4371	if ((m->flags & PG_FICTITIOUS) != 0)
4372		return;
4373	sched_pin();
4374	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4375	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4376		va = pv->pv_va;
4377		pmap = PV_PMAP(pv);
4378		PMAP_LOCK(pmap);
4379		pde = pmap_pde(pmap, va);
4380		oldpde = *pde;
4381		if ((oldpde & PG_RW) != 0) {
4382			if (pmap_demote_pde(pmap, pde, va)) {
4383				if ((oldpde & PG_W) == 0) {
4384					/*
4385					 * Write protect the mapping to a
4386					 * single page so that a subsequent
4387					 * write access may repromote.
4388					 */
4389					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4390					    PG_PS_FRAME);
4391					pte = pmap_pte_quick(pmap, va);
4392					oldpte = *pte;
4393					if ((oldpte & PG_V) != 0) {
4394						/*
4395						 * Regardless of whether a pte is 32 or 64 bits
4396						 * in size, PG_RW and PG_M are among the least
4397						 * significant 32 bits.
4398						 */
4399						while (!atomic_cmpset_int((u_int *)pte,
4400						    oldpte,
4401						    oldpte & ~(PG_M | PG_RW)))
4402							oldpte = *pte;
4403						vm_page_dirty(m);
4404						pmap_invalidate_page(pmap, va);
4405					}
4406				}
4407			}
4408		}
4409		PMAP_UNLOCK(pmap);
4410	}
4411	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4412		pmap = PV_PMAP(pv);
4413		PMAP_LOCK(pmap);
4414		pde = pmap_pde(pmap, pv->pv_va);
4415		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4416		    " a 4mpage in page %p's pv list", m));
4417		pte = pmap_pte_quick(pmap, pv->pv_va);
4418		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4419			/*
4420			 * Regardless of whether a pte is 32 or 64 bits
4421			 * in size, PG_M is among the least significant
4422			 * 32 bits.
4423			 */
4424			atomic_clear_int((u_int *)pte, PG_M);
4425			pmap_invalidate_page(pmap, pv->pv_va);
4426		}
4427		PMAP_UNLOCK(pmap);
4428	}
4429	sched_unpin();
4430}
4431
4432/*
4433 *	pmap_clear_reference:
4434 *
4435 *	Clear the reference bit on the specified physical page.
4436 */
4437void
4438pmap_clear_reference(vm_page_t m)
4439{
4440	struct md_page *pvh;
4441	pv_entry_t next_pv, pv;
4442	pmap_t pmap;
4443	pd_entry_t oldpde, *pde;
4444	pt_entry_t *pte;
4445	vm_offset_t va;
4446
4447	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4448	if ((m->flags & PG_FICTITIOUS) != 0)
4449		return;
4450	sched_pin();
4451	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4452	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4453		va = pv->pv_va;
4454		pmap = PV_PMAP(pv);
4455		PMAP_LOCK(pmap);
4456		pde = pmap_pde(pmap, va);
4457		oldpde = *pde;
4458		if ((oldpde & PG_A) != 0) {
4459			if (pmap_demote_pde(pmap, pde, va)) {
4460				/*
4461				 * Remove the mapping to a single page so
4462				 * that a subsequent access may repromote.
4463				 * Since the underlying page table page is
4464				 * fully populated, this removal never frees
4465				 * a page table page.
4466				 */
4467				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4468				    PG_PS_FRAME);
4469				pmap_remove_page(pmap, va, NULL);
4470			}
4471		}
4472		PMAP_UNLOCK(pmap);
4473	}
4474	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4475		pmap = PV_PMAP(pv);
4476		PMAP_LOCK(pmap);
4477		pde = pmap_pde(pmap, pv->pv_va);
4478		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4479		    " a 4mpage in page %p's pv list", m));
4480		pte = pmap_pte_quick(pmap, pv->pv_va);
4481		if ((*pte & PG_A) != 0) {
4482			/*
4483			 * Regardless of whether a pte is 32 or 64 bits
4484			 * in size, PG_A is among the least significant
4485			 * 32 bits.
4486			 */
4487			atomic_clear_int((u_int *)pte, PG_A);
4488			pmap_invalidate_page(pmap, pv->pv_va);
4489		}
4490		PMAP_UNLOCK(pmap);
4491	}
4492	sched_unpin();
4493}
4494
4495/*
4496 * Miscellaneous support routines follow
4497 */
4498
4499/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4500static __inline void
4501pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4502{
4503	u_int opte, npte;
4504
4505	/*
4506	 * The cache mode bits are all in the low 32-bits of the
4507	 * PTE, so we can just spin on updating the low 32-bits.
4508	 */
4509	do {
4510		opte = *(u_int *)pte;
4511		npte = opte & ~PG_PTE_CACHE;
4512		npte |= cache_bits;
4513	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4514}
4515
4516/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4517static __inline void
4518pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4519{
4520	u_int opde, npde;
4521
4522	/*
4523	 * The cache mode bits are all in the low 32-bits of the
4524	 * PDE, so we can just spin on updating the low 32-bits.
4525	 */
4526	do {
4527		opde = *(u_int *)pde;
4528		npde = opde & ~PG_PDE_CACHE;
4529		npde |= cache_bits;
4530	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4531}
4532
4533/*
4534 * Map a set of physical memory pages into the kernel virtual
4535 * address space. Return a pointer to where it is mapped. This
4536 * routine is intended to be used for mapping device memory,
4537 * NOT real memory.
4538 */
4539void *
4540pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4541{
4542	vm_offset_t va, offset;
4543	vm_size_t tmpsize;
4544
4545	offset = pa & PAGE_MASK;
4546	size = roundup(offset + size, PAGE_SIZE);
4547	pa = pa & PG_FRAME;
4548
4549	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4550		va = KERNBASE + pa;
4551	else
4552		va = kmem_alloc_nofault(kernel_map, size);
4553	if (!va)
4554		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4555
4556	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4557		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4558	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4559	pmap_invalidate_cache_range(va, va + size);
4560	return ((void *)(va + offset));
4561}
4562
4563void *
4564pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4565{
4566
4567	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4568}
4569
4570void *
4571pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4572{
4573
4574	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4575}
4576
4577void
4578pmap_unmapdev(vm_offset_t va, vm_size_t size)
4579{
4580	vm_offset_t base, offset, tmpva;
4581
4582	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4583		return;
4584	base = trunc_page(va);
4585	offset = va & PAGE_MASK;
4586	size = roundup(offset + size, PAGE_SIZE);
4587	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4588		pmap_kremove(tmpva);
4589	pmap_invalidate_range(kernel_pmap, va, tmpva);
4590	kmem_free(kernel_map, base, size);
4591}
4592
4593/*
4594 * Sets the memory attribute for the specified page.
4595 */
4596void
4597pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4598{
4599	struct sysmaps *sysmaps;
4600	vm_offset_t sva, eva;
4601
4602	m->md.pat_mode = ma;
4603	if ((m->flags & PG_FICTITIOUS) != 0)
4604		return;
4605
4606	/*
4607	 * If "m" is a normal page, flush it from the cache.
4608	 * See pmap_invalidate_cache_range().
4609	 *
4610	 * First, try to find an existing mapping of the page by sf
4611	 * buffer. sf_buf_invalidate_cache() modifies mapping and
4612	 * flushes the cache.
4613	 */
4614	if (sf_buf_invalidate_cache(m))
4615		return;
4616
4617	/*
4618	 * If page is not mapped by sf buffer, but CPU does not
4619	 * support self snoop, map the page transient and do
4620	 * invalidation. In the worst case, whole cache is flushed by
4621	 * pmap_invalidate_cache_range().
4622	 */
4623	if ((cpu_feature & (CPUID_SS|CPUID_CLFSH)) == CPUID_CLFSH) {
4624		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4625		mtx_lock(&sysmaps->lock);
4626		if (*sysmaps->CMAP2)
4627			panic("pmap_page_set_memattr: CMAP2 busy");
4628		sched_pin();
4629		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
4630		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
4631		invlcaddr(sysmaps->CADDR2);
4632		sva = (vm_offset_t)sysmaps->CADDR2;
4633		eva = sva + PAGE_SIZE;
4634	} else
4635		sva = eva = 0; /* gcc */
4636	pmap_invalidate_cache_range(sva, eva);
4637	if (sva != 0) {
4638		*sysmaps->CMAP2 = 0;
4639		sched_unpin();
4640		mtx_unlock(&sysmaps->lock);
4641	}
4642}
4643
4644/*
4645 * Changes the specified virtual address range's memory type to that given by
4646 * the parameter "mode".  The specified virtual address range must be
4647 * completely contained within either the kernel map.
4648 *
4649 * Returns zero if the change completed successfully, and either EINVAL or
4650 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4651 * of the virtual address range was not mapped, and ENOMEM is returned if
4652 * there was insufficient memory available to complete the change.
4653 */
4654int
4655pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4656{
4657	vm_offset_t base, offset, tmpva;
4658	pd_entry_t *pde;
4659	pt_entry_t *pte;
4660	int cache_bits_pte, cache_bits_pde;
4661	boolean_t changed;
4662
4663	base = trunc_page(va);
4664	offset = va & PAGE_MASK;
4665	size = roundup(offset + size, PAGE_SIZE);
4666
4667	/*
4668	 * Only supported on kernel virtual addresses above the recursive map.
4669	 */
4670	if (base < VM_MIN_KERNEL_ADDRESS)
4671		return (EINVAL);
4672
4673	cache_bits_pde = pmap_cache_bits(mode, 1);
4674	cache_bits_pte = pmap_cache_bits(mode, 0);
4675	changed = FALSE;
4676
4677	/*
4678	 * Pages that aren't mapped aren't supported.  Also break down
4679	 * 2/4MB pages into 4KB pages if required.
4680	 */
4681	PMAP_LOCK(kernel_pmap);
4682	for (tmpva = base; tmpva < base + size; ) {
4683		pde = pmap_pde(kernel_pmap, tmpva);
4684		if (*pde == 0) {
4685			PMAP_UNLOCK(kernel_pmap);
4686			return (EINVAL);
4687		}
4688		if (*pde & PG_PS) {
4689			/*
4690			 * If the current 2/4MB page already has
4691			 * the required memory type, then we need not
4692			 * demote this page.  Just increment tmpva to
4693			 * the next 2/4MB page frame.
4694			 */
4695			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4696				tmpva = trunc_4mpage(tmpva) + NBPDR;
4697				continue;
4698			}
4699
4700			/*
4701			 * If the current offset aligns with a 2/4MB
4702			 * page frame and there is at least 2/4MB left
4703			 * within the range, then we need not break
4704			 * down this page into 4KB pages.
4705			 */
4706			if ((tmpva & PDRMASK) == 0 &&
4707			    tmpva + PDRMASK < base + size) {
4708				tmpva += NBPDR;
4709				continue;
4710			}
4711			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
4712				PMAP_UNLOCK(kernel_pmap);
4713				return (ENOMEM);
4714			}
4715		}
4716		pte = vtopte(tmpva);
4717		if (*pte == 0) {
4718			PMAP_UNLOCK(kernel_pmap);
4719			return (EINVAL);
4720		}
4721		tmpva += PAGE_SIZE;
4722	}
4723	PMAP_UNLOCK(kernel_pmap);
4724
4725	/*
4726	 * Ok, all the pages exist, so run through them updating their
4727	 * cache mode if required.
4728	 */
4729	for (tmpva = base; tmpva < base + size; ) {
4730		pde = pmap_pde(kernel_pmap, tmpva);
4731		if (*pde & PG_PS) {
4732			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
4733				pmap_pde_attr(pde, cache_bits_pde);
4734				changed = TRUE;
4735			}
4736			tmpva = trunc_4mpage(tmpva) + NBPDR;
4737		} else {
4738			pte = vtopte(tmpva);
4739			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
4740				pmap_pte_attr(pte, cache_bits_pte);
4741				changed = TRUE;
4742			}
4743			tmpva += PAGE_SIZE;
4744		}
4745	}
4746
4747	/*
4748	 * Flush CPU caches to make sure any data isn't cached that
4749	 * shouldn't be, etc.
4750	 */
4751	if (changed) {
4752		pmap_invalidate_range(kernel_pmap, base, tmpva);
4753		pmap_invalidate_cache_range(base, tmpva);
4754	}
4755	return (0);
4756}
4757
4758/*
4759 * perform the pmap work for mincore
4760 */
4761int
4762pmap_mincore(pmap_t pmap, vm_offset_t addr)
4763{
4764	pd_entry_t *pdep;
4765	pt_entry_t *ptep, pte;
4766	vm_paddr_t pa;
4767	vm_page_t m;
4768	int val = 0;
4769
4770	PMAP_LOCK(pmap);
4771	pdep = pmap_pde(pmap, addr);
4772	if (*pdep != 0) {
4773		if (*pdep & PG_PS) {
4774			pte = *pdep;
4775			val = MINCORE_SUPER;
4776			/* Compute the physical address of the 4KB page. */
4777			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4778			    PG_FRAME;
4779		} else {
4780			ptep = pmap_pte(pmap, addr);
4781			pte = *ptep;
4782			pmap_pte_release(ptep);
4783			pa = pte & PG_FRAME;
4784		}
4785	} else {
4786		pte = 0;
4787		pa = 0;
4788	}
4789	PMAP_UNLOCK(pmap);
4790
4791	if (pte != 0) {
4792		val |= MINCORE_INCORE;
4793		if ((pte & PG_MANAGED) == 0)
4794			return val;
4795
4796		m = PHYS_TO_VM_PAGE(pa);
4797
4798		/*
4799		 * Modified by us
4800		 */
4801		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4802			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4803		else {
4804			/*
4805			 * Modified by someone else
4806			 */
4807			vm_page_lock_queues();
4808			if (m->dirty || pmap_is_modified(m))
4809				val |= MINCORE_MODIFIED_OTHER;
4810			vm_page_unlock_queues();
4811		}
4812		/*
4813		 * Referenced by us
4814		 */
4815		if (pte & PG_A)
4816			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4817		else {
4818			/*
4819			 * Referenced by someone else
4820			 */
4821			vm_page_lock_queues();
4822			if ((m->flags & PG_REFERENCED) ||
4823			    pmap_ts_referenced(m)) {
4824				val |= MINCORE_REFERENCED_OTHER;
4825				vm_page_flag_set(m, PG_REFERENCED);
4826			}
4827			vm_page_unlock_queues();
4828		}
4829	}
4830	return val;
4831}
4832
4833void
4834pmap_activate(struct thread *td)
4835{
4836	pmap_t	pmap, oldpmap;
4837	u_int32_t  cr3;
4838
4839	critical_enter();
4840	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4841	oldpmap = PCPU_GET(curpmap);
4842#if defined(SMP)
4843	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4844	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4845#else
4846	oldpmap->pm_active &= ~1;
4847	pmap->pm_active |= 1;
4848#endif
4849#ifdef PAE
4850	cr3 = vtophys(pmap->pm_pdpt);
4851#else
4852	cr3 = vtophys(pmap->pm_pdir);
4853#endif
4854	/*
4855	 * pmap_activate is for the current thread on the current cpu
4856	 */
4857	td->td_pcb->pcb_cr3 = cr3;
4858	load_cr3(cr3);
4859	PCPU_SET(curpmap, pmap);
4860	critical_exit();
4861}
4862
4863void
4864pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
4865{
4866}
4867
4868/*
4869 *	Increase the starting virtual address of the given mapping if a
4870 *	different alignment might result in more superpage mappings.
4871 */
4872void
4873pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4874    vm_offset_t *addr, vm_size_t size)
4875{
4876	vm_offset_t superpage_offset;
4877
4878	if (size < NBPDR)
4879		return;
4880	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4881		offset += ptoa(object->pg_color);
4882	superpage_offset = offset & PDRMASK;
4883	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
4884	    (*addr & PDRMASK) == superpage_offset)
4885		return;
4886	if ((*addr & PDRMASK) < superpage_offset)
4887		*addr = (*addr & ~PDRMASK) + superpage_offset;
4888	else
4889		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
4890}
4891
4892
4893#if defined(PMAP_DEBUG)
4894pmap_pid_dump(int pid)
4895{
4896	pmap_t pmap;
4897	struct proc *p;
4898	int npte = 0;
4899	int index;
4900
4901	sx_slock(&allproc_lock);
4902	FOREACH_PROC_IN_SYSTEM(p) {
4903		if (p->p_pid != pid)
4904			continue;
4905
4906		if (p->p_vmspace) {
4907			int i,j;
4908			index = 0;
4909			pmap = vmspace_pmap(p->p_vmspace);
4910			for (i = 0; i < NPDEPTD; i++) {
4911				pd_entry_t *pde;
4912				pt_entry_t *pte;
4913				vm_offset_t base = i << PDRSHIFT;
4914
4915				pde = &pmap->pm_pdir[i];
4916				if (pde && pmap_pde_v(pde)) {
4917					for (j = 0; j < NPTEPG; j++) {
4918						vm_offset_t va = base + (j << PAGE_SHIFT);
4919						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
4920							if (index) {
4921								index = 0;
4922								printf("\n");
4923							}
4924							sx_sunlock(&allproc_lock);
4925							return npte;
4926						}
4927						pte = pmap_pte(pmap, va);
4928						if (pte && pmap_pte_v(pte)) {
4929							pt_entry_t pa;
4930							vm_page_t m;
4931							pa = *pte;
4932							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
4933							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
4934								va, pa, m->hold_count, m->wire_count, m->flags);
4935							npte++;
4936							index++;
4937							if (index >= 2) {
4938								index = 0;
4939								printf("\n");
4940							} else {
4941								printf(" ");
4942							}
4943						}
4944					}
4945				}
4946			}
4947		}
4948	}
4949	sx_sunlock(&allproc_lock);
4950	return npte;
4951}
4952#endif
4953
4954#if defined(DEBUG)
4955
4956static void	pads(pmap_t pm);
4957void		pmap_pvdump(vm_offset_t pa);
4958
4959/* print address space of pmap*/
4960static void
4961pads(pmap_t pm)
4962{
4963	int i, j;
4964	vm_paddr_t va;
4965	pt_entry_t *ptep;
4966
4967	if (pm == kernel_pmap)
4968		return;
4969	for (i = 0; i < NPDEPTD; i++)
4970		if (pm->pm_pdir[i])
4971			for (j = 0; j < NPTEPG; j++) {
4972				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
4973				if (pm == kernel_pmap && va < KERNBASE)
4974					continue;
4975				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
4976					continue;
4977				ptep = pmap_pte(pm, va);
4978				if (pmap_pte_v(ptep))
4979					printf("%x:%x ", va, *ptep);
4980			};
4981
4982}
4983
4984void
4985pmap_pvdump(vm_paddr_t pa)
4986{
4987	pv_entry_t pv;
4988	pmap_t pmap;
4989	vm_page_t m;
4990
4991	printf("pa %x", pa);
4992	m = PHYS_TO_VM_PAGE(pa);
4993	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4994		pmap = PV_PMAP(pv);
4995		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
4996		pads(pmap);
4997	}
4998	printf(" ");
4999}
5000#endif
5001