pmap.c revision 205778
1207753Smm/*-
2207753Smm * Copyright (c) 1991 Regents of the University of California.
3207753Smm * All rights reserved.
4207753Smm * Copyright (c) 1994 John S. Dyson
5207753Smm * All rights reserved.
6207753Smm * Copyright (c) 1994 David Greenman
7207753Smm * All rights reserved.
8207753Smm * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9207753Smm * All rights reserved.
10207753Smm *
11207753Smm * This code is derived from software contributed to Berkeley by
12207753Smm * the Systems Programming Group of the University of Utah Computer
13207753Smm * Science Department and William Jolitz of UUNET Technologies Inc.
14207753Smm *
15207753Smm * Redistribution and use in source and binary forms, with or without
16207753Smm * modification, are permitted provided that the following conditions
17207753Smm * are met:
18207753Smm * 1. Redistributions of source code must retain the above copyright
19207753Smm *    notice, this list of conditions and the following disclaimer.
20207753Smm * 2. Redistributions in binary form must reproduce the above copyright
21207753Smm *    notice, this list of conditions and the following disclaimer in the
22207753Smm *    documentation and/or other materials provided with the distribution.
23207753Smm * 3. All advertising materials mentioning features or use of this software
24207753Smm *    must display the following acknowledgement:
25207753Smm *	This product includes software developed by the University of
26207753Smm *	California, Berkeley and its contributors.
27207753Smm * 4. Neither the name of the University nor the names of its contributors
28207753Smm *    may be used to endorse or promote products derived from this software
29207753Smm *    without specific prior written permission.
30207753Smm *
31207753Smm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32207753Smm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33207753Smm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34207753Smm * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35207753Smm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36207753Smm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37207753Smm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38207753Smm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39207753Smm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40207753Smm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41207753Smm * SUCH DAMAGE.
42207753Smm *
43207753Smm *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44207753Smm */
45207753Smm/*-
46207753Smm * Copyright (c) 2003 Networks Associates Technology, Inc.
47207753Smm * All rights reserved.
48207753Smm *
49207753Smm * This software was developed for the FreeBSD Project by Jake Burkholder,
50207753Smm * Safeport Network Services, and Network Associates Laboratories, the
51207753Smm * Security Research Division of Network Associates, Inc. under
52207753Smm * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53207753Smm * CHATS research program.
54207753Smm *
55207753Smm * Redistribution and use in source and binary forms, with or without
56207753Smm * modification, are permitted provided that the following conditions
57207753Smm * are met:
58207753Smm * 1. Redistributions of source code must retain the above copyright
59207753Smm *    notice, this list of conditions and the following disclaimer.
60207753Smm * 2. Redistributions in binary form must reproduce the above copyright
61207753Smm *    notice, this list of conditions and the following disclaimer in the
62207753Smm *    documentation and/or other materials provided with the distribution.
63207753Smm *
64207753Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65207753Smm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66207753Smm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67207753Smm * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 205778 2010-03-27 23:53:47Z alc $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_cpu.h"
107#include "opt_pmap.h"
108#include "opt_msgbuf.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/sf_buf.h>
123#include <sys/sx.h>
124#include <sys/vmmeter.h>
125#include <sys/sched.h>
126#include <sys/sysctl.h>
127#ifdef SMP
128#include <sys/smp.h>
129#endif
130
131#include <vm/vm.h>
132#include <vm/vm_param.h>
133#include <vm/vm_kern.h>
134#include <vm/vm_page.h>
135#include <vm/vm_map.h>
136#include <vm/vm_object.h>
137#include <vm/vm_extern.h>
138#include <vm/vm_pageout.h>
139#include <vm/vm_pager.h>
140#include <vm/vm_reserv.h>
141#include <vm/uma.h>
142
143#include <machine/cpu.h>
144#include <machine/cputypes.h>
145#include <machine/md_var.h>
146#include <machine/pcb.h>
147#include <machine/specialreg.h>
148#ifdef SMP
149#include <machine/smp.h>
150#endif
151
152#ifdef XBOX
153#include <machine/xbox.h>
154#endif
155
156#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
157#define CPU_ENABLE_SSE
158#endif
159
160#ifndef PMAP_SHPGPERPROC
161#define PMAP_SHPGPERPROC 200
162#endif
163
164#if !defined(DIAGNOSTIC)
165#ifdef __GNUC_GNU_INLINE__
166#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
167#else
168#define PMAP_INLINE	extern inline
169#endif
170#else
171#define PMAP_INLINE
172#endif
173
174#define PV_STATS
175#ifdef PV_STATS
176#define PV_STAT(x)	do { x ; } while (0)
177#else
178#define PV_STAT(x)	do { } while (0)
179#endif
180
181#define	pa_index(pa)	((pa) >> PDRSHIFT)
182#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
183
184/*
185 * Get PDEs and PTEs for user/kernel address space
186 */
187#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
188#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
189
190#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
191#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
192#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
193#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
194#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
195
196#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
197    atomic_clear_int((u_int *)(pte), PG_W))
198#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
199
200struct pmap kernel_pmap_store;
201LIST_HEAD(pmaplist, pmap);
202static struct pmaplist allpmaps;
203static struct mtx allpmaps_lock;
204
205vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
206vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
207int pgeflag = 0;		/* PG_G or-in */
208int pseflag = 0;		/* PG_PS or-in */
209
210static int nkpt = NKPT;
211vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
212extern u_int32_t KERNend;
213extern u_int32_t KPTphys;
214
215#ifdef PAE
216pt_entry_t pg_nx;
217static uma_zone_t pdptzone;
218#endif
219
220static int pat_works = 0;		/* Is page attribute table sane? */
221
222SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
223
224static int pg_ps_enabled = 1;
225SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
226    "Are large page mappings enabled?");
227
228/*
229 * Data for the pv entry allocation mechanism
230 */
231static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
232static struct md_page *pv_table;
233static int shpgperproc = PMAP_SHPGPERPROC;
234
235struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
236int pv_maxchunks;			/* How many chunks we have KVA for */
237vm_offset_t pv_vafree;			/* freelist stored in the PTE */
238
239/*
240 * All those kernel PT submaps that BSD is so fond of
241 */
242struct sysmaps {
243	struct	mtx lock;
244	pt_entry_t *CMAP1;
245	pt_entry_t *CMAP2;
246	caddr_t	CADDR1;
247	caddr_t	CADDR2;
248};
249static struct sysmaps sysmaps_pcpu[MAXCPU];
250pt_entry_t *CMAP1 = 0, *KPTmap;
251static pt_entry_t *CMAP3;
252static pd_entry_t *KPTD;
253caddr_t CADDR1 = 0, ptvmmap = 0;
254static caddr_t CADDR3;
255struct msgbuf *msgbufp = 0;
256
257/*
258 * Crashdump maps.
259 */
260static caddr_t crashdumpmap;
261
262static pt_entry_t *PMAP1 = 0, *PMAP2;
263static pt_entry_t *PADDR1 = 0, *PADDR2;
264#ifdef SMP
265static int PMAP1cpu;
266static int PMAP1changedcpu;
267SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
268	   &PMAP1changedcpu, 0,
269	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
270#endif
271static int PMAP1changed;
272SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
273	   &PMAP1changed, 0,
274	   "Number of times pmap_pte_quick changed PMAP1");
275static int PMAP1unchanged;
276SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
277	   &PMAP1unchanged, 0,
278	   "Number of times pmap_pte_quick didn't change PMAP1");
279static struct mtx PMAP2mutex;
280
281static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
282static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
283static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
284static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
285static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
286static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
287static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
288		    vm_offset_t va);
289static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
290
291static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
292static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
293    vm_prot_t prot);
294static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
295    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
296static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
297static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
298static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
299static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
300static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
301static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
302static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
303static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
304static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
305    vm_prot_t prot);
306static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
307static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
308    vm_page_t *free);
309static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
310    vm_page_t *free);
311static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
312static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
313    vm_page_t *free);
314static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
315					vm_offset_t va);
316static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
317static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
318    vm_page_t m);
319static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
320    pd_entry_t newpde);
321static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
322
323static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
324
325static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
326static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
327static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
328static void pmap_pte_release(pt_entry_t *pte);
329static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
330#ifdef PAE
331static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
332#endif
333static void pmap_set_pg(void);
334
335CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
336CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
337
338/*
339 * If you get an error here, then you set KVA_PAGES wrong! See the
340 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
341 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
342 */
343CTASSERT(KERNBASE % (1 << 24) == 0);
344
345/*
346 *	Bootstrap the system enough to run with virtual memory.
347 *
348 *	On the i386 this is called after mapping has already been enabled
349 *	and just syncs the pmap module with what has already been done.
350 *	[We can't call it easily with mapping off since the kernel is not
351 *	mapped with PA == VA, hence we would have to relocate every address
352 *	from the linked base (virtual) address "KERNBASE" to the actual
353 *	(physical) address starting relative to 0]
354 */
355void
356pmap_bootstrap(vm_paddr_t firstaddr)
357{
358	vm_offset_t va;
359	pt_entry_t *pte, *unused;
360	struct sysmaps *sysmaps;
361	int i;
362
363	/*
364	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
365	 * large. It should instead be correctly calculated in locore.s and
366	 * not based on 'first' (which is a physical address, not a virtual
367	 * address, for the start of unused physical memory). The kernel
368	 * page tables are NOT double mapped and thus should not be included
369	 * in this calculation.
370	 */
371	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
372
373	virtual_end = VM_MAX_KERNEL_ADDRESS;
374
375	/*
376	 * Initialize the kernel pmap (which is statically allocated).
377	 */
378	PMAP_LOCK_INIT(kernel_pmap);
379	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
380#ifdef PAE
381	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
382#endif
383	kernel_pmap->pm_root = NULL;
384	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
385	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
386	LIST_INIT(&allpmaps);
387
388	/*
389	 * Request a spin mutex so that changes to allpmaps cannot be
390	 * preempted by smp_rendezvous_cpus().  Otherwise,
391	 * pmap_update_pde_kernel() could access allpmaps while it is
392	 * being changed.
393	 */
394	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
395	mtx_lock_spin(&allpmaps_lock);
396	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
397	mtx_unlock_spin(&allpmaps_lock);
398
399	/*
400	 * Reserve some special page table entries/VA space for temporary
401	 * mapping of pages.
402	 */
403#define	SYSMAP(c, p, v, n)	\
404	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
405
406	va = virtual_avail;
407	pte = vtopte(va);
408
409	/*
410	 * CMAP1/CMAP2 are used for zeroing and copying pages.
411	 * CMAP3 is used for the idle process page zeroing.
412	 */
413	for (i = 0; i < MAXCPU; i++) {
414		sysmaps = &sysmaps_pcpu[i];
415		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
416		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
417		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
418	}
419	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
420	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
421
422	/*
423	 * Crashdump maps.
424	 */
425	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
426
427	/*
428	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
429	 */
430	SYSMAP(caddr_t, unused, ptvmmap, 1)
431
432	/*
433	 * msgbufp is used to map the system message buffer.
434	 */
435	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
436
437	/*
438	 * KPTmap is used by pmap_kextract().
439	 */
440	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
441
442	for (i = 0; i < NKPT; i++)
443		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
444
445	/*
446	 * Adjust the start of the KPTD and KPTmap so that the implementation
447	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
448	 */
449	KPTD -= KPTDI;
450	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
451
452	/*
453	 * ptemap is used for pmap_pte_quick
454	 */
455	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
456	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
457
458	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
459
460	virtual_avail = va;
461
462	/*
463	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
464	 * physical memory region that is used by the ACPI wakeup code.  This
465	 * mapping must not have PG_G set.
466	 */
467#ifdef XBOX
468	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
469	 * an early stadium, we cannot yet neatly map video memory ... :-(
470	 * Better fixes are very welcome! */
471	if (!arch_i386_is_xbox)
472#endif
473	for (i = 1; i < NKPT; i++)
474		PTD[i] = 0;
475
476	/* Initialize the PAT MSR if present. */
477	pmap_init_pat();
478
479	/* Turn on PG_G on kernel page(s) */
480	pmap_set_pg();
481}
482
483/*
484 * Setup the PAT MSR.
485 */
486void
487pmap_init_pat(void)
488{
489	uint64_t pat_msr;
490	char *sysenv;
491	static int pat_tested = 0;
492
493	/* Bail if this CPU doesn't implement PAT. */
494	if (!(cpu_feature & CPUID_PAT))
495		return;
496
497	/*
498	 * Due to some Intel errata, we can only safely use the lower 4
499	 * PAT entries.
500	 *
501	 *   Intel Pentium III Processor Specification Update
502	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
503	 * or Mode C Paging)
504	 *
505	 *   Intel Pentium IV  Processor Specification Update
506	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
507	 *
508	 * Some Apple Macs based on nVidia chipsets cannot enter ACPI mode
509	 * via SMI# when we use upper 4 PAT entries for unknown reason.
510	 */
511	if (!pat_tested) {
512		if (cpu_vendor_id != CPU_VENDOR_INTEL ||
513		    (CPUID_TO_FAMILY(cpu_id) == 6 &&
514		    CPUID_TO_MODEL(cpu_id) >= 0xe)) {
515			pat_works = 1;
516			sysenv = getenv("smbios.system.product");
517			if (sysenv != NULL) {
518				if (strncmp(sysenv, "MacBook5,1", 10) == 0 ||
519				    strncmp(sysenv, "MacBookPro5,5", 13) == 0 ||
520				    strncmp(sysenv, "Macmini3,1", 10) == 0)
521					pat_works = 0;
522				freeenv(sysenv);
523			}
524		}
525		pat_tested = 1;
526	}
527
528	/* Initialize default PAT entries. */
529	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
530	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
531	    PAT_VALUE(2, PAT_UNCACHED) |
532	    PAT_VALUE(3, PAT_UNCACHEABLE) |
533	    PAT_VALUE(4, PAT_WRITE_BACK) |
534	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
535	    PAT_VALUE(6, PAT_UNCACHED) |
536	    PAT_VALUE(7, PAT_UNCACHEABLE);
537
538	if (pat_works) {
539		/*
540		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
541		 * Program 4 and 5 as WP and WC.
542		 * Leave 6 and 7 as UC- and UC.
543		 */
544		pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
545		pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
546		    PAT_VALUE(5, PAT_WRITE_COMBINING);
547	} else {
548		/*
549		 * Just replace PAT Index 2 with WC instead of UC-.
550		 */
551		pat_msr &= ~PAT_MASK(2);
552		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
553	}
554	wrmsr(MSR_PAT, pat_msr);
555}
556
557/*
558 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
559 */
560static void
561pmap_set_pg(void)
562{
563	pt_entry_t *pte;
564	vm_offset_t va, endva;
565
566	if (pgeflag == 0)
567		return;
568
569	endva = KERNBASE + KERNend;
570
571	if (pseflag) {
572		va = KERNBASE + KERNLOAD;
573		while (va  < endva) {
574			pdir_pde(PTD, va) |= pgeflag;
575			invltlb();	/* Play it safe, invltlb() every time */
576			va += NBPDR;
577		}
578	} else {
579		va = (vm_offset_t)btext;
580		while (va < endva) {
581			pte = vtopte(va);
582			if (*pte)
583				*pte |= pgeflag;
584			invltlb();	/* Play it safe, invltlb() every time */
585			va += PAGE_SIZE;
586		}
587	}
588}
589
590/*
591 * Initialize a vm_page's machine-dependent fields.
592 */
593void
594pmap_page_init(vm_page_t m)
595{
596
597	TAILQ_INIT(&m->md.pv_list);
598	m->md.pat_mode = PAT_WRITE_BACK;
599}
600
601#ifdef PAE
602static void *
603pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
604{
605
606	/* Inform UMA that this allocator uses kernel_map/object. */
607	*flags = UMA_SLAB_KERNEL;
608	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
609	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
610}
611#endif
612
613/*
614 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
615 * Requirements:
616 *  - Must deal with pages in order to ensure that none of the PG_* bits
617 *    are ever set, PG_V in particular.
618 *  - Assumes we can write to ptes without pte_store() atomic ops, even
619 *    on PAE systems.  This should be ok.
620 *  - Assumes nothing will ever test these addresses for 0 to indicate
621 *    no mapping instead of correctly checking PG_V.
622 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
623 * Because PG_V is never set, there can be no mappings to invalidate.
624 */
625static vm_offset_t
626pmap_ptelist_alloc(vm_offset_t *head)
627{
628	pt_entry_t *pte;
629	vm_offset_t va;
630
631	va = *head;
632	if (va == 0)
633		return (va);	/* Out of memory */
634	pte = vtopte(va);
635	*head = *pte;
636	if (*head & PG_V)
637		panic("pmap_ptelist_alloc: va with PG_V set!");
638	*pte = 0;
639	return (va);
640}
641
642static void
643pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
644{
645	pt_entry_t *pte;
646
647	if (va & PG_V)
648		panic("pmap_ptelist_free: freeing va with PG_V set!");
649	pte = vtopte(va);
650	*pte = *head;		/* virtual! PG_V is 0 though */
651	*head = va;
652}
653
654static void
655pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
656{
657	int i;
658	vm_offset_t va;
659
660	*head = 0;
661	for (i = npages - 1; i >= 0; i--) {
662		va = (vm_offset_t)base + i * PAGE_SIZE;
663		pmap_ptelist_free(head, va);
664	}
665}
666
667
668/*
669 *	Initialize the pmap module.
670 *	Called by vm_init, to initialize any structures that the pmap
671 *	system needs to map virtual memory.
672 */
673void
674pmap_init(void)
675{
676	vm_page_t mpte;
677	vm_size_t s;
678	int i, pv_npg;
679
680	/*
681	 * Initialize the vm page array entries for the kernel pmap's
682	 * page table pages.
683	 */
684	for (i = 0; i < NKPT; i++) {
685		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
686		KASSERT(mpte >= vm_page_array &&
687		    mpte < &vm_page_array[vm_page_array_size],
688		    ("pmap_init: page table page is out of range"));
689		mpte->pindex = i + KPTDI;
690		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
691	}
692
693	/*
694	 * Initialize the address space (zone) for the pv entries.  Set a
695	 * high water mark so that the system can recover from excessive
696	 * numbers of pv entries.
697	 */
698	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
699	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
700	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
701	pv_entry_max = roundup(pv_entry_max, _NPCPV);
702	pv_entry_high_water = 9 * (pv_entry_max / 10);
703
704	/*
705	 * If the kernel is running in a virtual machine on an AMD Family 10h
706	 * processor, then it must assume that MCA is enabled by the virtual
707	 * machine monitor.
708	 */
709	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
710	    CPUID_TO_FAMILY(cpu_id) == 0x10)
711		workaround_erratum383 = 1;
712
713	/*
714	 * Are large page mappings supported and enabled?
715	 */
716	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
717	if (pseflag == 0)
718		pg_ps_enabled = 0;
719	else if (pg_ps_enabled) {
720		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
721		    ("pmap_init: can't assign to pagesizes[1]"));
722		pagesizes[1] = NBPDR;
723	}
724
725	/*
726	 * Calculate the size of the pv head table for superpages.
727	 */
728	for (i = 0; phys_avail[i + 1]; i += 2);
729	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
730
731	/*
732	 * Allocate memory for the pv head table for superpages.
733	 */
734	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
735	s = round_page(s);
736	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
737	for (i = 0; i < pv_npg; i++)
738		TAILQ_INIT(&pv_table[i].pv_list);
739
740	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
741	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
742	    PAGE_SIZE * pv_maxchunks);
743	if (pv_chunkbase == NULL)
744		panic("pmap_init: not enough kvm for pv chunks");
745	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
746#ifdef PAE
747	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
748	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
749	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
750	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
751#endif
752}
753
754
755SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
756	"Max number of PV entries");
757SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
758	"Page share factor per proc");
759
760SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
761    "2/4MB page mapping counters");
762
763static u_long pmap_pde_demotions;
764SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
765    &pmap_pde_demotions, 0, "2/4MB page demotions");
766
767static u_long pmap_pde_mappings;
768SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
769    &pmap_pde_mappings, 0, "2/4MB page mappings");
770
771static u_long pmap_pde_p_failures;
772SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
773    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
774
775static u_long pmap_pde_promotions;
776SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
777    &pmap_pde_promotions, 0, "2/4MB page promotions");
778
779/***************************************************
780 * Low level helper routines.....
781 ***************************************************/
782
783/*
784 * Determine the appropriate bits to set in a PTE or PDE for a specified
785 * caching mode.
786 */
787int
788pmap_cache_bits(int mode, boolean_t is_pde)
789{
790	int pat_flag, pat_index, cache_bits;
791
792	/* The PAT bit is different for PTE's and PDE's. */
793	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
794
795	/* If we don't support PAT, map extended modes to older ones. */
796	if (!(cpu_feature & CPUID_PAT)) {
797		switch (mode) {
798		case PAT_UNCACHEABLE:
799		case PAT_WRITE_THROUGH:
800		case PAT_WRITE_BACK:
801			break;
802		case PAT_UNCACHED:
803		case PAT_WRITE_COMBINING:
804		case PAT_WRITE_PROTECTED:
805			mode = PAT_UNCACHEABLE;
806			break;
807		}
808	}
809
810	/* Map the caching mode to a PAT index. */
811	if (pat_works) {
812		switch (mode) {
813		case PAT_UNCACHEABLE:
814			pat_index = 3;
815			break;
816		case PAT_WRITE_THROUGH:
817			pat_index = 1;
818			break;
819		case PAT_WRITE_BACK:
820			pat_index = 0;
821			break;
822		case PAT_UNCACHED:
823			pat_index = 2;
824			break;
825		case PAT_WRITE_COMBINING:
826			pat_index = 5;
827			break;
828		case PAT_WRITE_PROTECTED:
829			pat_index = 4;
830			break;
831		default:
832			panic("Unknown caching mode %d\n", mode);
833		}
834	} else {
835		switch (mode) {
836		case PAT_UNCACHED:
837		case PAT_UNCACHEABLE:
838		case PAT_WRITE_PROTECTED:
839			pat_index = 3;
840			break;
841		case PAT_WRITE_THROUGH:
842			pat_index = 1;
843			break;
844		case PAT_WRITE_BACK:
845			pat_index = 0;
846			break;
847		case PAT_WRITE_COMBINING:
848			pat_index = 2;
849			break;
850		default:
851			panic("Unknown caching mode %d\n", mode);
852		}
853	}
854
855	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
856	cache_bits = 0;
857	if (pat_index & 0x4)
858		cache_bits |= pat_flag;
859	if (pat_index & 0x2)
860		cache_bits |= PG_NC_PCD;
861	if (pat_index & 0x1)
862		cache_bits |= PG_NC_PWT;
863	return (cache_bits);
864}
865
866/*
867 * The caller is responsible for maintaining TLB consistency.
868 */
869static void
870pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
871{
872	pd_entry_t *pde;
873	pmap_t pmap;
874	boolean_t PTD_updated;
875
876	PTD_updated = FALSE;
877	mtx_lock_spin(&allpmaps_lock);
878	LIST_FOREACH(pmap, &allpmaps, pm_list) {
879		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
880		    PG_FRAME))
881			PTD_updated = TRUE;
882		pde = pmap_pde(pmap, va);
883		pde_store(pde, newpde);
884	}
885	mtx_unlock_spin(&allpmaps_lock);
886	KASSERT(PTD_updated,
887	    ("pmap_kenter_pde: current page table is not in allpmaps"));
888}
889
890/*
891 * After changing the page size for the specified virtual address in the page
892 * table, flush the corresponding entries from the processor's TLB.  Only the
893 * calling processor's TLB is affected.
894 *
895 * The calling thread must be pinned to a processor.
896 */
897static void
898pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
899{
900	u_long cr4;
901
902	if ((newpde & PG_PS) == 0)
903		/* Demotion: flush a specific 2MB page mapping. */
904		invlpg(va);
905	else if ((newpde & PG_G) == 0)
906		/*
907		 * Promotion: flush every 4KB page mapping from the TLB
908		 * because there are too many to flush individually.
909		 */
910		invltlb();
911	else {
912		/*
913		 * Promotion: flush every 4KB page mapping from the TLB,
914		 * including any global (PG_G) mappings.
915		 */
916		cr4 = rcr4();
917		load_cr4(cr4 & ~CR4_PGE);
918		/*
919		 * Although preemption at this point could be detrimental to
920		 * performance, it would not lead to an error.  PG_G is simply
921		 * ignored if CR4.PGE is clear.  Moreover, in case this block
922		 * is re-entered, the load_cr4() either above or below will
923		 * modify CR4.PGE flushing the TLB.
924		 */
925		load_cr4(cr4 | CR4_PGE);
926	}
927}
928#ifdef SMP
929/*
930 * For SMP, these functions have to use the IPI mechanism for coherence.
931 *
932 * N.B.: Before calling any of the following TLB invalidation functions,
933 * the calling processor must ensure that all stores updating a non-
934 * kernel page table are globally performed.  Otherwise, another
935 * processor could cache an old, pre-update entry without being
936 * invalidated.  This can happen one of two ways: (1) The pmap becomes
937 * active on another processor after its pm_active field is checked by
938 * one of the following functions but before a store updating the page
939 * table is globally performed. (2) The pmap becomes active on another
940 * processor before its pm_active field is checked but due to
941 * speculative loads one of the following functions stills reads the
942 * pmap as inactive on the other processor.
943 *
944 * The kernel page table is exempt because its pm_active field is
945 * immutable.  The kernel page table is always active on every
946 * processor.
947 */
948void
949pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
950{
951	u_int cpumask;
952	u_int other_cpus;
953
954	sched_pin();
955	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
956		invlpg(va);
957		smp_invlpg(va);
958	} else {
959		cpumask = PCPU_GET(cpumask);
960		other_cpus = PCPU_GET(other_cpus);
961		if (pmap->pm_active & cpumask)
962			invlpg(va);
963		if (pmap->pm_active & other_cpus)
964			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
965	}
966	sched_unpin();
967}
968
969void
970pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
971{
972	u_int cpumask;
973	u_int other_cpus;
974	vm_offset_t addr;
975
976	sched_pin();
977	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
978		for (addr = sva; addr < eva; addr += PAGE_SIZE)
979			invlpg(addr);
980		smp_invlpg_range(sva, eva);
981	} else {
982		cpumask = PCPU_GET(cpumask);
983		other_cpus = PCPU_GET(other_cpus);
984		if (pmap->pm_active & cpumask)
985			for (addr = sva; addr < eva; addr += PAGE_SIZE)
986				invlpg(addr);
987		if (pmap->pm_active & other_cpus)
988			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
989			    sva, eva);
990	}
991	sched_unpin();
992}
993
994void
995pmap_invalidate_all(pmap_t pmap)
996{
997	u_int cpumask;
998	u_int other_cpus;
999
1000	sched_pin();
1001	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
1002		invltlb();
1003		smp_invltlb();
1004	} else {
1005		cpumask = PCPU_GET(cpumask);
1006		other_cpus = PCPU_GET(other_cpus);
1007		if (pmap->pm_active & cpumask)
1008			invltlb();
1009		if (pmap->pm_active & other_cpus)
1010			smp_masked_invltlb(pmap->pm_active & other_cpus);
1011	}
1012	sched_unpin();
1013}
1014
1015void
1016pmap_invalidate_cache(void)
1017{
1018
1019	sched_pin();
1020	wbinvd();
1021	smp_cache_flush();
1022	sched_unpin();
1023}
1024
1025struct pde_action {
1026	cpumask_t store;	/* processor that updates the PDE */
1027	cpumask_t invalidate;	/* processors that invalidate their TLB */
1028	vm_offset_t va;
1029	pd_entry_t *pde;
1030	pd_entry_t newpde;
1031};
1032
1033static void
1034pmap_update_pde_kernel(void *arg)
1035{
1036	struct pde_action *act = arg;
1037	pd_entry_t *pde;
1038	pmap_t pmap;
1039
1040	if (act->store == PCPU_GET(cpumask))
1041		/*
1042		 * Elsewhere, this operation requires allpmaps_lock for
1043		 * synchronization.  Here, it does not because it is being
1044		 * performed in the context of an all_cpus rendezvous.
1045		 */
1046		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1047			pde = pmap_pde(pmap, act->va);
1048			pde_store(pde, act->newpde);
1049		}
1050}
1051
1052static void
1053pmap_update_pde_user(void *arg)
1054{
1055	struct pde_action *act = arg;
1056
1057	if (act->store == PCPU_GET(cpumask))
1058		pde_store(act->pde, act->newpde);
1059}
1060
1061static void
1062pmap_update_pde_teardown(void *arg)
1063{
1064	struct pde_action *act = arg;
1065
1066	if ((act->invalidate & PCPU_GET(cpumask)) != 0)
1067		pmap_update_pde_invalidate(act->va, act->newpde);
1068}
1069
1070/*
1071 * Change the page size for the specified virtual address in a way that
1072 * prevents any possibility of the TLB ever having two entries that map the
1073 * same virtual address using different page sizes.  This is the recommended
1074 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1075 * machine check exception for a TLB state that is improperly diagnosed as a
1076 * hardware error.
1077 */
1078static void
1079pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1080{
1081	struct pde_action act;
1082	cpumask_t active, cpumask;
1083
1084	sched_pin();
1085	cpumask = PCPU_GET(cpumask);
1086	if (pmap == kernel_pmap)
1087		active = all_cpus;
1088	else
1089		active = pmap->pm_active;
1090	if ((active & PCPU_GET(other_cpus)) != 0) {
1091		act.store = cpumask;
1092		act.invalidate = active;
1093		act.va = va;
1094		act.pde = pde;
1095		act.newpde = newpde;
1096		smp_rendezvous_cpus(cpumask | active,
1097		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1098		    pmap_update_pde_kernel : pmap_update_pde_user,
1099		    pmap_update_pde_teardown, &act);
1100	} else {
1101		if (pmap == kernel_pmap)
1102			pmap_kenter_pde(va, newpde);
1103		else
1104			pde_store(pde, newpde);
1105		if ((active & cpumask) != 0)
1106			pmap_update_pde_invalidate(va, newpde);
1107	}
1108	sched_unpin();
1109}
1110#else /* !SMP */
1111/*
1112 * Normal, non-SMP, 486+ invalidation functions.
1113 * We inline these within pmap.c for speed.
1114 */
1115PMAP_INLINE void
1116pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1117{
1118
1119	if (pmap == kernel_pmap || pmap->pm_active)
1120		invlpg(va);
1121}
1122
1123PMAP_INLINE void
1124pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1125{
1126	vm_offset_t addr;
1127
1128	if (pmap == kernel_pmap || pmap->pm_active)
1129		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1130			invlpg(addr);
1131}
1132
1133PMAP_INLINE void
1134pmap_invalidate_all(pmap_t pmap)
1135{
1136
1137	if (pmap == kernel_pmap || pmap->pm_active)
1138		invltlb();
1139}
1140
1141PMAP_INLINE void
1142pmap_invalidate_cache(void)
1143{
1144
1145	wbinvd();
1146}
1147
1148static void
1149pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1150{
1151
1152	if (pmap == kernel_pmap)
1153		pmap_kenter_pde(va, newpde);
1154	else
1155		pde_store(pde, newpde);
1156	if (pmap == kernel_pmap || pmap->pm_active)
1157		pmap_update_pde_invalidate(va, newpde);
1158}
1159#endif /* !SMP */
1160
1161void
1162pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1163{
1164
1165	KASSERT((sva & PAGE_MASK) == 0,
1166	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1167	KASSERT((eva & PAGE_MASK) == 0,
1168	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1169
1170	if (cpu_feature & CPUID_SS)
1171		; /* If "Self Snoop" is supported, do nothing. */
1172	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1173		 eva - sva < 2 * 1024 * 1024) {
1174
1175		/*
1176		 * Otherwise, do per-cache line flush.  Use the mfence
1177		 * instruction to insure that previous stores are
1178		 * included in the write-back.  The processor
1179		 * propagates flush to other processors in the cache
1180		 * coherence domain.
1181		 */
1182		mfence();
1183		for (; sva < eva; sva += cpu_clflush_line_size)
1184			clflush(sva);
1185		mfence();
1186	} else {
1187
1188		/*
1189		 * No targeted cache flush methods are supported by CPU,
1190		 * or the supplied range is bigger than 2MB.
1191		 * Globally invalidate cache.
1192		 */
1193		pmap_invalidate_cache();
1194	}
1195}
1196
1197/*
1198 * Are we current address space or kernel?  N.B. We return FALSE when
1199 * a pmap's page table is in use because a kernel thread is borrowing
1200 * it.  The borrowed page table can change spontaneously, making any
1201 * dependence on its continued use subject to a race condition.
1202 */
1203static __inline int
1204pmap_is_current(pmap_t pmap)
1205{
1206
1207	return (pmap == kernel_pmap ||
1208		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1209	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1210}
1211
1212/*
1213 * If the given pmap is not the current or kernel pmap, the returned pte must
1214 * be released by passing it to pmap_pte_release().
1215 */
1216pt_entry_t *
1217pmap_pte(pmap_t pmap, vm_offset_t va)
1218{
1219	pd_entry_t newpf;
1220	pd_entry_t *pde;
1221
1222	pde = pmap_pde(pmap, va);
1223	if (*pde & PG_PS)
1224		return (pde);
1225	if (*pde != 0) {
1226		/* are we current address space or kernel? */
1227		if (pmap_is_current(pmap))
1228			return (vtopte(va));
1229		mtx_lock(&PMAP2mutex);
1230		newpf = *pde & PG_FRAME;
1231		if ((*PMAP2 & PG_FRAME) != newpf) {
1232			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1233			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1234		}
1235		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1236	}
1237	return (0);
1238}
1239
1240/*
1241 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1242 * being NULL.
1243 */
1244static __inline void
1245pmap_pte_release(pt_entry_t *pte)
1246{
1247
1248	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1249		mtx_unlock(&PMAP2mutex);
1250}
1251
1252static __inline void
1253invlcaddr(void *caddr)
1254{
1255
1256	invlpg((u_int)caddr);
1257}
1258
1259/*
1260 * Super fast pmap_pte routine best used when scanning
1261 * the pv lists.  This eliminates many coarse-grained
1262 * invltlb calls.  Note that many of the pv list
1263 * scans are across different pmaps.  It is very wasteful
1264 * to do an entire invltlb for checking a single mapping.
1265 *
1266 * If the given pmap is not the current pmap, vm_page_queue_mtx
1267 * must be held and curthread pinned to a CPU.
1268 */
1269static pt_entry_t *
1270pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1271{
1272	pd_entry_t newpf;
1273	pd_entry_t *pde;
1274
1275	pde = pmap_pde(pmap, va);
1276	if (*pde & PG_PS)
1277		return (pde);
1278	if (*pde != 0) {
1279		/* are we current address space or kernel? */
1280		if (pmap_is_current(pmap))
1281			return (vtopte(va));
1282		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1283		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1284		newpf = *pde & PG_FRAME;
1285		if ((*PMAP1 & PG_FRAME) != newpf) {
1286			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1287#ifdef SMP
1288			PMAP1cpu = PCPU_GET(cpuid);
1289#endif
1290			invlcaddr(PADDR1);
1291			PMAP1changed++;
1292		} else
1293#ifdef SMP
1294		if (PMAP1cpu != PCPU_GET(cpuid)) {
1295			PMAP1cpu = PCPU_GET(cpuid);
1296			invlcaddr(PADDR1);
1297			PMAP1changedcpu++;
1298		} else
1299#endif
1300			PMAP1unchanged++;
1301		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1302	}
1303	return (0);
1304}
1305
1306/*
1307 *	Routine:	pmap_extract
1308 *	Function:
1309 *		Extract the physical page address associated
1310 *		with the given map/virtual_address pair.
1311 */
1312vm_paddr_t
1313pmap_extract(pmap_t pmap, vm_offset_t va)
1314{
1315	vm_paddr_t rtval;
1316	pt_entry_t *pte;
1317	pd_entry_t pde;
1318
1319	rtval = 0;
1320	PMAP_LOCK(pmap);
1321	pde = pmap->pm_pdir[va >> PDRSHIFT];
1322	if (pde != 0) {
1323		if ((pde & PG_PS) != 0)
1324			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1325		else {
1326			pte = pmap_pte(pmap, va);
1327			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1328			pmap_pte_release(pte);
1329		}
1330	}
1331	PMAP_UNLOCK(pmap);
1332	return (rtval);
1333}
1334
1335/*
1336 *	Routine:	pmap_extract_and_hold
1337 *	Function:
1338 *		Atomically extract and hold the physical page
1339 *		with the given pmap and virtual address pair
1340 *		if that mapping permits the given protection.
1341 */
1342vm_page_t
1343pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1344{
1345	pd_entry_t pde;
1346	pt_entry_t pte;
1347	vm_page_t m;
1348
1349	m = NULL;
1350	vm_page_lock_queues();
1351	PMAP_LOCK(pmap);
1352	pde = *pmap_pde(pmap, va);
1353	if (pde != 0) {
1354		if (pde & PG_PS) {
1355			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1356				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1357				    (va & PDRMASK));
1358				vm_page_hold(m);
1359			}
1360		} else {
1361			sched_pin();
1362			pte = *pmap_pte_quick(pmap, va);
1363			if (pte != 0 &&
1364			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1365				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1366				vm_page_hold(m);
1367			}
1368			sched_unpin();
1369		}
1370	}
1371	vm_page_unlock_queues();
1372	PMAP_UNLOCK(pmap);
1373	return (m);
1374}
1375
1376/***************************************************
1377 * Low level mapping routines.....
1378 ***************************************************/
1379
1380/*
1381 * Add a wired page to the kva.
1382 * Note: not SMP coherent.
1383 */
1384PMAP_INLINE void
1385pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1386{
1387	pt_entry_t *pte;
1388
1389	pte = vtopte(va);
1390	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1391}
1392
1393static __inline void
1394pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1395{
1396	pt_entry_t *pte;
1397
1398	pte = vtopte(va);
1399	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1400}
1401
1402/*
1403 * Remove a page from the kernel pagetables.
1404 * Note: not SMP coherent.
1405 */
1406PMAP_INLINE void
1407pmap_kremove(vm_offset_t va)
1408{
1409	pt_entry_t *pte;
1410
1411	pte = vtopte(va);
1412	pte_clear(pte);
1413}
1414
1415/*
1416 *	Used to map a range of physical addresses into kernel
1417 *	virtual address space.
1418 *
1419 *	The value passed in '*virt' is a suggested virtual address for
1420 *	the mapping. Architectures which can support a direct-mapped
1421 *	physical to virtual region can return the appropriate address
1422 *	within that region, leaving '*virt' unchanged. Other
1423 *	architectures should map the pages starting at '*virt' and
1424 *	update '*virt' with the first usable address after the mapped
1425 *	region.
1426 */
1427vm_offset_t
1428pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1429{
1430	vm_offset_t va, sva;
1431
1432	va = sva = *virt;
1433	while (start < end) {
1434		pmap_kenter(va, start);
1435		va += PAGE_SIZE;
1436		start += PAGE_SIZE;
1437	}
1438	pmap_invalidate_range(kernel_pmap, sva, va);
1439	*virt = va;
1440	return (sva);
1441}
1442
1443
1444/*
1445 * Add a list of wired pages to the kva
1446 * this routine is only used for temporary
1447 * kernel mappings that do not need to have
1448 * page modification or references recorded.
1449 * Note that old mappings are simply written
1450 * over.  The page *must* be wired.
1451 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1452 */
1453void
1454pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1455{
1456	pt_entry_t *endpte, oldpte, *pte;
1457
1458	oldpte = 0;
1459	pte = vtopte(sva);
1460	endpte = pte + count;
1461	while (pte < endpte) {
1462		oldpte |= *pte;
1463		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag |
1464		    pmap_cache_bits((*ma)->md.pat_mode, 0) | PG_RW | PG_V);
1465		pte++;
1466		ma++;
1467	}
1468	if ((oldpte & PG_V) != 0)
1469		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1470		    PAGE_SIZE);
1471}
1472
1473/*
1474 * This routine tears out page mappings from the
1475 * kernel -- it is meant only for temporary mappings.
1476 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1477 */
1478void
1479pmap_qremove(vm_offset_t sva, int count)
1480{
1481	vm_offset_t va;
1482
1483	va = sva;
1484	while (count-- > 0) {
1485		pmap_kremove(va);
1486		va += PAGE_SIZE;
1487	}
1488	pmap_invalidate_range(kernel_pmap, sva, va);
1489}
1490
1491/***************************************************
1492 * Page table page management routines.....
1493 ***************************************************/
1494static __inline void
1495pmap_free_zero_pages(vm_page_t free)
1496{
1497	vm_page_t m;
1498
1499	while (free != NULL) {
1500		m = free;
1501		free = m->right;
1502		/* Preserve the page's PG_ZERO setting. */
1503		vm_page_free_toq(m);
1504	}
1505}
1506
1507/*
1508 * Schedule the specified unused page table page to be freed.  Specifically,
1509 * add the page to the specified list of pages that will be released to the
1510 * physical memory manager after the TLB has been updated.
1511 */
1512static __inline void
1513pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1514{
1515
1516	if (set_PG_ZERO)
1517		m->flags |= PG_ZERO;
1518	else
1519		m->flags &= ~PG_ZERO;
1520	m->right = *free;
1521	*free = m;
1522}
1523
1524/*
1525 * Inserts the specified page table page into the specified pmap's collection
1526 * of idle page table pages.  Each of a pmap's page table pages is responsible
1527 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1528 * ordered by this virtual address range.
1529 */
1530static void
1531pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1532{
1533	vm_page_t root;
1534
1535	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1536	root = pmap->pm_root;
1537	if (root == NULL) {
1538		mpte->left = NULL;
1539		mpte->right = NULL;
1540	} else {
1541		root = vm_page_splay(mpte->pindex, root);
1542		if (mpte->pindex < root->pindex) {
1543			mpte->left = root->left;
1544			mpte->right = root;
1545			root->left = NULL;
1546		} else if (mpte->pindex == root->pindex)
1547			panic("pmap_insert_pt_page: pindex already inserted");
1548		else {
1549			mpte->right = root->right;
1550			mpte->left = root;
1551			root->right = NULL;
1552		}
1553	}
1554	pmap->pm_root = mpte;
1555}
1556
1557/*
1558 * Looks for a page table page mapping the specified virtual address in the
1559 * specified pmap's collection of idle page table pages.  Returns NULL if there
1560 * is no page table page corresponding to the specified virtual address.
1561 */
1562static vm_page_t
1563pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1564{
1565	vm_page_t mpte;
1566	vm_pindex_t pindex = va >> PDRSHIFT;
1567
1568	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1569	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1570		mpte = vm_page_splay(pindex, mpte);
1571		if ((pmap->pm_root = mpte)->pindex != pindex)
1572			mpte = NULL;
1573	}
1574	return (mpte);
1575}
1576
1577/*
1578 * Removes the specified page table page from the specified pmap's collection
1579 * of idle page table pages.  The specified page table page must be a member of
1580 * the pmap's collection.
1581 */
1582static void
1583pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1584{
1585	vm_page_t root;
1586
1587	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1588	if (mpte != pmap->pm_root)
1589		vm_page_splay(mpte->pindex, pmap->pm_root);
1590	if (mpte->left == NULL)
1591		root = mpte->right;
1592	else {
1593		root = vm_page_splay(mpte->pindex, mpte->left);
1594		root->right = mpte->right;
1595	}
1596	pmap->pm_root = root;
1597}
1598
1599/*
1600 * This routine unholds page table pages, and if the hold count
1601 * drops to zero, then it decrements the wire count.
1602 */
1603static __inline int
1604pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1605{
1606
1607	--m->wire_count;
1608	if (m->wire_count == 0)
1609		return _pmap_unwire_pte_hold(pmap, m, free);
1610	else
1611		return 0;
1612}
1613
1614static int
1615_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1616{
1617	vm_offset_t pteva;
1618
1619	/*
1620	 * unmap the page table page
1621	 */
1622	pmap->pm_pdir[m->pindex] = 0;
1623	--pmap->pm_stats.resident_count;
1624
1625	/*
1626	 * This is a release store so that the ordinary store unmapping
1627	 * the page table page is globally performed before TLB shoot-
1628	 * down is begun.
1629	 */
1630	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1631
1632	/*
1633	 * Do an invltlb to make the invalidated mapping
1634	 * take effect immediately.
1635	 */
1636	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1637	pmap_invalidate_page(pmap, pteva);
1638
1639	/*
1640	 * Put page on a list so that it is released after
1641	 * *ALL* TLB shootdown is done
1642	 */
1643	pmap_add_delayed_free_list(m, free, TRUE);
1644
1645	return 1;
1646}
1647
1648/*
1649 * After removing a page table entry, this routine is used to
1650 * conditionally free the page, and manage the hold/wire counts.
1651 */
1652static int
1653pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1654{
1655	pd_entry_t ptepde;
1656	vm_page_t mpte;
1657
1658	if (va >= VM_MAXUSER_ADDRESS)
1659		return 0;
1660	ptepde = *pmap_pde(pmap, va);
1661	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1662	return pmap_unwire_pte_hold(pmap, mpte, free);
1663}
1664
1665void
1666pmap_pinit0(pmap_t pmap)
1667{
1668
1669	PMAP_LOCK_INIT(pmap);
1670	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1671#ifdef PAE
1672	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1673#endif
1674	pmap->pm_root = NULL;
1675	pmap->pm_active = 0;
1676	PCPU_SET(curpmap, pmap);
1677	TAILQ_INIT(&pmap->pm_pvchunk);
1678	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1679	mtx_lock_spin(&allpmaps_lock);
1680	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1681	mtx_unlock_spin(&allpmaps_lock);
1682}
1683
1684/*
1685 * Initialize a preallocated and zeroed pmap structure,
1686 * such as one in a vmspace structure.
1687 */
1688int
1689pmap_pinit(pmap_t pmap)
1690{
1691	vm_page_t m, ptdpg[NPGPTD];
1692	vm_paddr_t pa;
1693	static int color;
1694	int i;
1695
1696	PMAP_LOCK_INIT(pmap);
1697
1698	/*
1699	 * No need to allocate page table space yet but we do need a valid
1700	 * page directory table.
1701	 */
1702	if (pmap->pm_pdir == NULL) {
1703		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1704		    NBPTD);
1705
1706		if (pmap->pm_pdir == NULL) {
1707			PMAP_LOCK_DESTROY(pmap);
1708			return (0);
1709		}
1710#ifdef PAE
1711		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1712		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1713		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1714		    ("pmap_pinit: pdpt misaligned"));
1715		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1716		    ("pmap_pinit: pdpt above 4g"));
1717#endif
1718		pmap->pm_root = NULL;
1719	}
1720	KASSERT(pmap->pm_root == NULL,
1721	    ("pmap_pinit: pmap has reserved page table page(s)"));
1722
1723	/*
1724	 * allocate the page directory page(s)
1725	 */
1726	for (i = 0; i < NPGPTD;) {
1727		m = vm_page_alloc(NULL, color++,
1728		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1729		    VM_ALLOC_ZERO);
1730		if (m == NULL)
1731			VM_WAIT;
1732		else {
1733			ptdpg[i++] = m;
1734		}
1735	}
1736
1737	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1738
1739	for (i = 0; i < NPGPTD; i++) {
1740		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1741			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1742	}
1743
1744	mtx_lock_spin(&allpmaps_lock);
1745	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1746	mtx_unlock_spin(&allpmaps_lock);
1747	/* Wire in kernel global address entries. */
1748	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1749
1750	/* install self-referential address mapping entry(s) */
1751	for (i = 0; i < NPGPTD; i++) {
1752		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1753		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1754#ifdef PAE
1755		pmap->pm_pdpt[i] = pa | PG_V;
1756#endif
1757	}
1758
1759	pmap->pm_active = 0;
1760	TAILQ_INIT(&pmap->pm_pvchunk);
1761	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1762
1763	return (1);
1764}
1765
1766/*
1767 * this routine is called if the page table page is not
1768 * mapped correctly.
1769 */
1770static vm_page_t
1771_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1772{
1773	vm_paddr_t ptepa;
1774	vm_page_t m;
1775
1776	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1777	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1778	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1779
1780	/*
1781	 * Allocate a page table page.
1782	 */
1783	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1784	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1785		if (flags & M_WAITOK) {
1786			PMAP_UNLOCK(pmap);
1787			vm_page_unlock_queues();
1788			VM_WAIT;
1789			vm_page_lock_queues();
1790			PMAP_LOCK(pmap);
1791		}
1792
1793		/*
1794		 * Indicate the need to retry.  While waiting, the page table
1795		 * page may have been allocated.
1796		 */
1797		return (NULL);
1798	}
1799	if ((m->flags & PG_ZERO) == 0)
1800		pmap_zero_page(m);
1801
1802	/*
1803	 * Map the pagetable page into the process address space, if
1804	 * it isn't already there.
1805	 */
1806
1807	pmap->pm_stats.resident_count++;
1808
1809	ptepa = VM_PAGE_TO_PHYS(m);
1810	pmap->pm_pdir[ptepindex] =
1811		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1812
1813	return m;
1814}
1815
1816static vm_page_t
1817pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1818{
1819	unsigned ptepindex;
1820	pd_entry_t ptepa;
1821	vm_page_t m;
1822
1823	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1824	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1825	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1826
1827	/*
1828	 * Calculate pagetable page index
1829	 */
1830	ptepindex = va >> PDRSHIFT;
1831retry:
1832	/*
1833	 * Get the page directory entry
1834	 */
1835	ptepa = pmap->pm_pdir[ptepindex];
1836
1837	/*
1838	 * This supports switching from a 4MB page to a
1839	 * normal 4K page.
1840	 */
1841	if (ptepa & PG_PS) {
1842		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1843		ptepa = pmap->pm_pdir[ptepindex];
1844	}
1845
1846	/*
1847	 * If the page table page is mapped, we just increment the
1848	 * hold count, and activate it.
1849	 */
1850	if (ptepa) {
1851		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1852		m->wire_count++;
1853	} else {
1854		/*
1855		 * Here if the pte page isn't mapped, or if it has
1856		 * been deallocated.
1857		 */
1858		m = _pmap_allocpte(pmap, ptepindex, flags);
1859		if (m == NULL && (flags & M_WAITOK))
1860			goto retry;
1861	}
1862	return (m);
1863}
1864
1865
1866/***************************************************
1867* Pmap allocation/deallocation routines.
1868 ***************************************************/
1869
1870#ifdef SMP
1871/*
1872 * Deal with a SMP shootdown of other users of the pmap that we are
1873 * trying to dispose of.  This can be a bit hairy.
1874 */
1875static cpumask_t *lazymask;
1876static u_int lazyptd;
1877static volatile u_int lazywait;
1878
1879void pmap_lazyfix_action(void);
1880
1881void
1882pmap_lazyfix_action(void)
1883{
1884	cpumask_t mymask = PCPU_GET(cpumask);
1885
1886#ifdef COUNT_IPIS
1887	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1888#endif
1889	if (rcr3() == lazyptd)
1890		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1891	atomic_clear_int(lazymask, mymask);
1892	atomic_store_rel_int(&lazywait, 1);
1893}
1894
1895static void
1896pmap_lazyfix_self(cpumask_t mymask)
1897{
1898
1899	if (rcr3() == lazyptd)
1900		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1901	atomic_clear_int(lazymask, mymask);
1902}
1903
1904
1905static void
1906pmap_lazyfix(pmap_t pmap)
1907{
1908	cpumask_t mymask, mask;
1909	u_int spins;
1910
1911	while ((mask = pmap->pm_active) != 0) {
1912		spins = 50000000;
1913		mask = mask & -mask;	/* Find least significant set bit */
1914		mtx_lock_spin(&smp_ipi_mtx);
1915#ifdef PAE
1916		lazyptd = vtophys(pmap->pm_pdpt);
1917#else
1918		lazyptd = vtophys(pmap->pm_pdir);
1919#endif
1920		mymask = PCPU_GET(cpumask);
1921		if (mask == mymask) {
1922			lazymask = &pmap->pm_active;
1923			pmap_lazyfix_self(mymask);
1924		} else {
1925			atomic_store_rel_int((u_int *)&lazymask,
1926			    (u_int)&pmap->pm_active);
1927			atomic_store_rel_int(&lazywait, 0);
1928			ipi_selected(mask, IPI_LAZYPMAP);
1929			while (lazywait == 0) {
1930				ia32_pause();
1931				if (--spins == 0)
1932					break;
1933			}
1934		}
1935		mtx_unlock_spin(&smp_ipi_mtx);
1936		if (spins == 0)
1937			printf("pmap_lazyfix: spun for 50000000\n");
1938	}
1939}
1940
1941#else	/* SMP */
1942
1943/*
1944 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1945 * unlikely to have to even execute this code, including the fact
1946 * that the cleanup is deferred until the parent does a wait(2), which
1947 * means that another userland process has run.
1948 */
1949static void
1950pmap_lazyfix(pmap_t pmap)
1951{
1952	u_int cr3;
1953
1954	cr3 = vtophys(pmap->pm_pdir);
1955	if (cr3 == rcr3()) {
1956		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1957		pmap->pm_active &= ~(PCPU_GET(cpumask));
1958	}
1959}
1960#endif	/* SMP */
1961
1962/*
1963 * Release any resources held by the given physical map.
1964 * Called when a pmap initialized by pmap_pinit is being released.
1965 * Should only be called if the map contains no valid mappings.
1966 */
1967void
1968pmap_release(pmap_t pmap)
1969{
1970	vm_page_t m, ptdpg[NPGPTD];
1971	int i;
1972
1973	KASSERT(pmap->pm_stats.resident_count == 0,
1974	    ("pmap_release: pmap resident count %ld != 0",
1975	    pmap->pm_stats.resident_count));
1976	KASSERT(pmap->pm_root == NULL,
1977	    ("pmap_release: pmap has reserved page table page(s)"));
1978
1979	pmap_lazyfix(pmap);
1980	mtx_lock_spin(&allpmaps_lock);
1981	LIST_REMOVE(pmap, pm_list);
1982	mtx_unlock_spin(&allpmaps_lock);
1983
1984	for (i = 0; i < NPGPTD; i++)
1985		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1986		    PG_FRAME);
1987
1988	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1989	    sizeof(*pmap->pm_pdir));
1990
1991	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1992
1993	for (i = 0; i < NPGPTD; i++) {
1994		m = ptdpg[i];
1995#ifdef PAE
1996		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1997		    ("pmap_release: got wrong ptd page"));
1998#endif
1999		m->wire_count--;
2000		atomic_subtract_int(&cnt.v_wire_count, 1);
2001		vm_page_free_zero(m);
2002	}
2003	PMAP_LOCK_DESTROY(pmap);
2004}
2005
2006static int
2007kvm_size(SYSCTL_HANDLER_ARGS)
2008{
2009	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2010
2011	return sysctl_handle_long(oidp, &ksize, 0, req);
2012}
2013SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2014    0, 0, kvm_size, "IU", "Size of KVM");
2015
2016static int
2017kvm_free(SYSCTL_HANDLER_ARGS)
2018{
2019	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2020
2021	return sysctl_handle_long(oidp, &kfree, 0, req);
2022}
2023SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2024    0, 0, kvm_free, "IU", "Amount of KVM free");
2025
2026/*
2027 * grow the number of kernel page table entries, if needed
2028 */
2029void
2030pmap_growkernel(vm_offset_t addr)
2031{
2032	vm_paddr_t ptppaddr;
2033	vm_page_t nkpg;
2034	pd_entry_t newpdir;
2035
2036	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2037	addr = roundup2(addr, NBPDR);
2038	if (addr - 1 >= kernel_map->max_offset)
2039		addr = kernel_map->max_offset;
2040	while (kernel_vm_end < addr) {
2041		if (pdir_pde(PTD, kernel_vm_end)) {
2042			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2043			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2044				kernel_vm_end = kernel_map->max_offset;
2045				break;
2046			}
2047			continue;
2048		}
2049
2050		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2051		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2052		    VM_ALLOC_ZERO);
2053		if (nkpg == NULL)
2054			panic("pmap_growkernel: no memory to grow kernel");
2055
2056		nkpt++;
2057
2058		if ((nkpg->flags & PG_ZERO) == 0)
2059			pmap_zero_page(nkpg);
2060		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2061		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2062		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2063
2064		pmap_kenter_pde(kernel_vm_end, newpdir);
2065		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2066		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2067			kernel_vm_end = kernel_map->max_offset;
2068			break;
2069		}
2070	}
2071}
2072
2073
2074/***************************************************
2075 * page management routines.
2076 ***************************************************/
2077
2078CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2079CTASSERT(_NPCM == 11);
2080
2081static __inline struct pv_chunk *
2082pv_to_chunk(pv_entry_t pv)
2083{
2084
2085	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
2086}
2087
2088#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2089
2090#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2091#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2092
2093static uint32_t pc_freemask[11] = {
2094	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2095	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2096	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2097	PC_FREE0_9, PC_FREE10
2098};
2099
2100SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2101	"Current number of pv entries");
2102
2103#ifdef PV_STATS
2104static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2105
2106SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2107	"Current number of pv entry chunks");
2108SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2109	"Current number of pv entry chunks allocated");
2110SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2111	"Current number of pv entry chunks frees");
2112SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2113	"Number of times tried to get a chunk page but failed.");
2114
2115static long pv_entry_frees, pv_entry_allocs;
2116static int pv_entry_spare;
2117
2118SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2119	"Current number of pv entry frees");
2120SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2121	"Current number of pv entry allocs");
2122SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2123	"Current number of spare pv entries");
2124
2125static int pmap_collect_inactive, pmap_collect_active;
2126
2127SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
2128	"Current number times pmap_collect called on inactive queue");
2129SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
2130	"Current number times pmap_collect called on active queue");
2131#endif
2132
2133/*
2134 * We are in a serious low memory condition.  Resort to
2135 * drastic measures to free some pages so we can allocate
2136 * another pv entry chunk.  This is normally called to
2137 * unmap inactive pages, and if necessary, active pages.
2138 */
2139static void
2140pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
2141{
2142	struct md_page *pvh;
2143	pd_entry_t *pde;
2144	pmap_t pmap;
2145	pt_entry_t *pte, tpte;
2146	pv_entry_t next_pv, pv;
2147	vm_offset_t va;
2148	vm_page_t m, free;
2149
2150	sched_pin();
2151	TAILQ_FOREACH(m, &vpq->pl, pageq) {
2152		if (m->hold_count || m->busy)
2153			continue;
2154		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
2155			va = pv->pv_va;
2156			pmap = PV_PMAP(pv);
2157			/* Avoid deadlock and lock recursion. */
2158			if (pmap > locked_pmap)
2159				PMAP_LOCK(pmap);
2160			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
2161				continue;
2162			pmap->pm_stats.resident_count--;
2163			pde = pmap_pde(pmap, va);
2164			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
2165			    " a 4mpage in page %p's pv list", m));
2166			pte = pmap_pte_quick(pmap, va);
2167			tpte = pte_load_clear(pte);
2168			KASSERT((tpte & PG_W) == 0,
2169			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
2170			if (tpte & PG_A)
2171				vm_page_flag_set(m, PG_REFERENCED);
2172			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2173				vm_page_dirty(m);
2174			free = NULL;
2175			pmap_unuse_pt(pmap, va, &free);
2176			pmap_invalidate_page(pmap, va);
2177			pmap_free_zero_pages(free);
2178			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2179			if (TAILQ_EMPTY(&m->md.pv_list)) {
2180				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2181				if (TAILQ_EMPTY(&pvh->pv_list))
2182					vm_page_flag_clear(m, PG_WRITEABLE);
2183			}
2184			free_pv_entry(pmap, pv);
2185			if (pmap != locked_pmap)
2186				PMAP_UNLOCK(pmap);
2187		}
2188	}
2189	sched_unpin();
2190}
2191
2192
2193/*
2194 * free the pv_entry back to the free list
2195 */
2196static void
2197free_pv_entry(pmap_t pmap, pv_entry_t pv)
2198{
2199	vm_page_t m;
2200	struct pv_chunk *pc;
2201	int idx, field, bit;
2202
2203	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2204	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2205	PV_STAT(pv_entry_frees++);
2206	PV_STAT(pv_entry_spare++);
2207	pv_entry_count--;
2208	pc = pv_to_chunk(pv);
2209	idx = pv - &pc->pc_pventry[0];
2210	field = idx / 32;
2211	bit = idx % 32;
2212	pc->pc_map[field] |= 1ul << bit;
2213	/* move to head of list */
2214	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2215	for (idx = 0; idx < _NPCM; idx++)
2216		if (pc->pc_map[idx] != pc_freemask[idx]) {
2217			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2218			return;
2219		}
2220	PV_STAT(pv_entry_spare -= _NPCPV);
2221	PV_STAT(pc_chunk_count--);
2222	PV_STAT(pc_chunk_frees++);
2223	/* entire chunk is free, return it */
2224	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2225	pmap_qremove((vm_offset_t)pc, 1);
2226	vm_page_unwire(m, 0);
2227	vm_page_free(m);
2228	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2229}
2230
2231/*
2232 * get a new pv_entry, allocating a block from the system
2233 * when needed.
2234 */
2235static pv_entry_t
2236get_pv_entry(pmap_t pmap, int try)
2237{
2238	static const struct timeval printinterval = { 60, 0 };
2239	static struct timeval lastprint;
2240	static vm_pindex_t colour;
2241	struct vpgqueues *pq;
2242	int bit, field;
2243	pv_entry_t pv;
2244	struct pv_chunk *pc;
2245	vm_page_t m;
2246
2247	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2248	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2249	PV_STAT(pv_entry_allocs++);
2250	pv_entry_count++;
2251	if (pv_entry_count > pv_entry_high_water)
2252		if (ratecheck(&lastprint, &printinterval))
2253			printf("Approaching the limit on PV entries, consider "
2254			    "increasing either the vm.pmap.shpgperproc or the "
2255			    "vm.pmap.pv_entry_max tunable.\n");
2256	pq = NULL;
2257retry:
2258	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2259	if (pc != NULL) {
2260		for (field = 0; field < _NPCM; field++) {
2261			if (pc->pc_map[field]) {
2262				bit = bsfl(pc->pc_map[field]);
2263				break;
2264			}
2265		}
2266		if (field < _NPCM) {
2267			pv = &pc->pc_pventry[field * 32 + bit];
2268			pc->pc_map[field] &= ~(1ul << bit);
2269			/* If this was the last item, move it to tail */
2270			for (field = 0; field < _NPCM; field++)
2271				if (pc->pc_map[field] != 0) {
2272					PV_STAT(pv_entry_spare--);
2273					return (pv);	/* not full, return */
2274				}
2275			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2276			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2277			PV_STAT(pv_entry_spare--);
2278			return (pv);
2279		}
2280	}
2281	/*
2282	 * Access to the ptelist "pv_vafree" is synchronized by the page
2283	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2284	 * remain non-empty until pmap_ptelist_alloc() completes.
2285	 */
2286	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq ==
2287	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
2288	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2289		if (try) {
2290			pv_entry_count--;
2291			PV_STAT(pc_chunk_tryfail++);
2292			return (NULL);
2293		}
2294		/*
2295		 * Reclaim pv entries: At first, destroy mappings to
2296		 * inactive pages.  After that, if a pv chunk entry
2297		 * is still needed, destroy mappings to active pages.
2298		 */
2299		if (pq == NULL) {
2300			PV_STAT(pmap_collect_inactive++);
2301			pq = &vm_page_queues[PQ_INACTIVE];
2302		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2303			PV_STAT(pmap_collect_active++);
2304			pq = &vm_page_queues[PQ_ACTIVE];
2305		} else
2306			panic("get_pv_entry: increase vm.pmap.shpgperproc");
2307		pmap_collect(pmap, pq);
2308		goto retry;
2309	}
2310	PV_STAT(pc_chunk_count++);
2311	PV_STAT(pc_chunk_allocs++);
2312	colour++;
2313	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2314	pmap_qenter((vm_offset_t)pc, &m, 1);
2315	pc->pc_pmap = pmap;
2316	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2317	for (field = 1; field < _NPCM; field++)
2318		pc->pc_map[field] = pc_freemask[field];
2319	pv = &pc->pc_pventry[0];
2320	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2321	PV_STAT(pv_entry_spare += _NPCPV - 1);
2322	return (pv);
2323}
2324
2325static __inline pv_entry_t
2326pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2327{
2328	pv_entry_t pv;
2329
2330	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2331	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2332		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2333			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2334			break;
2335		}
2336	}
2337	return (pv);
2338}
2339
2340static void
2341pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2342{
2343	struct md_page *pvh;
2344	pv_entry_t pv;
2345	vm_offset_t va_last;
2346	vm_page_t m;
2347
2348	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2349	KASSERT((pa & PDRMASK) == 0,
2350	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2351
2352	/*
2353	 * Transfer the 4mpage's pv entry for this mapping to the first
2354	 * page's pv list.
2355	 */
2356	pvh = pa_to_pvh(pa);
2357	va = trunc_4mpage(va);
2358	pv = pmap_pvh_remove(pvh, pmap, va);
2359	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2360	m = PHYS_TO_VM_PAGE(pa);
2361	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2362	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2363	va_last = va + NBPDR - PAGE_SIZE;
2364	do {
2365		m++;
2366		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2367		    ("pmap_pv_demote_pde: page %p is not managed", m));
2368		va += PAGE_SIZE;
2369		pmap_insert_entry(pmap, va, m);
2370	} while (va < va_last);
2371}
2372
2373static void
2374pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2375{
2376	struct md_page *pvh;
2377	pv_entry_t pv;
2378	vm_offset_t va_last;
2379	vm_page_t m;
2380
2381	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2382	KASSERT((pa & PDRMASK) == 0,
2383	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2384
2385	/*
2386	 * Transfer the first page's pv entry for this mapping to the
2387	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2388	 * to get_pv_entry(), a transfer avoids the possibility that
2389	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2390	 * removes one of the mappings that is being promoted.
2391	 */
2392	m = PHYS_TO_VM_PAGE(pa);
2393	va = trunc_4mpage(va);
2394	pv = pmap_pvh_remove(&m->md, pmap, va);
2395	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2396	pvh = pa_to_pvh(pa);
2397	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2398	/* Free the remaining NPTEPG - 1 pv entries. */
2399	va_last = va + NBPDR - PAGE_SIZE;
2400	do {
2401		m++;
2402		va += PAGE_SIZE;
2403		pmap_pvh_free(&m->md, pmap, va);
2404	} while (va < va_last);
2405}
2406
2407static void
2408pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2409{
2410	pv_entry_t pv;
2411
2412	pv = pmap_pvh_remove(pvh, pmap, va);
2413	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2414	free_pv_entry(pmap, pv);
2415}
2416
2417static void
2418pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2419{
2420	struct md_page *pvh;
2421
2422	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2423	pmap_pvh_free(&m->md, pmap, va);
2424	if (TAILQ_EMPTY(&m->md.pv_list)) {
2425		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2426		if (TAILQ_EMPTY(&pvh->pv_list))
2427			vm_page_flag_clear(m, PG_WRITEABLE);
2428	}
2429}
2430
2431/*
2432 * Create a pv entry for page at pa for
2433 * (pmap, va).
2434 */
2435static void
2436pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2437{
2438	pv_entry_t pv;
2439
2440	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2441	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2442	pv = get_pv_entry(pmap, FALSE);
2443	pv->pv_va = va;
2444	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2445}
2446
2447/*
2448 * Conditionally create a pv entry.
2449 */
2450static boolean_t
2451pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2452{
2453	pv_entry_t pv;
2454
2455	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2456	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2457	if (pv_entry_count < pv_entry_high_water &&
2458	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2459		pv->pv_va = va;
2460		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2461		return (TRUE);
2462	} else
2463		return (FALSE);
2464}
2465
2466/*
2467 * Create the pv entries for each of the pages within a superpage.
2468 */
2469static boolean_t
2470pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2471{
2472	struct md_page *pvh;
2473	pv_entry_t pv;
2474
2475	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2476	if (pv_entry_count < pv_entry_high_water &&
2477	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2478		pv->pv_va = va;
2479		pvh = pa_to_pvh(pa);
2480		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2481		return (TRUE);
2482	} else
2483		return (FALSE);
2484}
2485
2486/*
2487 * Fills a page table page with mappings to consecutive physical pages.
2488 */
2489static void
2490pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2491{
2492	pt_entry_t *pte;
2493
2494	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2495		*pte = newpte;
2496		newpte += PAGE_SIZE;
2497	}
2498}
2499
2500/*
2501 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2502 * 2- or 4MB page mapping is invalidated.
2503 */
2504static boolean_t
2505pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2506{
2507	pd_entry_t newpde, oldpde;
2508	pt_entry_t *firstpte, newpte;
2509	vm_paddr_t mptepa;
2510	vm_page_t free, mpte;
2511
2512	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2513	oldpde = *pde;
2514	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2515	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2516	mpte = pmap_lookup_pt_page(pmap, va);
2517	if (mpte != NULL)
2518		pmap_remove_pt_page(pmap, mpte);
2519	else {
2520		KASSERT((oldpde & PG_W) == 0,
2521		    ("pmap_demote_pde: page table page for a wired mapping"
2522		    " is missing"));
2523
2524		/*
2525		 * Invalidate the 2- or 4MB page mapping and return
2526		 * "failure" if the mapping was never accessed or the
2527		 * allocation of the new page table page fails.
2528		 */
2529		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2530		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2531		    VM_ALLOC_WIRED)) == NULL) {
2532			free = NULL;
2533			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2534			pmap_invalidate_page(pmap, trunc_4mpage(va));
2535			pmap_free_zero_pages(free);
2536			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2537			    " in pmap %p", va, pmap);
2538			return (FALSE);
2539		}
2540		if (va < VM_MAXUSER_ADDRESS)
2541			pmap->pm_stats.resident_count++;
2542	}
2543	mptepa = VM_PAGE_TO_PHYS(mpte);
2544
2545	/*
2546	 * If the page mapping is in the kernel's address space, then the
2547	 * KPTmap can provide access to the page table page.  Otherwise,
2548	 * temporarily map the page table page (mpte) into the kernel's
2549	 * address space at either PADDR1 or PADDR2.
2550	 */
2551	if (va >= KERNBASE)
2552		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2553	else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2554		if ((*PMAP1 & PG_FRAME) != mptepa) {
2555			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2556#ifdef SMP
2557			PMAP1cpu = PCPU_GET(cpuid);
2558#endif
2559			invlcaddr(PADDR1);
2560			PMAP1changed++;
2561		} else
2562#ifdef SMP
2563		if (PMAP1cpu != PCPU_GET(cpuid)) {
2564			PMAP1cpu = PCPU_GET(cpuid);
2565			invlcaddr(PADDR1);
2566			PMAP1changedcpu++;
2567		} else
2568#endif
2569			PMAP1unchanged++;
2570		firstpte = PADDR1;
2571	} else {
2572		mtx_lock(&PMAP2mutex);
2573		if ((*PMAP2 & PG_FRAME) != mptepa) {
2574			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2575			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2576		}
2577		firstpte = PADDR2;
2578	}
2579	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2580	KASSERT((oldpde & PG_A) != 0,
2581	    ("pmap_demote_pde: oldpde is missing PG_A"));
2582	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2583	    ("pmap_demote_pde: oldpde is missing PG_M"));
2584	newpte = oldpde & ~PG_PS;
2585	if ((newpte & PG_PDE_PAT) != 0)
2586		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2587
2588	/*
2589	 * If the page table page is new, initialize it.
2590	 */
2591	if (mpte->wire_count == 1) {
2592		mpte->wire_count = NPTEPG;
2593		pmap_fill_ptp(firstpte, newpte);
2594	}
2595	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2596	    ("pmap_demote_pde: firstpte and newpte map different physical"
2597	    " addresses"));
2598
2599	/*
2600	 * If the mapping has changed attributes, update the page table
2601	 * entries.
2602	 */
2603	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2604		pmap_fill_ptp(firstpte, newpte);
2605
2606	/*
2607	 * Demote the mapping.  This pmap is locked.  The old PDE has
2608	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2609	 * set.  Thus, there is no danger of a race with another
2610	 * processor changing the setting of PG_A and/or PG_M between
2611	 * the read above and the store below.
2612	 */
2613	if (workaround_erratum383)
2614		pmap_update_pde(pmap, va, pde, newpde);
2615	else if (pmap == kernel_pmap)
2616		pmap_kenter_pde(va, newpde);
2617	else
2618		pde_store(pde, newpde);
2619	if (firstpte == PADDR2)
2620		mtx_unlock(&PMAP2mutex);
2621
2622	/*
2623	 * Invalidate the recursive mapping of the page table page.
2624	 */
2625	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2626
2627	/*
2628	 * Demote the pv entry.  This depends on the earlier demotion
2629	 * of the mapping.  Specifically, the (re)creation of a per-
2630	 * page pv entry might trigger the execution of pmap_collect(),
2631	 * which might reclaim a newly (re)created per-page pv entry
2632	 * and destroy the associated mapping.  In order to destroy
2633	 * the mapping, the PDE must have already changed from mapping
2634	 * the 2mpage to referencing the page table page.
2635	 */
2636	if ((oldpde & PG_MANAGED) != 0)
2637		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2638
2639	pmap_pde_demotions++;
2640	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2641	    " in pmap %p", va, pmap);
2642	return (TRUE);
2643}
2644
2645/*
2646 * pmap_remove_pde: do the things to unmap a superpage in a process
2647 */
2648static void
2649pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2650    vm_page_t *free)
2651{
2652	struct md_page *pvh;
2653	pd_entry_t oldpde;
2654	vm_offset_t eva, va;
2655	vm_page_t m, mpte;
2656
2657	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2658	KASSERT((sva & PDRMASK) == 0,
2659	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2660	oldpde = pte_load_clear(pdq);
2661	if (oldpde & PG_W)
2662		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2663
2664	/*
2665	 * Machines that don't support invlpg, also don't support
2666	 * PG_G.
2667	 */
2668	if (oldpde & PG_G)
2669		pmap_invalidate_page(kernel_pmap, sva);
2670	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2671	if (oldpde & PG_MANAGED) {
2672		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2673		pmap_pvh_free(pvh, pmap, sva);
2674		eva = sva + NBPDR;
2675		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2676		    va < eva; va += PAGE_SIZE, m++) {
2677			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2678				vm_page_dirty(m);
2679			if (oldpde & PG_A)
2680				vm_page_flag_set(m, PG_REFERENCED);
2681			if (TAILQ_EMPTY(&m->md.pv_list) &&
2682			    TAILQ_EMPTY(&pvh->pv_list))
2683				vm_page_flag_clear(m, PG_WRITEABLE);
2684		}
2685	}
2686	if (pmap == kernel_pmap) {
2687		if (!pmap_demote_pde(pmap, pdq, sva))
2688			panic("pmap_remove_pde: failed demotion");
2689	} else {
2690		mpte = pmap_lookup_pt_page(pmap, sva);
2691		if (mpte != NULL) {
2692			pmap_remove_pt_page(pmap, mpte);
2693			pmap->pm_stats.resident_count--;
2694			KASSERT(mpte->wire_count == NPTEPG,
2695			    ("pmap_remove_pde: pte page wire count error"));
2696			mpte->wire_count = 0;
2697			pmap_add_delayed_free_list(mpte, free, FALSE);
2698			atomic_subtract_int(&cnt.v_wire_count, 1);
2699		}
2700	}
2701}
2702
2703/*
2704 * pmap_remove_pte: do the things to unmap a page in a process
2705 */
2706static int
2707pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2708{
2709	pt_entry_t oldpte;
2710	vm_page_t m;
2711
2712	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2713	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2714	oldpte = pte_load_clear(ptq);
2715	if (oldpte & PG_W)
2716		pmap->pm_stats.wired_count -= 1;
2717	/*
2718	 * Machines that don't support invlpg, also don't support
2719	 * PG_G.
2720	 */
2721	if (oldpte & PG_G)
2722		pmap_invalidate_page(kernel_pmap, va);
2723	pmap->pm_stats.resident_count -= 1;
2724	if (oldpte & PG_MANAGED) {
2725		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2726		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2727			vm_page_dirty(m);
2728		if (oldpte & PG_A)
2729			vm_page_flag_set(m, PG_REFERENCED);
2730		pmap_remove_entry(pmap, m, va);
2731	}
2732	return (pmap_unuse_pt(pmap, va, free));
2733}
2734
2735/*
2736 * Remove a single page from a process address space
2737 */
2738static void
2739pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2740{
2741	pt_entry_t *pte;
2742
2743	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2744	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2745	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2746	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2747		return;
2748	pmap_remove_pte(pmap, pte, va, free);
2749	pmap_invalidate_page(pmap, va);
2750}
2751
2752/*
2753 *	Remove the given range of addresses from the specified map.
2754 *
2755 *	It is assumed that the start and end are properly
2756 *	rounded to the page size.
2757 */
2758void
2759pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2760{
2761	vm_offset_t pdnxt;
2762	pd_entry_t ptpaddr;
2763	pt_entry_t *pte;
2764	vm_page_t free = NULL;
2765	int anyvalid;
2766
2767	/*
2768	 * Perform an unsynchronized read.  This is, however, safe.
2769	 */
2770	if (pmap->pm_stats.resident_count == 0)
2771		return;
2772
2773	anyvalid = 0;
2774
2775	vm_page_lock_queues();
2776	sched_pin();
2777	PMAP_LOCK(pmap);
2778
2779	/*
2780	 * special handling of removing one page.  a very
2781	 * common operation and easy to short circuit some
2782	 * code.
2783	 */
2784	if ((sva + PAGE_SIZE == eva) &&
2785	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2786		pmap_remove_page(pmap, sva, &free);
2787		goto out;
2788	}
2789
2790	for (; sva < eva; sva = pdnxt) {
2791		unsigned pdirindex;
2792
2793		/*
2794		 * Calculate index for next page table.
2795		 */
2796		pdnxt = (sva + NBPDR) & ~PDRMASK;
2797		if (pdnxt < sva)
2798			pdnxt = eva;
2799		if (pmap->pm_stats.resident_count == 0)
2800			break;
2801
2802		pdirindex = sva >> PDRSHIFT;
2803		ptpaddr = pmap->pm_pdir[pdirindex];
2804
2805		/*
2806		 * Weed out invalid mappings. Note: we assume that the page
2807		 * directory table is always allocated, and in kernel virtual.
2808		 */
2809		if (ptpaddr == 0)
2810			continue;
2811
2812		/*
2813		 * Check for large page.
2814		 */
2815		if ((ptpaddr & PG_PS) != 0) {
2816			/*
2817			 * Are we removing the entire large page?  If not,
2818			 * demote the mapping and fall through.
2819			 */
2820			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2821				/*
2822				 * The TLB entry for a PG_G mapping is
2823				 * invalidated by pmap_remove_pde().
2824				 */
2825				if ((ptpaddr & PG_G) == 0)
2826					anyvalid = 1;
2827				pmap_remove_pde(pmap,
2828				    &pmap->pm_pdir[pdirindex], sva, &free);
2829				continue;
2830			} else if (!pmap_demote_pde(pmap,
2831			    &pmap->pm_pdir[pdirindex], sva)) {
2832				/* The large page mapping was destroyed. */
2833				continue;
2834			}
2835		}
2836
2837		/*
2838		 * Limit our scan to either the end of the va represented
2839		 * by the current page table page, or to the end of the
2840		 * range being removed.
2841		 */
2842		if (pdnxt > eva)
2843			pdnxt = eva;
2844
2845		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2846		    sva += PAGE_SIZE) {
2847			if (*pte == 0)
2848				continue;
2849
2850			/*
2851			 * The TLB entry for a PG_G mapping is invalidated
2852			 * by pmap_remove_pte().
2853			 */
2854			if ((*pte & PG_G) == 0)
2855				anyvalid = 1;
2856			if (pmap_remove_pte(pmap, pte, sva, &free))
2857				break;
2858		}
2859	}
2860out:
2861	sched_unpin();
2862	if (anyvalid)
2863		pmap_invalidate_all(pmap);
2864	vm_page_unlock_queues();
2865	PMAP_UNLOCK(pmap);
2866	pmap_free_zero_pages(free);
2867}
2868
2869/*
2870 *	Routine:	pmap_remove_all
2871 *	Function:
2872 *		Removes this physical page from
2873 *		all physical maps in which it resides.
2874 *		Reflects back modify bits to the pager.
2875 *
2876 *	Notes:
2877 *		Original versions of this routine were very
2878 *		inefficient because they iteratively called
2879 *		pmap_remove (slow...)
2880 */
2881
2882void
2883pmap_remove_all(vm_page_t m)
2884{
2885	struct md_page *pvh;
2886	pv_entry_t pv;
2887	pmap_t pmap;
2888	pt_entry_t *pte, tpte;
2889	pd_entry_t *pde;
2890	vm_offset_t va;
2891	vm_page_t free;
2892
2893	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2894	    ("pmap_remove_all: page %p is fictitious", m));
2895	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2896	sched_pin();
2897	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2898	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2899		va = pv->pv_va;
2900		pmap = PV_PMAP(pv);
2901		PMAP_LOCK(pmap);
2902		pde = pmap_pde(pmap, va);
2903		(void)pmap_demote_pde(pmap, pde, va);
2904		PMAP_UNLOCK(pmap);
2905	}
2906	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2907		pmap = PV_PMAP(pv);
2908		PMAP_LOCK(pmap);
2909		pmap->pm_stats.resident_count--;
2910		pde = pmap_pde(pmap, pv->pv_va);
2911		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2912		    " a 4mpage in page %p's pv list", m));
2913		pte = pmap_pte_quick(pmap, pv->pv_va);
2914		tpte = pte_load_clear(pte);
2915		if (tpte & PG_W)
2916			pmap->pm_stats.wired_count--;
2917		if (tpte & PG_A)
2918			vm_page_flag_set(m, PG_REFERENCED);
2919
2920		/*
2921		 * Update the vm_page_t clean and reference bits.
2922		 */
2923		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2924			vm_page_dirty(m);
2925		free = NULL;
2926		pmap_unuse_pt(pmap, pv->pv_va, &free);
2927		pmap_invalidate_page(pmap, pv->pv_va);
2928		pmap_free_zero_pages(free);
2929		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2930		free_pv_entry(pmap, pv);
2931		PMAP_UNLOCK(pmap);
2932	}
2933	vm_page_flag_clear(m, PG_WRITEABLE);
2934	sched_unpin();
2935}
2936
2937/*
2938 * pmap_protect_pde: do the things to protect a 4mpage in a process
2939 */
2940static boolean_t
2941pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2942{
2943	pd_entry_t newpde, oldpde;
2944	vm_offset_t eva, va;
2945	vm_page_t m;
2946	boolean_t anychanged;
2947
2948	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2949	KASSERT((sva & PDRMASK) == 0,
2950	    ("pmap_protect_pde: sva is not 4mpage aligned"));
2951	anychanged = FALSE;
2952retry:
2953	oldpde = newpde = *pde;
2954	if (oldpde & PG_MANAGED) {
2955		eva = sva + NBPDR;
2956		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2957		    va < eva; va += PAGE_SIZE, m++) {
2958			/*
2959			 * In contrast to the analogous operation on a 4KB page
2960			 * mapping, the mapping's PG_A flag is not cleared and
2961			 * the page's PG_REFERENCED flag is not set.  The
2962			 * reason is that pmap_demote_pde() expects that a 2/4MB
2963			 * page mapping with a stored page table page has PG_A
2964			 * set.
2965			 */
2966			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2967				vm_page_dirty(m);
2968		}
2969	}
2970	if ((prot & VM_PROT_WRITE) == 0)
2971		newpde &= ~(PG_RW | PG_M);
2972#ifdef PAE
2973	if ((prot & VM_PROT_EXECUTE) == 0)
2974		newpde |= pg_nx;
2975#endif
2976	if (newpde != oldpde) {
2977		if (!pde_cmpset(pde, oldpde, newpde))
2978			goto retry;
2979		if (oldpde & PG_G)
2980			pmap_invalidate_page(pmap, sva);
2981		else
2982			anychanged = TRUE;
2983	}
2984	return (anychanged);
2985}
2986
2987/*
2988 *	Set the physical protection on the
2989 *	specified range of this map as requested.
2990 */
2991void
2992pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2993{
2994	vm_offset_t pdnxt;
2995	pd_entry_t ptpaddr;
2996	pt_entry_t *pte;
2997	int anychanged;
2998
2999	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3000		pmap_remove(pmap, sva, eva);
3001		return;
3002	}
3003
3004#ifdef PAE
3005	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3006	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3007		return;
3008#else
3009	if (prot & VM_PROT_WRITE)
3010		return;
3011#endif
3012
3013	anychanged = 0;
3014
3015	vm_page_lock_queues();
3016	sched_pin();
3017	PMAP_LOCK(pmap);
3018	for (; sva < eva; sva = pdnxt) {
3019		pt_entry_t obits, pbits;
3020		unsigned pdirindex;
3021
3022		pdnxt = (sva + NBPDR) & ~PDRMASK;
3023		if (pdnxt < sva)
3024			pdnxt = eva;
3025
3026		pdirindex = sva >> PDRSHIFT;
3027		ptpaddr = pmap->pm_pdir[pdirindex];
3028
3029		/*
3030		 * Weed out invalid mappings. Note: we assume that the page
3031		 * directory table is always allocated, and in kernel virtual.
3032		 */
3033		if (ptpaddr == 0)
3034			continue;
3035
3036		/*
3037		 * Check for large page.
3038		 */
3039		if ((ptpaddr & PG_PS) != 0) {
3040			/*
3041			 * Are we protecting the entire large page?  If not,
3042			 * demote the mapping and fall through.
3043			 */
3044			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3045				/*
3046				 * The TLB entry for a PG_G mapping is
3047				 * invalidated by pmap_protect_pde().
3048				 */
3049				if (pmap_protect_pde(pmap,
3050				    &pmap->pm_pdir[pdirindex], sva, prot))
3051					anychanged = 1;
3052				continue;
3053			} else if (!pmap_demote_pde(pmap,
3054			    &pmap->pm_pdir[pdirindex], sva)) {
3055				/* The large page mapping was destroyed. */
3056				continue;
3057			}
3058		}
3059
3060		if (pdnxt > eva)
3061			pdnxt = eva;
3062
3063		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3064		    sva += PAGE_SIZE) {
3065			vm_page_t m;
3066
3067retry:
3068			/*
3069			 * Regardless of whether a pte is 32 or 64 bits in
3070			 * size, PG_RW, PG_A, and PG_M are among the least
3071			 * significant 32 bits.
3072			 */
3073			obits = pbits = *pte;
3074			if ((pbits & PG_V) == 0)
3075				continue;
3076			if (pbits & PG_MANAGED) {
3077				m = NULL;
3078				if (pbits & PG_A) {
3079					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3080					vm_page_flag_set(m, PG_REFERENCED);
3081					pbits &= ~PG_A;
3082				}
3083				if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3084					if (m == NULL)
3085						m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3086					vm_page_dirty(m);
3087				}
3088			}
3089
3090			if ((prot & VM_PROT_WRITE) == 0)
3091				pbits &= ~(PG_RW | PG_M);
3092#ifdef PAE
3093			if ((prot & VM_PROT_EXECUTE) == 0)
3094				pbits |= pg_nx;
3095#endif
3096
3097			if (pbits != obits) {
3098#ifdef PAE
3099				if (!atomic_cmpset_64(pte, obits, pbits))
3100					goto retry;
3101#else
3102				if (!atomic_cmpset_int((u_int *)pte, obits,
3103				    pbits))
3104					goto retry;
3105#endif
3106				if (obits & PG_G)
3107					pmap_invalidate_page(pmap, sva);
3108				else
3109					anychanged = 1;
3110			}
3111		}
3112	}
3113	sched_unpin();
3114	if (anychanged)
3115		pmap_invalidate_all(pmap);
3116	vm_page_unlock_queues();
3117	PMAP_UNLOCK(pmap);
3118}
3119
3120/*
3121 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3122 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3123 * For promotion to occur, two conditions must be met: (1) the 4KB page
3124 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3125 * mappings must have identical characteristics.
3126 *
3127 * Managed (PG_MANAGED) mappings within the kernel address space are not
3128 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3129 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3130 * pmap.
3131 */
3132static void
3133pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3134{
3135	pd_entry_t newpde;
3136	pt_entry_t *firstpte, oldpte, pa, *pte;
3137	vm_offset_t oldpteva;
3138	vm_page_t mpte;
3139
3140	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3141
3142	/*
3143	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3144	 * either invalid, unused, or does not map the first 4KB physical page
3145	 * within a 2- or 4MB page.
3146	 */
3147	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3148setpde:
3149	newpde = *firstpte;
3150	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3151		pmap_pde_p_failures++;
3152		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3153		    " in pmap %p", va, pmap);
3154		return;
3155	}
3156	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3157		pmap_pde_p_failures++;
3158		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3159		    " in pmap %p", va, pmap);
3160		return;
3161	}
3162	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3163		/*
3164		 * When PG_M is already clear, PG_RW can be cleared without
3165		 * a TLB invalidation.
3166		 */
3167		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3168		    ~PG_RW))
3169			goto setpde;
3170		newpde &= ~PG_RW;
3171	}
3172
3173	/*
3174	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3175	 * PTE maps an unexpected 4KB physical page or does not have identical
3176	 * characteristics to the first PTE.
3177	 */
3178	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3179	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3180setpte:
3181		oldpte = *pte;
3182		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3183			pmap_pde_p_failures++;
3184			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3185			    " in pmap %p", va, pmap);
3186			return;
3187		}
3188		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3189			/*
3190			 * When PG_M is already clear, PG_RW can be cleared
3191			 * without a TLB invalidation.
3192			 */
3193			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3194			    oldpte & ~PG_RW))
3195				goto setpte;
3196			oldpte &= ~PG_RW;
3197			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3198			    (va & ~PDRMASK);
3199			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3200			    " in pmap %p", oldpteva, pmap);
3201		}
3202		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3203			pmap_pde_p_failures++;
3204			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3205			    " in pmap %p", va, pmap);
3206			return;
3207		}
3208		pa -= PAGE_SIZE;
3209	}
3210
3211	/*
3212	 * Save the page table page in its current state until the PDE
3213	 * mapping the superpage is demoted by pmap_demote_pde() or
3214	 * destroyed by pmap_remove_pde().
3215	 */
3216	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3217	KASSERT(mpte >= vm_page_array &&
3218	    mpte < &vm_page_array[vm_page_array_size],
3219	    ("pmap_promote_pde: page table page is out of range"));
3220	KASSERT(mpte->pindex == va >> PDRSHIFT,
3221	    ("pmap_promote_pde: page table page's pindex is wrong"));
3222	pmap_insert_pt_page(pmap, mpte);
3223
3224	/*
3225	 * Promote the pv entries.
3226	 */
3227	if ((newpde & PG_MANAGED) != 0)
3228		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3229
3230	/*
3231	 * Propagate the PAT index to its proper position.
3232	 */
3233	if ((newpde & PG_PTE_PAT) != 0)
3234		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3235
3236	/*
3237	 * Map the superpage.
3238	 */
3239	if (workaround_erratum383)
3240		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3241	else if (pmap == kernel_pmap)
3242		pmap_kenter_pde(va, PG_PS | newpde);
3243	else
3244		pde_store(pde, PG_PS | newpde);
3245
3246	pmap_pde_promotions++;
3247	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3248	    " in pmap %p", va, pmap);
3249}
3250
3251/*
3252 *	Insert the given physical page (p) at
3253 *	the specified virtual address (v) in the
3254 *	target physical map with the protection requested.
3255 *
3256 *	If specified, the page will be wired down, meaning
3257 *	that the related pte can not be reclaimed.
3258 *
3259 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3260 *	or lose information.  That is, this routine must actually
3261 *	insert this page into the given map NOW.
3262 */
3263void
3264pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3265    vm_prot_t prot, boolean_t wired)
3266{
3267	vm_paddr_t pa;
3268	pd_entry_t *pde;
3269	pt_entry_t *pte;
3270	vm_paddr_t opa;
3271	pt_entry_t origpte, newpte;
3272	vm_page_t mpte, om;
3273	boolean_t invlva;
3274
3275	va = trunc_page(va);
3276	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3277	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3278	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va));
3279
3280	mpte = NULL;
3281
3282	vm_page_lock_queues();
3283	PMAP_LOCK(pmap);
3284	sched_pin();
3285
3286	/*
3287	 * In the case that a page table page is not
3288	 * resident, we are creating it here.
3289	 */
3290	if (va < VM_MAXUSER_ADDRESS) {
3291		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3292	}
3293
3294	pde = pmap_pde(pmap, va);
3295	if ((*pde & PG_PS) != 0)
3296		panic("pmap_enter: attempted pmap_enter on 4MB page");
3297	pte = pmap_pte_quick(pmap, va);
3298
3299	/*
3300	 * Page Directory table entry not valid, we need a new PT page
3301	 */
3302	if (pte == NULL) {
3303		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3304			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3305	}
3306
3307	pa = VM_PAGE_TO_PHYS(m);
3308	om = NULL;
3309	origpte = *pte;
3310	opa = origpte & PG_FRAME;
3311
3312	/*
3313	 * Mapping has not changed, must be protection or wiring change.
3314	 */
3315	if (origpte && (opa == pa)) {
3316		/*
3317		 * Wiring change, just update stats. We don't worry about
3318		 * wiring PT pages as they remain resident as long as there
3319		 * are valid mappings in them. Hence, if a user page is wired,
3320		 * the PT page will be also.
3321		 */
3322		if (wired && ((origpte & PG_W) == 0))
3323			pmap->pm_stats.wired_count++;
3324		else if (!wired && (origpte & PG_W))
3325			pmap->pm_stats.wired_count--;
3326
3327		/*
3328		 * Remove extra pte reference
3329		 */
3330		if (mpte)
3331			mpte->wire_count--;
3332
3333		/*
3334		 * We might be turning off write access to the page,
3335		 * so we go ahead and sense modify status.
3336		 */
3337		if (origpte & PG_MANAGED) {
3338			om = m;
3339			pa |= PG_MANAGED;
3340		}
3341		goto validate;
3342	}
3343	/*
3344	 * Mapping has changed, invalidate old range and fall through to
3345	 * handle validating new mapping.
3346	 */
3347	if (opa) {
3348		if (origpte & PG_W)
3349			pmap->pm_stats.wired_count--;
3350		if (origpte & PG_MANAGED) {
3351			om = PHYS_TO_VM_PAGE(opa);
3352			pmap_remove_entry(pmap, om, va);
3353		}
3354		if (mpte != NULL) {
3355			mpte->wire_count--;
3356			KASSERT(mpte->wire_count > 0,
3357			    ("pmap_enter: missing reference to page table page,"
3358			     " va: 0x%x", va));
3359		}
3360	} else
3361		pmap->pm_stats.resident_count++;
3362
3363	/*
3364	 * Enter on the PV list if part of our managed memory.
3365	 */
3366	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3367		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3368		    ("pmap_enter: managed mapping within the clean submap"));
3369		pmap_insert_entry(pmap, va, m);
3370		pa |= PG_MANAGED;
3371	}
3372
3373	/*
3374	 * Increment counters
3375	 */
3376	if (wired)
3377		pmap->pm_stats.wired_count++;
3378
3379validate:
3380	/*
3381	 * Now validate mapping with desired protection/wiring.
3382	 */
3383	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3384	if ((prot & VM_PROT_WRITE) != 0) {
3385		newpte |= PG_RW;
3386		vm_page_flag_set(m, PG_WRITEABLE);
3387	}
3388#ifdef PAE
3389	if ((prot & VM_PROT_EXECUTE) == 0)
3390		newpte |= pg_nx;
3391#endif
3392	if (wired)
3393		newpte |= PG_W;
3394	if (va < VM_MAXUSER_ADDRESS)
3395		newpte |= PG_U;
3396	if (pmap == kernel_pmap)
3397		newpte |= pgeflag;
3398
3399	/*
3400	 * if the mapping or permission bits are different, we need
3401	 * to update the pte.
3402	 */
3403	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3404		newpte |= PG_A;
3405		if ((access & VM_PROT_WRITE) != 0)
3406			newpte |= PG_M;
3407		if (origpte & PG_V) {
3408			invlva = FALSE;
3409			origpte = pte_load_store(pte, newpte);
3410			if (origpte & PG_A) {
3411				if (origpte & PG_MANAGED)
3412					vm_page_flag_set(om, PG_REFERENCED);
3413				if (opa != VM_PAGE_TO_PHYS(m))
3414					invlva = TRUE;
3415#ifdef PAE
3416				if ((origpte & PG_NX) == 0 &&
3417				    (newpte & PG_NX) != 0)
3418					invlva = TRUE;
3419#endif
3420			}
3421			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3422				if ((origpte & PG_MANAGED) != 0)
3423					vm_page_dirty(om);
3424				if ((prot & VM_PROT_WRITE) == 0)
3425					invlva = TRUE;
3426			}
3427			if (invlva)
3428				pmap_invalidate_page(pmap, va);
3429		} else
3430			pte_store(pte, newpte);
3431	}
3432
3433	/*
3434	 * If both the page table page and the reservation are fully
3435	 * populated, then attempt promotion.
3436	 */
3437	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3438	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3439		pmap_promote_pde(pmap, pde, va);
3440
3441	sched_unpin();
3442	vm_page_unlock_queues();
3443	PMAP_UNLOCK(pmap);
3444}
3445
3446/*
3447 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3448 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3449 * blocking, (2) a mapping already exists at the specified virtual address, or
3450 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3451 */
3452static boolean_t
3453pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3454{
3455	pd_entry_t *pde, newpde;
3456
3457	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3458	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3459	pde = pmap_pde(pmap, va);
3460	if (*pde != 0) {
3461		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3462		    " in pmap %p", va, pmap);
3463		return (FALSE);
3464	}
3465	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3466	    PG_PS | PG_V;
3467	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3468		newpde |= PG_MANAGED;
3469
3470		/*
3471		 * Abort this mapping if its PV entry could not be created.
3472		 */
3473		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3474			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3475			    " in pmap %p", va, pmap);
3476			return (FALSE);
3477		}
3478	}
3479#ifdef PAE
3480	if ((prot & VM_PROT_EXECUTE) == 0)
3481		newpde |= pg_nx;
3482#endif
3483	if (va < VM_MAXUSER_ADDRESS)
3484		newpde |= PG_U;
3485
3486	/*
3487	 * Increment counters.
3488	 */
3489	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3490
3491	/*
3492	 * Map the superpage.
3493	 */
3494	pde_store(pde, newpde);
3495
3496	pmap_pde_mappings++;
3497	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3498	    " in pmap %p", va, pmap);
3499	return (TRUE);
3500}
3501
3502/*
3503 * Maps a sequence of resident pages belonging to the same object.
3504 * The sequence begins with the given page m_start.  This page is
3505 * mapped at the given virtual address start.  Each subsequent page is
3506 * mapped at a virtual address that is offset from start by the same
3507 * amount as the page is offset from m_start within the object.  The
3508 * last page in the sequence is the page with the largest offset from
3509 * m_start that can be mapped at a virtual address less than the given
3510 * virtual address end.  Not every virtual page between start and end
3511 * is mapped; only those for which a resident page exists with the
3512 * corresponding offset from m_start are mapped.
3513 */
3514void
3515pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3516    vm_page_t m_start, vm_prot_t prot)
3517{
3518	vm_offset_t va;
3519	vm_page_t m, mpte;
3520	vm_pindex_t diff, psize;
3521
3522	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3523	psize = atop(end - start);
3524	mpte = NULL;
3525	m = m_start;
3526	PMAP_LOCK(pmap);
3527	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3528		va = start + ptoa(diff);
3529		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3530		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3531		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3532		    pmap_enter_pde(pmap, va, m, prot))
3533			m = &m[NBPDR / PAGE_SIZE - 1];
3534		else
3535			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3536			    mpte);
3537		m = TAILQ_NEXT(m, listq);
3538	}
3539 	PMAP_UNLOCK(pmap);
3540}
3541
3542/*
3543 * this code makes some *MAJOR* assumptions:
3544 * 1. Current pmap & pmap exists.
3545 * 2. Not wired.
3546 * 3. Read access.
3547 * 4. No page table pages.
3548 * but is *MUCH* faster than pmap_enter...
3549 */
3550
3551void
3552pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3553{
3554
3555	PMAP_LOCK(pmap);
3556	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3557	PMAP_UNLOCK(pmap);
3558}
3559
3560static vm_page_t
3561pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3562    vm_prot_t prot, vm_page_t mpte)
3563{
3564	pt_entry_t *pte;
3565	vm_paddr_t pa;
3566	vm_page_t free;
3567
3568	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3569	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3570	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3571	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3572	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3573
3574	/*
3575	 * In the case that a page table page is not
3576	 * resident, we are creating it here.
3577	 */
3578	if (va < VM_MAXUSER_ADDRESS) {
3579		unsigned ptepindex;
3580		pd_entry_t ptepa;
3581
3582		/*
3583		 * Calculate pagetable page index
3584		 */
3585		ptepindex = va >> PDRSHIFT;
3586		if (mpte && (mpte->pindex == ptepindex)) {
3587			mpte->wire_count++;
3588		} else {
3589			/*
3590			 * Get the page directory entry
3591			 */
3592			ptepa = pmap->pm_pdir[ptepindex];
3593
3594			/*
3595			 * If the page table page is mapped, we just increment
3596			 * the hold count, and activate it.
3597			 */
3598			if (ptepa) {
3599				if (ptepa & PG_PS)
3600					return (NULL);
3601				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3602				mpte->wire_count++;
3603			} else {
3604				mpte = _pmap_allocpte(pmap, ptepindex,
3605				    M_NOWAIT);
3606				if (mpte == NULL)
3607					return (mpte);
3608			}
3609		}
3610	} else {
3611		mpte = NULL;
3612	}
3613
3614	/*
3615	 * This call to vtopte makes the assumption that we are
3616	 * entering the page into the current pmap.  In order to support
3617	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3618	 * But that isn't as quick as vtopte.
3619	 */
3620	pte = vtopte(va);
3621	if (*pte) {
3622		if (mpte != NULL) {
3623			mpte->wire_count--;
3624			mpte = NULL;
3625		}
3626		return (mpte);
3627	}
3628
3629	/*
3630	 * Enter on the PV list if part of our managed memory.
3631	 */
3632	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3633	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3634		if (mpte != NULL) {
3635			free = NULL;
3636			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3637				pmap_invalidate_page(pmap, va);
3638				pmap_free_zero_pages(free);
3639			}
3640
3641			mpte = NULL;
3642		}
3643		return (mpte);
3644	}
3645
3646	/*
3647	 * Increment counters
3648	 */
3649	pmap->pm_stats.resident_count++;
3650
3651	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3652#ifdef PAE
3653	if ((prot & VM_PROT_EXECUTE) == 0)
3654		pa |= pg_nx;
3655#endif
3656
3657	/*
3658	 * Now validate mapping with RO protection
3659	 */
3660	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3661		pte_store(pte, pa | PG_V | PG_U);
3662	else
3663		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3664	return mpte;
3665}
3666
3667/*
3668 * Make a temporary mapping for a physical address.  This is only intended
3669 * to be used for panic dumps.
3670 */
3671void *
3672pmap_kenter_temporary(vm_paddr_t pa, int i)
3673{
3674	vm_offset_t va;
3675
3676	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3677	pmap_kenter(va, pa);
3678	invlpg(va);
3679	return ((void *)crashdumpmap);
3680}
3681
3682/*
3683 * This code maps large physical mmap regions into the
3684 * processor address space.  Note that some shortcuts
3685 * are taken, but the code works.
3686 */
3687void
3688pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3689    vm_pindex_t pindex, vm_size_t size)
3690{
3691	pd_entry_t *pde;
3692	vm_paddr_t pa, ptepa;
3693	vm_page_t p;
3694	int pat_mode;
3695
3696	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3697	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3698	    ("pmap_object_init_pt: non-device object"));
3699	if (pseflag &&
3700	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3701		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3702			return;
3703		p = vm_page_lookup(object, pindex);
3704		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3705		    ("pmap_object_init_pt: invalid page %p", p));
3706		pat_mode = p->md.pat_mode;
3707
3708		/*
3709		 * Abort the mapping if the first page is not physically
3710		 * aligned to a 2/4MB page boundary.
3711		 */
3712		ptepa = VM_PAGE_TO_PHYS(p);
3713		if (ptepa & (NBPDR - 1))
3714			return;
3715
3716		/*
3717		 * Skip the first page.  Abort the mapping if the rest of
3718		 * the pages are not physically contiguous or have differing
3719		 * memory attributes.
3720		 */
3721		p = TAILQ_NEXT(p, listq);
3722		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3723		    pa += PAGE_SIZE) {
3724			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3725			    ("pmap_object_init_pt: invalid page %p", p));
3726			if (pa != VM_PAGE_TO_PHYS(p) ||
3727			    pat_mode != p->md.pat_mode)
3728				return;
3729			p = TAILQ_NEXT(p, listq);
3730		}
3731
3732		/*
3733		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3734		 * "size" is a multiple of 2/4M, adding the PAT setting to
3735		 * "pa" will not affect the termination of this loop.
3736		 */
3737		PMAP_LOCK(pmap);
3738		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3739		    size; pa += NBPDR) {
3740			pde = pmap_pde(pmap, addr);
3741			if (*pde == 0) {
3742				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3743				    PG_U | PG_RW | PG_V);
3744				pmap->pm_stats.resident_count += NBPDR /
3745				    PAGE_SIZE;
3746				pmap_pde_mappings++;
3747			}
3748			/* Else continue on if the PDE is already valid. */
3749			addr += NBPDR;
3750		}
3751		PMAP_UNLOCK(pmap);
3752	}
3753}
3754
3755/*
3756 *	Routine:	pmap_change_wiring
3757 *	Function:	Change the wiring attribute for a map/virtual-address
3758 *			pair.
3759 *	In/out conditions:
3760 *			The mapping must already exist in the pmap.
3761 */
3762void
3763pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3764{
3765	pd_entry_t *pde;
3766	pt_entry_t *pte;
3767	boolean_t are_queues_locked;
3768
3769	are_queues_locked = FALSE;
3770retry:
3771	PMAP_LOCK(pmap);
3772	pde = pmap_pde(pmap, va);
3773	if ((*pde & PG_PS) != 0) {
3774		if (!wired != ((*pde & PG_W) == 0)) {
3775			if (!are_queues_locked) {
3776				are_queues_locked = TRUE;
3777				if (!mtx_trylock(&vm_page_queue_mtx)) {
3778					PMAP_UNLOCK(pmap);
3779					vm_page_lock_queues();
3780					goto retry;
3781				}
3782			}
3783			if (!pmap_demote_pde(pmap, pde, va))
3784				panic("pmap_change_wiring: demotion failed");
3785		} else
3786			goto out;
3787	}
3788	pte = pmap_pte(pmap, va);
3789
3790	if (wired && !pmap_pte_w(pte))
3791		pmap->pm_stats.wired_count++;
3792	else if (!wired && pmap_pte_w(pte))
3793		pmap->pm_stats.wired_count--;
3794
3795	/*
3796	 * Wiring is not a hardware characteristic so there is no need to
3797	 * invalidate TLB.
3798	 */
3799	pmap_pte_set_w(pte, wired);
3800	pmap_pte_release(pte);
3801out:
3802	if (are_queues_locked)
3803		vm_page_unlock_queues();
3804	PMAP_UNLOCK(pmap);
3805}
3806
3807
3808
3809/*
3810 *	Copy the range specified by src_addr/len
3811 *	from the source map to the range dst_addr/len
3812 *	in the destination map.
3813 *
3814 *	This routine is only advisory and need not do anything.
3815 */
3816
3817void
3818pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3819    vm_offset_t src_addr)
3820{
3821	vm_page_t   free;
3822	vm_offset_t addr;
3823	vm_offset_t end_addr = src_addr + len;
3824	vm_offset_t pdnxt;
3825
3826	if (dst_addr != src_addr)
3827		return;
3828
3829	if (!pmap_is_current(src_pmap))
3830		return;
3831
3832	vm_page_lock_queues();
3833	if (dst_pmap < src_pmap) {
3834		PMAP_LOCK(dst_pmap);
3835		PMAP_LOCK(src_pmap);
3836	} else {
3837		PMAP_LOCK(src_pmap);
3838		PMAP_LOCK(dst_pmap);
3839	}
3840	sched_pin();
3841	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3842		pt_entry_t *src_pte, *dst_pte;
3843		vm_page_t dstmpte, srcmpte;
3844		pd_entry_t srcptepaddr;
3845		unsigned ptepindex;
3846
3847		KASSERT(addr < UPT_MIN_ADDRESS,
3848		    ("pmap_copy: invalid to pmap_copy page tables"));
3849
3850		pdnxt = (addr + NBPDR) & ~PDRMASK;
3851		if (pdnxt < addr)
3852			pdnxt = end_addr;
3853		ptepindex = addr >> PDRSHIFT;
3854
3855		srcptepaddr = src_pmap->pm_pdir[ptepindex];
3856		if (srcptepaddr == 0)
3857			continue;
3858
3859		if (srcptepaddr & PG_PS) {
3860			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
3861			    ((srcptepaddr & PG_MANAGED) == 0 ||
3862			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3863			    PG_PS_FRAME))) {
3864				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
3865				    ~PG_W;
3866				dst_pmap->pm_stats.resident_count +=
3867				    NBPDR / PAGE_SIZE;
3868			}
3869			continue;
3870		}
3871
3872		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3873		KASSERT(srcmpte->wire_count > 0,
3874		    ("pmap_copy: source page table page is unused"));
3875
3876		if (pdnxt > end_addr)
3877			pdnxt = end_addr;
3878
3879		src_pte = vtopte(addr);
3880		while (addr < pdnxt) {
3881			pt_entry_t ptetemp;
3882			ptetemp = *src_pte;
3883			/*
3884			 * we only virtual copy managed pages
3885			 */
3886			if ((ptetemp & PG_MANAGED) != 0) {
3887				dstmpte = pmap_allocpte(dst_pmap, addr,
3888				    M_NOWAIT);
3889				if (dstmpte == NULL)
3890					goto out;
3891				dst_pte = pmap_pte_quick(dst_pmap, addr);
3892				if (*dst_pte == 0 &&
3893				    pmap_try_insert_pv_entry(dst_pmap, addr,
3894				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3895					/*
3896					 * Clear the wired, modified, and
3897					 * accessed (referenced) bits
3898					 * during the copy.
3899					 */
3900					*dst_pte = ptetemp & ~(PG_W | PG_M |
3901					    PG_A);
3902					dst_pmap->pm_stats.resident_count++;
3903	 			} else {
3904					free = NULL;
3905					if (pmap_unwire_pte_hold(dst_pmap,
3906					    dstmpte, &free)) {
3907						pmap_invalidate_page(dst_pmap,
3908						    addr);
3909						pmap_free_zero_pages(free);
3910					}
3911					goto out;
3912				}
3913				if (dstmpte->wire_count >= srcmpte->wire_count)
3914					break;
3915			}
3916			addr += PAGE_SIZE;
3917			src_pte++;
3918		}
3919	}
3920out:
3921	sched_unpin();
3922	vm_page_unlock_queues();
3923	PMAP_UNLOCK(src_pmap);
3924	PMAP_UNLOCK(dst_pmap);
3925}
3926
3927static __inline void
3928pagezero(void *page)
3929{
3930#if defined(I686_CPU)
3931	if (cpu_class == CPUCLASS_686) {
3932#if defined(CPU_ENABLE_SSE)
3933		if (cpu_feature & CPUID_SSE2)
3934			sse2_pagezero(page);
3935		else
3936#endif
3937			i686_pagezero(page);
3938	} else
3939#endif
3940		bzero(page, PAGE_SIZE);
3941}
3942
3943/*
3944 *	pmap_zero_page zeros the specified hardware page by mapping
3945 *	the page into KVM and using bzero to clear its contents.
3946 */
3947void
3948pmap_zero_page(vm_page_t m)
3949{
3950	struct sysmaps *sysmaps;
3951
3952	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3953	mtx_lock(&sysmaps->lock);
3954	if (*sysmaps->CMAP2)
3955		panic("pmap_zero_page: CMAP2 busy");
3956	sched_pin();
3957	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3958	    pmap_cache_bits(m->md.pat_mode, 0);
3959	invlcaddr(sysmaps->CADDR2);
3960	pagezero(sysmaps->CADDR2);
3961	*sysmaps->CMAP2 = 0;
3962	sched_unpin();
3963	mtx_unlock(&sysmaps->lock);
3964}
3965
3966/*
3967 *	pmap_zero_page_area zeros the specified hardware page by mapping
3968 *	the page into KVM and using bzero to clear its contents.
3969 *
3970 *	off and size may not cover an area beyond a single hardware page.
3971 */
3972void
3973pmap_zero_page_area(vm_page_t m, int off, int size)
3974{
3975	struct sysmaps *sysmaps;
3976
3977	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3978	mtx_lock(&sysmaps->lock);
3979	if (*sysmaps->CMAP2)
3980		panic("pmap_zero_page_area: CMAP2 busy");
3981	sched_pin();
3982	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3983	    pmap_cache_bits(m->md.pat_mode, 0);
3984	invlcaddr(sysmaps->CADDR2);
3985	if (off == 0 && size == PAGE_SIZE)
3986		pagezero(sysmaps->CADDR2);
3987	else
3988		bzero((char *)sysmaps->CADDR2 + off, size);
3989	*sysmaps->CMAP2 = 0;
3990	sched_unpin();
3991	mtx_unlock(&sysmaps->lock);
3992}
3993
3994/*
3995 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3996 *	the page into KVM and using bzero to clear its contents.  This
3997 *	is intended to be called from the vm_pagezero process only and
3998 *	outside of Giant.
3999 */
4000void
4001pmap_zero_page_idle(vm_page_t m)
4002{
4003
4004	if (*CMAP3)
4005		panic("pmap_zero_page_idle: CMAP3 busy");
4006	sched_pin();
4007	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4008	    pmap_cache_bits(m->md.pat_mode, 0);
4009	invlcaddr(CADDR3);
4010	pagezero(CADDR3);
4011	*CMAP3 = 0;
4012	sched_unpin();
4013}
4014
4015/*
4016 *	pmap_copy_page copies the specified (machine independent)
4017 *	page by mapping the page into virtual memory and using
4018 *	bcopy to copy the page, one machine dependent page at a
4019 *	time.
4020 */
4021void
4022pmap_copy_page(vm_page_t src, vm_page_t dst)
4023{
4024	struct sysmaps *sysmaps;
4025
4026	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4027	mtx_lock(&sysmaps->lock);
4028	if (*sysmaps->CMAP1)
4029		panic("pmap_copy_page: CMAP1 busy");
4030	if (*sysmaps->CMAP2)
4031		panic("pmap_copy_page: CMAP2 busy");
4032	sched_pin();
4033	invlpg((u_int)sysmaps->CADDR1);
4034	invlpg((u_int)sysmaps->CADDR2);
4035	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4036	    pmap_cache_bits(src->md.pat_mode, 0);
4037	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4038	    pmap_cache_bits(dst->md.pat_mode, 0);
4039	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4040	*sysmaps->CMAP1 = 0;
4041	*sysmaps->CMAP2 = 0;
4042	sched_unpin();
4043	mtx_unlock(&sysmaps->lock);
4044}
4045
4046/*
4047 * Returns true if the pmap's pv is one of the first
4048 * 16 pvs linked to from this page.  This count may
4049 * be changed upwards or downwards in the future; it
4050 * is only necessary that true be returned for a small
4051 * subset of pmaps for proper page aging.
4052 */
4053boolean_t
4054pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4055{
4056	struct md_page *pvh;
4057	pv_entry_t pv;
4058	int loops = 0;
4059
4060	if (m->flags & PG_FICTITIOUS)
4061		return FALSE;
4062
4063	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4064	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4065		if (PV_PMAP(pv) == pmap) {
4066			return TRUE;
4067		}
4068		loops++;
4069		if (loops >= 16)
4070			break;
4071	}
4072	if (loops < 16) {
4073		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4074		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4075			if (PV_PMAP(pv) == pmap)
4076				return (TRUE);
4077			loops++;
4078			if (loops >= 16)
4079				break;
4080		}
4081	}
4082	return (FALSE);
4083}
4084
4085/*
4086 *	pmap_page_wired_mappings:
4087 *
4088 *	Return the number of managed mappings to the given physical page
4089 *	that are wired.
4090 */
4091int
4092pmap_page_wired_mappings(vm_page_t m)
4093{
4094	int count;
4095
4096	count = 0;
4097	if ((m->flags & PG_FICTITIOUS) != 0)
4098		return (count);
4099	count = pmap_pvh_wired_mappings(&m->md, count);
4100	return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count));
4101}
4102
4103/*
4104 *	pmap_pvh_wired_mappings:
4105 *
4106 *	Return the updated number "count" of managed mappings that are wired.
4107 */
4108static int
4109pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4110{
4111	pmap_t pmap;
4112	pt_entry_t *pte;
4113	pv_entry_t pv;
4114
4115	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4116	sched_pin();
4117	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4118		pmap = PV_PMAP(pv);
4119		PMAP_LOCK(pmap);
4120		pte = pmap_pte_quick(pmap, pv->pv_va);
4121		if ((*pte & PG_W) != 0)
4122			count++;
4123		PMAP_UNLOCK(pmap);
4124	}
4125	sched_unpin();
4126	return (count);
4127}
4128
4129/*
4130 * Returns TRUE if the given page is mapped individually or as part of
4131 * a 4mpage.  Otherwise, returns FALSE.
4132 */
4133boolean_t
4134pmap_page_is_mapped(vm_page_t m)
4135{
4136	struct md_page *pvh;
4137
4138	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
4139		return (FALSE);
4140	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4141	if (TAILQ_EMPTY(&m->md.pv_list)) {
4142		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4143		return (!TAILQ_EMPTY(&pvh->pv_list));
4144	} else
4145		return (TRUE);
4146}
4147
4148/*
4149 * Remove all pages from specified address space
4150 * this aids process exit speeds.  Also, this code
4151 * is special cased for current process only, but
4152 * can have the more generic (and slightly slower)
4153 * mode enabled.  This is much faster than pmap_remove
4154 * in the case of running down an entire address space.
4155 */
4156void
4157pmap_remove_pages(pmap_t pmap)
4158{
4159	pt_entry_t *pte, tpte;
4160	vm_page_t free = NULL;
4161	vm_page_t m, mpte, mt;
4162	pv_entry_t pv;
4163	struct md_page *pvh;
4164	struct pv_chunk *pc, *npc;
4165	int field, idx;
4166	int32_t bit;
4167	uint32_t inuse, bitmask;
4168	int allfree;
4169
4170	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
4171		printf("warning: pmap_remove_pages called with non-current pmap\n");
4172		return;
4173	}
4174	vm_page_lock_queues();
4175	PMAP_LOCK(pmap);
4176	sched_pin();
4177	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4178		allfree = 1;
4179		for (field = 0; field < _NPCM; field++) {
4180			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4181			while (inuse != 0) {
4182				bit = bsfl(inuse);
4183				bitmask = 1UL << bit;
4184				idx = field * 32 + bit;
4185				pv = &pc->pc_pventry[idx];
4186				inuse &= ~bitmask;
4187
4188				pte = pmap_pde(pmap, pv->pv_va);
4189				tpte = *pte;
4190				if ((tpte & PG_PS) == 0) {
4191					pte = vtopte(pv->pv_va);
4192					tpte = *pte & ~PG_PTE_PAT;
4193				}
4194
4195				if (tpte == 0) {
4196					printf(
4197					    "TPTE at %p  IS ZERO @ VA %08x\n",
4198					    pte, pv->pv_va);
4199					panic("bad pte");
4200				}
4201
4202/*
4203 * We cannot remove wired pages from a process' mapping at this time
4204 */
4205				if (tpte & PG_W) {
4206					allfree = 0;
4207					continue;
4208				}
4209
4210				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4211				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4212				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4213				    m, (uintmax_t)m->phys_addr,
4214				    (uintmax_t)tpte));
4215
4216				KASSERT(m < &vm_page_array[vm_page_array_size],
4217					("pmap_remove_pages: bad tpte %#jx",
4218					(uintmax_t)tpte));
4219
4220				pte_clear(pte);
4221
4222				/*
4223				 * Update the vm_page_t clean/reference bits.
4224				 */
4225				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4226					if ((tpte & PG_PS) != 0) {
4227						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4228							vm_page_dirty(mt);
4229					} else
4230						vm_page_dirty(m);
4231				}
4232
4233				/* Mark free */
4234				PV_STAT(pv_entry_frees++);
4235				PV_STAT(pv_entry_spare++);
4236				pv_entry_count--;
4237				pc->pc_map[field] |= bitmask;
4238				if ((tpte & PG_PS) != 0) {
4239					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4240					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4241					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4242					if (TAILQ_EMPTY(&pvh->pv_list)) {
4243						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4244							if (TAILQ_EMPTY(&mt->md.pv_list))
4245								vm_page_flag_clear(mt, PG_WRITEABLE);
4246					}
4247					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4248					if (mpte != NULL) {
4249						pmap_remove_pt_page(pmap, mpte);
4250						pmap->pm_stats.resident_count--;
4251						KASSERT(mpte->wire_count == NPTEPG,
4252						    ("pmap_remove_pages: pte page wire count error"));
4253						mpte->wire_count = 0;
4254						pmap_add_delayed_free_list(mpte, &free, FALSE);
4255						atomic_subtract_int(&cnt.v_wire_count, 1);
4256					}
4257				} else {
4258					pmap->pm_stats.resident_count--;
4259					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4260					if (TAILQ_EMPTY(&m->md.pv_list)) {
4261						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4262						if (TAILQ_EMPTY(&pvh->pv_list))
4263							vm_page_flag_clear(m, PG_WRITEABLE);
4264					}
4265					pmap_unuse_pt(pmap, pv->pv_va, &free);
4266				}
4267			}
4268		}
4269		if (allfree) {
4270			PV_STAT(pv_entry_spare -= _NPCPV);
4271			PV_STAT(pc_chunk_count--);
4272			PV_STAT(pc_chunk_frees++);
4273			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4274			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
4275			pmap_qremove((vm_offset_t)pc, 1);
4276			vm_page_unwire(m, 0);
4277			vm_page_free(m);
4278			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
4279		}
4280	}
4281	sched_unpin();
4282	pmap_invalidate_all(pmap);
4283	vm_page_unlock_queues();
4284	PMAP_UNLOCK(pmap);
4285	pmap_free_zero_pages(free);
4286}
4287
4288/*
4289 *	pmap_is_modified:
4290 *
4291 *	Return whether or not the specified physical page was modified
4292 *	in any physical maps.
4293 */
4294boolean_t
4295pmap_is_modified(vm_page_t m)
4296{
4297
4298	if (m->flags & PG_FICTITIOUS)
4299		return (FALSE);
4300	if (pmap_is_modified_pvh(&m->md))
4301		return (TRUE);
4302	return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4303}
4304
4305/*
4306 * Returns TRUE if any of the given mappings were used to modify
4307 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4308 * mappings are supported.
4309 */
4310static boolean_t
4311pmap_is_modified_pvh(struct md_page *pvh)
4312{
4313	pv_entry_t pv;
4314	pt_entry_t *pte;
4315	pmap_t pmap;
4316	boolean_t rv;
4317
4318	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4319	rv = FALSE;
4320	sched_pin();
4321	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4322		pmap = PV_PMAP(pv);
4323		PMAP_LOCK(pmap);
4324		pte = pmap_pte_quick(pmap, pv->pv_va);
4325		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4326		PMAP_UNLOCK(pmap);
4327		if (rv)
4328			break;
4329	}
4330	sched_unpin();
4331	return (rv);
4332}
4333
4334/*
4335 *	pmap_is_prefaultable:
4336 *
4337 *	Return whether or not the specified virtual address is elgible
4338 *	for prefault.
4339 */
4340boolean_t
4341pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4342{
4343	pd_entry_t *pde;
4344	pt_entry_t *pte;
4345	boolean_t rv;
4346
4347	rv = FALSE;
4348	PMAP_LOCK(pmap);
4349	pde = pmap_pde(pmap, addr);
4350	if (*pde != 0 && (*pde & PG_PS) == 0) {
4351		pte = vtopte(addr);
4352		rv = *pte == 0;
4353	}
4354	PMAP_UNLOCK(pmap);
4355	return (rv);
4356}
4357
4358/*
4359 * Clear the write and modified bits in each of the given page's mappings.
4360 */
4361void
4362pmap_remove_write(vm_page_t m)
4363{
4364	struct md_page *pvh;
4365	pv_entry_t next_pv, pv;
4366	pmap_t pmap;
4367	pd_entry_t *pde;
4368	pt_entry_t oldpte, *pte;
4369	vm_offset_t va;
4370
4371	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4372	if ((m->flags & PG_FICTITIOUS) != 0 ||
4373	    (m->flags & PG_WRITEABLE) == 0)
4374		return;
4375	sched_pin();
4376	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4377	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4378		va = pv->pv_va;
4379		pmap = PV_PMAP(pv);
4380		PMAP_LOCK(pmap);
4381		pde = pmap_pde(pmap, va);
4382		if ((*pde & PG_RW) != 0)
4383			(void)pmap_demote_pde(pmap, pde, va);
4384		PMAP_UNLOCK(pmap);
4385	}
4386	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4387		pmap = PV_PMAP(pv);
4388		PMAP_LOCK(pmap);
4389		pde = pmap_pde(pmap, pv->pv_va);
4390		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4391		    " a 4mpage in page %p's pv list", m));
4392		pte = pmap_pte_quick(pmap, pv->pv_va);
4393retry:
4394		oldpte = *pte;
4395		if ((oldpte & PG_RW) != 0) {
4396			/*
4397			 * Regardless of whether a pte is 32 or 64 bits
4398			 * in size, PG_RW and PG_M are among the least
4399			 * significant 32 bits.
4400			 */
4401			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4402			    oldpte & ~(PG_RW | PG_M)))
4403				goto retry;
4404			if ((oldpte & PG_M) != 0)
4405				vm_page_dirty(m);
4406			pmap_invalidate_page(pmap, pv->pv_va);
4407		}
4408		PMAP_UNLOCK(pmap);
4409	}
4410	vm_page_flag_clear(m, PG_WRITEABLE);
4411	sched_unpin();
4412}
4413
4414/*
4415 *	pmap_ts_referenced:
4416 *
4417 *	Return a count of reference bits for a page, clearing those bits.
4418 *	It is not necessary for every reference bit to be cleared, but it
4419 *	is necessary that 0 only be returned when there are truly no
4420 *	reference bits set.
4421 *
4422 *	XXX: The exact number of bits to check and clear is a matter that
4423 *	should be tested and standardized at some point in the future for
4424 *	optimal aging of shared pages.
4425 */
4426int
4427pmap_ts_referenced(vm_page_t m)
4428{
4429	struct md_page *pvh;
4430	pv_entry_t pv, pvf, pvn;
4431	pmap_t pmap;
4432	pd_entry_t oldpde, *pde;
4433	pt_entry_t *pte;
4434	vm_offset_t va;
4435	int rtval = 0;
4436
4437	if (m->flags & PG_FICTITIOUS)
4438		return (rtval);
4439	sched_pin();
4440	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4441	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4442	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4443		va = pv->pv_va;
4444		pmap = PV_PMAP(pv);
4445		PMAP_LOCK(pmap);
4446		pde = pmap_pde(pmap, va);
4447		oldpde = *pde;
4448		if ((oldpde & PG_A) != 0) {
4449			if (pmap_demote_pde(pmap, pde, va)) {
4450				if ((oldpde & PG_W) == 0) {
4451					/*
4452					 * Remove the mapping to a single page
4453					 * so that a subsequent access may
4454					 * repromote.  Since the underlying
4455					 * page table page is fully populated,
4456					 * this removal never frees a page
4457					 * table page.
4458					 */
4459					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4460					    PG_PS_FRAME);
4461					pmap_remove_page(pmap, va, NULL);
4462					rtval++;
4463					if (rtval > 4) {
4464						PMAP_UNLOCK(pmap);
4465						return (rtval);
4466					}
4467				}
4468			}
4469		}
4470		PMAP_UNLOCK(pmap);
4471	}
4472	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4473		pvf = pv;
4474		do {
4475			pvn = TAILQ_NEXT(pv, pv_list);
4476			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4477			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4478			pmap = PV_PMAP(pv);
4479			PMAP_LOCK(pmap);
4480			pde = pmap_pde(pmap, pv->pv_va);
4481			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4482			    " found a 4mpage in page %p's pv list", m));
4483			pte = pmap_pte_quick(pmap, pv->pv_va);
4484			if ((*pte & PG_A) != 0) {
4485				atomic_clear_int((u_int *)pte, PG_A);
4486				pmap_invalidate_page(pmap, pv->pv_va);
4487				rtval++;
4488				if (rtval > 4)
4489					pvn = NULL;
4490			}
4491			PMAP_UNLOCK(pmap);
4492		} while ((pv = pvn) != NULL && pv != pvf);
4493	}
4494	sched_unpin();
4495	return (rtval);
4496}
4497
4498/*
4499 *	Clear the modify bits on the specified physical page.
4500 */
4501void
4502pmap_clear_modify(vm_page_t m)
4503{
4504	struct md_page *pvh;
4505	pv_entry_t next_pv, pv;
4506	pmap_t pmap;
4507	pd_entry_t oldpde, *pde;
4508	pt_entry_t oldpte, *pte;
4509	vm_offset_t va;
4510
4511	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4512	if ((m->flags & PG_FICTITIOUS) != 0)
4513		return;
4514	sched_pin();
4515	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4516	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4517		va = pv->pv_va;
4518		pmap = PV_PMAP(pv);
4519		PMAP_LOCK(pmap);
4520		pde = pmap_pde(pmap, va);
4521		oldpde = *pde;
4522		if ((oldpde & PG_RW) != 0) {
4523			if (pmap_demote_pde(pmap, pde, va)) {
4524				if ((oldpde & PG_W) == 0) {
4525					/*
4526					 * Write protect the mapping to a
4527					 * single page so that a subsequent
4528					 * write access may repromote.
4529					 */
4530					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4531					    PG_PS_FRAME);
4532					pte = pmap_pte_quick(pmap, va);
4533					oldpte = *pte;
4534					if ((oldpte & PG_V) != 0) {
4535						/*
4536						 * Regardless of whether a pte is 32 or 64 bits
4537						 * in size, PG_RW and PG_M are among the least
4538						 * significant 32 bits.
4539						 */
4540						while (!atomic_cmpset_int((u_int *)pte,
4541						    oldpte,
4542						    oldpte & ~(PG_M | PG_RW)))
4543							oldpte = *pte;
4544						vm_page_dirty(m);
4545						pmap_invalidate_page(pmap, va);
4546					}
4547				}
4548			}
4549		}
4550		PMAP_UNLOCK(pmap);
4551	}
4552	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4553		pmap = PV_PMAP(pv);
4554		PMAP_LOCK(pmap);
4555		pde = pmap_pde(pmap, pv->pv_va);
4556		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4557		    " a 4mpage in page %p's pv list", m));
4558		pte = pmap_pte_quick(pmap, pv->pv_va);
4559		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4560			/*
4561			 * Regardless of whether a pte is 32 or 64 bits
4562			 * in size, PG_M is among the least significant
4563			 * 32 bits.
4564			 */
4565			atomic_clear_int((u_int *)pte, PG_M);
4566			pmap_invalidate_page(pmap, pv->pv_va);
4567		}
4568		PMAP_UNLOCK(pmap);
4569	}
4570	sched_unpin();
4571}
4572
4573/*
4574 *	pmap_clear_reference:
4575 *
4576 *	Clear the reference bit on the specified physical page.
4577 */
4578void
4579pmap_clear_reference(vm_page_t m)
4580{
4581	struct md_page *pvh;
4582	pv_entry_t next_pv, pv;
4583	pmap_t pmap;
4584	pd_entry_t oldpde, *pde;
4585	pt_entry_t *pte;
4586	vm_offset_t va;
4587
4588	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4589	if ((m->flags & PG_FICTITIOUS) != 0)
4590		return;
4591	sched_pin();
4592	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4593	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4594		va = pv->pv_va;
4595		pmap = PV_PMAP(pv);
4596		PMAP_LOCK(pmap);
4597		pde = pmap_pde(pmap, va);
4598		oldpde = *pde;
4599		if ((oldpde & PG_A) != 0) {
4600			if (pmap_demote_pde(pmap, pde, va)) {
4601				/*
4602				 * Remove the mapping to a single page so
4603				 * that a subsequent access may repromote.
4604				 * Since the underlying page table page is
4605				 * fully populated, this removal never frees
4606				 * a page table page.
4607				 */
4608				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4609				    PG_PS_FRAME);
4610				pmap_remove_page(pmap, va, NULL);
4611			}
4612		}
4613		PMAP_UNLOCK(pmap);
4614	}
4615	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4616		pmap = PV_PMAP(pv);
4617		PMAP_LOCK(pmap);
4618		pde = pmap_pde(pmap, pv->pv_va);
4619		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4620		    " a 4mpage in page %p's pv list", m));
4621		pte = pmap_pte_quick(pmap, pv->pv_va);
4622		if ((*pte & PG_A) != 0) {
4623			/*
4624			 * Regardless of whether a pte is 32 or 64 bits
4625			 * in size, PG_A is among the least significant
4626			 * 32 bits.
4627			 */
4628			atomic_clear_int((u_int *)pte, PG_A);
4629			pmap_invalidate_page(pmap, pv->pv_va);
4630		}
4631		PMAP_UNLOCK(pmap);
4632	}
4633	sched_unpin();
4634}
4635
4636/*
4637 * Miscellaneous support routines follow
4638 */
4639
4640/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4641static __inline void
4642pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4643{
4644	u_int opte, npte;
4645
4646	/*
4647	 * The cache mode bits are all in the low 32-bits of the
4648	 * PTE, so we can just spin on updating the low 32-bits.
4649	 */
4650	do {
4651		opte = *(u_int *)pte;
4652		npte = opte & ~PG_PTE_CACHE;
4653		npte |= cache_bits;
4654	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4655}
4656
4657/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4658static __inline void
4659pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4660{
4661	u_int opde, npde;
4662
4663	/*
4664	 * The cache mode bits are all in the low 32-bits of the
4665	 * PDE, so we can just spin on updating the low 32-bits.
4666	 */
4667	do {
4668		opde = *(u_int *)pde;
4669		npde = opde & ~PG_PDE_CACHE;
4670		npde |= cache_bits;
4671	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4672}
4673
4674/*
4675 * Map a set of physical memory pages into the kernel virtual
4676 * address space. Return a pointer to where it is mapped. This
4677 * routine is intended to be used for mapping device memory,
4678 * NOT real memory.
4679 */
4680void *
4681pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4682{
4683	vm_offset_t va, offset;
4684	vm_size_t tmpsize;
4685
4686	offset = pa & PAGE_MASK;
4687	size = roundup(offset + size, PAGE_SIZE);
4688	pa = pa & PG_FRAME;
4689
4690	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4691		va = KERNBASE + pa;
4692	else
4693		va = kmem_alloc_nofault(kernel_map, size);
4694	if (!va)
4695		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4696
4697	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4698		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4699	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4700	pmap_invalidate_cache_range(va, va + size);
4701	return ((void *)(va + offset));
4702}
4703
4704void *
4705pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4706{
4707
4708	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4709}
4710
4711void *
4712pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4713{
4714
4715	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4716}
4717
4718void
4719pmap_unmapdev(vm_offset_t va, vm_size_t size)
4720{
4721	vm_offset_t base, offset, tmpva;
4722
4723	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4724		return;
4725	base = trunc_page(va);
4726	offset = va & PAGE_MASK;
4727	size = roundup(offset + size, PAGE_SIZE);
4728	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4729		pmap_kremove(tmpva);
4730	pmap_invalidate_range(kernel_pmap, va, tmpva);
4731	kmem_free(kernel_map, base, size);
4732}
4733
4734/*
4735 * Sets the memory attribute for the specified page.
4736 */
4737void
4738pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4739{
4740	struct sysmaps *sysmaps;
4741	vm_offset_t sva, eva;
4742
4743	m->md.pat_mode = ma;
4744	if ((m->flags & PG_FICTITIOUS) != 0)
4745		return;
4746
4747	/*
4748	 * If "m" is a normal page, flush it from the cache.
4749	 * See pmap_invalidate_cache_range().
4750	 *
4751	 * First, try to find an existing mapping of the page by sf
4752	 * buffer. sf_buf_invalidate_cache() modifies mapping and
4753	 * flushes the cache.
4754	 */
4755	if (sf_buf_invalidate_cache(m))
4756		return;
4757
4758	/*
4759	 * If page is not mapped by sf buffer, but CPU does not
4760	 * support self snoop, map the page transient and do
4761	 * invalidation. In the worst case, whole cache is flushed by
4762	 * pmap_invalidate_cache_range().
4763	 */
4764	if ((cpu_feature & (CPUID_SS|CPUID_CLFSH)) == CPUID_CLFSH) {
4765		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4766		mtx_lock(&sysmaps->lock);
4767		if (*sysmaps->CMAP2)
4768			panic("pmap_page_set_memattr: CMAP2 busy");
4769		sched_pin();
4770		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
4771		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
4772		invlcaddr(sysmaps->CADDR2);
4773		sva = (vm_offset_t)sysmaps->CADDR2;
4774		eva = sva + PAGE_SIZE;
4775	} else
4776		sva = eva = 0; /* gcc */
4777	pmap_invalidate_cache_range(sva, eva);
4778	if (sva != 0) {
4779		*sysmaps->CMAP2 = 0;
4780		sched_unpin();
4781		mtx_unlock(&sysmaps->lock);
4782	}
4783}
4784
4785/*
4786 * Changes the specified virtual address range's memory type to that given by
4787 * the parameter "mode".  The specified virtual address range must be
4788 * completely contained within either the kernel map.
4789 *
4790 * Returns zero if the change completed successfully, and either EINVAL or
4791 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4792 * of the virtual address range was not mapped, and ENOMEM is returned if
4793 * there was insufficient memory available to complete the change.
4794 */
4795int
4796pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4797{
4798	vm_offset_t base, offset, tmpva;
4799	pd_entry_t *pde;
4800	pt_entry_t *pte;
4801	int cache_bits_pte, cache_bits_pde;
4802	boolean_t changed;
4803
4804	base = trunc_page(va);
4805	offset = va & PAGE_MASK;
4806	size = roundup(offset + size, PAGE_SIZE);
4807
4808	/*
4809	 * Only supported on kernel virtual addresses above the recursive map.
4810	 */
4811	if (base < VM_MIN_KERNEL_ADDRESS)
4812		return (EINVAL);
4813
4814	cache_bits_pde = pmap_cache_bits(mode, 1);
4815	cache_bits_pte = pmap_cache_bits(mode, 0);
4816	changed = FALSE;
4817
4818	/*
4819	 * Pages that aren't mapped aren't supported.  Also break down
4820	 * 2/4MB pages into 4KB pages if required.
4821	 */
4822	PMAP_LOCK(kernel_pmap);
4823	for (tmpva = base; tmpva < base + size; ) {
4824		pde = pmap_pde(kernel_pmap, tmpva);
4825		if (*pde == 0) {
4826			PMAP_UNLOCK(kernel_pmap);
4827			return (EINVAL);
4828		}
4829		if (*pde & PG_PS) {
4830			/*
4831			 * If the current 2/4MB page already has
4832			 * the required memory type, then we need not
4833			 * demote this page.  Just increment tmpva to
4834			 * the next 2/4MB page frame.
4835			 */
4836			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4837				tmpva = trunc_4mpage(tmpva) + NBPDR;
4838				continue;
4839			}
4840
4841			/*
4842			 * If the current offset aligns with a 2/4MB
4843			 * page frame and there is at least 2/4MB left
4844			 * within the range, then we need not break
4845			 * down this page into 4KB pages.
4846			 */
4847			if ((tmpva & PDRMASK) == 0 &&
4848			    tmpva + PDRMASK < base + size) {
4849				tmpva += NBPDR;
4850				continue;
4851			}
4852			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
4853				PMAP_UNLOCK(kernel_pmap);
4854				return (ENOMEM);
4855			}
4856		}
4857		pte = vtopte(tmpva);
4858		if (*pte == 0) {
4859			PMAP_UNLOCK(kernel_pmap);
4860			return (EINVAL);
4861		}
4862		tmpva += PAGE_SIZE;
4863	}
4864	PMAP_UNLOCK(kernel_pmap);
4865
4866	/*
4867	 * Ok, all the pages exist, so run through them updating their
4868	 * cache mode if required.
4869	 */
4870	for (tmpva = base; tmpva < base + size; ) {
4871		pde = pmap_pde(kernel_pmap, tmpva);
4872		if (*pde & PG_PS) {
4873			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
4874				pmap_pde_attr(pde, cache_bits_pde);
4875				changed = TRUE;
4876			}
4877			tmpva = trunc_4mpage(tmpva) + NBPDR;
4878		} else {
4879			pte = vtopte(tmpva);
4880			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
4881				pmap_pte_attr(pte, cache_bits_pte);
4882				changed = TRUE;
4883			}
4884			tmpva += PAGE_SIZE;
4885		}
4886	}
4887
4888	/*
4889	 * Flush CPU caches to make sure any data isn't cached that
4890	 * shouldn't be, etc.
4891	 */
4892	if (changed) {
4893		pmap_invalidate_range(kernel_pmap, base, tmpva);
4894		pmap_invalidate_cache_range(base, tmpva);
4895	}
4896	return (0);
4897}
4898
4899/*
4900 * perform the pmap work for mincore
4901 */
4902int
4903pmap_mincore(pmap_t pmap, vm_offset_t addr)
4904{
4905	pd_entry_t *pdep;
4906	pt_entry_t *ptep, pte;
4907	vm_paddr_t pa;
4908	vm_page_t m;
4909	int val = 0;
4910
4911	PMAP_LOCK(pmap);
4912	pdep = pmap_pde(pmap, addr);
4913	if (*pdep != 0) {
4914		if (*pdep & PG_PS) {
4915			pte = *pdep;
4916			val = MINCORE_SUPER;
4917			/* Compute the physical address of the 4KB page. */
4918			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4919			    PG_FRAME;
4920		} else {
4921			ptep = pmap_pte(pmap, addr);
4922			pte = *ptep;
4923			pmap_pte_release(ptep);
4924			pa = pte & PG_FRAME;
4925		}
4926	} else {
4927		pte = 0;
4928		pa = 0;
4929	}
4930	PMAP_UNLOCK(pmap);
4931
4932	if (pte != 0) {
4933		val |= MINCORE_INCORE;
4934		if ((pte & PG_MANAGED) == 0)
4935			return val;
4936
4937		m = PHYS_TO_VM_PAGE(pa);
4938
4939		/*
4940		 * Modified by us
4941		 */
4942		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4943			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4944		else {
4945			/*
4946			 * Modified by someone else
4947			 */
4948			vm_page_lock_queues();
4949			if (m->dirty || pmap_is_modified(m))
4950				val |= MINCORE_MODIFIED_OTHER;
4951			vm_page_unlock_queues();
4952		}
4953		/*
4954		 * Referenced by us
4955		 */
4956		if (pte & PG_A)
4957			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4958		else {
4959			/*
4960			 * Referenced by someone else
4961			 */
4962			vm_page_lock_queues();
4963			if ((m->flags & PG_REFERENCED) ||
4964			    pmap_ts_referenced(m)) {
4965				val |= MINCORE_REFERENCED_OTHER;
4966				vm_page_flag_set(m, PG_REFERENCED);
4967			}
4968			vm_page_unlock_queues();
4969		}
4970	}
4971	return val;
4972}
4973
4974void
4975pmap_activate(struct thread *td)
4976{
4977	pmap_t	pmap, oldpmap;
4978	u_int32_t  cr3;
4979
4980	critical_enter();
4981	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4982	oldpmap = PCPU_GET(curpmap);
4983#if defined(SMP)
4984	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4985	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4986#else
4987	oldpmap->pm_active &= ~1;
4988	pmap->pm_active |= 1;
4989#endif
4990#ifdef PAE
4991	cr3 = vtophys(pmap->pm_pdpt);
4992#else
4993	cr3 = vtophys(pmap->pm_pdir);
4994#endif
4995	/*
4996	 * pmap_activate is for the current thread on the current cpu
4997	 */
4998	td->td_pcb->pcb_cr3 = cr3;
4999	load_cr3(cr3);
5000	PCPU_SET(curpmap, pmap);
5001	critical_exit();
5002}
5003
5004void
5005pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5006{
5007}
5008
5009/*
5010 *	Increase the starting virtual address of the given mapping if a
5011 *	different alignment might result in more superpage mappings.
5012 */
5013void
5014pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5015    vm_offset_t *addr, vm_size_t size)
5016{
5017	vm_offset_t superpage_offset;
5018
5019	if (size < NBPDR)
5020		return;
5021	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5022		offset += ptoa(object->pg_color);
5023	superpage_offset = offset & PDRMASK;
5024	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5025	    (*addr & PDRMASK) == superpage_offset)
5026		return;
5027	if ((*addr & PDRMASK) < superpage_offset)
5028		*addr = (*addr & ~PDRMASK) + superpage_offset;
5029	else
5030		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5031}
5032
5033
5034#if defined(PMAP_DEBUG)
5035pmap_pid_dump(int pid)
5036{
5037	pmap_t pmap;
5038	struct proc *p;
5039	int npte = 0;
5040	int index;
5041
5042	sx_slock(&allproc_lock);
5043	FOREACH_PROC_IN_SYSTEM(p) {
5044		if (p->p_pid != pid)
5045			continue;
5046
5047		if (p->p_vmspace) {
5048			int i,j;
5049			index = 0;
5050			pmap = vmspace_pmap(p->p_vmspace);
5051			for (i = 0; i < NPDEPTD; i++) {
5052				pd_entry_t *pde;
5053				pt_entry_t *pte;
5054				vm_offset_t base = i << PDRSHIFT;
5055
5056				pde = &pmap->pm_pdir[i];
5057				if (pde && pmap_pde_v(pde)) {
5058					for (j = 0; j < NPTEPG; j++) {
5059						vm_offset_t va = base + (j << PAGE_SHIFT);
5060						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5061							if (index) {
5062								index = 0;
5063								printf("\n");
5064							}
5065							sx_sunlock(&allproc_lock);
5066							return npte;
5067						}
5068						pte = pmap_pte(pmap, va);
5069						if (pte && pmap_pte_v(pte)) {
5070							pt_entry_t pa;
5071							vm_page_t m;
5072							pa = *pte;
5073							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5074							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5075								va, pa, m->hold_count, m->wire_count, m->flags);
5076							npte++;
5077							index++;
5078							if (index >= 2) {
5079								index = 0;
5080								printf("\n");
5081							} else {
5082								printf(" ");
5083							}
5084						}
5085					}
5086				}
5087			}
5088		}
5089	}
5090	sx_sunlock(&allproc_lock);
5091	return npte;
5092}
5093#endif
5094
5095#if defined(DEBUG)
5096
5097static void	pads(pmap_t pm);
5098void		pmap_pvdump(vm_offset_t pa);
5099
5100/* print address space of pmap*/
5101static void
5102pads(pmap_t pm)
5103{
5104	int i, j;
5105	vm_paddr_t va;
5106	pt_entry_t *ptep;
5107
5108	if (pm == kernel_pmap)
5109		return;
5110	for (i = 0; i < NPDEPTD; i++)
5111		if (pm->pm_pdir[i])
5112			for (j = 0; j < NPTEPG; j++) {
5113				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5114				if (pm == kernel_pmap && va < KERNBASE)
5115					continue;
5116				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5117					continue;
5118				ptep = pmap_pte(pm, va);
5119				if (pmap_pte_v(ptep))
5120					printf("%x:%x ", va, *ptep);
5121			};
5122
5123}
5124
5125void
5126pmap_pvdump(vm_paddr_t pa)
5127{
5128	pv_entry_t pv;
5129	pmap_t pmap;
5130	vm_page_t m;
5131
5132	printf("pa %x", pa);
5133	m = PHYS_TO_VM_PAGE(pa);
5134	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5135		pmap = PV_PMAP(pv);
5136		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5137		pads(pmap);
5138	}
5139	printf(" ");
5140}
5141#endif
5142