pmap.c revision 224746
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 224746 2011-08-09 21:01:36Z kib $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_cpu.h"
107#include "opt_pmap.h"
108#include "opt_smp.h"
109#include "opt_xbox.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/kernel.h>
114#include <sys/ktr.h>
115#include <sys/lock.h>
116#include <sys/malloc.h>
117#include <sys/mman.h>
118#include <sys/msgbuf.h>
119#include <sys/mutex.h>
120#include <sys/proc.h>
121#include <sys/sf_buf.h>
122#include <sys/sx.h>
123#include <sys/vmmeter.h>
124#include <sys/sched.h>
125#include <sys/sysctl.h>
126#ifdef SMP
127#include <sys/smp.h>
128#else
129#include <sys/cpuset.h>
130#endif
131
132#include <vm/vm.h>
133#include <vm/vm_param.h>
134#include <vm/vm_kern.h>
135#include <vm/vm_page.h>
136#include <vm/vm_map.h>
137#include <vm/vm_object.h>
138#include <vm/vm_extern.h>
139#include <vm/vm_pageout.h>
140#include <vm/vm_pager.h>
141#include <vm/vm_reserv.h>
142#include <vm/uma.h>
143
144#include <machine/cpu.h>
145#include <machine/cputypes.h>
146#include <machine/md_var.h>
147#include <machine/pcb.h>
148#include <machine/specialreg.h>
149#ifdef SMP
150#include <machine/smp.h>
151#endif
152
153#ifdef XBOX
154#include <machine/xbox.h>
155#endif
156
157#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
158#define CPU_ENABLE_SSE
159#endif
160
161#ifndef PMAP_SHPGPERPROC
162#define PMAP_SHPGPERPROC 200
163#endif
164
165#if !defined(DIAGNOSTIC)
166#ifdef __GNUC_GNU_INLINE__
167#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
168#else
169#define PMAP_INLINE	extern inline
170#endif
171#else
172#define PMAP_INLINE
173#endif
174
175#define PV_STATS
176#ifdef PV_STATS
177#define PV_STAT(x)	do { x ; } while (0)
178#else
179#define PV_STAT(x)	do { } while (0)
180#endif
181
182#define	pa_index(pa)	((pa) >> PDRSHIFT)
183#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
184
185/*
186 * Get PDEs and PTEs for user/kernel address space
187 */
188#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
189#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
190
191#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
192#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
193#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
194#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
195#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
196
197#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
198    atomic_clear_int((u_int *)(pte), PG_W))
199#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
200
201struct pmap kernel_pmap_store;
202LIST_HEAD(pmaplist, pmap);
203static struct pmaplist allpmaps;
204static struct mtx allpmaps_lock;
205
206vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
207vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
208int pgeflag = 0;		/* PG_G or-in */
209int pseflag = 0;		/* PG_PS or-in */
210
211static int nkpt = NKPT;
212vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
213extern u_int32_t KERNend;
214extern u_int32_t KPTphys;
215
216#ifdef PAE
217pt_entry_t pg_nx;
218static uma_zone_t pdptzone;
219#endif
220
221SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
222
223static int pat_works = 1;
224SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
225    "Is page attribute table fully functional?");
226
227static int pg_ps_enabled = 1;
228SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
229    "Are large page mappings enabled?");
230
231#define	PAT_INDEX_SIZE	8
232static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
233
234/*
235 * Data for the pv entry allocation mechanism
236 */
237static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
238static struct md_page *pv_table;
239static int shpgperproc = PMAP_SHPGPERPROC;
240
241struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
242int pv_maxchunks;			/* How many chunks we have KVA for */
243vm_offset_t pv_vafree;			/* freelist stored in the PTE */
244
245/*
246 * All those kernel PT submaps that BSD is so fond of
247 */
248struct sysmaps {
249	struct	mtx lock;
250	pt_entry_t *CMAP1;
251	pt_entry_t *CMAP2;
252	caddr_t	CADDR1;
253	caddr_t	CADDR2;
254};
255static struct sysmaps sysmaps_pcpu[MAXCPU];
256pt_entry_t *CMAP1 = 0;
257static pt_entry_t *CMAP3;
258static pd_entry_t *KPTD;
259caddr_t CADDR1 = 0, ptvmmap = 0;
260static caddr_t CADDR3;
261struct msgbuf *msgbufp = 0;
262
263/*
264 * Crashdump maps.
265 */
266static caddr_t crashdumpmap;
267
268static pt_entry_t *PMAP1 = 0, *PMAP2;
269static pt_entry_t *PADDR1 = 0, *PADDR2;
270#ifdef SMP
271static int PMAP1cpu;
272static int PMAP1changedcpu;
273SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
274	   &PMAP1changedcpu, 0,
275	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
276#endif
277static int PMAP1changed;
278SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
279	   &PMAP1changed, 0,
280	   "Number of times pmap_pte_quick changed PMAP1");
281static int PMAP1unchanged;
282SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
283	   &PMAP1unchanged, 0,
284	   "Number of times pmap_pte_quick didn't change PMAP1");
285static struct mtx PMAP2mutex;
286
287static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
288static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
289static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
290static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
291static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
292static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
293static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
294		    vm_offset_t va);
295static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
296
297static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
298static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
299    vm_prot_t prot);
300static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
301    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
302static void pmap_flush_page(vm_page_t m);
303static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
304static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
305static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
306static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
307static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
308static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
309static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
310static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
311static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
312static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
313    vm_prot_t prot);
314static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
315static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
316    vm_page_t *free);
317static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
318    vm_page_t *free);
319static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
320static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
321    vm_page_t *free);
322static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
323					vm_offset_t va);
324static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
325static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
326    vm_page_t m);
327static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
328    pd_entry_t newpde);
329static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
330
331static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
332
333static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
334static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
335static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
336static void pmap_pte_release(pt_entry_t *pte);
337static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
338#ifdef PAE
339static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
340#endif
341static void pmap_set_pg(void);
342
343CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
344CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
345
346/*
347 * If you get an error here, then you set KVA_PAGES wrong! See the
348 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
349 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
350 */
351CTASSERT(KERNBASE % (1 << 24) == 0);
352
353/*
354 *	Bootstrap the system enough to run with virtual memory.
355 *
356 *	On the i386 this is called after mapping has already been enabled
357 *	and just syncs the pmap module with what has already been done.
358 *	[We can't call it easily with mapping off since the kernel is not
359 *	mapped with PA == VA, hence we would have to relocate every address
360 *	from the linked base (virtual) address "KERNBASE" to the actual
361 *	(physical) address starting relative to 0]
362 */
363void
364pmap_bootstrap(vm_paddr_t firstaddr)
365{
366	vm_offset_t va;
367	pt_entry_t *pte, *unused;
368	struct sysmaps *sysmaps;
369	int i;
370
371	/*
372	 * Initialize the first available kernel virtual address.  However,
373	 * using "firstaddr" may waste a few pages of the kernel virtual
374	 * address space, because locore may not have mapped every physical
375	 * page that it allocated.  Preferably, locore would provide a first
376	 * unused virtual address in addition to "firstaddr".
377	 */
378	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
379
380	virtual_end = VM_MAX_KERNEL_ADDRESS;
381
382	/*
383	 * Initialize the kernel pmap (which is statically allocated).
384	 */
385	PMAP_LOCK_INIT(kernel_pmap);
386	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
387#ifdef PAE
388	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
389#endif
390	kernel_pmap->pm_root = NULL;
391	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
392	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
393	LIST_INIT(&allpmaps);
394
395	/*
396	 * Request a spin mutex so that changes to allpmaps cannot be
397	 * preempted by smp_rendezvous_cpus().  Otherwise,
398	 * pmap_update_pde_kernel() could access allpmaps while it is
399	 * being changed.
400	 */
401	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
402	mtx_lock_spin(&allpmaps_lock);
403	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
404	mtx_unlock_spin(&allpmaps_lock);
405
406	/*
407	 * Reserve some special page table entries/VA space for temporary
408	 * mapping of pages.
409	 */
410#define	SYSMAP(c, p, v, n)	\
411	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
412
413	va = virtual_avail;
414	pte = vtopte(va);
415
416	/*
417	 * CMAP1/CMAP2 are used for zeroing and copying pages.
418	 * CMAP3 is used for the idle process page zeroing.
419	 */
420	for (i = 0; i < MAXCPU; i++) {
421		sysmaps = &sysmaps_pcpu[i];
422		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
423		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
424		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
425	}
426	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
427	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
428
429	/*
430	 * Crashdump maps.
431	 */
432	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
433
434	/*
435	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
436	 */
437	SYSMAP(caddr_t, unused, ptvmmap, 1)
438
439	/*
440	 * msgbufp is used to map the system message buffer.
441	 */
442	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
443
444	/*
445	 * KPTmap is used by pmap_kextract().
446	 *
447	 * KPTmap is first initialized by locore.  However, that initial
448	 * KPTmap can only support NKPT page table pages.  Here, a larger
449	 * KPTmap is created that can support KVA_PAGES page table pages.
450	 */
451	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
452
453	for (i = 0; i < NKPT; i++)
454		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
455
456	/*
457	 * Adjust the start of the KPTD and KPTmap so that the implementation
458	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
459	 */
460	KPTD -= KPTDI;
461	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
462
463	/*
464	 * ptemap is used for pmap_pte_quick
465	 */
466	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
467	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
468
469	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
470
471	virtual_avail = va;
472
473	/*
474	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
475	 * physical memory region that is used by the ACPI wakeup code.  This
476	 * mapping must not have PG_G set.
477	 */
478#ifdef XBOX
479	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
480	 * an early stadium, we cannot yet neatly map video memory ... :-(
481	 * Better fixes are very welcome! */
482	if (!arch_i386_is_xbox)
483#endif
484	for (i = 1; i < NKPT; i++)
485		PTD[i] = 0;
486
487	/* Initialize the PAT MSR if present. */
488	pmap_init_pat();
489
490	/* Turn on PG_G on kernel page(s) */
491	pmap_set_pg();
492}
493
494/*
495 * Setup the PAT MSR.
496 */
497void
498pmap_init_pat(void)
499{
500	int pat_table[PAT_INDEX_SIZE];
501	uint64_t pat_msr;
502	u_long cr0, cr4;
503	int i;
504
505	/* Set default PAT index table. */
506	for (i = 0; i < PAT_INDEX_SIZE; i++)
507		pat_table[i] = -1;
508	pat_table[PAT_WRITE_BACK] = 0;
509	pat_table[PAT_WRITE_THROUGH] = 1;
510	pat_table[PAT_UNCACHEABLE] = 3;
511	pat_table[PAT_WRITE_COMBINING] = 3;
512	pat_table[PAT_WRITE_PROTECTED] = 3;
513	pat_table[PAT_UNCACHED] = 3;
514
515	/* Bail if this CPU doesn't implement PAT. */
516	if ((cpu_feature & CPUID_PAT) == 0) {
517		for (i = 0; i < PAT_INDEX_SIZE; i++)
518			pat_index[i] = pat_table[i];
519		pat_works = 0;
520		return;
521	}
522
523	/*
524	 * Due to some Intel errata, we can only safely use the lower 4
525	 * PAT entries.
526	 *
527	 *   Intel Pentium III Processor Specification Update
528	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
529	 * or Mode C Paging)
530	 *
531	 *   Intel Pentium IV  Processor Specification Update
532	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
533	 */
534	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
535	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
536		pat_works = 0;
537
538	/* Initialize default PAT entries. */
539	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
540	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
541	    PAT_VALUE(2, PAT_UNCACHED) |
542	    PAT_VALUE(3, PAT_UNCACHEABLE) |
543	    PAT_VALUE(4, PAT_WRITE_BACK) |
544	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
545	    PAT_VALUE(6, PAT_UNCACHED) |
546	    PAT_VALUE(7, PAT_UNCACHEABLE);
547
548	if (pat_works) {
549		/*
550		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
551		 * Program 5 and 6 as WP and WC.
552		 * Leave 4 and 7 as WB and UC.
553		 */
554		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
555		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
556		    PAT_VALUE(6, PAT_WRITE_COMBINING);
557		pat_table[PAT_UNCACHED] = 2;
558		pat_table[PAT_WRITE_PROTECTED] = 5;
559		pat_table[PAT_WRITE_COMBINING] = 6;
560	} else {
561		/*
562		 * Just replace PAT Index 2 with WC instead of UC-.
563		 */
564		pat_msr &= ~PAT_MASK(2);
565		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
566		pat_table[PAT_WRITE_COMBINING] = 2;
567	}
568
569	/* Disable PGE. */
570	cr4 = rcr4();
571	load_cr4(cr4 & ~CR4_PGE);
572
573	/* Disable caches (CD = 1, NW = 0). */
574	cr0 = rcr0();
575	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
576
577	/* Flushes caches and TLBs. */
578	wbinvd();
579	invltlb();
580
581	/* Update PAT and index table. */
582	wrmsr(MSR_PAT, pat_msr);
583	for (i = 0; i < PAT_INDEX_SIZE; i++)
584		pat_index[i] = pat_table[i];
585
586	/* Flush caches and TLBs again. */
587	wbinvd();
588	invltlb();
589
590	/* Restore caches and PGE. */
591	load_cr0(cr0);
592	load_cr4(cr4);
593}
594
595/*
596 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
597 */
598static void
599pmap_set_pg(void)
600{
601	pt_entry_t *pte;
602	vm_offset_t va, endva;
603
604	if (pgeflag == 0)
605		return;
606
607	endva = KERNBASE + KERNend;
608
609	if (pseflag) {
610		va = KERNBASE + KERNLOAD;
611		while (va  < endva) {
612			pdir_pde(PTD, va) |= pgeflag;
613			invltlb();	/* Play it safe, invltlb() every time */
614			va += NBPDR;
615		}
616	} else {
617		va = (vm_offset_t)btext;
618		while (va < endva) {
619			pte = vtopte(va);
620			if (*pte)
621				*pte |= pgeflag;
622			invltlb();	/* Play it safe, invltlb() every time */
623			va += PAGE_SIZE;
624		}
625	}
626}
627
628/*
629 * Initialize a vm_page's machine-dependent fields.
630 */
631void
632pmap_page_init(vm_page_t m)
633{
634
635	TAILQ_INIT(&m->md.pv_list);
636	m->md.pat_mode = PAT_WRITE_BACK;
637}
638
639#ifdef PAE
640static void *
641pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
642{
643
644	/* Inform UMA that this allocator uses kernel_map/object. */
645	*flags = UMA_SLAB_KERNEL;
646	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
647	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
648}
649#endif
650
651/*
652 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
653 * Requirements:
654 *  - Must deal with pages in order to ensure that none of the PG_* bits
655 *    are ever set, PG_V in particular.
656 *  - Assumes we can write to ptes without pte_store() atomic ops, even
657 *    on PAE systems.  This should be ok.
658 *  - Assumes nothing will ever test these addresses for 0 to indicate
659 *    no mapping instead of correctly checking PG_V.
660 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
661 * Because PG_V is never set, there can be no mappings to invalidate.
662 */
663static vm_offset_t
664pmap_ptelist_alloc(vm_offset_t *head)
665{
666	pt_entry_t *pte;
667	vm_offset_t va;
668
669	va = *head;
670	if (va == 0)
671		return (va);	/* Out of memory */
672	pte = vtopte(va);
673	*head = *pte;
674	if (*head & PG_V)
675		panic("pmap_ptelist_alloc: va with PG_V set!");
676	*pte = 0;
677	return (va);
678}
679
680static void
681pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
682{
683	pt_entry_t *pte;
684
685	if (va & PG_V)
686		panic("pmap_ptelist_free: freeing va with PG_V set!");
687	pte = vtopte(va);
688	*pte = *head;		/* virtual! PG_V is 0 though */
689	*head = va;
690}
691
692static void
693pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
694{
695	int i;
696	vm_offset_t va;
697
698	*head = 0;
699	for (i = npages - 1; i >= 0; i--) {
700		va = (vm_offset_t)base + i * PAGE_SIZE;
701		pmap_ptelist_free(head, va);
702	}
703}
704
705
706/*
707 *	Initialize the pmap module.
708 *	Called by vm_init, to initialize any structures that the pmap
709 *	system needs to map virtual memory.
710 */
711void
712pmap_init(void)
713{
714	vm_page_t mpte;
715	vm_size_t s;
716	int i, pv_npg;
717
718	/*
719	 * Initialize the vm page array entries for the kernel pmap's
720	 * page table pages.
721	 */
722	for (i = 0; i < NKPT; i++) {
723		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
724		KASSERT(mpte >= vm_page_array &&
725		    mpte < &vm_page_array[vm_page_array_size],
726		    ("pmap_init: page table page is out of range"));
727		mpte->pindex = i + KPTDI;
728		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
729	}
730
731	/*
732	 * Initialize the address space (zone) for the pv entries.  Set a
733	 * high water mark so that the system can recover from excessive
734	 * numbers of pv entries.
735	 */
736	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
737	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
738	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
739	pv_entry_max = roundup(pv_entry_max, _NPCPV);
740	pv_entry_high_water = 9 * (pv_entry_max / 10);
741
742	/*
743	 * If the kernel is running in a virtual machine on an AMD Family 10h
744	 * processor, then it must assume that MCA is enabled by the virtual
745	 * machine monitor.
746	 */
747	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
748	    CPUID_TO_FAMILY(cpu_id) == 0x10)
749		workaround_erratum383 = 1;
750
751	/*
752	 * Are large page mappings supported and enabled?
753	 */
754	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
755	if (pseflag == 0)
756		pg_ps_enabled = 0;
757	else if (pg_ps_enabled) {
758		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
759		    ("pmap_init: can't assign to pagesizes[1]"));
760		pagesizes[1] = NBPDR;
761	}
762
763	/*
764	 * Calculate the size of the pv head table for superpages.
765	 */
766	for (i = 0; phys_avail[i + 1]; i += 2);
767	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
768
769	/*
770	 * Allocate memory for the pv head table for superpages.
771	 */
772	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
773	s = round_page(s);
774	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
775	for (i = 0; i < pv_npg; i++)
776		TAILQ_INIT(&pv_table[i].pv_list);
777
778	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
779	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
780	    PAGE_SIZE * pv_maxchunks);
781	if (pv_chunkbase == NULL)
782		panic("pmap_init: not enough kvm for pv chunks");
783	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
784#ifdef PAE
785	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
786	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
787	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
788	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
789#endif
790}
791
792
793SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
794	"Max number of PV entries");
795SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
796	"Page share factor per proc");
797
798SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
799    "2/4MB page mapping counters");
800
801static u_long pmap_pde_demotions;
802SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
803    &pmap_pde_demotions, 0, "2/4MB page demotions");
804
805static u_long pmap_pde_mappings;
806SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
807    &pmap_pde_mappings, 0, "2/4MB page mappings");
808
809static u_long pmap_pde_p_failures;
810SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
811    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
812
813static u_long pmap_pde_promotions;
814SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
815    &pmap_pde_promotions, 0, "2/4MB page promotions");
816
817/***************************************************
818 * Low level helper routines.....
819 ***************************************************/
820
821/*
822 * Determine the appropriate bits to set in a PTE or PDE for a specified
823 * caching mode.
824 */
825int
826pmap_cache_bits(int mode, boolean_t is_pde)
827{
828	int cache_bits, pat_flag, pat_idx;
829
830	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
831		panic("Unknown caching mode %d\n", mode);
832
833	/* The PAT bit is different for PTE's and PDE's. */
834	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
835
836	/* Map the caching mode to a PAT index. */
837	pat_idx = pat_index[mode];
838
839	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
840	cache_bits = 0;
841	if (pat_idx & 0x4)
842		cache_bits |= pat_flag;
843	if (pat_idx & 0x2)
844		cache_bits |= PG_NC_PCD;
845	if (pat_idx & 0x1)
846		cache_bits |= PG_NC_PWT;
847	return (cache_bits);
848}
849
850/*
851 * The caller is responsible for maintaining TLB consistency.
852 */
853static void
854pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
855{
856	pd_entry_t *pde;
857	pmap_t pmap;
858	boolean_t PTD_updated;
859
860	PTD_updated = FALSE;
861	mtx_lock_spin(&allpmaps_lock);
862	LIST_FOREACH(pmap, &allpmaps, pm_list) {
863		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
864		    PG_FRAME))
865			PTD_updated = TRUE;
866		pde = pmap_pde(pmap, va);
867		pde_store(pde, newpde);
868	}
869	mtx_unlock_spin(&allpmaps_lock);
870	KASSERT(PTD_updated,
871	    ("pmap_kenter_pde: current page table is not in allpmaps"));
872}
873
874/*
875 * After changing the page size for the specified virtual address in the page
876 * table, flush the corresponding entries from the processor's TLB.  Only the
877 * calling processor's TLB is affected.
878 *
879 * The calling thread must be pinned to a processor.
880 */
881static void
882pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
883{
884	u_long cr4;
885
886	if ((newpde & PG_PS) == 0)
887		/* Demotion: flush a specific 2MB page mapping. */
888		invlpg(va);
889	else if ((newpde & PG_G) == 0)
890		/*
891		 * Promotion: flush every 4KB page mapping from the TLB
892		 * because there are too many to flush individually.
893		 */
894		invltlb();
895	else {
896		/*
897		 * Promotion: flush every 4KB page mapping from the TLB,
898		 * including any global (PG_G) mappings.
899		 */
900		cr4 = rcr4();
901		load_cr4(cr4 & ~CR4_PGE);
902		/*
903		 * Although preemption at this point could be detrimental to
904		 * performance, it would not lead to an error.  PG_G is simply
905		 * ignored if CR4.PGE is clear.  Moreover, in case this block
906		 * is re-entered, the load_cr4() either above or below will
907		 * modify CR4.PGE flushing the TLB.
908		 */
909		load_cr4(cr4 | CR4_PGE);
910	}
911}
912#ifdef SMP
913/*
914 * For SMP, these functions have to use the IPI mechanism for coherence.
915 *
916 * N.B.: Before calling any of the following TLB invalidation functions,
917 * the calling processor must ensure that all stores updating a non-
918 * kernel page table are globally performed.  Otherwise, another
919 * processor could cache an old, pre-update entry without being
920 * invalidated.  This can happen one of two ways: (1) The pmap becomes
921 * active on another processor after its pm_active field is checked by
922 * one of the following functions but before a store updating the page
923 * table is globally performed. (2) The pmap becomes active on another
924 * processor before its pm_active field is checked but due to
925 * speculative loads one of the following functions stills reads the
926 * pmap as inactive on the other processor.
927 *
928 * The kernel page table is exempt because its pm_active field is
929 * immutable.  The kernel page table is always active on every
930 * processor.
931 */
932void
933pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
934{
935	cpuset_t other_cpus;
936	u_int cpuid;
937
938	sched_pin();
939	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
940		invlpg(va);
941		smp_invlpg(va);
942	} else {
943		cpuid = PCPU_GET(cpuid);
944		other_cpus = all_cpus;
945		CPU_CLR(cpuid, &other_cpus);
946		if (CPU_ISSET(cpuid, &pmap->pm_active))
947			invlpg(va);
948		CPU_AND(&other_cpus, &pmap->pm_active);
949		if (!CPU_EMPTY(&other_cpus))
950			smp_masked_invlpg(other_cpus, va);
951	}
952	sched_unpin();
953}
954
955void
956pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
957{
958	cpuset_t other_cpus;
959	vm_offset_t addr;
960	u_int cpuid;
961
962	sched_pin();
963	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
964		for (addr = sva; addr < eva; addr += PAGE_SIZE)
965			invlpg(addr);
966		smp_invlpg_range(sva, eva);
967	} else {
968		cpuid = PCPU_GET(cpuid);
969		other_cpus = all_cpus;
970		CPU_CLR(cpuid, &other_cpus);
971		if (CPU_ISSET(cpuid, &pmap->pm_active))
972			for (addr = sva; addr < eva; addr += PAGE_SIZE)
973				invlpg(addr);
974		CPU_AND(&other_cpus, &pmap->pm_active);
975		if (!CPU_EMPTY(&other_cpus))
976			smp_masked_invlpg_range(other_cpus, sva, eva);
977	}
978	sched_unpin();
979}
980
981void
982pmap_invalidate_all(pmap_t pmap)
983{
984	cpuset_t other_cpus;
985	u_int cpuid;
986
987	sched_pin();
988	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
989		invltlb();
990		smp_invltlb();
991	} else {
992		cpuid = PCPU_GET(cpuid);
993		other_cpus = all_cpus;
994		CPU_CLR(cpuid, &other_cpus);
995		if (CPU_ISSET(cpuid, &pmap->pm_active))
996			invltlb();
997		CPU_AND(&other_cpus, &pmap->pm_active);
998		if (!CPU_EMPTY(&other_cpus))
999			smp_masked_invltlb(other_cpus);
1000	}
1001	sched_unpin();
1002}
1003
1004void
1005pmap_invalidate_cache(void)
1006{
1007
1008	sched_pin();
1009	wbinvd();
1010	smp_cache_flush();
1011	sched_unpin();
1012}
1013
1014struct pde_action {
1015	cpuset_t invalidate;	/* processors that invalidate their TLB */
1016	vm_offset_t va;
1017	pd_entry_t *pde;
1018	pd_entry_t newpde;
1019	u_int store;		/* processor that updates the PDE */
1020};
1021
1022static void
1023pmap_update_pde_kernel(void *arg)
1024{
1025	struct pde_action *act = arg;
1026	pd_entry_t *pde;
1027	pmap_t pmap;
1028
1029	if (act->store == PCPU_GET(cpuid)) {
1030
1031		/*
1032		 * Elsewhere, this operation requires allpmaps_lock for
1033		 * synchronization.  Here, it does not because it is being
1034		 * performed in the context of an all_cpus rendezvous.
1035		 */
1036		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1037			pde = pmap_pde(pmap, act->va);
1038			pde_store(pde, act->newpde);
1039		}
1040	}
1041}
1042
1043static void
1044pmap_update_pde_user(void *arg)
1045{
1046	struct pde_action *act = arg;
1047
1048	if (act->store == PCPU_GET(cpuid))
1049		pde_store(act->pde, act->newpde);
1050}
1051
1052static void
1053pmap_update_pde_teardown(void *arg)
1054{
1055	struct pde_action *act = arg;
1056
1057	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1058		pmap_update_pde_invalidate(act->va, act->newpde);
1059}
1060
1061/*
1062 * Change the page size for the specified virtual address in a way that
1063 * prevents any possibility of the TLB ever having two entries that map the
1064 * same virtual address using different page sizes.  This is the recommended
1065 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1066 * machine check exception for a TLB state that is improperly diagnosed as a
1067 * hardware error.
1068 */
1069static void
1070pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1071{
1072	struct pde_action act;
1073	cpuset_t active, other_cpus;
1074	u_int cpuid;
1075
1076	sched_pin();
1077	cpuid = PCPU_GET(cpuid);
1078	other_cpus = all_cpus;
1079	CPU_CLR(cpuid, &other_cpus);
1080	if (pmap == kernel_pmap)
1081		active = all_cpus;
1082	else
1083		active = pmap->pm_active;
1084	if (CPU_OVERLAP(&active, &other_cpus)) {
1085		act.store = cpuid;
1086		act.invalidate = active;
1087		act.va = va;
1088		act.pde = pde;
1089		act.newpde = newpde;
1090		CPU_SET(cpuid, &active);
1091		smp_rendezvous_cpus(active,
1092		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1093		    pmap_update_pde_kernel : pmap_update_pde_user,
1094		    pmap_update_pde_teardown, &act);
1095	} else {
1096		if (pmap == kernel_pmap)
1097			pmap_kenter_pde(va, newpde);
1098		else
1099			pde_store(pde, newpde);
1100		if (CPU_ISSET(cpuid, &active))
1101			pmap_update_pde_invalidate(va, newpde);
1102	}
1103	sched_unpin();
1104}
1105#else /* !SMP */
1106/*
1107 * Normal, non-SMP, 486+ invalidation functions.
1108 * We inline these within pmap.c for speed.
1109 */
1110PMAP_INLINE void
1111pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1112{
1113
1114	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1115		invlpg(va);
1116}
1117
1118PMAP_INLINE void
1119pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1120{
1121	vm_offset_t addr;
1122
1123	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1124		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1125			invlpg(addr);
1126}
1127
1128PMAP_INLINE void
1129pmap_invalidate_all(pmap_t pmap)
1130{
1131
1132	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1133		invltlb();
1134}
1135
1136PMAP_INLINE void
1137pmap_invalidate_cache(void)
1138{
1139
1140	wbinvd();
1141}
1142
1143static void
1144pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1145{
1146
1147	if (pmap == kernel_pmap)
1148		pmap_kenter_pde(va, newpde);
1149	else
1150		pde_store(pde, newpde);
1151	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1152		pmap_update_pde_invalidate(va, newpde);
1153}
1154#endif /* !SMP */
1155
1156#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1157
1158void
1159pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1160{
1161
1162	KASSERT((sva & PAGE_MASK) == 0,
1163	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1164	KASSERT((eva & PAGE_MASK) == 0,
1165	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1166
1167	if (cpu_feature & CPUID_SS)
1168		; /* If "Self Snoop" is supported, do nothing. */
1169	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1170	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1171
1172		/*
1173		 * Otherwise, do per-cache line flush.  Use the mfence
1174		 * instruction to insure that previous stores are
1175		 * included in the write-back.  The processor
1176		 * propagates flush to other processors in the cache
1177		 * coherence domain.
1178		 */
1179		mfence();
1180		for (; sva < eva; sva += cpu_clflush_line_size)
1181			clflush(sva);
1182		mfence();
1183	} else {
1184
1185		/*
1186		 * No targeted cache flush methods are supported by CPU,
1187		 * or the supplied range is bigger than 2MB.
1188		 * Globally invalidate cache.
1189		 */
1190		pmap_invalidate_cache();
1191	}
1192}
1193
1194void
1195pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1196{
1197	int i;
1198
1199	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1200	    (cpu_feature & CPUID_CLFSH) == 0) {
1201		pmap_invalidate_cache();
1202	} else {
1203		for (i = 0; i < count; i++)
1204			pmap_flush_page(pages[i]);
1205	}
1206}
1207
1208/*
1209 * Are we current address space or kernel?  N.B. We return FALSE when
1210 * a pmap's page table is in use because a kernel thread is borrowing
1211 * it.  The borrowed page table can change spontaneously, making any
1212 * dependence on its continued use subject to a race condition.
1213 */
1214static __inline int
1215pmap_is_current(pmap_t pmap)
1216{
1217
1218	return (pmap == kernel_pmap ||
1219		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1220	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1221}
1222
1223/*
1224 * If the given pmap is not the current or kernel pmap, the returned pte must
1225 * be released by passing it to pmap_pte_release().
1226 */
1227pt_entry_t *
1228pmap_pte(pmap_t pmap, vm_offset_t va)
1229{
1230	pd_entry_t newpf;
1231	pd_entry_t *pde;
1232
1233	pde = pmap_pde(pmap, va);
1234	if (*pde & PG_PS)
1235		return (pde);
1236	if (*pde != 0) {
1237		/* are we current address space or kernel? */
1238		if (pmap_is_current(pmap))
1239			return (vtopte(va));
1240		mtx_lock(&PMAP2mutex);
1241		newpf = *pde & PG_FRAME;
1242		if ((*PMAP2 & PG_FRAME) != newpf) {
1243			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1244			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1245		}
1246		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1247	}
1248	return (NULL);
1249}
1250
1251/*
1252 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1253 * being NULL.
1254 */
1255static __inline void
1256pmap_pte_release(pt_entry_t *pte)
1257{
1258
1259	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1260		mtx_unlock(&PMAP2mutex);
1261}
1262
1263static __inline void
1264invlcaddr(void *caddr)
1265{
1266
1267	invlpg((u_int)caddr);
1268}
1269
1270/*
1271 * Super fast pmap_pte routine best used when scanning
1272 * the pv lists.  This eliminates many coarse-grained
1273 * invltlb calls.  Note that many of the pv list
1274 * scans are across different pmaps.  It is very wasteful
1275 * to do an entire invltlb for checking a single mapping.
1276 *
1277 * If the given pmap is not the current pmap, vm_page_queue_mtx
1278 * must be held and curthread pinned to a CPU.
1279 */
1280static pt_entry_t *
1281pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1282{
1283	pd_entry_t newpf;
1284	pd_entry_t *pde;
1285
1286	pde = pmap_pde(pmap, va);
1287	if (*pde & PG_PS)
1288		return (pde);
1289	if (*pde != 0) {
1290		/* are we current address space or kernel? */
1291		if (pmap_is_current(pmap))
1292			return (vtopte(va));
1293		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1294		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1295		newpf = *pde & PG_FRAME;
1296		if ((*PMAP1 & PG_FRAME) != newpf) {
1297			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1298#ifdef SMP
1299			PMAP1cpu = PCPU_GET(cpuid);
1300#endif
1301			invlcaddr(PADDR1);
1302			PMAP1changed++;
1303		} else
1304#ifdef SMP
1305		if (PMAP1cpu != PCPU_GET(cpuid)) {
1306			PMAP1cpu = PCPU_GET(cpuid);
1307			invlcaddr(PADDR1);
1308			PMAP1changedcpu++;
1309		} else
1310#endif
1311			PMAP1unchanged++;
1312		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1313	}
1314	return (0);
1315}
1316
1317/*
1318 *	Routine:	pmap_extract
1319 *	Function:
1320 *		Extract the physical page address associated
1321 *		with the given map/virtual_address pair.
1322 */
1323vm_paddr_t
1324pmap_extract(pmap_t pmap, vm_offset_t va)
1325{
1326	vm_paddr_t rtval;
1327	pt_entry_t *pte;
1328	pd_entry_t pde;
1329
1330	rtval = 0;
1331	PMAP_LOCK(pmap);
1332	pde = pmap->pm_pdir[va >> PDRSHIFT];
1333	if (pde != 0) {
1334		if ((pde & PG_PS) != 0)
1335			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1336		else {
1337			pte = pmap_pte(pmap, va);
1338			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1339			pmap_pte_release(pte);
1340		}
1341	}
1342	PMAP_UNLOCK(pmap);
1343	return (rtval);
1344}
1345
1346/*
1347 *	Routine:	pmap_extract_and_hold
1348 *	Function:
1349 *		Atomically extract and hold the physical page
1350 *		with the given pmap and virtual address pair
1351 *		if that mapping permits the given protection.
1352 */
1353vm_page_t
1354pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1355{
1356	pd_entry_t pde;
1357	pt_entry_t pte, *ptep;
1358	vm_page_t m;
1359	vm_paddr_t pa;
1360
1361	pa = 0;
1362	m = NULL;
1363	PMAP_LOCK(pmap);
1364retry:
1365	pde = *pmap_pde(pmap, va);
1366	if (pde != 0) {
1367		if (pde & PG_PS) {
1368			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1369				if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) |
1370				       (va & PDRMASK), &pa))
1371					goto retry;
1372				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1373				    (va & PDRMASK));
1374				vm_page_hold(m);
1375			}
1376		} else {
1377			ptep = pmap_pte(pmap, va);
1378			pte = *ptep;
1379			pmap_pte_release(ptep);
1380			if (pte != 0 &&
1381			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1382				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa))
1383					goto retry;
1384				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1385				vm_page_hold(m);
1386			}
1387		}
1388	}
1389	PA_UNLOCK_COND(pa);
1390	PMAP_UNLOCK(pmap);
1391	return (m);
1392}
1393
1394/***************************************************
1395 * Low level mapping routines.....
1396 ***************************************************/
1397
1398/*
1399 * Add a wired page to the kva.
1400 * Note: not SMP coherent.
1401 *
1402 * This function may be used before pmap_bootstrap() is called.
1403 */
1404PMAP_INLINE void
1405pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1406{
1407	pt_entry_t *pte;
1408
1409	pte = vtopte(va);
1410	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1411}
1412
1413static __inline void
1414pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1415{
1416	pt_entry_t *pte;
1417
1418	pte = vtopte(va);
1419	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1420}
1421
1422/*
1423 * Remove a page from the kernel pagetables.
1424 * Note: not SMP coherent.
1425 *
1426 * This function may be used before pmap_bootstrap() is called.
1427 */
1428PMAP_INLINE void
1429pmap_kremove(vm_offset_t va)
1430{
1431	pt_entry_t *pte;
1432
1433	pte = vtopte(va);
1434	pte_clear(pte);
1435}
1436
1437/*
1438 *	Used to map a range of physical addresses into kernel
1439 *	virtual address space.
1440 *
1441 *	The value passed in '*virt' is a suggested virtual address for
1442 *	the mapping. Architectures which can support a direct-mapped
1443 *	physical to virtual region can return the appropriate address
1444 *	within that region, leaving '*virt' unchanged. Other
1445 *	architectures should map the pages starting at '*virt' and
1446 *	update '*virt' with the first usable address after the mapped
1447 *	region.
1448 */
1449vm_offset_t
1450pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1451{
1452	vm_offset_t va, sva;
1453
1454	va = sva = *virt;
1455	while (start < end) {
1456		pmap_kenter(va, start);
1457		va += PAGE_SIZE;
1458		start += PAGE_SIZE;
1459	}
1460	pmap_invalidate_range(kernel_pmap, sva, va);
1461	*virt = va;
1462	return (sva);
1463}
1464
1465
1466/*
1467 * Add a list of wired pages to the kva
1468 * this routine is only used for temporary
1469 * kernel mappings that do not need to have
1470 * page modification or references recorded.
1471 * Note that old mappings are simply written
1472 * over.  The page *must* be wired.
1473 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1474 */
1475void
1476pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1477{
1478	pt_entry_t *endpte, oldpte, pa, *pte;
1479	vm_page_t m;
1480
1481	oldpte = 0;
1482	pte = vtopte(sva);
1483	endpte = pte + count;
1484	while (pte < endpte) {
1485		m = *ma++;
1486		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1487		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1488			oldpte |= *pte;
1489			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1490		}
1491		pte++;
1492	}
1493	if (__predict_false((oldpte & PG_V) != 0))
1494		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1495		    PAGE_SIZE);
1496}
1497
1498/*
1499 * This routine tears out page mappings from the
1500 * kernel -- it is meant only for temporary mappings.
1501 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1502 */
1503void
1504pmap_qremove(vm_offset_t sva, int count)
1505{
1506	vm_offset_t va;
1507
1508	va = sva;
1509	while (count-- > 0) {
1510		pmap_kremove(va);
1511		va += PAGE_SIZE;
1512	}
1513	pmap_invalidate_range(kernel_pmap, sva, va);
1514}
1515
1516/***************************************************
1517 * Page table page management routines.....
1518 ***************************************************/
1519static __inline void
1520pmap_free_zero_pages(vm_page_t free)
1521{
1522	vm_page_t m;
1523
1524	while (free != NULL) {
1525		m = free;
1526		free = m->right;
1527		/* Preserve the page's PG_ZERO setting. */
1528		vm_page_free_toq(m);
1529	}
1530}
1531
1532/*
1533 * Schedule the specified unused page table page to be freed.  Specifically,
1534 * add the page to the specified list of pages that will be released to the
1535 * physical memory manager after the TLB has been updated.
1536 */
1537static __inline void
1538pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1539{
1540
1541	if (set_PG_ZERO)
1542		m->flags |= PG_ZERO;
1543	else
1544		m->flags &= ~PG_ZERO;
1545	m->right = *free;
1546	*free = m;
1547}
1548
1549/*
1550 * Inserts the specified page table page into the specified pmap's collection
1551 * of idle page table pages.  Each of a pmap's page table pages is responsible
1552 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1553 * ordered by this virtual address range.
1554 */
1555static void
1556pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1557{
1558	vm_page_t root;
1559
1560	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1561	root = pmap->pm_root;
1562	if (root == NULL) {
1563		mpte->left = NULL;
1564		mpte->right = NULL;
1565	} else {
1566		root = vm_page_splay(mpte->pindex, root);
1567		if (mpte->pindex < root->pindex) {
1568			mpte->left = root->left;
1569			mpte->right = root;
1570			root->left = NULL;
1571		} else if (mpte->pindex == root->pindex)
1572			panic("pmap_insert_pt_page: pindex already inserted");
1573		else {
1574			mpte->right = root->right;
1575			mpte->left = root;
1576			root->right = NULL;
1577		}
1578	}
1579	pmap->pm_root = mpte;
1580}
1581
1582/*
1583 * Looks for a page table page mapping the specified virtual address in the
1584 * specified pmap's collection of idle page table pages.  Returns NULL if there
1585 * is no page table page corresponding to the specified virtual address.
1586 */
1587static vm_page_t
1588pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1589{
1590	vm_page_t mpte;
1591	vm_pindex_t pindex = va >> PDRSHIFT;
1592
1593	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1594	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1595		mpte = vm_page_splay(pindex, mpte);
1596		if ((pmap->pm_root = mpte)->pindex != pindex)
1597			mpte = NULL;
1598	}
1599	return (mpte);
1600}
1601
1602/*
1603 * Removes the specified page table page from the specified pmap's collection
1604 * of idle page table pages.  The specified page table page must be a member of
1605 * the pmap's collection.
1606 */
1607static void
1608pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1609{
1610	vm_page_t root;
1611
1612	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1613	if (mpte != pmap->pm_root)
1614		vm_page_splay(mpte->pindex, pmap->pm_root);
1615	if (mpte->left == NULL)
1616		root = mpte->right;
1617	else {
1618		root = vm_page_splay(mpte->pindex, mpte->left);
1619		root->right = mpte->right;
1620	}
1621	pmap->pm_root = root;
1622}
1623
1624/*
1625 * This routine unholds page table pages, and if the hold count
1626 * drops to zero, then it decrements the wire count.
1627 */
1628static __inline int
1629pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1630{
1631
1632	--m->wire_count;
1633	if (m->wire_count == 0)
1634		return (_pmap_unwire_pte_hold(pmap, m, free));
1635	else
1636		return (0);
1637}
1638
1639static int
1640_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1641{
1642	vm_offset_t pteva;
1643
1644	/*
1645	 * unmap the page table page
1646	 */
1647	pmap->pm_pdir[m->pindex] = 0;
1648	--pmap->pm_stats.resident_count;
1649
1650	/*
1651	 * This is a release store so that the ordinary store unmapping
1652	 * the page table page is globally performed before TLB shoot-
1653	 * down is begun.
1654	 */
1655	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1656
1657	/*
1658	 * Do an invltlb to make the invalidated mapping
1659	 * take effect immediately.
1660	 */
1661	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1662	pmap_invalidate_page(pmap, pteva);
1663
1664	/*
1665	 * Put page on a list so that it is released after
1666	 * *ALL* TLB shootdown is done
1667	 */
1668	pmap_add_delayed_free_list(m, free, TRUE);
1669
1670	return (1);
1671}
1672
1673/*
1674 * After removing a page table entry, this routine is used to
1675 * conditionally free the page, and manage the hold/wire counts.
1676 */
1677static int
1678pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1679{
1680	pd_entry_t ptepde;
1681	vm_page_t mpte;
1682
1683	if (va >= VM_MAXUSER_ADDRESS)
1684		return (0);
1685	ptepde = *pmap_pde(pmap, va);
1686	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1687	return (pmap_unwire_pte_hold(pmap, mpte, free));
1688}
1689
1690/*
1691 * Initialize the pmap for the swapper process.
1692 */
1693void
1694pmap_pinit0(pmap_t pmap)
1695{
1696
1697	PMAP_LOCK_INIT(pmap);
1698	/*
1699	 * Since the page table directory is shared with the kernel pmap,
1700	 * which is already included in the list "allpmaps", this pmap does
1701	 * not need to be inserted into that list.
1702	 */
1703	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1704#ifdef PAE
1705	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1706#endif
1707	pmap->pm_root = NULL;
1708	CPU_ZERO(&pmap->pm_active);
1709	PCPU_SET(curpmap, pmap);
1710	TAILQ_INIT(&pmap->pm_pvchunk);
1711	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1712}
1713
1714/*
1715 * Initialize a preallocated and zeroed pmap structure,
1716 * such as one in a vmspace structure.
1717 */
1718int
1719pmap_pinit(pmap_t pmap)
1720{
1721	vm_page_t m, ptdpg[NPGPTD];
1722	vm_paddr_t pa;
1723	static int color;
1724	int i;
1725
1726	PMAP_LOCK_INIT(pmap);
1727
1728	/*
1729	 * No need to allocate page table space yet but we do need a valid
1730	 * page directory table.
1731	 */
1732	if (pmap->pm_pdir == NULL) {
1733		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1734		    NBPTD);
1735
1736		if (pmap->pm_pdir == NULL) {
1737			PMAP_LOCK_DESTROY(pmap);
1738			return (0);
1739		}
1740#ifdef PAE
1741		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1742		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1743		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1744		    ("pmap_pinit: pdpt misaligned"));
1745		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1746		    ("pmap_pinit: pdpt above 4g"));
1747#endif
1748		pmap->pm_root = NULL;
1749	}
1750	KASSERT(pmap->pm_root == NULL,
1751	    ("pmap_pinit: pmap has reserved page table page(s)"));
1752
1753	/*
1754	 * allocate the page directory page(s)
1755	 */
1756	for (i = 0; i < NPGPTD;) {
1757		m = vm_page_alloc(NULL, color++,
1758		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1759		    VM_ALLOC_ZERO);
1760		if (m == NULL)
1761			VM_WAIT;
1762		else {
1763			ptdpg[i++] = m;
1764		}
1765	}
1766
1767	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1768
1769	for (i = 0; i < NPGPTD; i++) {
1770		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1771			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1772	}
1773
1774	mtx_lock_spin(&allpmaps_lock);
1775	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1776	/* Copy the kernel page table directory entries. */
1777	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1778	mtx_unlock_spin(&allpmaps_lock);
1779
1780	/* install self-referential address mapping entry(s) */
1781	for (i = 0; i < NPGPTD; i++) {
1782		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1783		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1784#ifdef PAE
1785		pmap->pm_pdpt[i] = pa | PG_V;
1786#endif
1787	}
1788
1789	CPU_ZERO(&pmap->pm_active);
1790	TAILQ_INIT(&pmap->pm_pvchunk);
1791	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1792
1793	return (1);
1794}
1795
1796/*
1797 * this routine is called if the page table page is not
1798 * mapped correctly.
1799 */
1800static vm_page_t
1801_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1802{
1803	vm_paddr_t ptepa;
1804	vm_page_t m;
1805
1806	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1807	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1808	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1809
1810	/*
1811	 * Allocate a page table page.
1812	 */
1813	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1814	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1815		if (flags & M_WAITOK) {
1816			PMAP_UNLOCK(pmap);
1817			vm_page_unlock_queues();
1818			VM_WAIT;
1819			vm_page_lock_queues();
1820			PMAP_LOCK(pmap);
1821		}
1822
1823		/*
1824		 * Indicate the need to retry.  While waiting, the page table
1825		 * page may have been allocated.
1826		 */
1827		return (NULL);
1828	}
1829	if ((m->flags & PG_ZERO) == 0)
1830		pmap_zero_page(m);
1831
1832	/*
1833	 * Map the pagetable page into the process address space, if
1834	 * it isn't already there.
1835	 */
1836
1837	pmap->pm_stats.resident_count++;
1838
1839	ptepa = VM_PAGE_TO_PHYS(m);
1840	pmap->pm_pdir[ptepindex] =
1841		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1842
1843	return (m);
1844}
1845
1846static vm_page_t
1847pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1848{
1849	unsigned ptepindex;
1850	pd_entry_t ptepa;
1851	vm_page_t m;
1852
1853	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1854	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1855	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1856
1857	/*
1858	 * Calculate pagetable page index
1859	 */
1860	ptepindex = va >> PDRSHIFT;
1861retry:
1862	/*
1863	 * Get the page directory entry
1864	 */
1865	ptepa = pmap->pm_pdir[ptepindex];
1866
1867	/*
1868	 * This supports switching from a 4MB page to a
1869	 * normal 4K page.
1870	 */
1871	if (ptepa & PG_PS) {
1872		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1873		ptepa = pmap->pm_pdir[ptepindex];
1874	}
1875
1876	/*
1877	 * If the page table page is mapped, we just increment the
1878	 * hold count, and activate it.
1879	 */
1880	if (ptepa) {
1881		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1882		m->wire_count++;
1883	} else {
1884		/*
1885		 * Here if the pte page isn't mapped, or if it has
1886		 * been deallocated.
1887		 */
1888		m = _pmap_allocpte(pmap, ptepindex, flags);
1889		if (m == NULL && (flags & M_WAITOK))
1890			goto retry;
1891	}
1892	return (m);
1893}
1894
1895
1896/***************************************************
1897* Pmap allocation/deallocation routines.
1898 ***************************************************/
1899
1900#ifdef SMP
1901/*
1902 * Deal with a SMP shootdown of other users of the pmap that we are
1903 * trying to dispose of.  This can be a bit hairy.
1904 */
1905static cpuset_t *lazymask;
1906static u_int lazyptd;
1907static volatile u_int lazywait;
1908
1909void pmap_lazyfix_action(void);
1910
1911void
1912pmap_lazyfix_action(void)
1913{
1914
1915#ifdef COUNT_IPIS
1916	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1917#endif
1918	if (rcr3() == lazyptd)
1919		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1920	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1921	atomic_store_rel_int(&lazywait, 1);
1922}
1923
1924static void
1925pmap_lazyfix_self(u_int cpuid)
1926{
1927
1928	if (rcr3() == lazyptd)
1929		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1930	CPU_CLR_ATOMIC(cpuid, lazymask);
1931}
1932
1933
1934static void
1935pmap_lazyfix(pmap_t pmap)
1936{
1937	cpuset_t mymask, mask;
1938	u_int cpuid, spins;
1939	int lsb;
1940
1941	mask = pmap->pm_active;
1942	while (!CPU_EMPTY(&mask)) {
1943		spins = 50000000;
1944
1945		/* Find least significant set bit. */
1946		lsb = cpusetobj_ffs(&mask);
1947		MPASS(lsb != 0);
1948		lsb--;
1949		CPU_SETOF(lsb, &mask);
1950		mtx_lock_spin(&smp_ipi_mtx);
1951#ifdef PAE
1952		lazyptd = vtophys(pmap->pm_pdpt);
1953#else
1954		lazyptd = vtophys(pmap->pm_pdir);
1955#endif
1956		cpuid = PCPU_GET(cpuid);
1957
1958		/* Use a cpuset just for having an easy check. */
1959		CPU_SETOF(cpuid, &mymask);
1960		if (!CPU_CMP(&mask, &mymask)) {
1961			lazymask = &pmap->pm_active;
1962			pmap_lazyfix_self(cpuid);
1963		} else {
1964			atomic_store_rel_int((u_int *)&lazymask,
1965			    (u_int)&pmap->pm_active);
1966			atomic_store_rel_int(&lazywait, 0);
1967			ipi_selected(mask, IPI_LAZYPMAP);
1968			while (lazywait == 0) {
1969				ia32_pause();
1970				if (--spins == 0)
1971					break;
1972			}
1973		}
1974		mtx_unlock_spin(&smp_ipi_mtx);
1975		if (spins == 0)
1976			printf("pmap_lazyfix: spun for 50000000\n");
1977		mask = pmap->pm_active;
1978	}
1979}
1980
1981#else	/* SMP */
1982
1983/*
1984 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1985 * unlikely to have to even execute this code, including the fact
1986 * that the cleanup is deferred until the parent does a wait(2), which
1987 * means that another userland process has run.
1988 */
1989static void
1990pmap_lazyfix(pmap_t pmap)
1991{
1992	u_int cr3;
1993
1994	cr3 = vtophys(pmap->pm_pdir);
1995	if (cr3 == rcr3()) {
1996		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1997		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
1998	}
1999}
2000#endif	/* SMP */
2001
2002/*
2003 * Release any resources held by the given physical map.
2004 * Called when a pmap initialized by pmap_pinit is being released.
2005 * Should only be called if the map contains no valid mappings.
2006 */
2007void
2008pmap_release(pmap_t pmap)
2009{
2010	vm_page_t m, ptdpg[NPGPTD];
2011	int i;
2012
2013	KASSERT(pmap->pm_stats.resident_count == 0,
2014	    ("pmap_release: pmap resident count %ld != 0",
2015	    pmap->pm_stats.resident_count));
2016	KASSERT(pmap->pm_root == NULL,
2017	    ("pmap_release: pmap has reserved page table page(s)"));
2018
2019	pmap_lazyfix(pmap);
2020	mtx_lock_spin(&allpmaps_lock);
2021	LIST_REMOVE(pmap, pm_list);
2022	mtx_unlock_spin(&allpmaps_lock);
2023
2024	for (i = 0; i < NPGPTD; i++)
2025		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2026		    PG_FRAME);
2027
2028	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2029	    sizeof(*pmap->pm_pdir));
2030
2031	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2032
2033	for (i = 0; i < NPGPTD; i++) {
2034		m = ptdpg[i];
2035#ifdef PAE
2036		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2037		    ("pmap_release: got wrong ptd page"));
2038#endif
2039		m->wire_count--;
2040		atomic_subtract_int(&cnt.v_wire_count, 1);
2041		vm_page_free_zero(m);
2042	}
2043	PMAP_LOCK_DESTROY(pmap);
2044}
2045
2046static int
2047kvm_size(SYSCTL_HANDLER_ARGS)
2048{
2049	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2050
2051	return (sysctl_handle_long(oidp, &ksize, 0, req));
2052}
2053SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2054    0, 0, kvm_size, "IU", "Size of KVM");
2055
2056static int
2057kvm_free(SYSCTL_HANDLER_ARGS)
2058{
2059	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2060
2061	return (sysctl_handle_long(oidp, &kfree, 0, req));
2062}
2063SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2064    0, 0, kvm_free, "IU", "Amount of KVM free");
2065
2066/*
2067 * grow the number of kernel page table entries, if needed
2068 */
2069void
2070pmap_growkernel(vm_offset_t addr)
2071{
2072	vm_paddr_t ptppaddr;
2073	vm_page_t nkpg;
2074	pd_entry_t newpdir;
2075
2076	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2077	addr = roundup2(addr, NBPDR);
2078	if (addr - 1 >= kernel_map->max_offset)
2079		addr = kernel_map->max_offset;
2080	while (kernel_vm_end < addr) {
2081		if (pdir_pde(PTD, kernel_vm_end)) {
2082			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2083			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2084				kernel_vm_end = kernel_map->max_offset;
2085				break;
2086			}
2087			continue;
2088		}
2089
2090		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2091		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2092		    VM_ALLOC_ZERO);
2093		if (nkpg == NULL)
2094			panic("pmap_growkernel: no memory to grow kernel");
2095
2096		nkpt++;
2097
2098		if ((nkpg->flags & PG_ZERO) == 0)
2099			pmap_zero_page(nkpg);
2100		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2101		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2102		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2103
2104		pmap_kenter_pde(kernel_vm_end, newpdir);
2105		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2106		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2107			kernel_vm_end = kernel_map->max_offset;
2108			break;
2109		}
2110	}
2111}
2112
2113
2114/***************************************************
2115 * page management routines.
2116 ***************************************************/
2117
2118CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2119CTASSERT(_NPCM == 11);
2120
2121static __inline struct pv_chunk *
2122pv_to_chunk(pv_entry_t pv)
2123{
2124
2125	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2126}
2127
2128#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2129
2130#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2131#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2132
2133static uint32_t pc_freemask[11] = {
2134	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2135	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2136	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2137	PC_FREE0_9, PC_FREE10
2138};
2139
2140SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2141	"Current number of pv entries");
2142
2143#ifdef PV_STATS
2144static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2145
2146SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2147	"Current number of pv entry chunks");
2148SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2149	"Current number of pv entry chunks allocated");
2150SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2151	"Current number of pv entry chunks frees");
2152SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2153	"Number of times tried to get a chunk page but failed.");
2154
2155static long pv_entry_frees, pv_entry_allocs;
2156static int pv_entry_spare;
2157
2158SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2159	"Current number of pv entry frees");
2160SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2161	"Current number of pv entry allocs");
2162SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2163	"Current number of spare pv entries");
2164
2165static int pmap_collect_inactive, pmap_collect_active;
2166
2167SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
2168	"Current number times pmap_collect called on inactive queue");
2169SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
2170	"Current number times pmap_collect called on active queue");
2171#endif
2172
2173/*
2174 * We are in a serious low memory condition.  Resort to
2175 * drastic measures to free some pages so we can allocate
2176 * another pv entry chunk.  This is normally called to
2177 * unmap inactive pages, and if necessary, active pages.
2178 */
2179static void
2180pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
2181{
2182	pd_entry_t *pde;
2183	pmap_t pmap;
2184	pt_entry_t *pte, tpte;
2185	pv_entry_t next_pv, pv;
2186	vm_offset_t va;
2187	vm_page_t m, free;
2188
2189	sched_pin();
2190	TAILQ_FOREACH(m, &vpq->pl, pageq) {
2191		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
2192			continue;
2193		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
2194			va = pv->pv_va;
2195			pmap = PV_PMAP(pv);
2196			/* Avoid deadlock and lock recursion. */
2197			if (pmap > locked_pmap)
2198				PMAP_LOCK(pmap);
2199			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
2200				continue;
2201			pmap->pm_stats.resident_count--;
2202			pde = pmap_pde(pmap, va);
2203			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
2204			    " a 4mpage in page %p's pv list", m));
2205			pte = pmap_pte_quick(pmap, va);
2206			tpte = pte_load_clear(pte);
2207			KASSERT((tpte & PG_W) == 0,
2208			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
2209			if (tpte & PG_A)
2210				vm_page_flag_set(m, PG_REFERENCED);
2211			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2212				vm_page_dirty(m);
2213			free = NULL;
2214			pmap_unuse_pt(pmap, va, &free);
2215			pmap_invalidate_page(pmap, va);
2216			pmap_free_zero_pages(free);
2217			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2218			free_pv_entry(pmap, pv);
2219			if (pmap != locked_pmap)
2220				PMAP_UNLOCK(pmap);
2221		}
2222		if (TAILQ_EMPTY(&m->md.pv_list) &&
2223		    TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list))
2224			vm_page_flag_clear(m, PG_WRITEABLE);
2225	}
2226	sched_unpin();
2227}
2228
2229
2230/*
2231 * free the pv_entry back to the free list
2232 */
2233static void
2234free_pv_entry(pmap_t pmap, pv_entry_t pv)
2235{
2236	vm_page_t m;
2237	struct pv_chunk *pc;
2238	int idx, field, bit;
2239
2240	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2241	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2242	PV_STAT(pv_entry_frees++);
2243	PV_STAT(pv_entry_spare++);
2244	pv_entry_count--;
2245	pc = pv_to_chunk(pv);
2246	idx = pv - &pc->pc_pventry[0];
2247	field = idx / 32;
2248	bit = idx % 32;
2249	pc->pc_map[field] |= 1ul << bit;
2250	/* move to head of list */
2251	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2252	for (idx = 0; idx < _NPCM; idx++)
2253		if (pc->pc_map[idx] != pc_freemask[idx]) {
2254			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2255			return;
2256		}
2257	PV_STAT(pv_entry_spare -= _NPCPV);
2258	PV_STAT(pc_chunk_count--);
2259	PV_STAT(pc_chunk_frees++);
2260	/* entire chunk is free, return it */
2261	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2262	pmap_qremove((vm_offset_t)pc, 1);
2263	vm_page_unwire(m, 0);
2264	vm_page_free(m);
2265	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2266}
2267
2268/*
2269 * get a new pv_entry, allocating a block from the system
2270 * when needed.
2271 */
2272static pv_entry_t
2273get_pv_entry(pmap_t pmap, int try)
2274{
2275	static const struct timeval printinterval = { 60, 0 };
2276	static struct timeval lastprint;
2277	static vm_pindex_t colour;
2278	struct vpgqueues *pq;
2279	int bit, field;
2280	pv_entry_t pv;
2281	struct pv_chunk *pc;
2282	vm_page_t m;
2283
2284	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2285	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2286	PV_STAT(pv_entry_allocs++);
2287	pv_entry_count++;
2288	if (pv_entry_count > pv_entry_high_water)
2289		if (ratecheck(&lastprint, &printinterval))
2290			printf("Approaching the limit on PV entries, consider "
2291			    "increasing either the vm.pmap.shpgperproc or the "
2292			    "vm.pmap.pv_entry_max tunable.\n");
2293	pq = NULL;
2294retry:
2295	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2296	if (pc != NULL) {
2297		for (field = 0; field < _NPCM; field++) {
2298			if (pc->pc_map[field]) {
2299				bit = bsfl(pc->pc_map[field]);
2300				break;
2301			}
2302		}
2303		if (field < _NPCM) {
2304			pv = &pc->pc_pventry[field * 32 + bit];
2305			pc->pc_map[field] &= ~(1ul << bit);
2306			/* If this was the last item, move it to tail */
2307			for (field = 0; field < _NPCM; field++)
2308				if (pc->pc_map[field] != 0) {
2309					PV_STAT(pv_entry_spare--);
2310					return (pv);	/* not full, return */
2311				}
2312			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2313			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2314			PV_STAT(pv_entry_spare--);
2315			return (pv);
2316		}
2317	}
2318	/*
2319	 * Access to the ptelist "pv_vafree" is synchronized by the page
2320	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2321	 * remain non-empty until pmap_ptelist_alloc() completes.
2322	 */
2323	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq ==
2324	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
2325	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2326		if (try) {
2327			pv_entry_count--;
2328			PV_STAT(pc_chunk_tryfail++);
2329			return (NULL);
2330		}
2331		/*
2332		 * Reclaim pv entries: At first, destroy mappings to
2333		 * inactive pages.  After that, if a pv chunk entry
2334		 * is still needed, destroy mappings to active pages.
2335		 */
2336		if (pq == NULL) {
2337			PV_STAT(pmap_collect_inactive++);
2338			pq = &vm_page_queues[PQ_INACTIVE];
2339		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2340			PV_STAT(pmap_collect_active++);
2341			pq = &vm_page_queues[PQ_ACTIVE];
2342		} else
2343			panic("get_pv_entry: increase vm.pmap.shpgperproc");
2344		pmap_collect(pmap, pq);
2345		goto retry;
2346	}
2347	PV_STAT(pc_chunk_count++);
2348	PV_STAT(pc_chunk_allocs++);
2349	colour++;
2350	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2351	pmap_qenter((vm_offset_t)pc, &m, 1);
2352	pc->pc_pmap = pmap;
2353	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2354	for (field = 1; field < _NPCM; field++)
2355		pc->pc_map[field] = pc_freemask[field];
2356	pv = &pc->pc_pventry[0];
2357	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2358	PV_STAT(pv_entry_spare += _NPCPV - 1);
2359	return (pv);
2360}
2361
2362static __inline pv_entry_t
2363pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2364{
2365	pv_entry_t pv;
2366
2367	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2368	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2369		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2370			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2371			break;
2372		}
2373	}
2374	return (pv);
2375}
2376
2377static void
2378pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2379{
2380	struct md_page *pvh;
2381	pv_entry_t pv;
2382	vm_offset_t va_last;
2383	vm_page_t m;
2384
2385	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2386	KASSERT((pa & PDRMASK) == 0,
2387	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2388
2389	/*
2390	 * Transfer the 4mpage's pv entry for this mapping to the first
2391	 * page's pv list.
2392	 */
2393	pvh = pa_to_pvh(pa);
2394	va = trunc_4mpage(va);
2395	pv = pmap_pvh_remove(pvh, pmap, va);
2396	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2397	m = PHYS_TO_VM_PAGE(pa);
2398	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2399	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2400	va_last = va + NBPDR - PAGE_SIZE;
2401	do {
2402		m++;
2403		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2404		    ("pmap_pv_demote_pde: page %p is not managed", m));
2405		va += PAGE_SIZE;
2406		pmap_insert_entry(pmap, va, m);
2407	} while (va < va_last);
2408}
2409
2410static void
2411pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2412{
2413	struct md_page *pvh;
2414	pv_entry_t pv;
2415	vm_offset_t va_last;
2416	vm_page_t m;
2417
2418	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2419	KASSERT((pa & PDRMASK) == 0,
2420	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2421
2422	/*
2423	 * Transfer the first page's pv entry for this mapping to the
2424	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2425	 * to get_pv_entry(), a transfer avoids the possibility that
2426	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2427	 * removes one of the mappings that is being promoted.
2428	 */
2429	m = PHYS_TO_VM_PAGE(pa);
2430	va = trunc_4mpage(va);
2431	pv = pmap_pvh_remove(&m->md, pmap, va);
2432	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2433	pvh = pa_to_pvh(pa);
2434	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2435	/* Free the remaining NPTEPG - 1 pv entries. */
2436	va_last = va + NBPDR - PAGE_SIZE;
2437	do {
2438		m++;
2439		va += PAGE_SIZE;
2440		pmap_pvh_free(&m->md, pmap, va);
2441	} while (va < va_last);
2442}
2443
2444static void
2445pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2446{
2447	pv_entry_t pv;
2448
2449	pv = pmap_pvh_remove(pvh, pmap, va);
2450	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2451	free_pv_entry(pmap, pv);
2452}
2453
2454static void
2455pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2456{
2457	struct md_page *pvh;
2458
2459	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2460	pmap_pvh_free(&m->md, pmap, va);
2461	if (TAILQ_EMPTY(&m->md.pv_list)) {
2462		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2463		if (TAILQ_EMPTY(&pvh->pv_list))
2464			vm_page_flag_clear(m, PG_WRITEABLE);
2465	}
2466}
2467
2468/*
2469 * Create a pv entry for page at pa for
2470 * (pmap, va).
2471 */
2472static void
2473pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2474{
2475	pv_entry_t pv;
2476
2477	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2478	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2479	pv = get_pv_entry(pmap, FALSE);
2480	pv->pv_va = va;
2481	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2482}
2483
2484/*
2485 * Conditionally create a pv entry.
2486 */
2487static boolean_t
2488pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2489{
2490	pv_entry_t pv;
2491
2492	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2493	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2494	if (pv_entry_count < pv_entry_high_water &&
2495	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2496		pv->pv_va = va;
2497		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2498		return (TRUE);
2499	} else
2500		return (FALSE);
2501}
2502
2503/*
2504 * Create the pv entries for each of the pages within a superpage.
2505 */
2506static boolean_t
2507pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2508{
2509	struct md_page *pvh;
2510	pv_entry_t pv;
2511
2512	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2513	if (pv_entry_count < pv_entry_high_water &&
2514	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2515		pv->pv_va = va;
2516		pvh = pa_to_pvh(pa);
2517		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2518		return (TRUE);
2519	} else
2520		return (FALSE);
2521}
2522
2523/*
2524 * Fills a page table page with mappings to consecutive physical pages.
2525 */
2526static void
2527pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2528{
2529	pt_entry_t *pte;
2530
2531	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2532		*pte = newpte;
2533		newpte += PAGE_SIZE;
2534	}
2535}
2536
2537/*
2538 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2539 * 2- or 4MB page mapping is invalidated.
2540 */
2541static boolean_t
2542pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2543{
2544	pd_entry_t newpde, oldpde;
2545	pt_entry_t *firstpte, newpte;
2546	vm_paddr_t mptepa;
2547	vm_page_t free, mpte;
2548
2549	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2550	oldpde = *pde;
2551	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2552	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2553	mpte = pmap_lookup_pt_page(pmap, va);
2554	if (mpte != NULL)
2555		pmap_remove_pt_page(pmap, mpte);
2556	else {
2557		KASSERT((oldpde & PG_W) == 0,
2558		    ("pmap_demote_pde: page table page for a wired mapping"
2559		    " is missing"));
2560
2561		/*
2562		 * Invalidate the 2- or 4MB page mapping and return
2563		 * "failure" if the mapping was never accessed or the
2564		 * allocation of the new page table page fails.
2565		 */
2566		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2567		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2568		    VM_ALLOC_WIRED)) == NULL) {
2569			free = NULL;
2570			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2571			pmap_invalidate_page(pmap, trunc_4mpage(va));
2572			pmap_free_zero_pages(free);
2573			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2574			    " in pmap %p", va, pmap);
2575			return (FALSE);
2576		}
2577		if (va < VM_MAXUSER_ADDRESS)
2578			pmap->pm_stats.resident_count++;
2579	}
2580	mptepa = VM_PAGE_TO_PHYS(mpte);
2581
2582	/*
2583	 * If the page mapping is in the kernel's address space, then the
2584	 * KPTmap can provide access to the page table page.  Otherwise,
2585	 * temporarily map the page table page (mpte) into the kernel's
2586	 * address space at either PADDR1 or PADDR2.
2587	 */
2588	if (va >= KERNBASE)
2589		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2590	else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2591		if ((*PMAP1 & PG_FRAME) != mptepa) {
2592			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2593#ifdef SMP
2594			PMAP1cpu = PCPU_GET(cpuid);
2595#endif
2596			invlcaddr(PADDR1);
2597			PMAP1changed++;
2598		} else
2599#ifdef SMP
2600		if (PMAP1cpu != PCPU_GET(cpuid)) {
2601			PMAP1cpu = PCPU_GET(cpuid);
2602			invlcaddr(PADDR1);
2603			PMAP1changedcpu++;
2604		} else
2605#endif
2606			PMAP1unchanged++;
2607		firstpte = PADDR1;
2608	} else {
2609		mtx_lock(&PMAP2mutex);
2610		if ((*PMAP2 & PG_FRAME) != mptepa) {
2611			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2612			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2613		}
2614		firstpte = PADDR2;
2615	}
2616	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2617	KASSERT((oldpde & PG_A) != 0,
2618	    ("pmap_demote_pde: oldpde is missing PG_A"));
2619	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2620	    ("pmap_demote_pde: oldpde is missing PG_M"));
2621	newpte = oldpde & ~PG_PS;
2622	if ((newpte & PG_PDE_PAT) != 0)
2623		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2624
2625	/*
2626	 * If the page table page is new, initialize it.
2627	 */
2628	if (mpte->wire_count == 1) {
2629		mpte->wire_count = NPTEPG;
2630		pmap_fill_ptp(firstpte, newpte);
2631	}
2632	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2633	    ("pmap_demote_pde: firstpte and newpte map different physical"
2634	    " addresses"));
2635
2636	/*
2637	 * If the mapping has changed attributes, update the page table
2638	 * entries.
2639	 */
2640	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2641		pmap_fill_ptp(firstpte, newpte);
2642
2643	/*
2644	 * Demote the mapping.  This pmap is locked.  The old PDE has
2645	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2646	 * set.  Thus, there is no danger of a race with another
2647	 * processor changing the setting of PG_A and/or PG_M between
2648	 * the read above and the store below.
2649	 */
2650	if (workaround_erratum383)
2651		pmap_update_pde(pmap, va, pde, newpde);
2652	else if (pmap == kernel_pmap)
2653		pmap_kenter_pde(va, newpde);
2654	else
2655		pde_store(pde, newpde);
2656	if (firstpte == PADDR2)
2657		mtx_unlock(&PMAP2mutex);
2658
2659	/*
2660	 * Invalidate the recursive mapping of the page table page.
2661	 */
2662	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2663
2664	/*
2665	 * Demote the pv entry.  This depends on the earlier demotion
2666	 * of the mapping.  Specifically, the (re)creation of a per-
2667	 * page pv entry might trigger the execution of pmap_collect(),
2668	 * which might reclaim a newly (re)created per-page pv entry
2669	 * and destroy the associated mapping.  In order to destroy
2670	 * the mapping, the PDE must have already changed from mapping
2671	 * the 2mpage to referencing the page table page.
2672	 */
2673	if ((oldpde & PG_MANAGED) != 0)
2674		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2675
2676	pmap_pde_demotions++;
2677	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2678	    " in pmap %p", va, pmap);
2679	return (TRUE);
2680}
2681
2682/*
2683 * pmap_remove_pde: do the things to unmap a superpage in a process
2684 */
2685static void
2686pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2687    vm_page_t *free)
2688{
2689	struct md_page *pvh;
2690	pd_entry_t oldpde;
2691	vm_offset_t eva, va;
2692	vm_page_t m, mpte;
2693
2694	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2695	KASSERT((sva & PDRMASK) == 0,
2696	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2697	oldpde = pte_load_clear(pdq);
2698	if (oldpde & PG_W)
2699		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2700
2701	/*
2702	 * Machines that don't support invlpg, also don't support
2703	 * PG_G.
2704	 */
2705	if (oldpde & PG_G)
2706		pmap_invalidate_page(kernel_pmap, sva);
2707	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2708	if (oldpde & PG_MANAGED) {
2709		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2710		pmap_pvh_free(pvh, pmap, sva);
2711		eva = sva + NBPDR;
2712		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2713		    va < eva; va += PAGE_SIZE, m++) {
2714			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2715				vm_page_dirty(m);
2716			if (oldpde & PG_A)
2717				vm_page_flag_set(m, PG_REFERENCED);
2718			if (TAILQ_EMPTY(&m->md.pv_list) &&
2719			    TAILQ_EMPTY(&pvh->pv_list))
2720				vm_page_flag_clear(m, PG_WRITEABLE);
2721		}
2722	}
2723	if (pmap == kernel_pmap) {
2724		if (!pmap_demote_pde(pmap, pdq, sva))
2725			panic("pmap_remove_pde: failed demotion");
2726	} else {
2727		mpte = pmap_lookup_pt_page(pmap, sva);
2728		if (mpte != NULL) {
2729			pmap_remove_pt_page(pmap, mpte);
2730			pmap->pm_stats.resident_count--;
2731			KASSERT(mpte->wire_count == NPTEPG,
2732			    ("pmap_remove_pde: pte page wire count error"));
2733			mpte->wire_count = 0;
2734			pmap_add_delayed_free_list(mpte, free, FALSE);
2735			atomic_subtract_int(&cnt.v_wire_count, 1);
2736		}
2737	}
2738}
2739
2740/*
2741 * pmap_remove_pte: do the things to unmap a page in a process
2742 */
2743static int
2744pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2745{
2746	pt_entry_t oldpte;
2747	vm_page_t m;
2748
2749	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2750	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2751	oldpte = pte_load_clear(ptq);
2752	if (oldpte & PG_W)
2753		pmap->pm_stats.wired_count -= 1;
2754	/*
2755	 * Machines that don't support invlpg, also don't support
2756	 * PG_G.
2757	 */
2758	if (oldpte & PG_G)
2759		pmap_invalidate_page(kernel_pmap, va);
2760	pmap->pm_stats.resident_count -= 1;
2761	if (oldpte & PG_MANAGED) {
2762		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2763		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2764			vm_page_dirty(m);
2765		if (oldpte & PG_A)
2766			vm_page_flag_set(m, PG_REFERENCED);
2767		pmap_remove_entry(pmap, m, va);
2768	}
2769	return (pmap_unuse_pt(pmap, va, free));
2770}
2771
2772/*
2773 * Remove a single page from a process address space
2774 */
2775static void
2776pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2777{
2778	pt_entry_t *pte;
2779
2780	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2781	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2782	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2783	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2784		return;
2785	pmap_remove_pte(pmap, pte, va, free);
2786	pmap_invalidate_page(pmap, va);
2787}
2788
2789/*
2790 *	Remove the given range of addresses from the specified map.
2791 *
2792 *	It is assumed that the start and end are properly
2793 *	rounded to the page size.
2794 */
2795void
2796pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2797{
2798	vm_offset_t pdnxt;
2799	pd_entry_t ptpaddr;
2800	pt_entry_t *pte;
2801	vm_page_t free = NULL;
2802	int anyvalid;
2803
2804	/*
2805	 * Perform an unsynchronized read.  This is, however, safe.
2806	 */
2807	if (pmap->pm_stats.resident_count == 0)
2808		return;
2809
2810	anyvalid = 0;
2811
2812	vm_page_lock_queues();
2813	sched_pin();
2814	PMAP_LOCK(pmap);
2815
2816	/*
2817	 * special handling of removing one page.  a very
2818	 * common operation and easy to short circuit some
2819	 * code.
2820	 */
2821	if ((sva + PAGE_SIZE == eva) &&
2822	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2823		pmap_remove_page(pmap, sva, &free);
2824		goto out;
2825	}
2826
2827	for (; sva < eva; sva = pdnxt) {
2828		unsigned pdirindex;
2829
2830		/*
2831		 * Calculate index for next page table.
2832		 */
2833		pdnxt = (sva + NBPDR) & ~PDRMASK;
2834		if (pdnxt < sva)
2835			pdnxt = eva;
2836		if (pmap->pm_stats.resident_count == 0)
2837			break;
2838
2839		pdirindex = sva >> PDRSHIFT;
2840		ptpaddr = pmap->pm_pdir[pdirindex];
2841
2842		/*
2843		 * Weed out invalid mappings. Note: we assume that the page
2844		 * directory table is always allocated, and in kernel virtual.
2845		 */
2846		if (ptpaddr == 0)
2847			continue;
2848
2849		/*
2850		 * Check for large page.
2851		 */
2852		if ((ptpaddr & PG_PS) != 0) {
2853			/*
2854			 * Are we removing the entire large page?  If not,
2855			 * demote the mapping and fall through.
2856			 */
2857			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2858				/*
2859				 * The TLB entry for a PG_G mapping is
2860				 * invalidated by pmap_remove_pde().
2861				 */
2862				if ((ptpaddr & PG_G) == 0)
2863					anyvalid = 1;
2864				pmap_remove_pde(pmap,
2865				    &pmap->pm_pdir[pdirindex], sva, &free);
2866				continue;
2867			} else if (!pmap_demote_pde(pmap,
2868			    &pmap->pm_pdir[pdirindex], sva)) {
2869				/* The large page mapping was destroyed. */
2870				continue;
2871			}
2872		}
2873
2874		/*
2875		 * Limit our scan to either the end of the va represented
2876		 * by the current page table page, or to the end of the
2877		 * range being removed.
2878		 */
2879		if (pdnxt > eva)
2880			pdnxt = eva;
2881
2882		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2883		    sva += PAGE_SIZE) {
2884			if (*pte == 0)
2885				continue;
2886
2887			/*
2888			 * The TLB entry for a PG_G mapping is invalidated
2889			 * by pmap_remove_pte().
2890			 */
2891			if ((*pte & PG_G) == 0)
2892				anyvalid = 1;
2893			if (pmap_remove_pte(pmap, pte, sva, &free))
2894				break;
2895		}
2896	}
2897out:
2898	sched_unpin();
2899	if (anyvalid)
2900		pmap_invalidate_all(pmap);
2901	vm_page_unlock_queues();
2902	PMAP_UNLOCK(pmap);
2903	pmap_free_zero_pages(free);
2904}
2905
2906/*
2907 *	Routine:	pmap_remove_all
2908 *	Function:
2909 *		Removes this physical page from
2910 *		all physical maps in which it resides.
2911 *		Reflects back modify bits to the pager.
2912 *
2913 *	Notes:
2914 *		Original versions of this routine were very
2915 *		inefficient because they iteratively called
2916 *		pmap_remove (slow...)
2917 */
2918
2919void
2920pmap_remove_all(vm_page_t m)
2921{
2922	struct md_page *pvh;
2923	pv_entry_t pv;
2924	pmap_t pmap;
2925	pt_entry_t *pte, tpte;
2926	pd_entry_t *pde;
2927	vm_offset_t va;
2928	vm_page_t free;
2929
2930	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2931	    ("pmap_remove_all: page %p is not managed", m));
2932	free = NULL;
2933	vm_page_lock_queues();
2934	sched_pin();
2935	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2936	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2937		va = pv->pv_va;
2938		pmap = PV_PMAP(pv);
2939		PMAP_LOCK(pmap);
2940		pde = pmap_pde(pmap, va);
2941		(void)pmap_demote_pde(pmap, pde, va);
2942		PMAP_UNLOCK(pmap);
2943	}
2944	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2945		pmap = PV_PMAP(pv);
2946		PMAP_LOCK(pmap);
2947		pmap->pm_stats.resident_count--;
2948		pde = pmap_pde(pmap, pv->pv_va);
2949		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2950		    " a 4mpage in page %p's pv list", m));
2951		pte = pmap_pte_quick(pmap, pv->pv_va);
2952		tpte = pte_load_clear(pte);
2953		if (tpte & PG_W)
2954			pmap->pm_stats.wired_count--;
2955		if (tpte & PG_A)
2956			vm_page_flag_set(m, PG_REFERENCED);
2957
2958		/*
2959		 * Update the vm_page_t clean and reference bits.
2960		 */
2961		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2962			vm_page_dirty(m);
2963		pmap_unuse_pt(pmap, pv->pv_va, &free);
2964		pmap_invalidate_page(pmap, pv->pv_va);
2965		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2966		free_pv_entry(pmap, pv);
2967		PMAP_UNLOCK(pmap);
2968	}
2969	vm_page_flag_clear(m, PG_WRITEABLE);
2970	sched_unpin();
2971	vm_page_unlock_queues();
2972	pmap_free_zero_pages(free);
2973}
2974
2975/*
2976 * pmap_protect_pde: do the things to protect a 4mpage in a process
2977 */
2978static boolean_t
2979pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2980{
2981	pd_entry_t newpde, oldpde;
2982	vm_offset_t eva, va;
2983	vm_page_t m;
2984	boolean_t anychanged;
2985
2986	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2987	KASSERT((sva & PDRMASK) == 0,
2988	    ("pmap_protect_pde: sva is not 4mpage aligned"));
2989	anychanged = FALSE;
2990retry:
2991	oldpde = newpde = *pde;
2992	if (oldpde & PG_MANAGED) {
2993		eva = sva + NBPDR;
2994		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2995		    va < eva; va += PAGE_SIZE, m++)
2996			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2997				vm_page_dirty(m);
2998	}
2999	if ((prot & VM_PROT_WRITE) == 0)
3000		newpde &= ~(PG_RW | PG_M);
3001#ifdef PAE
3002	if ((prot & VM_PROT_EXECUTE) == 0)
3003		newpde |= pg_nx;
3004#endif
3005	if (newpde != oldpde) {
3006		if (!pde_cmpset(pde, oldpde, newpde))
3007			goto retry;
3008		if (oldpde & PG_G)
3009			pmap_invalidate_page(pmap, sva);
3010		else
3011			anychanged = TRUE;
3012	}
3013	return (anychanged);
3014}
3015
3016/*
3017 *	Set the physical protection on the
3018 *	specified range of this map as requested.
3019 */
3020void
3021pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3022{
3023	vm_offset_t pdnxt;
3024	pd_entry_t ptpaddr;
3025	pt_entry_t *pte;
3026	int anychanged;
3027
3028	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3029		pmap_remove(pmap, sva, eva);
3030		return;
3031	}
3032
3033#ifdef PAE
3034	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3035	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3036		return;
3037#else
3038	if (prot & VM_PROT_WRITE)
3039		return;
3040#endif
3041
3042	anychanged = 0;
3043
3044	vm_page_lock_queues();
3045	sched_pin();
3046	PMAP_LOCK(pmap);
3047	for (; sva < eva; sva = pdnxt) {
3048		pt_entry_t obits, pbits;
3049		unsigned pdirindex;
3050
3051		pdnxt = (sva + NBPDR) & ~PDRMASK;
3052		if (pdnxt < sva)
3053			pdnxt = eva;
3054
3055		pdirindex = sva >> PDRSHIFT;
3056		ptpaddr = pmap->pm_pdir[pdirindex];
3057
3058		/*
3059		 * Weed out invalid mappings. Note: we assume that the page
3060		 * directory table is always allocated, and in kernel virtual.
3061		 */
3062		if (ptpaddr == 0)
3063			continue;
3064
3065		/*
3066		 * Check for large page.
3067		 */
3068		if ((ptpaddr & PG_PS) != 0) {
3069			/*
3070			 * Are we protecting the entire large page?  If not,
3071			 * demote the mapping and fall through.
3072			 */
3073			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3074				/*
3075				 * The TLB entry for a PG_G mapping is
3076				 * invalidated by pmap_protect_pde().
3077				 */
3078				if (pmap_protect_pde(pmap,
3079				    &pmap->pm_pdir[pdirindex], sva, prot))
3080					anychanged = 1;
3081				continue;
3082			} else if (!pmap_demote_pde(pmap,
3083			    &pmap->pm_pdir[pdirindex], sva)) {
3084				/* The large page mapping was destroyed. */
3085				continue;
3086			}
3087		}
3088
3089		if (pdnxt > eva)
3090			pdnxt = eva;
3091
3092		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3093		    sva += PAGE_SIZE) {
3094			vm_page_t m;
3095
3096retry:
3097			/*
3098			 * Regardless of whether a pte is 32 or 64 bits in
3099			 * size, PG_RW, PG_A, and PG_M are among the least
3100			 * significant 32 bits.
3101			 */
3102			obits = pbits = *pte;
3103			if ((pbits & PG_V) == 0)
3104				continue;
3105
3106			if ((prot & VM_PROT_WRITE) == 0) {
3107				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3108				    (PG_MANAGED | PG_M | PG_RW)) {
3109					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3110					vm_page_dirty(m);
3111				}
3112				pbits &= ~(PG_RW | PG_M);
3113			}
3114#ifdef PAE
3115			if ((prot & VM_PROT_EXECUTE) == 0)
3116				pbits |= pg_nx;
3117#endif
3118
3119			if (pbits != obits) {
3120#ifdef PAE
3121				if (!atomic_cmpset_64(pte, obits, pbits))
3122					goto retry;
3123#else
3124				if (!atomic_cmpset_int((u_int *)pte, obits,
3125				    pbits))
3126					goto retry;
3127#endif
3128				if (obits & PG_G)
3129					pmap_invalidate_page(pmap, sva);
3130				else
3131					anychanged = 1;
3132			}
3133		}
3134	}
3135	sched_unpin();
3136	if (anychanged)
3137		pmap_invalidate_all(pmap);
3138	vm_page_unlock_queues();
3139	PMAP_UNLOCK(pmap);
3140}
3141
3142/*
3143 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3144 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3145 * For promotion to occur, two conditions must be met: (1) the 4KB page
3146 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3147 * mappings must have identical characteristics.
3148 *
3149 * Managed (PG_MANAGED) mappings within the kernel address space are not
3150 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3151 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3152 * pmap.
3153 */
3154static void
3155pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3156{
3157	pd_entry_t newpde;
3158	pt_entry_t *firstpte, oldpte, pa, *pte;
3159	vm_offset_t oldpteva;
3160	vm_page_t mpte;
3161
3162	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3163
3164	/*
3165	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3166	 * either invalid, unused, or does not map the first 4KB physical page
3167	 * within a 2- or 4MB page.
3168	 */
3169	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3170setpde:
3171	newpde = *firstpte;
3172	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3173		pmap_pde_p_failures++;
3174		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3175		    " in pmap %p", va, pmap);
3176		return;
3177	}
3178	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3179		pmap_pde_p_failures++;
3180		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3181		    " in pmap %p", va, pmap);
3182		return;
3183	}
3184	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3185		/*
3186		 * When PG_M is already clear, PG_RW can be cleared without
3187		 * a TLB invalidation.
3188		 */
3189		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3190		    ~PG_RW))
3191			goto setpde;
3192		newpde &= ~PG_RW;
3193	}
3194
3195	/*
3196	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3197	 * PTE maps an unexpected 4KB physical page or does not have identical
3198	 * characteristics to the first PTE.
3199	 */
3200	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3201	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3202setpte:
3203		oldpte = *pte;
3204		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3205			pmap_pde_p_failures++;
3206			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3207			    " in pmap %p", va, pmap);
3208			return;
3209		}
3210		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3211			/*
3212			 * When PG_M is already clear, PG_RW can be cleared
3213			 * without a TLB invalidation.
3214			 */
3215			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3216			    oldpte & ~PG_RW))
3217				goto setpte;
3218			oldpte &= ~PG_RW;
3219			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3220			    (va & ~PDRMASK);
3221			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3222			    " in pmap %p", oldpteva, pmap);
3223		}
3224		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3225			pmap_pde_p_failures++;
3226			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3227			    " in pmap %p", va, pmap);
3228			return;
3229		}
3230		pa -= PAGE_SIZE;
3231	}
3232
3233	/*
3234	 * Save the page table page in its current state until the PDE
3235	 * mapping the superpage is demoted by pmap_demote_pde() or
3236	 * destroyed by pmap_remove_pde().
3237	 */
3238	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3239	KASSERT(mpte >= vm_page_array &&
3240	    mpte < &vm_page_array[vm_page_array_size],
3241	    ("pmap_promote_pde: page table page is out of range"));
3242	KASSERT(mpte->pindex == va >> PDRSHIFT,
3243	    ("pmap_promote_pde: page table page's pindex is wrong"));
3244	pmap_insert_pt_page(pmap, mpte);
3245
3246	/*
3247	 * Promote the pv entries.
3248	 */
3249	if ((newpde & PG_MANAGED) != 0)
3250		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3251
3252	/*
3253	 * Propagate the PAT index to its proper position.
3254	 */
3255	if ((newpde & PG_PTE_PAT) != 0)
3256		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3257
3258	/*
3259	 * Map the superpage.
3260	 */
3261	if (workaround_erratum383)
3262		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3263	else if (pmap == kernel_pmap)
3264		pmap_kenter_pde(va, PG_PS | newpde);
3265	else
3266		pde_store(pde, PG_PS | newpde);
3267
3268	pmap_pde_promotions++;
3269	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3270	    " in pmap %p", va, pmap);
3271}
3272
3273/*
3274 *	Insert the given physical page (p) at
3275 *	the specified virtual address (v) in the
3276 *	target physical map with the protection requested.
3277 *
3278 *	If specified, the page will be wired down, meaning
3279 *	that the related pte can not be reclaimed.
3280 *
3281 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3282 *	or lose information.  That is, this routine must actually
3283 *	insert this page into the given map NOW.
3284 */
3285void
3286pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3287    vm_prot_t prot, boolean_t wired)
3288{
3289	pd_entry_t *pde;
3290	pt_entry_t *pte;
3291	pt_entry_t newpte, origpte;
3292	pv_entry_t pv;
3293	vm_paddr_t opa, pa;
3294	vm_page_t mpte, om;
3295	boolean_t invlva;
3296
3297	va = trunc_page(va);
3298	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3299	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3300	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3301	    va));
3302	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
3303	    VM_OBJECT_LOCKED(m->object),
3304	    ("pmap_enter: page %p is not busy", m));
3305
3306	mpte = NULL;
3307
3308	vm_page_lock_queues();
3309	PMAP_LOCK(pmap);
3310	sched_pin();
3311
3312	/*
3313	 * In the case that a page table page is not
3314	 * resident, we are creating it here.
3315	 */
3316	if (va < VM_MAXUSER_ADDRESS) {
3317		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3318	}
3319
3320	pde = pmap_pde(pmap, va);
3321	if ((*pde & PG_PS) != 0)
3322		panic("pmap_enter: attempted pmap_enter on 4MB page");
3323	pte = pmap_pte_quick(pmap, va);
3324
3325	/*
3326	 * Page Directory table entry not valid, we need a new PT page
3327	 */
3328	if (pte == NULL) {
3329		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3330			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3331	}
3332
3333	pa = VM_PAGE_TO_PHYS(m);
3334	om = NULL;
3335	origpte = *pte;
3336	opa = origpte & PG_FRAME;
3337
3338	/*
3339	 * Mapping has not changed, must be protection or wiring change.
3340	 */
3341	if (origpte && (opa == pa)) {
3342		/*
3343		 * Wiring change, just update stats. We don't worry about
3344		 * wiring PT pages as they remain resident as long as there
3345		 * are valid mappings in them. Hence, if a user page is wired,
3346		 * the PT page will be also.
3347		 */
3348		if (wired && ((origpte & PG_W) == 0))
3349			pmap->pm_stats.wired_count++;
3350		else if (!wired && (origpte & PG_W))
3351			pmap->pm_stats.wired_count--;
3352
3353		/*
3354		 * Remove extra pte reference
3355		 */
3356		if (mpte)
3357			mpte->wire_count--;
3358
3359		if (origpte & PG_MANAGED) {
3360			om = m;
3361			pa |= PG_MANAGED;
3362		}
3363		goto validate;
3364	}
3365
3366	pv = NULL;
3367
3368	/*
3369	 * Mapping has changed, invalidate old range and fall through to
3370	 * handle validating new mapping.
3371	 */
3372	if (opa) {
3373		if (origpte & PG_W)
3374			pmap->pm_stats.wired_count--;
3375		if (origpte & PG_MANAGED) {
3376			om = PHYS_TO_VM_PAGE(opa);
3377			pv = pmap_pvh_remove(&om->md, pmap, va);
3378		}
3379		if (mpte != NULL) {
3380			mpte->wire_count--;
3381			KASSERT(mpte->wire_count > 0,
3382			    ("pmap_enter: missing reference to page table page,"
3383			     " va: 0x%x", va));
3384		}
3385	} else
3386		pmap->pm_stats.resident_count++;
3387
3388	/*
3389	 * Enter on the PV list if part of our managed memory.
3390	 */
3391	if ((m->oflags & VPO_UNMANAGED) == 0) {
3392		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3393		    ("pmap_enter: managed mapping within the clean submap"));
3394		if (pv == NULL)
3395			pv = get_pv_entry(pmap, FALSE);
3396		pv->pv_va = va;
3397		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3398		pa |= PG_MANAGED;
3399	} else if (pv != NULL)
3400		free_pv_entry(pmap, pv);
3401
3402	/*
3403	 * Increment counters
3404	 */
3405	if (wired)
3406		pmap->pm_stats.wired_count++;
3407
3408validate:
3409	/*
3410	 * Now validate mapping with desired protection/wiring.
3411	 */
3412	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3413	if ((prot & VM_PROT_WRITE) != 0) {
3414		newpte |= PG_RW;
3415		if ((newpte & PG_MANAGED) != 0)
3416			vm_page_flag_set(m, PG_WRITEABLE);
3417	}
3418#ifdef PAE
3419	if ((prot & VM_PROT_EXECUTE) == 0)
3420		newpte |= pg_nx;
3421#endif
3422	if (wired)
3423		newpte |= PG_W;
3424	if (va < VM_MAXUSER_ADDRESS)
3425		newpte |= PG_U;
3426	if (pmap == kernel_pmap)
3427		newpte |= pgeflag;
3428
3429	/*
3430	 * if the mapping or permission bits are different, we need
3431	 * to update the pte.
3432	 */
3433	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3434		newpte |= PG_A;
3435		if ((access & VM_PROT_WRITE) != 0)
3436			newpte |= PG_M;
3437		if (origpte & PG_V) {
3438			invlva = FALSE;
3439			origpte = pte_load_store(pte, newpte);
3440			if (origpte & PG_A) {
3441				if (origpte & PG_MANAGED)
3442					vm_page_flag_set(om, PG_REFERENCED);
3443				if (opa != VM_PAGE_TO_PHYS(m))
3444					invlva = TRUE;
3445#ifdef PAE
3446				if ((origpte & PG_NX) == 0 &&
3447				    (newpte & PG_NX) != 0)
3448					invlva = TRUE;
3449#endif
3450			}
3451			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3452				if ((origpte & PG_MANAGED) != 0)
3453					vm_page_dirty(om);
3454				if ((prot & VM_PROT_WRITE) == 0)
3455					invlva = TRUE;
3456			}
3457			if ((origpte & PG_MANAGED) != 0 &&
3458			    TAILQ_EMPTY(&om->md.pv_list) &&
3459			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))
3460				vm_page_flag_clear(om, PG_WRITEABLE);
3461			if (invlva)
3462				pmap_invalidate_page(pmap, va);
3463		} else
3464			pte_store(pte, newpte);
3465	}
3466
3467	/*
3468	 * If both the page table page and the reservation are fully
3469	 * populated, then attempt promotion.
3470	 */
3471	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3472	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3473		pmap_promote_pde(pmap, pde, va);
3474
3475	sched_unpin();
3476	vm_page_unlock_queues();
3477	PMAP_UNLOCK(pmap);
3478}
3479
3480/*
3481 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3482 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3483 * blocking, (2) a mapping already exists at the specified virtual address, or
3484 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3485 */
3486static boolean_t
3487pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3488{
3489	pd_entry_t *pde, newpde;
3490
3491	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3492	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3493	pde = pmap_pde(pmap, va);
3494	if (*pde != 0) {
3495		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3496		    " in pmap %p", va, pmap);
3497		return (FALSE);
3498	}
3499	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3500	    PG_PS | PG_V;
3501	if ((m->oflags & VPO_UNMANAGED) == 0) {
3502		newpde |= PG_MANAGED;
3503
3504		/*
3505		 * Abort this mapping if its PV entry could not be created.
3506		 */
3507		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3508			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3509			    " in pmap %p", va, pmap);
3510			return (FALSE);
3511		}
3512	}
3513#ifdef PAE
3514	if ((prot & VM_PROT_EXECUTE) == 0)
3515		newpde |= pg_nx;
3516#endif
3517	if (va < VM_MAXUSER_ADDRESS)
3518		newpde |= PG_U;
3519
3520	/*
3521	 * Increment counters.
3522	 */
3523	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3524
3525	/*
3526	 * Map the superpage.
3527	 */
3528	pde_store(pde, newpde);
3529
3530	pmap_pde_mappings++;
3531	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3532	    " in pmap %p", va, pmap);
3533	return (TRUE);
3534}
3535
3536/*
3537 * Maps a sequence of resident pages belonging to the same object.
3538 * The sequence begins with the given page m_start.  This page is
3539 * mapped at the given virtual address start.  Each subsequent page is
3540 * mapped at a virtual address that is offset from start by the same
3541 * amount as the page is offset from m_start within the object.  The
3542 * last page in the sequence is the page with the largest offset from
3543 * m_start that can be mapped at a virtual address less than the given
3544 * virtual address end.  Not every virtual page between start and end
3545 * is mapped; only those for which a resident page exists with the
3546 * corresponding offset from m_start are mapped.
3547 */
3548void
3549pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3550    vm_page_t m_start, vm_prot_t prot)
3551{
3552	vm_offset_t va;
3553	vm_page_t m, mpte;
3554	vm_pindex_t diff, psize;
3555
3556	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3557	psize = atop(end - start);
3558	mpte = NULL;
3559	m = m_start;
3560	vm_page_lock_queues();
3561	PMAP_LOCK(pmap);
3562	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3563		va = start + ptoa(diff);
3564		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3565		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3566		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3567		    pmap_enter_pde(pmap, va, m, prot))
3568			m = &m[NBPDR / PAGE_SIZE - 1];
3569		else
3570			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3571			    mpte);
3572		m = TAILQ_NEXT(m, listq);
3573	}
3574	vm_page_unlock_queues();
3575 	PMAP_UNLOCK(pmap);
3576}
3577
3578/*
3579 * this code makes some *MAJOR* assumptions:
3580 * 1. Current pmap & pmap exists.
3581 * 2. Not wired.
3582 * 3. Read access.
3583 * 4. No page table pages.
3584 * but is *MUCH* faster than pmap_enter...
3585 */
3586
3587void
3588pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3589{
3590
3591	vm_page_lock_queues();
3592	PMAP_LOCK(pmap);
3593	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3594	vm_page_unlock_queues();
3595	PMAP_UNLOCK(pmap);
3596}
3597
3598static vm_page_t
3599pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3600    vm_prot_t prot, vm_page_t mpte)
3601{
3602	pt_entry_t *pte;
3603	vm_paddr_t pa;
3604	vm_page_t free;
3605
3606	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3607	    (m->oflags & VPO_UNMANAGED) != 0,
3608	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3609	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3610	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3611
3612	/*
3613	 * In the case that a page table page is not
3614	 * resident, we are creating it here.
3615	 */
3616	if (va < VM_MAXUSER_ADDRESS) {
3617		unsigned ptepindex;
3618		pd_entry_t ptepa;
3619
3620		/*
3621		 * Calculate pagetable page index
3622		 */
3623		ptepindex = va >> PDRSHIFT;
3624		if (mpte && (mpte->pindex == ptepindex)) {
3625			mpte->wire_count++;
3626		} else {
3627			/*
3628			 * Get the page directory entry
3629			 */
3630			ptepa = pmap->pm_pdir[ptepindex];
3631
3632			/*
3633			 * If the page table page is mapped, we just increment
3634			 * the hold count, and activate it.
3635			 */
3636			if (ptepa) {
3637				if (ptepa & PG_PS)
3638					return (NULL);
3639				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3640				mpte->wire_count++;
3641			} else {
3642				mpte = _pmap_allocpte(pmap, ptepindex,
3643				    M_NOWAIT);
3644				if (mpte == NULL)
3645					return (mpte);
3646			}
3647		}
3648	} else {
3649		mpte = NULL;
3650	}
3651
3652	/*
3653	 * This call to vtopte makes the assumption that we are
3654	 * entering the page into the current pmap.  In order to support
3655	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3656	 * But that isn't as quick as vtopte.
3657	 */
3658	pte = vtopte(va);
3659	if (*pte) {
3660		if (mpte != NULL) {
3661			mpte->wire_count--;
3662			mpte = NULL;
3663		}
3664		return (mpte);
3665	}
3666
3667	/*
3668	 * Enter on the PV list if part of our managed memory.
3669	 */
3670	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3671	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3672		if (mpte != NULL) {
3673			free = NULL;
3674			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3675				pmap_invalidate_page(pmap, va);
3676				pmap_free_zero_pages(free);
3677			}
3678
3679			mpte = NULL;
3680		}
3681		return (mpte);
3682	}
3683
3684	/*
3685	 * Increment counters
3686	 */
3687	pmap->pm_stats.resident_count++;
3688
3689	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3690#ifdef PAE
3691	if ((prot & VM_PROT_EXECUTE) == 0)
3692		pa |= pg_nx;
3693#endif
3694
3695	/*
3696	 * Now validate mapping with RO protection
3697	 */
3698	if ((m->oflags & VPO_UNMANAGED) != 0)
3699		pte_store(pte, pa | PG_V | PG_U);
3700	else
3701		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3702	return (mpte);
3703}
3704
3705/*
3706 * Make a temporary mapping for a physical address.  This is only intended
3707 * to be used for panic dumps.
3708 */
3709void *
3710pmap_kenter_temporary(vm_paddr_t pa, int i)
3711{
3712	vm_offset_t va;
3713
3714	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3715	pmap_kenter(va, pa);
3716	invlpg(va);
3717	return ((void *)crashdumpmap);
3718}
3719
3720/*
3721 * This code maps large physical mmap regions into the
3722 * processor address space.  Note that some shortcuts
3723 * are taken, but the code works.
3724 */
3725void
3726pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3727    vm_pindex_t pindex, vm_size_t size)
3728{
3729	pd_entry_t *pde;
3730	vm_paddr_t pa, ptepa;
3731	vm_page_t p;
3732	int pat_mode;
3733
3734	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3735	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3736	    ("pmap_object_init_pt: non-device object"));
3737	if (pseflag &&
3738	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3739		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3740			return;
3741		p = vm_page_lookup(object, pindex);
3742		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3743		    ("pmap_object_init_pt: invalid page %p", p));
3744		pat_mode = p->md.pat_mode;
3745
3746		/*
3747		 * Abort the mapping if the first page is not physically
3748		 * aligned to a 2/4MB page boundary.
3749		 */
3750		ptepa = VM_PAGE_TO_PHYS(p);
3751		if (ptepa & (NBPDR - 1))
3752			return;
3753
3754		/*
3755		 * Skip the first page.  Abort the mapping if the rest of
3756		 * the pages are not physically contiguous or have differing
3757		 * memory attributes.
3758		 */
3759		p = TAILQ_NEXT(p, listq);
3760		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3761		    pa += PAGE_SIZE) {
3762			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3763			    ("pmap_object_init_pt: invalid page %p", p));
3764			if (pa != VM_PAGE_TO_PHYS(p) ||
3765			    pat_mode != p->md.pat_mode)
3766				return;
3767			p = TAILQ_NEXT(p, listq);
3768		}
3769
3770		/*
3771		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3772		 * "size" is a multiple of 2/4M, adding the PAT setting to
3773		 * "pa" will not affect the termination of this loop.
3774		 */
3775		PMAP_LOCK(pmap);
3776		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3777		    size; pa += NBPDR) {
3778			pde = pmap_pde(pmap, addr);
3779			if (*pde == 0) {
3780				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3781				    PG_U | PG_RW | PG_V);
3782				pmap->pm_stats.resident_count += NBPDR /
3783				    PAGE_SIZE;
3784				pmap_pde_mappings++;
3785			}
3786			/* Else continue on if the PDE is already valid. */
3787			addr += NBPDR;
3788		}
3789		PMAP_UNLOCK(pmap);
3790	}
3791}
3792
3793/*
3794 *	Routine:	pmap_change_wiring
3795 *	Function:	Change the wiring attribute for a map/virtual-address
3796 *			pair.
3797 *	In/out conditions:
3798 *			The mapping must already exist in the pmap.
3799 */
3800void
3801pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3802{
3803	pd_entry_t *pde;
3804	pt_entry_t *pte;
3805	boolean_t are_queues_locked;
3806
3807	are_queues_locked = FALSE;
3808retry:
3809	PMAP_LOCK(pmap);
3810	pde = pmap_pde(pmap, va);
3811	if ((*pde & PG_PS) != 0) {
3812		if (!wired != ((*pde & PG_W) == 0)) {
3813			if (!are_queues_locked) {
3814				are_queues_locked = TRUE;
3815				if (!mtx_trylock(&vm_page_queue_mtx)) {
3816					PMAP_UNLOCK(pmap);
3817					vm_page_lock_queues();
3818					goto retry;
3819				}
3820			}
3821			if (!pmap_demote_pde(pmap, pde, va))
3822				panic("pmap_change_wiring: demotion failed");
3823		} else
3824			goto out;
3825	}
3826	pte = pmap_pte(pmap, va);
3827
3828	if (wired && !pmap_pte_w(pte))
3829		pmap->pm_stats.wired_count++;
3830	else if (!wired && pmap_pte_w(pte))
3831		pmap->pm_stats.wired_count--;
3832
3833	/*
3834	 * Wiring is not a hardware characteristic so there is no need to
3835	 * invalidate TLB.
3836	 */
3837	pmap_pte_set_w(pte, wired);
3838	pmap_pte_release(pte);
3839out:
3840	if (are_queues_locked)
3841		vm_page_unlock_queues();
3842	PMAP_UNLOCK(pmap);
3843}
3844
3845
3846
3847/*
3848 *	Copy the range specified by src_addr/len
3849 *	from the source map to the range dst_addr/len
3850 *	in the destination map.
3851 *
3852 *	This routine is only advisory and need not do anything.
3853 */
3854
3855void
3856pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3857    vm_offset_t src_addr)
3858{
3859	vm_page_t   free;
3860	vm_offset_t addr;
3861	vm_offset_t end_addr = src_addr + len;
3862	vm_offset_t pdnxt;
3863
3864	if (dst_addr != src_addr)
3865		return;
3866
3867	if (!pmap_is_current(src_pmap))
3868		return;
3869
3870	vm_page_lock_queues();
3871	if (dst_pmap < src_pmap) {
3872		PMAP_LOCK(dst_pmap);
3873		PMAP_LOCK(src_pmap);
3874	} else {
3875		PMAP_LOCK(src_pmap);
3876		PMAP_LOCK(dst_pmap);
3877	}
3878	sched_pin();
3879	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3880		pt_entry_t *src_pte, *dst_pte;
3881		vm_page_t dstmpte, srcmpte;
3882		pd_entry_t srcptepaddr;
3883		unsigned ptepindex;
3884
3885		KASSERT(addr < UPT_MIN_ADDRESS,
3886		    ("pmap_copy: invalid to pmap_copy page tables"));
3887
3888		pdnxt = (addr + NBPDR) & ~PDRMASK;
3889		if (pdnxt < addr)
3890			pdnxt = end_addr;
3891		ptepindex = addr >> PDRSHIFT;
3892
3893		srcptepaddr = src_pmap->pm_pdir[ptepindex];
3894		if (srcptepaddr == 0)
3895			continue;
3896
3897		if (srcptepaddr & PG_PS) {
3898			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
3899			    ((srcptepaddr & PG_MANAGED) == 0 ||
3900			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3901			    PG_PS_FRAME))) {
3902				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
3903				    ~PG_W;
3904				dst_pmap->pm_stats.resident_count +=
3905				    NBPDR / PAGE_SIZE;
3906			}
3907			continue;
3908		}
3909
3910		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3911		KASSERT(srcmpte->wire_count > 0,
3912		    ("pmap_copy: source page table page is unused"));
3913
3914		if (pdnxt > end_addr)
3915			pdnxt = end_addr;
3916
3917		src_pte = vtopte(addr);
3918		while (addr < pdnxt) {
3919			pt_entry_t ptetemp;
3920			ptetemp = *src_pte;
3921			/*
3922			 * we only virtual copy managed pages
3923			 */
3924			if ((ptetemp & PG_MANAGED) != 0) {
3925				dstmpte = pmap_allocpte(dst_pmap, addr,
3926				    M_NOWAIT);
3927				if (dstmpte == NULL)
3928					goto out;
3929				dst_pte = pmap_pte_quick(dst_pmap, addr);
3930				if (*dst_pte == 0 &&
3931				    pmap_try_insert_pv_entry(dst_pmap, addr,
3932				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3933					/*
3934					 * Clear the wired, modified, and
3935					 * accessed (referenced) bits
3936					 * during the copy.
3937					 */
3938					*dst_pte = ptetemp & ~(PG_W | PG_M |
3939					    PG_A);
3940					dst_pmap->pm_stats.resident_count++;
3941	 			} else {
3942					free = NULL;
3943					if (pmap_unwire_pte_hold(dst_pmap,
3944					    dstmpte, &free)) {
3945						pmap_invalidate_page(dst_pmap,
3946						    addr);
3947						pmap_free_zero_pages(free);
3948					}
3949					goto out;
3950				}
3951				if (dstmpte->wire_count >= srcmpte->wire_count)
3952					break;
3953			}
3954			addr += PAGE_SIZE;
3955			src_pte++;
3956		}
3957	}
3958out:
3959	sched_unpin();
3960	vm_page_unlock_queues();
3961	PMAP_UNLOCK(src_pmap);
3962	PMAP_UNLOCK(dst_pmap);
3963}
3964
3965static __inline void
3966pagezero(void *page)
3967{
3968#if defined(I686_CPU)
3969	if (cpu_class == CPUCLASS_686) {
3970#if defined(CPU_ENABLE_SSE)
3971		if (cpu_feature & CPUID_SSE2)
3972			sse2_pagezero(page);
3973		else
3974#endif
3975			i686_pagezero(page);
3976	} else
3977#endif
3978		bzero(page, PAGE_SIZE);
3979}
3980
3981/*
3982 *	pmap_zero_page zeros the specified hardware page by mapping
3983 *	the page into KVM and using bzero to clear its contents.
3984 */
3985void
3986pmap_zero_page(vm_page_t m)
3987{
3988	struct sysmaps *sysmaps;
3989
3990	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3991	mtx_lock(&sysmaps->lock);
3992	if (*sysmaps->CMAP2)
3993		panic("pmap_zero_page: CMAP2 busy");
3994	sched_pin();
3995	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3996	    pmap_cache_bits(m->md.pat_mode, 0);
3997	invlcaddr(sysmaps->CADDR2);
3998	pagezero(sysmaps->CADDR2);
3999	*sysmaps->CMAP2 = 0;
4000	sched_unpin();
4001	mtx_unlock(&sysmaps->lock);
4002}
4003
4004/*
4005 *	pmap_zero_page_area zeros the specified hardware page by mapping
4006 *	the page into KVM and using bzero to clear its contents.
4007 *
4008 *	off and size may not cover an area beyond a single hardware page.
4009 */
4010void
4011pmap_zero_page_area(vm_page_t m, int off, int size)
4012{
4013	struct sysmaps *sysmaps;
4014
4015	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4016	mtx_lock(&sysmaps->lock);
4017	if (*sysmaps->CMAP2)
4018		panic("pmap_zero_page_area: CMAP2 busy");
4019	sched_pin();
4020	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4021	    pmap_cache_bits(m->md.pat_mode, 0);
4022	invlcaddr(sysmaps->CADDR2);
4023	if (off == 0 && size == PAGE_SIZE)
4024		pagezero(sysmaps->CADDR2);
4025	else
4026		bzero((char *)sysmaps->CADDR2 + off, size);
4027	*sysmaps->CMAP2 = 0;
4028	sched_unpin();
4029	mtx_unlock(&sysmaps->lock);
4030}
4031
4032/*
4033 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4034 *	the page into KVM and using bzero to clear its contents.  This
4035 *	is intended to be called from the vm_pagezero process only and
4036 *	outside of Giant.
4037 */
4038void
4039pmap_zero_page_idle(vm_page_t m)
4040{
4041
4042	if (*CMAP3)
4043		panic("pmap_zero_page_idle: CMAP3 busy");
4044	sched_pin();
4045	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4046	    pmap_cache_bits(m->md.pat_mode, 0);
4047	invlcaddr(CADDR3);
4048	pagezero(CADDR3);
4049	*CMAP3 = 0;
4050	sched_unpin();
4051}
4052
4053/*
4054 *	pmap_copy_page copies the specified (machine independent)
4055 *	page by mapping the page into virtual memory and using
4056 *	bcopy to copy the page, one machine dependent page at a
4057 *	time.
4058 */
4059void
4060pmap_copy_page(vm_page_t src, vm_page_t dst)
4061{
4062	struct sysmaps *sysmaps;
4063
4064	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4065	mtx_lock(&sysmaps->lock);
4066	if (*sysmaps->CMAP1)
4067		panic("pmap_copy_page: CMAP1 busy");
4068	if (*sysmaps->CMAP2)
4069		panic("pmap_copy_page: CMAP2 busy");
4070	sched_pin();
4071	invlpg((u_int)sysmaps->CADDR1);
4072	invlpg((u_int)sysmaps->CADDR2);
4073	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4074	    pmap_cache_bits(src->md.pat_mode, 0);
4075	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4076	    pmap_cache_bits(dst->md.pat_mode, 0);
4077	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4078	*sysmaps->CMAP1 = 0;
4079	*sysmaps->CMAP2 = 0;
4080	sched_unpin();
4081	mtx_unlock(&sysmaps->lock);
4082}
4083
4084/*
4085 * Returns true if the pmap's pv is one of the first
4086 * 16 pvs linked to from this page.  This count may
4087 * be changed upwards or downwards in the future; it
4088 * is only necessary that true be returned for a small
4089 * subset of pmaps for proper page aging.
4090 */
4091boolean_t
4092pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4093{
4094	struct md_page *pvh;
4095	pv_entry_t pv;
4096	int loops = 0;
4097	boolean_t rv;
4098
4099	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4100	    ("pmap_page_exists_quick: page %p is not managed", m));
4101	rv = FALSE;
4102	vm_page_lock_queues();
4103	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4104		if (PV_PMAP(pv) == pmap) {
4105			rv = TRUE;
4106			break;
4107		}
4108		loops++;
4109		if (loops >= 16)
4110			break;
4111	}
4112	if (!rv && loops < 16) {
4113		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4114		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4115			if (PV_PMAP(pv) == pmap) {
4116				rv = TRUE;
4117				break;
4118			}
4119			loops++;
4120			if (loops >= 16)
4121				break;
4122		}
4123	}
4124	vm_page_unlock_queues();
4125	return (rv);
4126}
4127
4128/*
4129 *	pmap_page_wired_mappings:
4130 *
4131 *	Return the number of managed mappings to the given physical page
4132 *	that are wired.
4133 */
4134int
4135pmap_page_wired_mappings(vm_page_t m)
4136{
4137	int count;
4138
4139	count = 0;
4140	if ((m->oflags & VPO_UNMANAGED) != 0)
4141		return (count);
4142	vm_page_lock_queues();
4143	count = pmap_pvh_wired_mappings(&m->md, count);
4144	count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count);
4145	vm_page_unlock_queues();
4146	return (count);
4147}
4148
4149/*
4150 *	pmap_pvh_wired_mappings:
4151 *
4152 *	Return the updated number "count" of managed mappings that are wired.
4153 */
4154static int
4155pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4156{
4157	pmap_t pmap;
4158	pt_entry_t *pte;
4159	pv_entry_t pv;
4160
4161	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4162	sched_pin();
4163	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4164		pmap = PV_PMAP(pv);
4165		PMAP_LOCK(pmap);
4166		pte = pmap_pte_quick(pmap, pv->pv_va);
4167		if ((*pte & PG_W) != 0)
4168			count++;
4169		PMAP_UNLOCK(pmap);
4170	}
4171	sched_unpin();
4172	return (count);
4173}
4174
4175/*
4176 * Returns TRUE if the given page is mapped individually or as part of
4177 * a 4mpage.  Otherwise, returns FALSE.
4178 */
4179boolean_t
4180pmap_page_is_mapped(vm_page_t m)
4181{
4182	boolean_t rv;
4183
4184	if ((m->oflags & VPO_UNMANAGED) != 0)
4185		return (FALSE);
4186	vm_page_lock_queues();
4187	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4188	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list);
4189	vm_page_unlock_queues();
4190	return (rv);
4191}
4192
4193/*
4194 * Remove all pages from specified address space
4195 * this aids process exit speeds.  Also, this code
4196 * is special cased for current process only, but
4197 * can have the more generic (and slightly slower)
4198 * mode enabled.  This is much faster than pmap_remove
4199 * in the case of running down an entire address space.
4200 */
4201void
4202pmap_remove_pages(pmap_t pmap)
4203{
4204	pt_entry_t *pte, tpte;
4205	vm_page_t free = NULL;
4206	vm_page_t m, mpte, mt;
4207	pv_entry_t pv;
4208	struct md_page *pvh;
4209	struct pv_chunk *pc, *npc;
4210	int field, idx;
4211	int32_t bit;
4212	uint32_t inuse, bitmask;
4213	int allfree;
4214
4215	if (pmap != PCPU_GET(curpmap)) {
4216		printf("warning: pmap_remove_pages called with non-current pmap\n");
4217		return;
4218	}
4219	vm_page_lock_queues();
4220	PMAP_LOCK(pmap);
4221	sched_pin();
4222	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4223		allfree = 1;
4224		for (field = 0; field < _NPCM; field++) {
4225			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4226			while (inuse != 0) {
4227				bit = bsfl(inuse);
4228				bitmask = 1UL << bit;
4229				idx = field * 32 + bit;
4230				pv = &pc->pc_pventry[idx];
4231				inuse &= ~bitmask;
4232
4233				pte = pmap_pde(pmap, pv->pv_va);
4234				tpte = *pte;
4235				if ((tpte & PG_PS) == 0) {
4236					pte = vtopte(pv->pv_va);
4237					tpte = *pte & ~PG_PTE_PAT;
4238				}
4239
4240				if (tpte == 0) {
4241					printf(
4242					    "TPTE at %p  IS ZERO @ VA %08x\n",
4243					    pte, pv->pv_va);
4244					panic("bad pte");
4245				}
4246
4247/*
4248 * We cannot remove wired pages from a process' mapping at this time
4249 */
4250				if (tpte & PG_W) {
4251					allfree = 0;
4252					continue;
4253				}
4254
4255				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4256				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4257				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4258				    m, (uintmax_t)m->phys_addr,
4259				    (uintmax_t)tpte));
4260
4261				KASSERT(m < &vm_page_array[vm_page_array_size],
4262					("pmap_remove_pages: bad tpte %#jx",
4263					(uintmax_t)tpte));
4264
4265				pte_clear(pte);
4266
4267				/*
4268				 * Update the vm_page_t clean/reference bits.
4269				 */
4270				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4271					if ((tpte & PG_PS) != 0) {
4272						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4273							vm_page_dirty(mt);
4274					} else
4275						vm_page_dirty(m);
4276				}
4277
4278				/* Mark free */
4279				PV_STAT(pv_entry_frees++);
4280				PV_STAT(pv_entry_spare++);
4281				pv_entry_count--;
4282				pc->pc_map[field] |= bitmask;
4283				if ((tpte & PG_PS) != 0) {
4284					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4285					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4286					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4287					if (TAILQ_EMPTY(&pvh->pv_list)) {
4288						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4289							if (TAILQ_EMPTY(&mt->md.pv_list))
4290								vm_page_flag_clear(mt, PG_WRITEABLE);
4291					}
4292					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4293					if (mpte != NULL) {
4294						pmap_remove_pt_page(pmap, mpte);
4295						pmap->pm_stats.resident_count--;
4296						KASSERT(mpte->wire_count == NPTEPG,
4297						    ("pmap_remove_pages: pte page wire count error"));
4298						mpte->wire_count = 0;
4299						pmap_add_delayed_free_list(mpte, &free, FALSE);
4300						atomic_subtract_int(&cnt.v_wire_count, 1);
4301					}
4302				} else {
4303					pmap->pm_stats.resident_count--;
4304					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4305					if (TAILQ_EMPTY(&m->md.pv_list)) {
4306						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4307						if (TAILQ_EMPTY(&pvh->pv_list))
4308							vm_page_flag_clear(m, PG_WRITEABLE);
4309					}
4310					pmap_unuse_pt(pmap, pv->pv_va, &free);
4311				}
4312			}
4313		}
4314		if (allfree) {
4315			PV_STAT(pv_entry_spare -= _NPCPV);
4316			PV_STAT(pc_chunk_count--);
4317			PV_STAT(pc_chunk_frees++);
4318			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4319			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
4320			pmap_qremove((vm_offset_t)pc, 1);
4321			vm_page_unwire(m, 0);
4322			vm_page_free(m);
4323			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
4324		}
4325	}
4326	sched_unpin();
4327	pmap_invalidate_all(pmap);
4328	vm_page_unlock_queues();
4329	PMAP_UNLOCK(pmap);
4330	pmap_free_zero_pages(free);
4331}
4332
4333/*
4334 *	pmap_is_modified:
4335 *
4336 *	Return whether or not the specified physical page was modified
4337 *	in any physical maps.
4338 */
4339boolean_t
4340pmap_is_modified(vm_page_t m)
4341{
4342	boolean_t rv;
4343
4344	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4345	    ("pmap_is_modified: page %p is not managed", m));
4346
4347	/*
4348	 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be
4349	 * concurrently set while the object is locked.  Thus, if PG_WRITEABLE
4350	 * is clear, no PTEs can have PG_M set.
4351	 */
4352	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4353	if ((m->oflags & VPO_BUSY) == 0 &&
4354	    (m->flags & PG_WRITEABLE) == 0)
4355		return (FALSE);
4356	vm_page_lock_queues();
4357	rv = pmap_is_modified_pvh(&m->md) ||
4358	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)));
4359	vm_page_unlock_queues();
4360	return (rv);
4361}
4362
4363/*
4364 * Returns TRUE if any of the given mappings were used to modify
4365 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4366 * mappings are supported.
4367 */
4368static boolean_t
4369pmap_is_modified_pvh(struct md_page *pvh)
4370{
4371	pv_entry_t pv;
4372	pt_entry_t *pte;
4373	pmap_t pmap;
4374	boolean_t rv;
4375
4376	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4377	rv = FALSE;
4378	sched_pin();
4379	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4380		pmap = PV_PMAP(pv);
4381		PMAP_LOCK(pmap);
4382		pte = pmap_pte_quick(pmap, pv->pv_va);
4383		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4384		PMAP_UNLOCK(pmap);
4385		if (rv)
4386			break;
4387	}
4388	sched_unpin();
4389	return (rv);
4390}
4391
4392/*
4393 *	pmap_is_prefaultable:
4394 *
4395 *	Return whether or not the specified virtual address is elgible
4396 *	for prefault.
4397 */
4398boolean_t
4399pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4400{
4401	pd_entry_t *pde;
4402	pt_entry_t *pte;
4403	boolean_t rv;
4404
4405	rv = FALSE;
4406	PMAP_LOCK(pmap);
4407	pde = pmap_pde(pmap, addr);
4408	if (*pde != 0 && (*pde & PG_PS) == 0) {
4409		pte = vtopte(addr);
4410		rv = *pte == 0;
4411	}
4412	PMAP_UNLOCK(pmap);
4413	return (rv);
4414}
4415
4416/*
4417 *	pmap_is_referenced:
4418 *
4419 *	Return whether or not the specified physical page was referenced
4420 *	in any physical maps.
4421 */
4422boolean_t
4423pmap_is_referenced(vm_page_t m)
4424{
4425	boolean_t rv;
4426
4427	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4428	    ("pmap_is_referenced: page %p is not managed", m));
4429	vm_page_lock_queues();
4430	rv = pmap_is_referenced_pvh(&m->md) ||
4431	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)));
4432	vm_page_unlock_queues();
4433	return (rv);
4434}
4435
4436/*
4437 * Returns TRUE if any of the given mappings were referenced and FALSE
4438 * otherwise.  Both page and 4mpage mappings are supported.
4439 */
4440static boolean_t
4441pmap_is_referenced_pvh(struct md_page *pvh)
4442{
4443	pv_entry_t pv;
4444	pt_entry_t *pte;
4445	pmap_t pmap;
4446	boolean_t rv;
4447
4448	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4449	rv = FALSE;
4450	sched_pin();
4451	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4452		pmap = PV_PMAP(pv);
4453		PMAP_LOCK(pmap);
4454		pte = pmap_pte_quick(pmap, pv->pv_va);
4455		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4456		PMAP_UNLOCK(pmap);
4457		if (rv)
4458			break;
4459	}
4460	sched_unpin();
4461	return (rv);
4462}
4463
4464/*
4465 * Clear the write and modified bits in each of the given page's mappings.
4466 */
4467void
4468pmap_remove_write(vm_page_t m)
4469{
4470	struct md_page *pvh;
4471	pv_entry_t next_pv, pv;
4472	pmap_t pmap;
4473	pd_entry_t *pde;
4474	pt_entry_t oldpte, *pte;
4475	vm_offset_t va;
4476
4477	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4478	    ("pmap_remove_write: page %p is not managed", m));
4479
4480	/*
4481	 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be set by
4482	 * another thread while the object is locked.  Thus, if PG_WRITEABLE
4483	 * is clear, no page table entries need updating.
4484	 */
4485	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4486	if ((m->oflags & VPO_BUSY) == 0 &&
4487	    (m->flags & PG_WRITEABLE) == 0)
4488		return;
4489	vm_page_lock_queues();
4490	sched_pin();
4491	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4492	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4493		va = pv->pv_va;
4494		pmap = PV_PMAP(pv);
4495		PMAP_LOCK(pmap);
4496		pde = pmap_pde(pmap, va);
4497		if ((*pde & PG_RW) != 0)
4498			(void)pmap_demote_pde(pmap, pde, va);
4499		PMAP_UNLOCK(pmap);
4500	}
4501	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4502		pmap = PV_PMAP(pv);
4503		PMAP_LOCK(pmap);
4504		pde = pmap_pde(pmap, pv->pv_va);
4505		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4506		    " a 4mpage in page %p's pv list", m));
4507		pte = pmap_pte_quick(pmap, pv->pv_va);
4508retry:
4509		oldpte = *pte;
4510		if ((oldpte & PG_RW) != 0) {
4511			/*
4512			 * Regardless of whether a pte is 32 or 64 bits
4513			 * in size, PG_RW and PG_M are among the least
4514			 * significant 32 bits.
4515			 */
4516			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4517			    oldpte & ~(PG_RW | PG_M)))
4518				goto retry;
4519			if ((oldpte & PG_M) != 0)
4520				vm_page_dirty(m);
4521			pmap_invalidate_page(pmap, pv->pv_va);
4522		}
4523		PMAP_UNLOCK(pmap);
4524	}
4525	vm_page_flag_clear(m, PG_WRITEABLE);
4526	sched_unpin();
4527	vm_page_unlock_queues();
4528}
4529
4530/*
4531 *	pmap_ts_referenced:
4532 *
4533 *	Return a count of reference bits for a page, clearing those bits.
4534 *	It is not necessary for every reference bit to be cleared, but it
4535 *	is necessary that 0 only be returned when there are truly no
4536 *	reference bits set.
4537 *
4538 *	XXX: The exact number of bits to check and clear is a matter that
4539 *	should be tested and standardized at some point in the future for
4540 *	optimal aging of shared pages.
4541 */
4542int
4543pmap_ts_referenced(vm_page_t m)
4544{
4545	struct md_page *pvh;
4546	pv_entry_t pv, pvf, pvn;
4547	pmap_t pmap;
4548	pd_entry_t oldpde, *pde;
4549	pt_entry_t *pte;
4550	vm_offset_t va;
4551	int rtval = 0;
4552
4553	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4554	    ("pmap_ts_referenced: page %p is not managed", m));
4555	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4556	vm_page_lock_queues();
4557	sched_pin();
4558	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4559		va = pv->pv_va;
4560		pmap = PV_PMAP(pv);
4561		PMAP_LOCK(pmap);
4562		pde = pmap_pde(pmap, va);
4563		oldpde = *pde;
4564		if ((oldpde & PG_A) != 0) {
4565			if (pmap_demote_pde(pmap, pde, va)) {
4566				if ((oldpde & PG_W) == 0) {
4567					/*
4568					 * Remove the mapping to a single page
4569					 * so that a subsequent access may
4570					 * repromote.  Since the underlying
4571					 * page table page is fully populated,
4572					 * this removal never frees a page
4573					 * table page.
4574					 */
4575					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4576					    PG_PS_FRAME);
4577					pmap_remove_page(pmap, va, NULL);
4578					rtval++;
4579					if (rtval > 4) {
4580						PMAP_UNLOCK(pmap);
4581						goto out;
4582					}
4583				}
4584			}
4585		}
4586		PMAP_UNLOCK(pmap);
4587	}
4588	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4589		pvf = pv;
4590		do {
4591			pvn = TAILQ_NEXT(pv, pv_list);
4592			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4593			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4594			pmap = PV_PMAP(pv);
4595			PMAP_LOCK(pmap);
4596			pde = pmap_pde(pmap, pv->pv_va);
4597			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4598			    " found a 4mpage in page %p's pv list", m));
4599			pte = pmap_pte_quick(pmap, pv->pv_va);
4600			if ((*pte & PG_A) != 0) {
4601				atomic_clear_int((u_int *)pte, PG_A);
4602				pmap_invalidate_page(pmap, pv->pv_va);
4603				rtval++;
4604				if (rtval > 4)
4605					pvn = NULL;
4606			}
4607			PMAP_UNLOCK(pmap);
4608		} while ((pv = pvn) != NULL && pv != pvf);
4609	}
4610out:
4611	sched_unpin();
4612	vm_page_unlock_queues();
4613	return (rtval);
4614}
4615
4616/*
4617 *	Clear the modify bits on the specified physical page.
4618 */
4619void
4620pmap_clear_modify(vm_page_t m)
4621{
4622	struct md_page *pvh;
4623	pv_entry_t next_pv, pv;
4624	pmap_t pmap;
4625	pd_entry_t oldpde, *pde;
4626	pt_entry_t oldpte, *pte;
4627	vm_offset_t va;
4628
4629	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4630	    ("pmap_clear_modify: page %p is not managed", m));
4631	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4632	KASSERT((m->oflags & VPO_BUSY) == 0,
4633	    ("pmap_clear_modify: page %p is busy", m));
4634
4635	/*
4636	 * If the page is not PG_WRITEABLE, then no PTEs can have PG_M set.
4637	 * If the object containing the page is locked and the page is not
4638	 * VPO_BUSY, then PG_WRITEABLE cannot be concurrently set.
4639	 */
4640	if ((m->flags & PG_WRITEABLE) == 0)
4641		return;
4642	vm_page_lock_queues();
4643	sched_pin();
4644	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4645	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4646		va = pv->pv_va;
4647		pmap = PV_PMAP(pv);
4648		PMAP_LOCK(pmap);
4649		pde = pmap_pde(pmap, va);
4650		oldpde = *pde;
4651		if ((oldpde & PG_RW) != 0) {
4652			if (pmap_demote_pde(pmap, pde, va)) {
4653				if ((oldpde & PG_W) == 0) {
4654					/*
4655					 * Write protect the mapping to a
4656					 * single page so that a subsequent
4657					 * write access may repromote.
4658					 */
4659					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4660					    PG_PS_FRAME);
4661					pte = pmap_pte_quick(pmap, va);
4662					oldpte = *pte;
4663					if ((oldpte & PG_V) != 0) {
4664						/*
4665						 * Regardless of whether a pte is 32 or 64 bits
4666						 * in size, PG_RW and PG_M are among the least
4667						 * significant 32 bits.
4668						 */
4669						while (!atomic_cmpset_int((u_int *)pte,
4670						    oldpte,
4671						    oldpte & ~(PG_M | PG_RW)))
4672							oldpte = *pte;
4673						vm_page_dirty(m);
4674						pmap_invalidate_page(pmap, va);
4675					}
4676				}
4677			}
4678		}
4679		PMAP_UNLOCK(pmap);
4680	}
4681	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4682		pmap = PV_PMAP(pv);
4683		PMAP_LOCK(pmap);
4684		pde = pmap_pde(pmap, pv->pv_va);
4685		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4686		    " a 4mpage in page %p's pv list", m));
4687		pte = pmap_pte_quick(pmap, pv->pv_va);
4688		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4689			/*
4690			 * Regardless of whether a pte is 32 or 64 bits
4691			 * in size, PG_M is among the least significant
4692			 * 32 bits.
4693			 */
4694			atomic_clear_int((u_int *)pte, PG_M);
4695			pmap_invalidate_page(pmap, pv->pv_va);
4696		}
4697		PMAP_UNLOCK(pmap);
4698	}
4699	sched_unpin();
4700	vm_page_unlock_queues();
4701}
4702
4703/*
4704 *	pmap_clear_reference:
4705 *
4706 *	Clear the reference bit on the specified physical page.
4707 */
4708void
4709pmap_clear_reference(vm_page_t m)
4710{
4711	struct md_page *pvh;
4712	pv_entry_t next_pv, pv;
4713	pmap_t pmap;
4714	pd_entry_t oldpde, *pde;
4715	pt_entry_t *pte;
4716	vm_offset_t va;
4717
4718	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4719	    ("pmap_clear_reference: page %p is not managed", m));
4720	vm_page_lock_queues();
4721	sched_pin();
4722	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4723	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4724		va = pv->pv_va;
4725		pmap = PV_PMAP(pv);
4726		PMAP_LOCK(pmap);
4727		pde = pmap_pde(pmap, va);
4728		oldpde = *pde;
4729		if ((oldpde & PG_A) != 0) {
4730			if (pmap_demote_pde(pmap, pde, va)) {
4731				/*
4732				 * Remove the mapping to a single page so
4733				 * that a subsequent access may repromote.
4734				 * Since the underlying page table page is
4735				 * fully populated, this removal never frees
4736				 * a page table page.
4737				 */
4738				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4739				    PG_PS_FRAME);
4740				pmap_remove_page(pmap, va, NULL);
4741			}
4742		}
4743		PMAP_UNLOCK(pmap);
4744	}
4745	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4746		pmap = PV_PMAP(pv);
4747		PMAP_LOCK(pmap);
4748		pde = pmap_pde(pmap, pv->pv_va);
4749		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4750		    " a 4mpage in page %p's pv list", m));
4751		pte = pmap_pte_quick(pmap, pv->pv_va);
4752		if ((*pte & PG_A) != 0) {
4753			/*
4754			 * Regardless of whether a pte is 32 or 64 bits
4755			 * in size, PG_A is among the least significant
4756			 * 32 bits.
4757			 */
4758			atomic_clear_int((u_int *)pte, PG_A);
4759			pmap_invalidate_page(pmap, pv->pv_va);
4760		}
4761		PMAP_UNLOCK(pmap);
4762	}
4763	sched_unpin();
4764	vm_page_unlock_queues();
4765}
4766
4767/*
4768 * Miscellaneous support routines follow
4769 */
4770
4771/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4772static __inline void
4773pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4774{
4775	u_int opte, npte;
4776
4777	/*
4778	 * The cache mode bits are all in the low 32-bits of the
4779	 * PTE, so we can just spin on updating the low 32-bits.
4780	 */
4781	do {
4782		opte = *(u_int *)pte;
4783		npte = opte & ~PG_PTE_CACHE;
4784		npte |= cache_bits;
4785	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4786}
4787
4788/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4789static __inline void
4790pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4791{
4792	u_int opde, npde;
4793
4794	/*
4795	 * The cache mode bits are all in the low 32-bits of the
4796	 * PDE, so we can just spin on updating the low 32-bits.
4797	 */
4798	do {
4799		opde = *(u_int *)pde;
4800		npde = opde & ~PG_PDE_CACHE;
4801		npde |= cache_bits;
4802	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4803}
4804
4805/*
4806 * Map a set of physical memory pages into the kernel virtual
4807 * address space. Return a pointer to where it is mapped. This
4808 * routine is intended to be used for mapping device memory,
4809 * NOT real memory.
4810 */
4811void *
4812pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4813{
4814	vm_offset_t va, offset;
4815	vm_size_t tmpsize;
4816
4817	offset = pa & PAGE_MASK;
4818	size = roundup(offset + size, PAGE_SIZE);
4819	pa = pa & PG_FRAME;
4820
4821	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4822		va = KERNBASE + pa;
4823	else
4824		va = kmem_alloc_nofault(kernel_map, size);
4825	if (!va)
4826		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4827
4828	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4829		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4830	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4831	pmap_invalidate_cache_range(va, va + size);
4832	return ((void *)(va + offset));
4833}
4834
4835void *
4836pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4837{
4838
4839	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4840}
4841
4842void *
4843pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4844{
4845
4846	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4847}
4848
4849void
4850pmap_unmapdev(vm_offset_t va, vm_size_t size)
4851{
4852	vm_offset_t base, offset, tmpva;
4853
4854	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4855		return;
4856	base = trunc_page(va);
4857	offset = va & PAGE_MASK;
4858	size = roundup(offset + size, PAGE_SIZE);
4859	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4860		pmap_kremove(tmpva);
4861	pmap_invalidate_range(kernel_pmap, va, tmpva);
4862	kmem_free(kernel_map, base, size);
4863}
4864
4865/*
4866 * Sets the memory attribute for the specified page.
4867 */
4868void
4869pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4870{
4871
4872	m->md.pat_mode = ma;
4873	if ((m->flags & PG_FICTITIOUS) != 0)
4874		return;
4875
4876	/*
4877	 * If "m" is a normal page, flush it from the cache.
4878	 * See pmap_invalidate_cache_range().
4879	 *
4880	 * First, try to find an existing mapping of the page by sf
4881	 * buffer. sf_buf_invalidate_cache() modifies mapping and
4882	 * flushes the cache.
4883	 */
4884	if (sf_buf_invalidate_cache(m))
4885		return;
4886
4887	/*
4888	 * If page is not mapped by sf buffer, but CPU does not
4889	 * support self snoop, map the page transient and do
4890	 * invalidation. In the worst case, whole cache is flushed by
4891	 * pmap_invalidate_cache_range().
4892	 */
4893	if ((cpu_feature & CPUID_SS) == 0)
4894		pmap_flush_page(m);
4895}
4896
4897static void
4898pmap_flush_page(vm_page_t m)
4899{
4900	struct sysmaps *sysmaps;
4901	vm_offset_t sva, eva;
4902
4903	if ((cpu_feature & CPUID_CLFSH) != 0) {
4904		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4905		mtx_lock(&sysmaps->lock);
4906		if (*sysmaps->CMAP2)
4907			panic("pmap_flush_page: CMAP2 busy");
4908		sched_pin();
4909		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
4910		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
4911		invlcaddr(sysmaps->CADDR2);
4912		sva = (vm_offset_t)sysmaps->CADDR2;
4913		eva = sva + PAGE_SIZE;
4914
4915		/*
4916		 * Use mfence despite the ordering implied by
4917		 * mtx_{un,}lock() because clflush is not guaranteed
4918		 * to be ordered by any other instruction.
4919		 */
4920		mfence();
4921		for (; sva < eva; sva += cpu_clflush_line_size)
4922			clflush(sva);
4923		mfence();
4924		*sysmaps->CMAP2 = 0;
4925		sched_unpin();
4926		mtx_unlock(&sysmaps->lock);
4927	} else
4928		pmap_invalidate_cache();
4929}
4930
4931/*
4932 * Changes the specified virtual address range's memory type to that given by
4933 * the parameter "mode".  The specified virtual address range must be
4934 * completely contained within either the kernel map.
4935 *
4936 * Returns zero if the change completed successfully, and either EINVAL or
4937 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4938 * of the virtual address range was not mapped, and ENOMEM is returned if
4939 * there was insufficient memory available to complete the change.
4940 */
4941int
4942pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4943{
4944	vm_offset_t base, offset, tmpva;
4945	pd_entry_t *pde;
4946	pt_entry_t *pte;
4947	int cache_bits_pte, cache_bits_pde;
4948	boolean_t changed;
4949
4950	base = trunc_page(va);
4951	offset = va & PAGE_MASK;
4952	size = roundup(offset + size, PAGE_SIZE);
4953
4954	/*
4955	 * Only supported on kernel virtual addresses above the recursive map.
4956	 */
4957	if (base < VM_MIN_KERNEL_ADDRESS)
4958		return (EINVAL);
4959
4960	cache_bits_pde = pmap_cache_bits(mode, 1);
4961	cache_bits_pte = pmap_cache_bits(mode, 0);
4962	changed = FALSE;
4963
4964	/*
4965	 * Pages that aren't mapped aren't supported.  Also break down
4966	 * 2/4MB pages into 4KB pages if required.
4967	 */
4968	PMAP_LOCK(kernel_pmap);
4969	for (tmpva = base; tmpva < base + size; ) {
4970		pde = pmap_pde(kernel_pmap, tmpva);
4971		if (*pde == 0) {
4972			PMAP_UNLOCK(kernel_pmap);
4973			return (EINVAL);
4974		}
4975		if (*pde & PG_PS) {
4976			/*
4977			 * If the current 2/4MB page already has
4978			 * the required memory type, then we need not
4979			 * demote this page.  Just increment tmpva to
4980			 * the next 2/4MB page frame.
4981			 */
4982			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4983				tmpva = trunc_4mpage(tmpva) + NBPDR;
4984				continue;
4985			}
4986
4987			/*
4988			 * If the current offset aligns with a 2/4MB
4989			 * page frame and there is at least 2/4MB left
4990			 * within the range, then we need not break
4991			 * down this page into 4KB pages.
4992			 */
4993			if ((tmpva & PDRMASK) == 0 &&
4994			    tmpva + PDRMASK < base + size) {
4995				tmpva += NBPDR;
4996				continue;
4997			}
4998			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
4999				PMAP_UNLOCK(kernel_pmap);
5000				return (ENOMEM);
5001			}
5002		}
5003		pte = vtopte(tmpva);
5004		if (*pte == 0) {
5005			PMAP_UNLOCK(kernel_pmap);
5006			return (EINVAL);
5007		}
5008		tmpva += PAGE_SIZE;
5009	}
5010	PMAP_UNLOCK(kernel_pmap);
5011
5012	/*
5013	 * Ok, all the pages exist, so run through them updating their
5014	 * cache mode if required.
5015	 */
5016	for (tmpva = base; tmpva < base + size; ) {
5017		pde = pmap_pde(kernel_pmap, tmpva);
5018		if (*pde & PG_PS) {
5019			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5020				pmap_pde_attr(pde, cache_bits_pde);
5021				changed = TRUE;
5022			}
5023			tmpva = trunc_4mpage(tmpva) + NBPDR;
5024		} else {
5025			pte = vtopte(tmpva);
5026			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5027				pmap_pte_attr(pte, cache_bits_pte);
5028				changed = TRUE;
5029			}
5030			tmpva += PAGE_SIZE;
5031		}
5032	}
5033
5034	/*
5035	 * Flush CPU caches to make sure any data isn't cached that
5036	 * shouldn't be, etc.
5037	 */
5038	if (changed) {
5039		pmap_invalidate_range(kernel_pmap, base, tmpva);
5040		pmap_invalidate_cache_range(base, tmpva);
5041	}
5042	return (0);
5043}
5044
5045/*
5046 * perform the pmap work for mincore
5047 */
5048int
5049pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5050{
5051	pd_entry_t *pdep;
5052	pt_entry_t *ptep, pte;
5053	vm_paddr_t pa;
5054	int val;
5055
5056	PMAP_LOCK(pmap);
5057retry:
5058	pdep = pmap_pde(pmap, addr);
5059	if (*pdep != 0) {
5060		if (*pdep & PG_PS) {
5061			pte = *pdep;
5062			/* Compute the physical address of the 4KB page. */
5063			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5064			    PG_FRAME;
5065			val = MINCORE_SUPER;
5066		} else {
5067			ptep = pmap_pte(pmap, addr);
5068			pte = *ptep;
5069			pmap_pte_release(ptep);
5070			pa = pte & PG_FRAME;
5071			val = 0;
5072		}
5073	} else {
5074		pte = 0;
5075		pa = 0;
5076		val = 0;
5077	}
5078	if ((pte & PG_V) != 0) {
5079		val |= MINCORE_INCORE;
5080		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5081			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5082		if ((pte & PG_A) != 0)
5083			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5084	}
5085	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5086	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5087	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5088		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5089		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5090			goto retry;
5091	} else
5092		PA_UNLOCK_COND(*locked_pa);
5093	PMAP_UNLOCK(pmap);
5094	return (val);
5095}
5096
5097void
5098pmap_activate(struct thread *td)
5099{
5100	pmap_t	pmap, oldpmap;
5101	u_int	cpuid;
5102	u_int32_t  cr3;
5103
5104	critical_enter();
5105	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5106	oldpmap = PCPU_GET(curpmap);
5107	cpuid = PCPU_GET(cpuid);
5108#if defined(SMP)
5109	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5110	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5111#else
5112	CPU_CLR(cpuid, &oldpmap->pm_active);
5113	CPU_SET(cpuid, &pmap->pm_active);
5114#endif
5115#ifdef PAE
5116	cr3 = vtophys(pmap->pm_pdpt);
5117#else
5118	cr3 = vtophys(pmap->pm_pdir);
5119#endif
5120	/*
5121	 * pmap_activate is for the current thread on the current cpu
5122	 */
5123	td->td_pcb->pcb_cr3 = cr3;
5124	load_cr3(cr3);
5125	PCPU_SET(curpmap, pmap);
5126	critical_exit();
5127}
5128
5129void
5130pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5131{
5132}
5133
5134/*
5135 *	Increase the starting virtual address of the given mapping if a
5136 *	different alignment might result in more superpage mappings.
5137 */
5138void
5139pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5140    vm_offset_t *addr, vm_size_t size)
5141{
5142	vm_offset_t superpage_offset;
5143
5144	if (size < NBPDR)
5145		return;
5146	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5147		offset += ptoa(object->pg_color);
5148	superpage_offset = offset & PDRMASK;
5149	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5150	    (*addr & PDRMASK) == superpage_offset)
5151		return;
5152	if ((*addr & PDRMASK) < superpage_offset)
5153		*addr = (*addr & ~PDRMASK) + superpage_offset;
5154	else
5155		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5156}
5157
5158
5159#if defined(PMAP_DEBUG)
5160pmap_pid_dump(int pid)
5161{
5162	pmap_t pmap;
5163	struct proc *p;
5164	int npte = 0;
5165	int index;
5166
5167	sx_slock(&allproc_lock);
5168	FOREACH_PROC_IN_SYSTEM(p) {
5169		if (p->p_pid != pid)
5170			continue;
5171
5172		if (p->p_vmspace) {
5173			int i,j;
5174			index = 0;
5175			pmap = vmspace_pmap(p->p_vmspace);
5176			for (i = 0; i < NPDEPTD; i++) {
5177				pd_entry_t *pde;
5178				pt_entry_t *pte;
5179				vm_offset_t base = i << PDRSHIFT;
5180
5181				pde = &pmap->pm_pdir[i];
5182				if (pde && pmap_pde_v(pde)) {
5183					for (j = 0; j < NPTEPG; j++) {
5184						vm_offset_t va = base + (j << PAGE_SHIFT);
5185						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5186							if (index) {
5187								index = 0;
5188								printf("\n");
5189							}
5190							sx_sunlock(&allproc_lock);
5191							return (npte);
5192						}
5193						pte = pmap_pte(pmap, va);
5194						if (pte && pmap_pte_v(pte)) {
5195							pt_entry_t pa;
5196							vm_page_t m;
5197							pa = *pte;
5198							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5199							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5200								va, pa, m->hold_count, m->wire_count, m->flags);
5201							npte++;
5202							index++;
5203							if (index >= 2) {
5204								index = 0;
5205								printf("\n");
5206							} else {
5207								printf(" ");
5208							}
5209						}
5210					}
5211				}
5212			}
5213		}
5214	}
5215	sx_sunlock(&allproc_lock);
5216	return (npte);
5217}
5218#endif
5219
5220#if defined(DEBUG)
5221
5222static void	pads(pmap_t pm);
5223void		pmap_pvdump(vm_offset_t pa);
5224
5225/* print address space of pmap*/
5226static void
5227pads(pmap_t pm)
5228{
5229	int i, j;
5230	vm_paddr_t va;
5231	pt_entry_t *ptep;
5232
5233	if (pm == kernel_pmap)
5234		return;
5235	for (i = 0; i < NPDEPTD; i++)
5236		if (pm->pm_pdir[i])
5237			for (j = 0; j < NPTEPG; j++) {
5238				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5239				if (pm == kernel_pmap && va < KERNBASE)
5240					continue;
5241				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5242					continue;
5243				ptep = pmap_pte(pm, va);
5244				if (pmap_pte_v(ptep))
5245					printf("%x:%x ", va, *ptep);
5246			};
5247
5248}
5249
5250void
5251pmap_pvdump(vm_paddr_t pa)
5252{
5253	pv_entry_t pv;
5254	pmap_t pmap;
5255	vm_page_t m;
5256
5257	printf("pa %x", pa);
5258	m = PHYS_TO_VM_PAGE(pa);
5259	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5260		pmap = PV_PMAP(pv);
5261		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5262		pads(pmap);
5263	}
5264	printf(" ");
5265}
5266#endif
5267