pmap.c revision 238005
1130803Smarcel/*-
2130803Smarcel * Copyright (c) 1991 Regents of the University of California.
3130803Smarcel * All rights reserved.
4130803Smarcel * Copyright (c) 1994 John S. Dyson
5130803Smarcel * All rights reserved.
6130803Smarcel * Copyright (c) 1994 David Greenman
7130803Smarcel * All rights reserved.
8130803Smarcel * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9130803Smarcel * All rights reserved.
10130803Smarcel *
11130803Smarcel * This code is derived from software contributed to Berkeley by
12130803Smarcel * the Systems Programming Group of the University of Utah Computer
13130803Smarcel * Science Department and William Jolitz of UUNET Technologies Inc.
14130803Smarcel *
15130803Smarcel * Redistribution and use in source and binary forms, with or without
16130803Smarcel * modification, are permitted provided that the following conditions
17130803Smarcel * are met:
18130803Smarcel * 1. Redistributions of source code must retain the above copyright
19130803Smarcel *    notice, this list of conditions and the following disclaimer.
20130803Smarcel * 2. Redistributions in binary form must reproduce the above copyright
21130803Smarcel *    notice, this list of conditions and the following disclaimer in the
22130803Smarcel *    documentation and/or other materials provided with the distribution.
23130803Smarcel * 3. All advertising materials mentioning features or use of this software
24130803Smarcel *    must display the following acknowledgement:
25130803Smarcel *	This product includes software developed by the University of
26130803Smarcel *	California, Berkeley and its contributors.
27130803Smarcel * 4. Neither the name of the University nor the names of its contributors
28130803Smarcel *    may be used to endorse or promote products derived from this software
29130803Smarcel *    without specific prior written permission.
30130803Smarcel *
31130803Smarcel * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32130803Smarcel * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33130803Smarcel * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34130803Smarcel * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35130803Smarcel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36130803Smarcel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37130803Smarcel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38130803Smarcel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39130803Smarcel * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40130803Smarcel * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41130803Smarcel * SUCH DAMAGE.
42130803Smarcel *
43130803Smarcel *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44130803Smarcel */
45130803Smarcel/*-
46130803Smarcel * Copyright (c) 2003 Networks Associates Technology, Inc.
47130803Smarcel * All rights reserved.
48130803Smarcel *
49130803Smarcel * This software was developed for the FreeBSD Project by Jake Burkholder,
50130803Smarcel * Safeport Network Services, and Network Associates Laboratories, the
51130803Smarcel * Security Research Division of Network Associates, Inc. under
52130803Smarcel * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53130803Smarcel * CHATS research program.
54130803Smarcel *
55130803Smarcel * Redistribution and use in source and binary forms, with or without
56130803Smarcel * modification, are permitted provided that the following conditions
57130803Smarcel * are met:
58130803Smarcel * 1. Redistributions of source code must retain the above copyright
59130803Smarcel *    notice, this list of conditions and the following disclaimer.
60130803Smarcel * 2. Redistributions in binary form must reproduce the above copyright
61130803Smarcel *    notice, this list of conditions and the following disclaimer in the
62130803Smarcel *    documentation and/or other materials provided with the distribution.
63130803Smarcel *
64130803Smarcel * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65130803Smarcel * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66130803Smarcel * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67130803Smarcel * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68130803Smarcel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69130803Smarcel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70130803Smarcel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71130803Smarcel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72130803Smarcel * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73130803Smarcel * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74130803Smarcel * SUCH DAMAGE.
75130803Smarcel */
76130803Smarcel
77130803Smarcel#include <sys/cdefs.h>
78130803Smarcel__FBSDID("$FreeBSD: stable/9/sys/i386/i386/pmap.c 238005 2012-07-02 17:22:38Z alc $");
79130803Smarcel
80130803Smarcel/*
81130803Smarcel *	Manages physical address maps.
82130803Smarcel *
83130803Smarcel *	In addition to hardware address maps, this
84130803Smarcel *	module is called upon to provide software-use-only
85130803Smarcel *	maps which may or may not be stored in the same
86130803Smarcel *	form as hardware maps.  These pseudo-maps are
87130803Smarcel *	used to store intermediate results from copy
88130803Smarcel *	operations to and from address spaces.
89130803Smarcel *
90130803Smarcel *	Since the information managed by this module is
91130803Smarcel *	also stored by the logical address mapping module,
92130803Smarcel *	this module may throw away valid virtual-to-physical
93130803Smarcel *	mappings at almost any time.  However, invalidations
94130803Smarcel *	of virtual-to-physical mappings must be done as
95130803Smarcel *	requested.
96130803Smarcel *
97130803Smarcel *	In order to cope with hardware architectures which
98130803Smarcel *	make virtual-to-physical map invalidates expensive,
99130803Smarcel *	this module may delay invalidate or reduced protection
100130803Smarcel *	operations until such time as they are actually
101130803Smarcel *	necessary.  This module is given full information as
102130803Smarcel *	to which processors are currently using which maps,
103130803Smarcel *	and to when physical maps must be made correct.
104130803Smarcel */
105130803Smarcel
106130803Smarcel#include "opt_cpu.h"
107130803Smarcel#include "opt_pmap.h"
108130803Smarcel#include "opt_smp.h"
109130803Smarcel#include "opt_xbox.h"
110130803Smarcel
111130803Smarcel#include <sys/param.h>
112130803Smarcel#include <sys/systm.h>
113130803Smarcel#include <sys/kernel.h>
114130803Smarcel#include <sys/ktr.h>
115130803Smarcel#include <sys/lock.h>
116130803Smarcel#include <sys/malloc.h>
117130803Smarcel#include <sys/mman.h>
118130803Smarcel#include <sys/msgbuf.h>
119130803Smarcel#include <sys/mutex.h>
120130803Smarcel#include <sys/proc.h>
121130803Smarcel#include <sys/rwlock.h>
122130803Smarcel#include <sys/sf_buf.h>
123130803Smarcel#include <sys/sx.h>
124130803Smarcel#include <sys/vmmeter.h>
125130803Smarcel#include <sys/sched.h>
126130803Smarcel#include <sys/sysctl.h>
127130803Smarcel#ifdef SMP
128130803Smarcel#include <sys/smp.h>
129130803Smarcel#else
130130803Smarcel#include <sys/cpuset.h>
131130803Smarcel#endif
132130803Smarcel
133130803Smarcel#include <vm/vm.h>
134130803Smarcel#include <vm/vm_param.h>
135130803Smarcel#include <vm/vm_kern.h>
136130803Smarcel#include <vm/vm_page.h>
137130803Smarcel#include <vm/vm_map.h>
138130803Smarcel#include <vm/vm_object.h>
139130803Smarcel#include <vm/vm_extern.h>
140130803Smarcel#include <vm/vm_pageout.h>
141130803Smarcel#include <vm/vm_pager.h>
142130803Smarcel#include <vm/vm_reserv.h>
143130803Smarcel#include <vm/uma.h>
144130803Smarcel
145130803Smarcel#include <machine/cpu.h>
146130803Smarcel#include <machine/cputypes.h>
147130803Smarcel#include <machine/md_var.h>
148130803Smarcel#include <machine/pcb.h>
149130803Smarcel#include <machine/specialreg.h>
150130803Smarcel#ifdef SMP
151130803Smarcel#include <machine/smp.h>
152130803Smarcel#endif
153130803Smarcel
154130803Smarcel#ifdef XBOX
155130803Smarcel#include <machine/xbox.h>
156130803Smarcel#endif
157130803Smarcel
158130803Smarcel#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
159130803Smarcel#define CPU_ENABLE_SSE
160130803Smarcel#endif
161130803Smarcel
162130803Smarcel#ifndef PMAP_SHPGPERPROC
163130803Smarcel#define PMAP_SHPGPERPROC 200
164130803Smarcel#endif
165130803Smarcel
166130803Smarcel#if !defined(DIAGNOSTIC)
167130803Smarcel#ifdef __GNUC_GNU_INLINE__
168130803Smarcel#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
169130803Smarcel#else
170130803Smarcel#define PMAP_INLINE	extern inline
171130803Smarcel#endif
172130803Smarcel#else
173130803Smarcel#define PMAP_INLINE
174130803Smarcel#endif
175130803Smarcel
176130803Smarcel#ifdef PV_STATS
177130803Smarcel#define PV_STAT(x)	do { x ; } while (0)
178130803Smarcel#else
179130803Smarcel#define PV_STAT(x)	do { } while (0)
180130803Smarcel#endif
181130803Smarcel
182130803Smarcel#define	pa_index(pa)	((pa) >> PDRSHIFT)
183130803Smarcel#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
184130803Smarcel
185130803Smarcel/*
186130803Smarcel * Get PDEs and PTEs for user/kernel address space
187130803Smarcel */
188130803Smarcel#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
189130803Smarcel#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
190130803Smarcel
191130803Smarcel#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
192130803Smarcel#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
193130803Smarcel#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
194130803Smarcel#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
195130803Smarcel#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
196130803Smarcel
197130803Smarcel#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
198130803Smarcel    atomic_clear_int((u_int *)(pte), PG_W))
199130803Smarcel#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
200130803Smarcel
201130803Smarcelstruct pmap kernel_pmap_store;
202130803SmarcelLIST_HEAD(pmaplist, pmap);
203130803Smarcelstatic struct pmaplist allpmaps;
204130803Smarcelstatic struct mtx allpmaps_lock;
205130803Smarcel
206130803Smarcelvm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
207130803Smarcelvm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
208130803Smarcelint pgeflag = 0;		/* PG_G or-in */
209130803Smarcelint pseflag = 0;		/* PG_PS or-in */
210130803Smarcel
211130803Smarcelstatic int nkpt = NKPT;
212130803Smarcelvm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
213130803Smarcelextern u_int32_t KERNend;
214130803Smarcelextern u_int32_t KPTphys;
215130803Smarcel
216130803Smarcel#ifdef PAE
217130803Smarcelpt_entry_t pg_nx;
218130803Smarcelstatic uma_zone_t pdptzone;
219130803Smarcel#endif
220130803Smarcel
221130803SmarcelSYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
222130803Smarcel
223130803Smarcelstatic int pat_works = 1;
224130803SmarcelSYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
225130803Smarcel    "Is page attribute table fully functional?");
226130803Smarcel
227130803Smarcelstatic int pg_ps_enabled = 1;
228130803SmarcelSYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
229130803Smarcel    "Are large page mappings enabled?");
230130803Smarcel
231130803Smarcel#define	PAT_INDEX_SIZE	8
232130803Smarcelstatic int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
233130803Smarcel
234130803Smarcel/*
235130803Smarcel * Isolate the global pv list lock from data and other locks to prevent false
236130803Smarcel * sharing within the cache.
237130803Smarcel */
238130803Smarcelstatic struct {
239130803Smarcel	struct rwlock	lock;
240130803Smarcel	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
241130803Smarcel} pvh_global __aligned(CACHE_LINE_SIZE);
242130803Smarcel
243130803Smarcel#define	pvh_global_lock	pvh_global.lock
244130803Smarcel
245130803Smarcel/*
246130803Smarcel * Data for the pv entry allocation mechanism
247130803Smarcel */
248130803Smarcelstatic TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
249130803Smarcelstatic int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
250130803Smarcelstatic struct md_page *pv_table;
251130803Smarcelstatic int shpgperproc = PMAP_SHPGPERPROC;
252130803Smarcel
253130803Smarcelstruct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
254130803Smarcelint pv_maxchunks;			/* How many chunks we have KVA for */
255130803Smarcelvm_offset_t pv_vafree;			/* freelist stored in the PTE */
256130803Smarcel
257130803Smarcel/*
258130803Smarcel * All those kernel PT submaps that BSD is so fond of
259130803Smarcel */
260130803Smarcelstruct sysmaps {
261130803Smarcel	struct	mtx lock;
262130803Smarcel	pt_entry_t *CMAP1;
263130803Smarcel	pt_entry_t *CMAP2;
264130803Smarcel	caddr_t	CADDR1;
265130803Smarcel	caddr_t	CADDR2;
266130803Smarcel};
267130803Smarcelstatic struct sysmaps sysmaps_pcpu[MAXCPU];
268130803Smarcelpt_entry_t *CMAP1 = 0;
269130803Smarcelstatic pt_entry_t *CMAP3;
270130803Smarcelstatic pd_entry_t *KPTD;
271130803Smarcelcaddr_t CADDR1 = 0, ptvmmap = 0;
272130803Smarcelstatic caddr_t CADDR3;
273130803Smarcelstruct msgbuf *msgbufp = 0;
274130803Smarcel
275130803Smarcel/*
276130803Smarcel * Crashdump maps.
277130803Smarcel */
278130803Smarcelstatic caddr_t crashdumpmap;
279130803Smarcel
280130803Smarcelstatic pt_entry_t *PMAP1 = 0, *PMAP2;
281130803Smarcelstatic pt_entry_t *PADDR1 = 0, *PADDR2;
282130803Smarcel#ifdef SMP
283130803Smarcelstatic int PMAP1cpu;
284130803Smarcelstatic int PMAP1changedcpu;
285130803SmarcelSYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
286130803Smarcel	   &PMAP1changedcpu, 0,
287130803Smarcel	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
288130803Smarcel#endif
289130803Smarcelstatic int PMAP1changed;
290130803SmarcelSYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
291130803Smarcel	   &PMAP1changed, 0,
292130803Smarcel	   "Number of times pmap_pte_quick changed PMAP1");
293130803Smarcelstatic int PMAP1unchanged;
294130803SmarcelSYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
295130803Smarcel	   &PMAP1unchanged, 0,
296130803Smarcel	   "Number of times pmap_pte_quick didn't change PMAP1");
297130803Smarcelstatic struct mtx PMAP2mutex;
298130803Smarcel
299130803Smarcelstatic void	free_pv_chunk(struct pv_chunk *pc);
300130803Smarcelstatic void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
301130803Smarcelstatic pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
302130803Smarcelstatic void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
303130803Smarcelstatic boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
304130803Smarcelstatic void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
305static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
306static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
307		    vm_offset_t va);
308static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
309
310static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
311static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
312    vm_prot_t prot);
313static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
314    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
315static void pmap_flush_page(vm_page_t m);
316static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
317static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
318static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
319static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
320static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
321static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
322static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
323static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
324static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
325static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
326    vm_prot_t prot);
327static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
328static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
329    vm_page_t *free);
330static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
331    vm_page_t *free);
332static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
333static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
334    vm_page_t *free);
335static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
336					vm_offset_t va);
337static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
338static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
339    vm_page_t m);
340static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
341    pd_entry_t newpde);
342static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
343
344static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
345
346static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags);
347static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
348static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
349static void pmap_pte_release(pt_entry_t *pte);
350static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
351#ifdef PAE
352static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
353#endif
354static void pmap_set_pg(void);
355
356static __inline void pagezero(void *page);
357
358CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
359CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
360
361/*
362 * If you get an error here, then you set KVA_PAGES wrong! See the
363 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
364 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
365 */
366CTASSERT(KERNBASE % (1 << 24) == 0);
367
368/*
369 *	Bootstrap the system enough to run with virtual memory.
370 *
371 *	On the i386 this is called after mapping has already been enabled
372 *	and just syncs the pmap module with what has already been done.
373 *	[We can't call it easily with mapping off since the kernel is not
374 *	mapped with PA == VA, hence we would have to relocate every address
375 *	from the linked base (virtual) address "KERNBASE" to the actual
376 *	(physical) address starting relative to 0]
377 */
378void
379pmap_bootstrap(vm_paddr_t firstaddr)
380{
381	vm_offset_t va;
382	pt_entry_t *pte, *unused;
383	struct sysmaps *sysmaps;
384	int i;
385
386	/*
387	 * Initialize the first available kernel virtual address.  However,
388	 * using "firstaddr" may waste a few pages of the kernel virtual
389	 * address space, because locore may not have mapped every physical
390	 * page that it allocated.  Preferably, locore would provide a first
391	 * unused virtual address in addition to "firstaddr".
392	 */
393	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
394
395	virtual_end = VM_MAX_KERNEL_ADDRESS;
396
397	/*
398	 * Initialize the kernel pmap (which is statically allocated).
399	 */
400	PMAP_LOCK_INIT(kernel_pmap);
401	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
402#ifdef PAE
403	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
404#endif
405	kernel_pmap->pm_root = NULL;
406	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
407	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
408
409 	/*
410	 * Initialize the global pv list lock.
411	 */
412	rw_init(&pvh_global_lock, "pvh global");
413
414	LIST_INIT(&allpmaps);
415
416	/*
417	 * Request a spin mutex so that changes to allpmaps cannot be
418	 * preempted by smp_rendezvous_cpus().  Otherwise,
419	 * pmap_update_pde_kernel() could access allpmaps while it is
420	 * being changed.
421	 */
422	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
423	mtx_lock_spin(&allpmaps_lock);
424	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
425	mtx_unlock_spin(&allpmaps_lock);
426
427	/*
428	 * Reserve some special page table entries/VA space for temporary
429	 * mapping of pages.
430	 */
431#define	SYSMAP(c, p, v, n)	\
432	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
433
434	va = virtual_avail;
435	pte = vtopte(va);
436
437	/*
438	 * CMAP1/CMAP2 are used for zeroing and copying pages.
439	 * CMAP3 is used for the idle process page zeroing.
440	 */
441	for (i = 0; i < MAXCPU; i++) {
442		sysmaps = &sysmaps_pcpu[i];
443		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
444		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
445		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
446	}
447	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
448	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
449
450	/*
451	 * Crashdump maps.
452	 */
453	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
454
455	/*
456	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
457	 */
458	SYSMAP(caddr_t, unused, ptvmmap, 1)
459
460	/*
461	 * msgbufp is used to map the system message buffer.
462	 */
463	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
464
465	/*
466	 * KPTmap is used by pmap_kextract().
467	 *
468	 * KPTmap is first initialized by locore.  However, that initial
469	 * KPTmap can only support NKPT page table pages.  Here, a larger
470	 * KPTmap is created that can support KVA_PAGES page table pages.
471	 */
472	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
473
474	for (i = 0; i < NKPT; i++)
475		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
476
477	/*
478	 * Adjust the start of the KPTD and KPTmap so that the implementation
479	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
480	 */
481	KPTD -= KPTDI;
482	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
483
484	/*
485	 * ptemap is used for pmap_pte_quick
486	 */
487	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
488	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
489
490	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
491
492	virtual_avail = va;
493
494	/*
495	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
496	 * physical memory region that is used by the ACPI wakeup code.  This
497	 * mapping must not have PG_G set.
498	 */
499#ifdef XBOX
500	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
501	 * an early stadium, we cannot yet neatly map video memory ... :-(
502	 * Better fixes are very welcome! */
503	if (!arch_i386_is_xbox)
504#endif
505	for (i = 1; i < NKPT; i++)
506		PTD[i] = 0;
507
508	/* Initialize the PAT MSR if present. */
509	pmap_init_pat();
510
511	/* Turn on PG_G on kernel page(s) */
512	pmap_set_pg();
513}
514
515/*
516 * Setup the PAT MSR.
517 */
518void
519pmap_init_pat(void)
520{
521	int pat_table[PAT_INDEX_SIZE];
522	uint64_t pat_msr;
523	u_long cr0, cr4;
524	int i;
525
526	/* Set default PAT index table. */
527	for (i = 0; i < PAT_INDEX_SIZE; i++)
528		pat_table[i] = -1;
529	pat_table[PAT_WRITE_BACK] = 0;
530	pat_table[PAT_WRITE_THROUGH] = 1;
531	pat_table[PAT_UNCACHEABLE] = 3;
532	pat_table[PAT_WRITE_COMBINING] = 3;
533	pat_table[PAT_WRITE_PROTECTED] = 3;
534	pat_table[PAT_UNCACHED] = 3;
535
536	/* Bail if this CPU doesn't implement PAT. */
537	if ((cpu_feature & CPUID_PAT) == 0) {
538		for (i = 0; i < PAT_INDEX_SIZE; i++)
539			pat_index[i] = pat_table[i];
540		pat_works = 0;
541		return;
542	}
543
544	/*
545	 * Due to some Intel errata, we can only safely use the lower 4
546	 * PAT entries.
547	 *
548	 *   Intel Pentium III Processor Specification Update
549	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
550	 * or Mode C Paging)
551	 *
552	 *   Intel Pentium IV  Processor Specification Update
553	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
554	 */
555	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
556	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
557		pat_works = 0;
558
559	/* Initialize default PAT entries. */
560	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
561	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
562	    PAT_VALUE(2, PAT_UNCACHED) |
563	    PAT_VALUE(3, PAT_UNCACHEABLE) |
564	    PAT_VALUE(4, PAT_WRITE_BACK) |
565	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
566	    PAT_VALUE(6, PAT_UNCACHED) |
567	    PAT_VALUE(7, PAT_UNCACHEABLE);
568
569	if (pat_works) {
570		/*
571		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
572		 * Program 5 and 6 as WP and WC.
573		 * Leave 4 and 7 as WB and UC.
574		 */
575		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
576		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
577		    PAT_VALUE(6, PAT_WRITE_COMBINING);
578		pat_table[PAT_UNCACHED] = 2;
579		pat_table[PAT_WRITE_PROTECTED] = 5;
580		pat_table[PAT_WRITE_COMBINING] = 6;
581	} else {
582		/*
583		 * Just replace PAT Index 2 with WC instead of UC-.
584		 */
585		pat_msr &= ~PAT_MASK(2);
586		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
587		pat_table[PAT_WRITE_COMBINING] = 2;
588	}
589
590	/* Disable PGE. */
591	cr4 = rcr4();
592	load_cr4(cr4 & ~CR4_PGE);
593
594	/* Disable caches (CD = 1, NW = 0). */
595	cr0 = rcr0();
596	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
597
598	/* Flushes caches and TLBs. */
599	wbinvd();
600	invltlb();
601
602	/* Update PAT and index table. */
603	wrmsr(MSR_PAT, pat_msr);
604	for (i = 0; i < PAT_INDEX_SIZE; i++)
605		pat_index[i] = pat_table[i];
606
607	/* Flush caches and TLBs again. */
608	wbinvd();
609	invltlb();
610
611	/* Restore caches and PGE. */
612	load_cr0(cr0);
613	load_cr4(cr4);
614}
615
616/*
617 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
618 */
619static void
620pmap_set_pg(void)
621{
622	pt_entry_t *pte;
623	vm_offset_t va, endva;
624
625	if (pgeflag == 0)
626		return;
627
628	endva = KERNBASE + KERNend;
629
630	if (pseflag) {
631		va = KERNBASE + KERNLOAD;
632		while (va  < endva) {
633			pdir_pde(PTD, va) |= pgeflag;
634			invltlb();	/* Play it safe, invltlb() every time */
635			va += NBPDR;
636		}
637	} else {
638		va = (vm_offset_t)btext;
639		while (va < endva) {
640			pte = vtopte(va);
641			if (*pte)
642				*pte |= pgeflag;
643			invltlb();	/* Play it safe, invltlb() every time */
644			va += PAGE_SIZE;
645		}
646	}
647}
648
649/*
650 * Initialize a vm_page's machine-dependent fields.
651 */
652void
653pmap_page_init(vm_page_t m)
654{
655
656	TAILQ_INIT(&m->md.pv_list);
657	m->md.pat_mode = PAT_WRITE_BACK;
658}
659
660#ifdef PAE
661static void *
662pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
663{
664
665	/* Inform UMA that this allocator uses kernel_map/object. */
666	*flags = UMA_SLAB_KERNEL;
667	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
668	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
669}
670#endif
671
672/*
673 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
674 * Requirements:
675 *  - Must deal with pages in order to ensure that none of the PG_* bits
676 *    are ever set, PG_V in particular.
677 *  - Assumes we can write to ptes without pte_store() atomic ops, even
678 *    on PAE systems.  This should be ok.
679 *  - Assumes nothing will ever test these addresses for 0 to indicate
680 *    no mapping instead of correctly checking PG_V.
681 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
682 * Because PG_V is never set, there can be no mappings to invalidate.
683 */
684static vm_offset_t
685pmap_ptelist_alloc(vm_offset_t *head)
686{
687	pt_entry_t *pte;
688	vm_offset_t va;
689
690	va = *head;
691	if (va == 0)
692		return (va);	/* Out of memory */
693	pte = vtopte(va);
694	*head = *pte;
695	if (*head & PG_V)
696		panic("pmap_ptelist_alloc: va with PG_V set!");
697	*pte = 0;
698	return (va);
699}
700
701static void
702pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
703{
704	pt_entry_t *pte;
705
706	if (va & PG_V)
707		panic("pmap_ptelist_free: freeing va with PG_V set!");
708	pte = vtopte(va);
709	*pte = *head;		/* virtual! PG_V is 0 though */
710	*head = va;
711}
712
713static void
714pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
715{
716	int i;
717	vm_offset_t va;
718
719	*head = 0;
720	for (i = npages - 1; i >= 0; i--) {
721		va = (vm_offset_t)base + i * PAGE_SIZE;
722		pmap_ptelist_free(head, va);
723	}
724}
725
726
727/*
728 *	Initialize the pmap module.
729 *	Called by vm_init, to initialize any structures that the pmap
730 *	system needs to map virtual memory.
731 */
732void
733pmap_init(void)
734{
735	vm_page_t mpte;
736	vm_size_t s;
737	int i, pv_npg;
738
739	/*
740	 * Initialize the vm page array entries for the kernel pmap's
741	 * page table pages.
742	 */
743	for (i = 0; i < NKPT; i++) {
744		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
745		KASSERT(mpte >= vm_page_array &&
746		    mpte < &vm_page_array[vm_page_array_size],
747		    ("pmap_init: page table page is out of range"));
748		mpte->pindex = i + KPTDI;
749		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
750	}
751
752	/*
753	 * Initialize the address space (zone) for the pv entries.  Set a
754	 * high water mark so that the system can recover from excessive
755	 * numbers of pv entries.
756	 */
757	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
758	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
759	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
760	pv_entry_max = roundup(pv_entry_max, _NPCPV);
761	pv_entry_high_water = 9 * (pv_entry_max / 10);
762
763	/*
764	 * If the kernel is running in a virtual machine on an AMD Family 10h
765	 * processor, then it must assume that MCA is enabled by the virtual
766	 * machine monitor.
767	 */
768	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
769	    CPUID_TO_FAMILY(cpu_id) == 0x10)
770		workaround_erratum383 = 1;
771
772	/*
773	 * Are large page mappings supported and enabled?
774	 */
775	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
776	if (pseflag == 0)
777		pg_ps_enabled = 0;
778	else if (pg_ps_enabled) {
779		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
780		    ("pmap_init: can't assign to pagesizes[1]"));
781		pagesizes[1] = NBPDR;
782	}
783
784	/*
785	 * Calculate the size of the pv head table for superpages.
786	 */
787	for (i = 0; phys_avail[i + 1]; i += 2);
788	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
789
790	/*
791	 * Allocate memory for the pv head table for superpages.
792	 */
793	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
794	s = round_page(s);
795	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
796	for (i = 0; i < pv_npg; i++)
797		TAILQ_INIT(&pv_table[i].pv_list);
798
799	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
800	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
801	    PAGE_SIZE * pv_maxchunks);
802	if (pv_chunkbase == NULL)
803		panic("pmap_init: not enough kvm for pv chunks");
804	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
805#ifdef PAE
806	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
807	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
808	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
809	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
810#endif
811}
812
813
814SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
815	"Max number of PV entries");
816SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
817	"Page share factor per proc");
818
819SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
820    "2/4MB page mapping counters");
821
822static u_long pmap_pde_demotions;
823SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
824    &pmap_pde_demotions, 0, "2/4MB page demotions");
825
826static u_long pmap_pde_mappings;
827SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
828    &pmap_pde_mappings, 0, "2/4MB page mappings");
829
830static u_long pmap_pde_p_failures;
831SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
832    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
833
834static u_long pmap_pde_promotions;
835SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
836    &pmap_pde_promotions, 0, "2/4MB page promotions");
837
838/***************************************************
839 * Low level helper routines.....
840 ***************************************************/
841
842/*
843 * Determine the appropriate bits to set in a PTE or PDE for a specified
844 * caching mode.
845 */
846int
847pmap_cache_bits(int mode, boolean_t is_pde)
848{
849	int cache_bits, pat_flag, pat_idx;
850
851	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
852		panic("Unknown caching mode %d\n", mode);
853
854	/* The PAT bit is different for PTE's and PDE's. */
855	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
856
857	/* Map the caching mode to a PAT index. */
858	pat_idx = pat_index[mode];
859
860	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
861	cache_bits = 0;
862	if (pat_idx & 0x4)
863		cache_bits |= pat_flag;
864	if (pat_idx & 0x2)
865		cache_bits |= PG_NC_PCD;
866	if (pat_idx & 0x1)
867		cache_bits |= PG_NC_PWT;
868	return (cache_bits);
869}
870
871/*
872 * The caller is responsible for maintaining TLB consistency.
873 */
874static void
875pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
876{
877	pd_entry_t *pde;
878	pmap_t pmap;
879	boolean_t PTD_updated;
880
881	PTD_updated = FALSE;
882	mtx_lock_spin(&allpmaps_lock);
883	LIST_FOREACH(pmap, &allpmaps, pm_list) {
884		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
885		    PG_FRAME))
886			PTD_updated = TRUE;
887		pde = pmap_pde(pmap, va);
888		pde_store(pde, newpde);
889	}
890	mtx_unlock_spin(&allpmaps_lock);
891	KASSERT(PTD_updated,
892	    ("pmap_kenter_pde: current page table is not in allpmaps"));
893}
894
895/*
896 * After changing the page size for the specified virtual address in the page
897 * table, flush the corresponding entries from the processor's TLB.  Only the
898 * calling processor's TLB is affected.
899 *
900 * The calling thread must be pinned to a processor.
901 */
902static void
903pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
904{
905	u_long cr4;
906
907	if ((newpde & PG_PS) == 0)
908		/* Demotion: flush a specific 2MB page mapping. */
909		invlpg(va);
910	else if ((newpde & PG_G) == 0)
911		/*
912		 * Promotion: flush every 4KB page mapping from the TLB
913		 * because there are too many to flush individually.
914		 */
915		invltlb();
916	else {
917		/*
918		 * Promotion: flush every 4KB page mapping from the TLB,
919		 * including any global (PG_G) mappings.
920		 */
921		cr4 = rcr4();
922		load_cr4(cr4 & ~CR4_PGE);
923		/*
924		 * Although preemption at this point could be detrimental to
925		 * performance, it would not lead to an error.  PG_G is simply
926		 * ignored if CR4.PGE is clear.  Moreover, in case this block
927		 * is re-entered, the load_cr4() either above or below will
928		 * modify CR4.PGE flushing the TLB.
929		 */
930		load_cr4(cr4 | CR4_PGE);
931	}
932}
933#ifdef SMP
934/*
935 * For SMP, these functions have to use the IPI mechanism for coherence.
936 *
937 * N.B.: Before calling any of the following TLB invalidation functions,
938 * the calling processor must ensure that all stores updating a non-
939 * kernel page table are globally performed.  Otherwise, another
940 * processor could cache an old, pre-update entry without being
941 * invalidated.  This can happen one of two ways: (1) The pmap becomes
942 * active on another processor after its pm_active field is checked by
943 * one of the following functions but before a store updating the page
944 * table is globally performed. (2) The pmap becomes active on another
945 * processor before its pm_active field is checked but due to
946 * speculative loads one of the following functions stills reads the
947 * pmap as inactive on the other processor.
948 *
949 * The kernel page table is exempt because its pm_active field is
950 * immutable.  The kernel page table is always active on every
951 * processor.
952 */
953void
954pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
955{
956	cpuset_t other_cpus;
957	u_int cpuid;
958
959	sched_pin();
960	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
961		invlpg(va);
962		smp_invlpg(va);
963	} else {
964		cpuid = PCPU_GET(cpuid);
965		other_cpus = all_cpus;
966		CPU_CLR(cpuid, &other_cpus);
967		if (CPU_ISSET(cpuid, &pmap->pm_active))
968			invlpg(va);
969		CPU_AND(&other_cpus, &pmap->pm_active);
970		if (!CPU_EMPTY(&other_cpus))
971			smp_masked_invlpg(other_cpus, va);
972	}
973	sched_unpin();
974}
975
976void
977pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
978{
979	cpuset_t other_cpus;
980	vm_offset_t addr;
981	u_int cpuid;
982
983	sched_pin();
984	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
985		for (addr = sva; addr < eva; addr += PAGE_SIZE)
986			invlpg(addr);
987		smp_invlpg_range(sva, eva);
988	} else {
989		cpuid = PCPU_GET(cpuid);
990		other_cpus = all_cpus;
991		CPU_CLR(cpuid, &other_cpus);
992		if (CPU_ISSET(cpuid, &pmap->pm_active))
993			for (addr = sva; addr < eva; addr += PAGE_SIZE)
994				invlpg(addr);
995		CPU_AND(&other_cpus, &pmap->pm_active);
996		if (!CPU_EMPTY(&other_cpus))
997			smp_masked_invlpg_range(other_cpus, sva, eva);
998	}
999	sched_unpin();
1000}
1001
1002void
1003pmap_invalidate_all(pmap_t pmap)
1004{
1005	cpuset_t other_cpus;
1006	u_int cpuid;
1007
1008	sched_pin();
1009	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1010		invltlb();
1011		smp_invltlb();
1012	} else {
1013		cpuid = PCPU_GET(cpuid);
1014		other_cpus = all_cpus;
1015		CPU_CLR(cpuid, &other_cpus);
1016		if (CPU_ISSET(cpuid, &pmap->pm_active))
1017			invltlb();
1018		CPU_AND(&other_cpus, &pmap->pm_active);
1019		if (!CPU_EMPTY(&other_cpus))
1020			smp_masked_invltlb(other_cpus);
1021	}
1022	sched_unpin();
1023}
1024
1025void
1026pmap_invalidate_cache(void)
1027{
1028
1029	sched_pin();
1030	wbinvd();
1031	smp_cache_flush();
1032	sched_unpin();
1033}
1034
1035struct pde_action {
1036	cpuset_t invalidate;	/* processors that invalidate their TLB */
1037	vm_offset_t va;
1038	pd_entry_t *pde;
1039	pd_entry_t newpde;
1040	u_int store;		/* processor that updates the PDE */
1041};
1042
1043static void
1044pmap_update_pde_kernel(void *arg)
1045{
1046	struct pde_action *act = arg;
1047	pd_entry_t *pde;
1048	pmap_t pmap;
1049
1050	if (act->store == PCPU_GET(cpuid)) {
1051
1052		/*
1053		 * Elsewhere, this operation requires allpmaps_lock for
1054		 * synchronization.  Here, it does not because it is being
1055		 * performed in the context of an all_cpus rendezvous.
1056		 */
1057		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1058			pde = pmap_pde(pmap, act->va);
1059			pde_store(pde, act->newpde);
1060		}
1061	}
1062}
1063
1064static void
1065pmap_update_pde_user(void *arg)
1066{
1067	struct pde_action *act = arg;
1068
1069	if (act->store == PCPU_GET(cpuid))
1070		pde_store(act->pde, act->newpde);
1071}
1072
1073static void
1074pmap_update_pde_teardown(void *arg)
1075{
1076	struct pde_action *act = arg;
1077
1078	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1079		pmap_update_pde_invalidate(act->va, act->newpde);
1080}
1081
1082/*
1083 * Change the page size for the specified virtual address in a way that
1084 * prevents any possibility of the TLB ever having two entries that map the
1085 * same virtual address using different page sizes.  This is the recommended
1086 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1087 * machine check exception for a TLB state that is improperly diagnosed as a
1088 * hardware error.
1089 */
1090static void
1091pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1092{
1093	struct pde_action act;
1094	cpuset_t active, other_cpus;
1095	u_int cpuid;
1096
1097	sched_pin();
1098	cpuid = PCPU_GET(cpuid);
1099	other_cpus = all_cpus;
1100	CPU_CLR(cpuid, &other_cpus);
1101	if (pmap == kernel_pmap)
1102		active = all_cpus;
1103	else
1104		active = pmap->pm_active;
1105	if (CPU_OVERLAP(&active, &other_cpus)) {
1106		act.store = cpuid;
1107		act.invalidate = active;
1108		act.va = va;
1109		act.pde = pde;
1110		act.newpde = newpde;
1111		CPU_SET(cpuid, &active);
1112		smp_rendezvous_cpus(active,
1113		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1114		    pmap_update_pde_kernel : pmap_update_pde_user,
1115		    pmap_update_pde_teardown, &act);
1116	} else {
1117		if (pmap == kernel_pmap)
1118			pmap_kenter_pde(va, newpde);
1119		else
1120			pde_store(pde, newpde);
1121		if (CPU_ISSET(cpuid, &active))
1122			pmap_update_pde_invalidate(va, newpde);
1123	}
1124	sched_unpin();
1125}
1126#else /* !SMP */
1127/*
1128 * Normal, non-SMP, 486+ invalidation functions.
1129 * We inline these within pmap.c for speed.
1130 */
1131PMAP_INLINE void
1132pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1133{
1134
1135	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1136		invlpg(va);
1137}
1138
1139PMAP_INLINE void
1140pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1141{
1142	vm_offset_t addr;
1143
1144	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1145		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1146			invlpg(addr);
1147}
1148
1149PMAP_INLINE void
1150pmap_invalidate_all(pmap_t pmap)
1151{
1152
1153	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1154		invltlb();
1155}
1156
1157PMAP_INLINE void
1158pmap_invalidate_cache(void)
1159{
1160
1161	wbinvd();
1162}
1163
1164static void
1165pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1166{
1167
1168	if (pmap == kernel_pmap)
1169		pmap_kenter_pde(va, newpde);
1170	else
1171		pde_store(pde, newpde);
1172	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1173		pmap_update_pde_invalidate(va, newpde);
1174}
1175#endif /* !SMP */
1176
1177#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1178
1179void
1180pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1181{
1182
1183	KASSERT((sva & PAGE_MASK) == 0,
1184	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1185	KASSERT((eva & PAGE_MASK) == 0,
1186	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1187
1188	if (cpu_feature & CPUID_SS)
1189		; /* If "Self Snoop" is supported, do nothing. */
1190	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1191	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1192
1193		/*
1194		 * Otherwise, do per-cache line flush.  Use the mfence
1195		 * instruction to insure that previous stores are
1196		 * included in the write-back.  The processor
1197		 * propagates flush to other processors in the cache
1198		 * coherence domain.
1199		 */
1200		mfence();
1201		for (; sva < eva; sva += cpu_clflush_line_size)
1202			clflush(sva);
1203		mfence();
1204	} else {
1205
1206		/*
1207		 * No targeted cache flush methods are supported by CPU,
1208		 * or the supplied range is bigger than 2MB.
1209		 * Globally invalidate cache.
1210		 */
1211		pmap_invalidate_cache();
1212	}
1213}
1214
1215void
1216pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1217{
1218	int i;
1219
1220	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1221	    (cpu_feature & CPUID_CLFSH) == 0) {
1222		pmap_invalidate_cache();
1223	} else {
1224		for (i = 0; i < count; i++)
1225			pmap_flush_page(pages[i]);
1226	}
1227}
1228
1229/*
1230 * Are we current address space or kernel?  N.B. We return FALSE when
1231 * a pmap's page table is in use because a kernel thread is borrowing
1232 * it.  The borrowed page table can change spontaneously, making any
1233 * dependence on its continued use subject to a race condition.
1234 */
1235static __inline int
1236pmap_is_current(pmap_t pmap)
1237{
1238
1239	return (pmap == kernel_pmap ||
1240	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1241	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1242}
1243
1244/*
1245 * If the given pmap is not the current or kernel pmap, the returned pte must
1246 * be released by passing it to pmap_pte_release().
1247 */
1248pt_entry_t *
1249pmap_pte(pmap_t pmap, vm_offset_t va)
1250{
1251	pd_entry_t newpf;
1252	pd_entry_t *pde;
1253
1254	pde = pmap_pde(pmap, va);
1255	if (*pde & PG_PS)
1256		return (pde);
1257	if (*pde != 0) {
1258		/* are we current address space or kernel? */
1259		if (pmap_is_current(pmap))
1260			return (vtopte(va));
1261		mtx_lock(&PMAP2mutex);
1262		newpf = *pde & PG_FRAME;
1263		if ((*PMAP2 & PG_FRAME) != newpf) {
1264			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1265			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1266		}
1267		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1268	}
1269	return (NULL);
1270}
1271
1272/*
1273 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1274 * being NULL.
1275 */
1276static __inline void
1277pmap_pte_release(pt_entry_t *pte)
1278{
1279
1280	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1281		mtx_unlock(&PMAP2mutex);
1282}
1283
1284static __inline void
1285invlcaddr(void *caddr)
1286{
1287
1288	invlpg((u_int)caddr);
1289}
1290
1291/*
1292 * Super fast pmap_pte routine best used when scanning
1293 * the pv lists.  This eliminates many coarse-grained
1294 * invltlb calls.  Note that many of the pv list
1295 * scans are across different pmaps.  It is very wasteful
1296 * to do an entire invltlb for checking a single mapping.
1297 *
1298 * If the given pmap is not the current pmap, pvh_global_lock
1299 * must be held and curthread pinned to a CPU.
1300 */
1301static pt_entry_t *
1302pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1303{
1304	pd_entry_t newpf;
1305	pd_entry_t *pde;
1306
1307	pde = pmap_pde(pmap, va);
1308	if (*pde & PG_PS)
1309		return (pde);
1310	if (*pde != 0) {
1311		/* are we current address space or kernel? */
1312		if (pmap_is_current(pmap))
1313			return (vtopte(va));
1314		rw_assert(&pvh_global_lock, RA_WLOCKED);
1315		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1316		newpf = *pde & PG_FRAME;
1317		if ((*PMAP1 & PG_FRAME) != newpf) {
1318			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1319#ifdef SMP
1320			PMAP1cpu = PCPU_GET(cpuid);
1321#endif
1322			invlcaddr(PADDR1);
1323			PMAP1changed++;
1324		} else
1325#ifdef SMP
1326		if (PMAP1cpu != PCPU_GET(cpuid)) {
1327			PMAP1cpu = PCPU_GET(cpuid);
1328			invlcaddr(PADDR1);
1329			PMAP1changedcpu++;
1330		} else
1331#endif
1332			PMAP1unchanged++;
1333		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1334	}
1335	return (0);
1336}
1337
1338/*
1339 *	Routine:	pmap_extract
1340 *	Function:
1341 *		Extract the physical page address associated
1342 *		with the given map/virtual_address pair.
1343 */
1344vm_paddr_t
1345pmap_extract(pmap_t pmap, vm_offset_t va)
1346{
1347	vm_paddr_t rtval;
1348	pt_entry_t *pte;
1349	pd_entry_t pde;
1350
1351	rtval = 0;
1352	PMAP_LOCK(pmap);
1353	pde = pmap->pm_pdir[va >> PDRSHIFT];
1354	if (pde != 0) {
1355		if ((pde & PG_PS) != 0)
1356			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1357		else {
1358			pte = pmap_pte(pmap, va);
1359			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1360			pmap_pte_release(pte);
1361		}
1362	}
1363	PMAP_UNLOCK(pmap);
1364	return (rtval);
1365}
1366
1367/*
1368 *	Routine:	pmap_extract_and_hold
1369 *	Function:
1370 *		Atomically extract and hold the physical page
1371 *		with the given pmap and virtual address pair
1372 *		if that mapping permits the given protection.
1373 */
1374vm_page_t
1375pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1376{
1377	pd_entry_t pde;
1378	pt_entry_t pte, *ptep;
1379	vm_page_t m;
1380	vm_paddr_t pa;
1381
1382	pa = 0;
1383	m = NULL;
1384	PMAP_LOCK(pmap);
1385retry:
1386	pde = *pmap_pde(pmap, va);
1387	if (pde != 0) {
1388		if (pde & PG_PS) {
1389			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1390				if (vm_page_pa_tryrelock(pmap, (pde &
1391				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1392					goto retry;
1393				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1394				    (va & PDRMASK));
1395				vm_page_hold(m);
1396			}
1397		} else {
1398			ptep = pmap_pte(pmap, va);
1399			pte = *ptep;
1400			pmap_pte_release(ptep);
1401			if (pte != 0 &&
1402			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1403				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1404				    &pa))
1405					goto retry;
1406				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1407				vm_page_hold(m);
1408			}
1409		}
1410	}
1411	PA_UNLOCK_COND(pa);
1412	PMAP_UNLOCK(pmap);
1413	return (m);
1414}
1415
1416/***************************************************
1417 * Low level mapping routines.....
1418 ***************************************************/
1419
1420/*
1421 * Add a wired page to the kva.
1422 * Note: not SMP coherent.
1423 *
1424 * This function may be used before pmap_bootstrap() is called.
1425 */
1426PMAP_INLINE void
1427pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1428{
1429	pt_entry_t *pte;
1430
1431	pte = vtopte(va);
1432	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1433}
1434
1435static __inline void
1436pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1437{
1438	pt_entry_t *pte;
1439
1440	pte = vtopte(va);
1441	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1442}
1443
1444/*
1445 * Remove a page from the kernel pagetables.
1446 * Note: not SMP coherent.
1447 *
1448 * This function may be used before pmap_bootstrap() is called.
1449 */
1450PMAP_INLINE void
1451pmap_kremove(vm_offset_t va)
1452{
1453	pt_entry_t *pte;
1454
1455	pte = vtopte(va);
1456	pte_clear(pte);
1457}
1458
1459/*
1460 *	Used to map a range of physical addresses into kernel
1461 *	virtual address space.
1462 *
1463 *	The value passed in '*virt' is a suggested virtual address for
1464 *	the mapping. Architectures which can support a direct-mapped
1465 *	physical to virtual region can return the appropriate address
1466 *	within that region, leaving '*virt' unchanged. Other
1467 *	architectures should map the pages starting at '*virt' and
1468 *	update '*virt' with the first usable address after the mapped
1469 *	region.
1470 */
1471vm_offset_t
1472pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1473{
1474	vm_offset_t va, sva;
1475	vm_paddr_t superpage_offset;
1476	pd_entry_t newpde;
1477
1478	va = *virt;
1479	/*
1480	 * Does the physical address range's size and alignment permit at
1481	 * least one superpage mapping to be created?
1482	 */
1483	superpage_offset = start & PDRMASK;
1484	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1485		/*
1486		 * Increase the starting virtual address so that its alignment
1487		 * does not preclude the use of superpage mappings.
1488		 */
1489		if ((va & PDRMASK) < superpage_offset)
1490			va = (va & ~PDRMASK) + superpage_offset;
1491		else if ((va & PDRMASK) > superpage_offset)
1492			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1493	}
1494	sva = va;
1495	while (start < end) {
1496		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1497		    pseflag) {
1498			KASSERT((va & PDRMASK) == 0,
1499			    ("pmap_map: misaligned va %#x", va));
1500			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1501			pmap_kenter_pde(va, newpde);
1502			va += NBPDR;
1503			start += NBPDR;
1504		} else {
1505			pmap_kenter(va, start);
1506			va += PAGE_SIZE;
1507			start += PAGE_SIZE;
1508		}
1509	}
1510	pmap_invalidate_range(kernel_pmap, sva, va);
1511	*virt = va;
1512	return (sva);
1513}
1514
1515
1516/*
1517 * Add a list of wired pages to the kva
1518 * this routine is only used for temporary
1519 * kernel mappings that do not need to have
1520 * page modification or references recorded.
1521 * Note that old mappings are simply written
1522 * over.  The page *must* be wired.
1523 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1524 */
1525void
1526pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1527{
1528	pt_entry_t *endpte, oldpte, pa, *pte;
1529	vm_page_t m;
1530
1531	oldpte = 0;
1532	pte = vtopte(sva);
1533	endpte = pte + count;
1534	while (pte < endpte) {
1535		m = *ma++;
1536		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1537		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1538			oldpte |= *pte;
1539			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1540		}
1541		pte++;
1542	}
1543	if (__predict_false((oldpte & PG_V) != 0))
1544		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1545		    PAGE_SIZE);
1546}
1547
1548/*
1549 * This routine tears out page mappings from the
1550 * kernel -- it is meant only for temporary mappings.
1551 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1552 */
1553void
1554pmap_qremove(vm_offset_t sva, int count)
1555{
1556	vm_offset_t va;
1557
1558	va = sva;
1559	while (count-- > 0) {
1560		pmap_kremove(va);
1561		va += PAGE_SIZE;
1562	}
1563	pmap_invalidate_range(kernel_pmap, sva, va);
1564}
1565
1566/***************************************************
1567 * Page table page management routines.....
1568 ***************************************************/
1569static __inline void
1570pmap_free_zero_pages(vm_page_t free)
1571{
1572	vm_page_t m;
1573
1574	while (free != NULL) {
1575		m = free;
1576		free = m->right;
1577		/* Preserve the page's PG_ZERO setting. */
1578		vm_page_free_toq(m);
1579	}
1580}
1581
1582/*
1583 * Schedule the specified unused page table page to be freed.  Specifically,
1584 * add the page to the specified list of pages that will be released to the
1585 * physical memory manager after the TLB has been updated.
1586 */
1587static __inline void
1588pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1589{
1590
1591	if (set_PG_ZERO)
1592		m->flags |= PG_ZERO;
1593	else
1594		m->flags &= ~PG_ZERO;
1595	m->right = *free;
1596	*free = m;
1597}
1598
1599/*
1600 * Inserts the specified page table page into the specified pmap's collection
1601 * of idle page table pages.  Each of a pmap's page table pages is responsible
1602 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1603 * ordered by this virtual address range.
1604 */
1605static void
1606pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1607{
1608	vm_page_t root;
1609
1610	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1611	root = pmap->pm_root;
1612	if (root == NULL) {
1613		mpte->left = NULL;
1614		mpte->right = NULL;
1615	} else {
1616		root = vm_page_splay(mpte->pindex, root);
1617		if (mpte->pindex < root->pindex) {
1618			mpte->left = root->left;
1619			mpte->right = root;
1620			root->left = NULL;
1621		} else if (mpte->pindex == root->pindex)
1622			panic("pmap_insert_pt_page: pindex already inserted");
1623		else {
1624			mpte->right = root->right;
1625			mpte->left = root;
1626			root->right = NULL;
1627		}
1628	}
1629	pmap->pm_root = mpte;
1630}
1631
1632/*
1633 * Looks for a page table page mapping the specified virtual address in the
1634 * specified pmap's collection of idle page table pages.  Returns NULL if there
1635 * is no page table page corresponding to the specified virtual address.
1636 */
1637static vm_page_t
1638pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1639{
1640	vm_page_t mpte;
1641	vm_pindex_t pindex = va >> PDRSHIFT;
1642
1643	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1644	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1645		mpte = vm_page_splay(pindex, mpte);
1646		if ((pmap->pm_root = mpte)->pindex != pindex)
1647			mpte = NULL;
1648	}
1649	return (mpte);
1650}
1651
1652/*
1653 * Removes the specified page table page from the specified pmap's collection
1654 * of idle page table pages.  The specified page table page must be a member of
1655 * the pmap's collection.
1656 */
1657static void
1658pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1659{
1660	vm_page_t root;
1661
1662	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1663	if (mpte != pmap->pm_root)
1664		vm_page_splay(mpte->pindex, pmap->pm_root);
1665	if (mpte->left == NULL)
1666		root = mpte->right;
1667	else {
1668		root = vm_page_splay(mpte->pindex, mpte->left);
1669		root->right = mpte->right;
1670	}
1671	pmap->pm_root = root;
1672}
1673
1674/*
1675 * This routine unholds page table pages, and if the hold count
1676 * drops to zero, then it decrements the wire count.
1677 */
1678static __inline int
1679pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1680{
1681
1682	--m->wire_count;
1683	if (m->wire_count == 0)
1684		return (_pmap_unwire_pte_hold(pmap, m, free));
1685	else
1686		return (0);
1687}
1688
1689static int
1690_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1691{
1692	vm_offset_t pteva;
1693
1694	/*
1695	 * unmap the page table page
1696	 */
1697	pmap->pm_pdir[m->pindex] = 0;
1698	--pmap->pm_stats.resident_count;
1699
1700	/*
1701	 * This is a release store so that the ordinary store unmapping
1702	 * the page table page is globally performed before TLB shoot-
1703	 * down is begun.
1704	 */
1705	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1706
1707	/*
1708	 * Do an invltlb to make the invalidated mapping
1709	 * take effect immediately.
1710	 */
1711	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1712	pmap_invalidate_page(pmap, pteva);
1713
1714	/*
1715	 * Put page on a list so that it is released after
1716	 * *ALL* TLB shootdown is done
1717	 */
1718	pmap_add_delayed_free_list(m, free, TRUE);
1719
1720	return (1);
1721}
1722
1723/*
1724 * After removing a page table entry, this routine is used to
1725 * conditionally free the page, and manage the hold/wire counts.
1726 */
1727static int
1728pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1729{
1730	pd_entry_t ptepde;
1731	vm_page_t mpte;
1732
1733	if (va >= VM_MAXUSER_ADDRESS)
1734		return (0);
1735	ptepde = *pmap_pde(pmap, va);
1736	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1737	return (pmap_unwire_pte_hold(pmap, mpte, free));
1738}
1739
1740/*
1741 * Initialize the pmap for the swapper process.
1742 */
1743void
1744pmap_pinit0(pmap_t pmap)
1745{
1746
1747	PMAP_LOCK_INIT(pmap);
1748	/*
1749	 * Since the page table directory is shared with the kernel pmap,
1750	 * which is already included in the list "allpmaps", this pmap does
1751	 * not need to be inserted into that list.
1752	 */
1753	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1754#ifdef PAE
1755	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1756#endif
1757	pmap->pm_root = NULL;
1758	CPU_ZERO(&pmap->pm_active);
1759	PCPU_SET(curpmap, pmap);
1760	TAILQ_INIT(&pmap->pm_pvchunk);
1761	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1762}
1763
1764/*
1765 * Initialize a preallocated and zeroed pmap structure,
1766 * such as one in a vmspace structure.
1767 */
1768int
1769pmap_pinit(pmap_t pmap)
1770{
1771	vm_page_t m, ptdpg[NPGPTD];
1772	vm_paddr_t pa;
1773	int i;
1774
1775	PMAP_LOCK_INIT(pmap);
1776
1777	/*
1778	 * No need to allocate page table space yet but we do need a valid
1779	 * page directory table.
1780	 */
1781	if (pmap->pm_pdir == NULL) {
1782		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1783		    NBPTD);
1784		if (pmap->pm_pdir == NULL) {
1785			PMAP_LOCK_DESTROY(pmap);
1786			return (0);
1787		}
1788#ifdef PAE
1789		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1790		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1791		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1792		    ("pmap_pinit: pdpt misaligned"));
1793		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1794		    ("pmap_pinit: pdpt above 4g"));
1795#endif
1796		pmap->pm_root = NULL;
1797	}
1798	KASSERT(pmap->pm_root == NULL,
1799	    ("pmap_pinit: pmap has reserved page table page(s)"));
1800
1801	/*
1802	 * allocate the page directory page(s)
1803	 */
1804	for (i = 0; i < NPGPTD;) {
1805		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1806		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1807		if (m == NULL)
1808			VM_WAIT;
1809		else {
1810			ptdpg[i++] = m;
1811		}
1812	}
1813
1814	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1815
1816	for (i = 0; i < NPGPTD; i++)
1817		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1818			pagezero(pmap->pm_pdir + (i * NPDEPG));
1819
1820	mtx_lock_spin(&allpmaps_lock);
1821	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1822	/* Copy the kernel page table directory entries. */
1823	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1824	mtx_unlock_spin(&allpmaps_lock);
1825
1826	/* install self-referential address mapping entry(s) */
1827	for (i = 0; i < NPGPTD; i++) {
1828		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1829		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1830#ifdef PAE
1831		pmap->pm_pdpt[i] = pa | PG_V;
1832#endif
1833	}
1834
1835	CPU_ZERO(&pmap->pm_active);
1836	TAILQ_INIT(&pmap->pm_pvchunk);
1837	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1838
1839	return (1);
1840}
1841
1842/*
1843 * this routine is called if the page table page is not
1844 * mapped correctly.
1845 */
1846static vm_page_t
1847_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags)
1848{
1849	vm_paddr_t ptepa;
1850	vm_page_t m;
1851
1852	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1853	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1854	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1855
1856	/*
1857	 * Allocate a page table page.
1858	 */
1859	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1860	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1861		if (flags & M_WAITOK) {
1862			PMAP_UNLOCK(pmap);
1863			rw_wunlock(&pvh_global_lock);
1864			VM_WAIT;
1865			rw_wlock(&pvh_global_lock);
1866			PMAP_LOCK(pmap);
1867		}
1868
1869		/*
1870		 * Indicate the need to retry.  While waiting, the page table
1871		 * page may have been allocated.
1872		 */
1873		return (NULL);
1874	}
1875	if ((m->flags & PG_ZERO) == 0)
1876		pmap_zero_page(m);
1877
1878	/*
1879	 * Map the pagetable page into the process address space, if
1880	 * it isn't already there.
1881	 */
1882
1883	pmap->pm_stats.resident_count++;
1884
1885	ptepa = VM_PAGE_TO_PHYS(m);
1886	pmap->pm_pdir[ptepindex] =
1887		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1888
1889	return (m);
1890}
1891
1892static vm_page_t
1893pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1894{
1895	u_int ptepindex;
1896	pd_entry_t ptepa;
1897	vm_page_t m;
1898
1899	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1900	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1901	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1902
1903	/*
1904	 * Calculate pagetable page index
1905	 */
1906	ptepindex = va >> PDRSHIFT;
1907retry:
1908	/*
1909	 * Get the page directory entry
1910	 */
1911	ptepa = pmap->pm_pdir[ptepindex];
1912
1913	/*
1914	 * This supports switching from a 4MB page to a
1915	 * normal 4K page.
1916	 */
1917	if (ptepa & PG_PS) {
1918		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1919		ptepa = pmap->pm_pdir[ptepindex];
1920	}
1921
1922	/*
1923	 * If the page table page is mapped, we just increment the
1924	 * hold count, and activate it.
1925	 */
1926	if (ptepa) {
1927		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1928		m->wire_count++;
1929	} else {
1930		/*
1931		 * Here if the pte page isn't mapped, or if it has
1932		 * been deallocated.
1933		 */
1934		m = _pmap_allocpte(pmap, ptepindex, flags);
1935		if (m == NULL && (flags & M_WAITOK))
1936			goto retry;
1937	}
1938	return (m);
1939}
1940
1941
1942/***************************************************
1943* Pmap allocation/deallocation routines.
1944 ***************************************************/
1945
1946#ifdef SMP
1947/*
1948 * Deal with a SMP shootdown of other users of the pmap that we are
1949 * trying to dispose of.  This can be a bit hairy.
1950 */
1951static cpuset_t *lazymask;
1952static u_int lazyptd;
1953static volatile u_int lazywait;
1954
1955void pmap_lazyfix_action(void);
1956
1957void
1958pmap_lazyfix_action(void)
1959{
1960
1961#ifdef COUNT_IPIS
1962	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1963#endif
1964	if (rcr3() == lazyptd)
1965		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1966	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1967	atomic_store_rel_int(&lazywait, 1);
1968}
1969
1970static void
1971pmap_lazyfix_self(u_int cpuid)
1972{
1973
1974	if (rcr3() == lazyptd)
1975		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1976	CPU_CLR_ATOMIC(cpuid, lazymask);
1977}
1978
1979
1980static void
1981pmap_lazyfix(pmap_t pmap)
1982{
1983	cpuset_t mymask, mask;
1984	u_int cpuid, spins;
1985	int lsb;
1986
1987	mask = pmap->pm_active;
1988	while (!CPU_EMPTY(&mask)) {
1989		spins = 50000000;
1990
1991		/* Find least significant set bit. */
1992		lsb = cpusetobj_ffs(&mask);
1993		MPASS(lsb != 0);
1994		lsb--;
1995		CPU_SETOF(lsb, &mask);
1996		mtx_lock_spin(&smp_ipi_mtx);
1997#ifdef PAE
1998		lazyptd = vtophys(pmap->pm_pdpt);
1999#else
2000		lazyptd = vtophys(pmap->pm_pdir);
2001#endif
2002		cpuid = PCPU_GET(cpuid);
2003
2004		/* Use a cpuset just for having an easy check. */
2005		CPU_SETOF(cpuid, &mymask);
2006		if (!CPU_CMP(&mask, &mymask)) {
2007			lazymask = &pmap->pm_active;
2008			pmap_lazyfix_self(cpuid);
2009		} else {
2010			atomic_store_rel_int((u_int *)&lazymask,
2011			    (u_int)&pmap->pm_active);
2012			atomic_store_rel_int(&lazywait, 0);
2013			ipi_selected(mask, IPI_LAZYPMAP);
2014			while (lazywait == 0) {
2015				ia32_pause();
2016				if (--spins == 0)
2017					break;
2018			}
2019		}
2020		mtx_unlock_spin(&smp_ipi_mtx);
2021		if (spins == 0)
2022			printf("pmap_lazyfix: spun for 50000000\n");
2023		mask = pmap->pm_active;
2024	}
2025}
2026
2027#else	/* SMP */
2028
2029/*
2030 * Cleaning up on uniprocessor is easy.  For various reasons, we're
2031 * unlikely to have to even execute this code, including the fact
2032 * that the cleanup is deferred until the parent does a wait(2), which
2033 * means that another userland process has run.
2034 */
2035static void
2036pmap_lazyfix(pmap_t pmap)
2037{
2038	u_int cr3;
2039
2040	cr3 = vtophys(pmap->pm_pdir);
2041	if (cr3 == rcr3()) {
2042		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
2043		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2044	}
2045}
2046#endif	/* SMP */
2047
2048/*
2049 * Release any resources held by the given physical map.
2050 * Called when a pmap initialized by pmap_pinit is being released.
2051 * Should only be called if the map contains no valid mappings.
2052 */
2053void
2054pmap_release(pmap_t pmap)
2055{
2056	vm_page_t m, ptdpg[NPGPTD];
2057	int i;
2058
2059	KASSERT(pmap->pm_stats.resident_count == 0,
2060	    ("pmap_release: pmap resident count %ld != 0",
2061	    pmap->pm_stats.resident_count));
2062	KASSERT(pmap->pm_root == NULL,
2063	    ("pmap_release: pmap has reserved page table page(s)"));
2064
2065	pmap_lazyfix(pmap);
2066	mtx_lock_spin(&allpmaps_lock);
2067	LIST_REMOVE(pmap, pm_list);
2068	mtx_unlock_spin(&allpmaps_lock);
2069
2070	for (i = 0; i < NPGPTD; i++)
2071		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2072		    PG_FRAME);
2073
2074	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2075	    sizeof(*pmap->pm_pdir));
2076
2077	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2078
2079	for (i = 0; i < NPGPTD; i++) {
2080		m = ptdpg[i];
2081#ifdef PAE
2082		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2083		    ("pmap_release: got wrong ptd page"));
2084#endif
2085		m->wire_count--;
2086		atomic_subtract_int(&cnt.v_wire_count, 1);
2087		vm_page_free_zero(m);
2088	}
2089	PMAP_LOCK_DESTROY(pmap);
2090}
2091
2092static int
2093kvm_size(SYSCTL_HANDLER_ARGS)
2094{
2095	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2096
2097	return (sysctl_handle_long(oidp, &ksize, 0, req));
2098}
2099SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2100    0, 0, kvm_size, "IU", "Size of KVM");
2101
2102static int
2103kvm_free(SYSCTL_HANDLER_ARGS)
2104{
2105	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2106
2107	return (sysctl_handle_long(oidp, &kfree, 0, req));
2108}
2109SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2110    0, 0, kvm_free, "IU", "Amount of KVM free");
2111
2112/*
2113 * grow the number of kernel page table entries, if needed
2114 */
2115void
2116pmap_growkernel(vm_offset_t addr)
2117{
2118	vm_paddr_t ptppaddr;
2119	vm_page_t nkpg;
2120	pd_entry_t newpdir;
2121
2122	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2123	addr = roundup2(addr, NBPDR);
2124	if (addr - 1 >= kernel_map->max_offset)
2125		addr = kernel_map->max_offset;
2126	while (kernel_vm_end < addr) {
2127		if (pdir_pde(PTD, kernel_vm_end)) {
2128			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2129			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2130				kernel_vm_end = kernel_map->max_offset;
2131				break;
2132			}
2133			continue;
2134		}
2135
2136		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2137		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2138		    VM_ALLOC_ZERO);
2139		if (nkpg == NULL)
2140			panic("pmap_growkernel: no memory to grow kernel");
2141
2142		nkpt++;
2143
2144		if ((nkpg->flags & PG_ZERO) == 0)
2145			pmap_zero_page(nkpg);
2146		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2147		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2148		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2149
2150		pmap_kenter_pde(kernel_vm_end, newpdir);
2151		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2152		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2153			kernel_vm_end = kernel_map->max_offset;
2154			break;
2155		}
2156	}
2157}
2158
2159
2160/***************************************************
2161 * page management routines.
2162 ***************************************************/
2163
2164CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2165CTASSERT(_NPCM == 11);
2166CTASSERT(_NPCPV == 336);
2167
2168static __inline struct pv_chunk *
2169pv_to_chunk(pv_entry_t pv)
2170{
2171
2172	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2173}
2174
2175#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2176
2177#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2178#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2179
2180static const uint32_t pc_freemask[_NPCM] = {
2181	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2182	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2183	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2184	PC_FREE0_9, PC_FREE10
2185};
2186
2187SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2188	"Current number of pv entries");
2189
2190#ifdef PV_STATS
2191static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2192
2193SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2194	"Current number of pv entry chunks");
2195SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2196	"Current number of pv entry chunks allocated");
2197SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2198	"Current number of pv entry chunks frees");
2199SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2200	"Number of times tried to get a chunk page but failed.");
2201
2202static long pv_entry_frees, pv_entry_allocs;
2203static int pv_entry_spare;
2204
2205SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2206	"Current number of pv entry frees");
2207SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2208	"Current number of pv entry allocs");
2209SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2210	"Current number of spare pv entries");
2211#endif
2212
2213/*
2214 * We are in a serious low memory condition.  Resort to
2215 * drastic measures to free some pages so we can allocate
2216 * another pv entry chunk.
2217 */
2218static vm_page_t
2219pmap_pv_reclaim(pmap_t locked_pmap)
2220{
2221	struct pch newtail;
2222	struct pv_chunk *pc;
2223	struct md_page *pvh;
2224	pd_entry_t *pde;
2225	pmap_t pmap;
2226	pt_entry_t *pte, tpte;
2227	pv_entry_t pv;
2228	vm_offset_t va;
2229	vm_page_t free, m, m_pc;
2230	uint32_t inuse;
2231	int bit, field, freed;
2232
2233	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2234	pmap = NULL;
2235	free = m_pc = NULL;
2236	TAILQ_INIT(&newtail);
2237	sched_pin();
2238	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2239	    free == NULL)) {
2240		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2241		if (pmap != pc->pc_pmap) {
2242			if (pmap != NULL) {
2243				pmap_invalidate_all(pmap);
2244				if (pmap != locked_pmap)
2245					PMAP_UNLOCK(pmap);
2246			}
2247			pmap = pc->pc_pmap;
2248			/* Avoid deadlock and lock recursion. */
2249			if (pmap > locked_pmap)
2250				PMAP_LOCK(pmap);
2251			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2252				pmap = NULL;
2253				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2254				continue;
2255			}
2256		}
2257
2258		/*
2259		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2260		 */
2261		freed = 0;
2262		for (field = 0; field < _NPCM; field++) {
2263			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2264			    inuse != 0; inuse &= ~(1UL << bit)) {
2265				bit = bsfl(inuse);
2266				pv = &pc->pc_pventry[field * 32 + bit];
2267				va = pv->pv_va;
2268				pde = pmap_pde(pmap, va);
2269				if ((*pde & PG_PS) != 0)
2270					continue;
2271				pte = pmap_pte_quick(pmap, va);
2272				if ((*pte & PG_W) != 0)
2273					continue;
2274				tpte = pte_load_clear(pte);
2275				if ((tpte & PG_G) != 0)
2276					pmap_invalidate_page(pmap, va);
2277				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2278				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2279					vm_page_dirty(m);
2280				if ((tpte & PG_A) != 0)
2281					vm_page_aflag_set(m, PGA_REFERENCED);
2282				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2283				if (TAILQ_EMPTY(&m->md.pv_list) &&
2284				    (m->flags & PG_FICTITIOUS) == 0) {
2285					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2286					if (TAILQ_EMPTY(&pvh->pv_list)) {
2287						vm_page_aflag_clear(m,
2288						    PGA_WRITEABLE);
2289					}
2290				}
2291				pc->pc_map[field] |= 1UL << bit;
2292				pmap_unuse_pt(pmap, va, &free);
2293				freed++;
2294			}
2295		}
2296		if (freed == 0) {
2297			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2298			continue;
2299		}
2300		/* Every freed mapping is for a 4 KB page. */
2301		pmap->pm_stats.resident_count -= freed;
2302		PV_STAT(pv_entry_frees += freed);
2303		PV_STAT(pv_entry_spare += freed);
2304		pv_entry_count -= freed;
2305		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2306		for (field = 0; field < _NPCM; field++)
2307			if (pc->pc_map[field] != pc_freemask[field]) {
2308				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2309				    pc_list);
2310				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2311
2312				/*
2313				 * One freed pv entry in locked_pmap is
2314				 * sufficient.
2315				 */
2316				if (pmap == locked_pmap)
2317					goto out;
2318				break;
2319			}
2320		if (field == _NPCM) {
2321			PV_STAT(pv_entry_spare -= _NPCPV);
2322			PV_STAT(pc_chunk_count--);
2323			PV_STAT(pc_chunk_frees++);
2324			/* Entire chunk is free; return it. */
2325			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2326			pmap_qremove((vm_offset_t)pc, 1);
2327			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2328			break;
2329		}
2330	}
2331out:
2332	sched_unpin();
2333	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2334	if (pmap != NULL) {
2335		pmap_invalidate_all(pmap);
2336		if (pmap != locked_pmap)
2337			PMAP_UNLOCK(pmap);
2338	}
2339	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
2340		m_pc = free;
2341		free = m_pc->right;
2342		/* Recycle a freed page table page. */
2343		m_pc->wire_count = 1;
2344		atomic_add_int(&cnt.v_wire_count, 1);
2345	}
2346	pmap_free_zero_pages(free);
2347	return (m_pc);
2348}
2349
2350/*
2351 * free the pv_entry back to the free list
2352 */
2353static void
2354free_pv_entry(pmap_t pmap, pv_entry_t pv)
2355{
2356	struct pv_chunk *pc;
2357	int idx, field, bit;
2358
2359	rw_assert(&pvh_global_lock, RA_WLOCKED);
2360	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2361	PV_STAT(pv_entry_frees++);
2362	PV_STAT(pv_entry_spare++);
2363	pv_entry_count--;
2364	pc = pv_to_chunk(pv);
2365	idx = pv - &pc->pc_pventry[0];
2366	field = idx / 32;
2367	bit = idx % 32;
2368	pc->pc_map[field] |= 1ul << bit;
2369	for (idx = 0; idx < _NPCM; idx++)
2370		if (pc->pc_map[idx] != pc_freemask[idx]) {
2371			/*
2372			 * 98% of the time, pc is already at the head of the
2373			 * list.  If it isn't already, move it to the head.
2374			 */
2375			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2376			    pc)) {
2377				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2378				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2379				    pc_list);
2380			}
2381			return;
2382		}
2383	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2384	free_pv_chunk(pc);
2385}
2386
2387static void
2388free_pv_chunk(struct pv_chunk *pc)
2389{
2390	vm_page_t m;
2391
2392 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2393	PV_STAT(pv_entry_spare -= _NPCPV);
2394	PV_STAT(pc_chunk_count--);
2395	PV_STAT(pc_chunk_frees++);
2396	/* entire chunk is free, return it */
2397	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2398	pmap_qremove((vm_offset_t)pc, 1);
2399	vm_page_unwire(m, 0);
2400	vm_page_free(m);
2401	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2402}
2403
2404/*
2405 * get a new pv_entry, allocating a block from the system
2406 * when needed.
2407 */
2408static pv_entry_t
2409get_pv_entry(pmap_t pmap, boolean_t try)
2410{
2411	static const struct timeval printinterval = { 60, 0 };
2412	static struct timeval lastprint;
2413	int bit, field;
2414	pv_entry_t pv;
2415	struct pv_chunk *pc;
2416	vm_page_t m;
2417
2418	rw_assert(&pvh_global_lock, RA_WLOCKED);
2419	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2420	PV_STAT(pv_entry_allocs++);
2421	pv_entry_count++;
2422	if (pv_entry_count > pv_entry_high_water)
2423		if (ratecheck(&lastprint, &printinterval))
2424			printf("Approaching the limit on PV entries, consider "
2425			    "increasing either the vm.pmap.shpgperproc or the "
2426			    "vm.pmap.pv_entry_max tunable.\n");
2427retry:
2428	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2429	if (pc != NULL) {
2430		for (field = 0; field < _NPCM; field++) {
2431			if (pc->pc_map[field]) {
2432				bit = bsfl(pc->pc_map[field]);
2433				break;
2434			}
2435		}
2436		if (field < _NPCM) {
2437			pv = &pc->pc_pventry[field * 32 + bit];
2438			pc->pc_map[field] &= ~(1ul << bit);
2439			/* If this was the last item, move it to tail */
2440			for (field = 0; field < _NPCM; field++)
2441				if (pc->pc_map[field] != 0) {
2442					PV_STAT(pv_entry_spare--);
2443					return (pv);	/* not full, return */
2444				}
2445			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2446			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2447			PV_STAT(pv_entry_spare--);
2448			return (pv);
2449		}
2450	}
2451	/*
2452	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2453	 * global lock.  If "pv_vafree" is currently non-empty, it will
2454	 * remain non-empty until pmap_ptelist_alloc() completes.
2455	 */
2456	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2457	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2458		if (try) {
2459			pv_entry_count--;
2460			PV_STAT(pc_chunk_tryfail++);
2461			return (NULL);
2462		}
2463		m = pmap_pv_reclaim(pmap);
2464		if (m == NULL)
2465			goto retry;
2466	}
2467	PV_STAT(pc_chunk_count++);
2468	PV_STAT(pc_chunk_allocs++);
2469	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2470	pmap_qenter((vm_offset_t)pc, &m, 1);
2471	pc->pc_pmap = pmap;
2472	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2473	for (field = 1; field < _NPCM; field++)
2474		pc->pc_map[field] = pc_freemask[field];
2475	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2476	pv = &pc->pc_pventry[0];
2477	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2478	PV_STAT(pv_entry_spare += _NPCPV - 1);
2479	return (pv);
2480}
2481
2482static __inline pv_entry_t
2483pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2484{
2485	pv_entry_t pv;
2486
2487	rw_assert(&pvh_global_lock, RA_WLOCKED);
2488	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2489		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2490			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2491			break;
2492		}
2493	}
2494	return (pv);
2495}
2496
2497static void
2498pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2499{
2500	struct md_page *pvh;
2501	pv_entry_t pv;
2502	vm_offset_t va_last;
2503	vm_page_t m;
2504
2505	rw_assert(&pvh_global_lock, RA_WLOCKED);
2506	KASSERT((pa & PDRMASK) == 0,
2507	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2508
2509	/*
2510	 * Transfer the 4mpage's pv entry for this mapping to the first
2511	 * page's pv list.
2512	 */
2513	pvh = pa_to_pvh(pa);
2514	va = trunc_4mpage(va);
2515	pv = pmap_pvh_remove(pvh, pmap, va);
2516	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2517	m = PHYS_TO_VM_PAGE(pa);
2518	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2519	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2520	va_last = va + NBPDR - PAGE_SIZE;
2521	do {
2522		m++;
2523		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2524		    ("pmap_pv_demote_pde: page %p is not managed", m));
2525		va += PAGE_SIZE;
2526		pmap_insert_entry(pmap, va, m);
2527	} while (va < va_last);
2528}
2529
2530static void
2531pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2532{
2533	struct md_page *pvh;
2534	pv_entry_t pv;
2535	vm_offset_t va_last;
2536	vm_page_t m;
2537
2538	rw_assert(&pvh_global_lock, RA_WLOCKED);
2539	KASSERT((pa & PDRMASK) == 0,
2540	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2541
2542	/*
2543	 * Transfer the first page's pv entry for this mapping to the
2544	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2545	 * to get_pv_entry(), a transfer avoids the possibility that
2546	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2547	 * removes one of the mappings that is being promoted.
2548	 */
2549	m = PHYS_TO_VM_PAGE(pa);
2550	va = trunc_4mpage(va);
2551	pv = pmap_pvh_remove(&m->md, pmap, va);
2552	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2553	pvh = pa_to_pvh(pa);
2554	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2555	/* Free the remaining NPTEPG - 1 pv entries. */
2556	va_last = va + NBPDR - PAGE_SIZE;
2557	do {
2558		m++;
2559		va += PAGE_SIZE;
2560		pmap_pvh_free(&m->md, pmap, va);
2561	} while (va < va_last);
2562}
2563
2564static void
2565pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2566{
2567	pv_entry_t pv;
2568
2569	pv = pmap_pvh_remove(pvh, pmap, va);
2570	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2571	free_pv_entry(pmap, pv);
2572}
2573
2574static void
2575pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2576{
2577	struct md_page *pvh;
2578
2579	rw_assert(&pvh_global_lock, RA_WLOCKED);
2580	pmap_pvh_free(&m->md, pmap, va);
2581	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2582		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2583		if (TAILQ_EMPTY(&pvh->pv_list))
2584			vm_page_aflag_clear(m, PGA_WRITEABLE);
2585	}
2586}
2587
2588/*
2589 * Create a pv entry for page at pa for
2590 * (pmap, va).
2591 */
2592static void
2593pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2594{
2595	pv_entry_t pv;
2596
2597	rw_assert(&pvh_global_lock, RA_WLOCKED);
2598	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2599	pv = get_pv_entry(pmap, FALSE);
2600	pv->pv_va = va;
2601	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2602}
2603
2604/*
2605 * Conditionally create a pv entry.
2606 */
2607static boolean_t
2608pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2609{
2610	pv_entry_t pv;
2611
2612	rw_assert(&pvh_global_lock, RA_WLOCKED);
2613	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2614	if (pv_entry_count < pv_entry_high_water &&
2615	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2616		pv->pv_va = va;
2617		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2618		return (TRUE);
2619	} else
2620		return (FALSE);
2621}
2622
2623/*
2624 * Create the pv entries for each of the pages within a superpage.
2625 */
2626static boolean_t
2627pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2628{
2629	struct md_page *pvh;
2630	pv_entry_t pv;
2631
2632	rw_assert(&pvh_global_lock, RA_WLOCKED);
2633	if (pv_entry_count < pv_entry_high_water &&
2634	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2635		pv->pv_va = va;
2636		pvh = pa_to_pvh(pa);
2637		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2638		return (TRUE);
2639	} else
2640		return (FALSE);
2641}
2642
2643/*
2644 * Fills a page table page with mappings to consecutive physical pages.
2645 */
2646static void
2647pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2648{
2649	pt_entry_t *pte;
2650
2651	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2652		*pte = newpte;
2653		newpte += PAGE_SIZE;
2654	}
2655}
2656
2657/*
2658 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2659 * 2- or 4MB page mapping is invalidated.
2660 */
2661static boolean_t
2662pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2663{
2664	pd_entry_t newpde, oldpde;
2665	pt_entry_t *firstpte, newpte;
2666	vm_paddr_t mptepa;
2667	vm_page_t free, mpte;
2668
2669	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2670	oldpde = *pde;
2671	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2672	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2673	mpte = pmap_lookup_pt_page(pmap, va);
2674	if (mpte != NULL)
2675		pmap_remove_pt_page(pmap, mpte);
2676	else {
2677		KASSERT((oldpde & PG_W) == 0,
2678		    ("pmap_demote_pde: page table page for a wired mapping"
2679		    " is missing"));
2680
2681		/*
2682		 * Invalidate the 2- or 4MB page mapping and return
2683		 * "failure" if the mapping was never accessed or the
2684		 * allocation of the new page table page fails.
2685		 */
2686		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2687		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2688		    VM_ALLOC_WIRED)) == NULL) {
2689			free = NULL;
2690			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2691			pmap_invalidate_page(pmap, trunc_4mpage(va));
2692			pmap_free_zero_pages(free);
2693			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2694			    " in pmap %p", va, pmap);
2695			return (FALSE);
2696		}
2697		if (va < VM_MAXUSER_ADDRESS)
2698			pmap->pm_stats.resident_count++;
2699	}
2700	mptepa = VM_PAGE_TO_PHYS(mpte);
2701
2702	/*
2703	 * If the page mapping is in the kernel's address space, then the
2704	 * KPTmap can provide access to the page table page.  Otherwise,
2705	 * temporarily map the page table page (mpte) into the kernel's
2706	 * address space at either PADDR1 or PADDR2.
2707	 */
2708	if (va >= KERNBASE)
2709		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2710	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2711		if ((*PMAP1 & PG_FRAME) != mptepa) {
2712			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2713#ifdef SMP
2714			PMAP1cpu = PCPU_GET(cpuid);
2715#endif
2716			invlcaddr(PADDR1);
2717			PMAP1changed++;
2718		} else
2719#ifdef SMP
2720		if (PMAP1cpu != PCPU_GET(cpuid)) {
2721			PMAP1cpu = PCPU_GET(cpuid);
2722			invlcaddr(PADDR1);
2723			PMAP1changedcpu++;
2724		} else
2725#endif
2726			PMAP1unchanged++;
2727		firstpte = PADDR1;
2728	} else {
2729		mtx_lock(&PMAP2mutex);
2730		if ((*PMAP2 & PG_FRAME) != mptepa) {
2731			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2732			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2733		}
2734		firstpte = PADDR2;
2735	}
2736	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2737	KASSERT((oldpde & PG_A) != 0,
2738	    ("pmap_demote_pde: oldpde is missing PG_A"));
2739	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2740	    ("pmap_demote_pde: oldpde is missing PG_M"));
2741	newpte = oldpde & ~PG_PS;
2742	if ((newpte & PG_PDE_PAT) != 0)
2743		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2744
2745	/*
2746	 * If the page table page is new, initialize it.
2747	 */
2748	if (mpte->wire_count == 1) {
2749		mpte->wire_count = NPTEPG;
2750		pmap_fill_ptp(firstpte, newpte);
2751	}
2752	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2753	    ("pmap_demote_pde: firstpte and newpte map different physical"
2754	    " addresses"));
2755
2756	/*
2757	 * If the mapping has changed attributes, update the page table
2758	 * entries.
2759	 */
2760	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2761		pmap_fill_ptp(firstpte, newpte);
2762
2763	/*
2764	 * Demote the mapping.  This pmap is locked.  The old PDE has
2765	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2766	 * set.  Thus, there is no danger of a race with another
2767	 * processor changing the setting of PG_A and/or PG_M between
2768	 * the read above and the store below.
2769	 */
2770	if (workaround_erratum383)
2771		pmap_update_pde(pmap, va, pde, newpde);
2772	else if (pmap == kernel_pmap)
2773		pmap_kenter_pde(va, newpde);
2774	else
2775		pde_store(pde, newpde);
2776	if (firstpte == PADDR2)
2777		mtx_unlock(&PMAP2mutex);
2778
2779	/*
2780	 * Invalidate the recursive mapping of the page table page.
2781	 */
2782	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2783
2784	/*
2785	 * Demote the pv entry.  This depends on the earlier demotion
2786	 * of the mapping.  Specifically, the (re)creation of a per-
2787	 * page pv entry might trigger the execution of pmap_collect(),
2788	 * which might reclaim a newly (re)created per-page pv entry
2789	 * and destroy the associated mapping.  In order to destroy
2790	 * the mapping, the PDE must have already changed from mapping
2791	 * the 2mpage to referencing the page table page.
2792	 */
2793	if ((oldpde & PG_MANAGED) != 0)
2794		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2795
2796	pmap_pde_demotions++;
2797	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2798	    " in pmap %p", va, pmap);
2799	return (TRUE);
2800}
2801
2802/*
2803 * pmap_remove_pde: do the things to unmap a superpage in a process
2804 */
2805static void
2806pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2807    vm_page_t *free)
2808{
2809	struct md_page *pvh;
2810	pd_entry_t oldpde;
2811	vm_offset_t eva, va;
2812	vm_page_t m, mpte;
2813
2814	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2815	KASSERT((sva & PDRMASK) == 0,
2816	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2817	oldpde = pte_load_clear(pdq);
2818	if (oldpde & PG_W)
2819		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2820
2821	/*
2822	 * Machines that don't support invlpg, also don't support
2823	 * PG_G.
2824	 */
2825	if (oldpde & PG_G)
2826		pmap_invalidate_page(kernel_pmap, sva);
2827	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2828	if (oldpde & PG_MANAGED) {
2829		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2830		pmap_pvh_free(pvh, pmap, sva);
2831		eva = sva + NBPDR;
2832		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2833		    va < eva; va += PAGE_SIZE, m++) {
2834			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2835				vm_page_dirty(m);
2836			if (oldpde & PG_A)
2837				vm_page_aflag_set(m, PGA_REFERENCED);
2838			if (TAILQ_EMPTY(&m->md.pv_list) &&
2839			    TAILQ_EMPTY(&pvh->pv_list))
2840				vm_page_aflag_clear(m, PGA_WRITEABLE);
2841		}
2842	}
2843	if (pmap == kernel_pmap) {
2844		if (!pmap_demote_pde(pmap, pdq, sva))
2845			panic("pmap_remove_pde: failed demotion");
2846	} else {
2847		mpte = pmap_lookup_pt_page(pmap, sva);
2848		if (mpte != NULL) {
2849			pmap_remove_pt_page(pmap, mpte);
2850			pmap->pm_stats.resident_count--;
2851			KASSERT(mpte->wire_count == NPTEPG,
2852			    ("pmap_remove_pde: pte page wire count error"));
2853			mpte->wire_count = 0;
2854			pmap_add_delayed_free_list(mpte, free, FALSE);
2855			atomic_subtract_int(&cnt.v_wire_count, 1);
2856		}
2857	}
2858}
2859
2860/*
2861 * pmap_remove_pte: do the things to unmap a page in a process
2862 */
2863static int
2864pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2865{
2866	pt_entry_t oldpte;
2867	vm_page_t m;
2868
2869	rw_assert(&pvh_global_lock, RA_WLOCKED);
2870	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2871	oldpte = pte_load_clear(ptq);
2872	if (oldpte & PG_W)
2873		pmap->pm_stats.wired_count -= 1;
2874	/*
2875	 * Machines that don't support invlpg, also don't support
2876	 * PG_G.
2877	 */
2878	if (oldpte & PG_G)
2879		pmap_invalidate_page(kernel_pmap, va);
2880	pmap->pm_stats.resident_count -= 1;
2881	if (oldpte & PG_MANAGED) {
2882		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2883		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2884			vm_page_dirty(m);
2885		if (oldpte & PG_A)
2886			vm_page_aflag_set(m, PGA_REFERENCED);
2887		pmap_remove_entry(pmap, m, va);
2888	}
2889	return (pmap_unuse_pt(pmap, va, free));
2890}
2891
2892/*
2893 * Remove a single page from a process address space
2894 */
2895static void
2896pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2897{
2898	pt_entry_t *pte;
2899
2900	rw_assert(&pvh_global_lock, RA_WLOCKED);
2901	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2902	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2903	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2904		return;
2905	pmap_remove_pte(pmap, pte, va, free);
2906	pmap_invalidate_page(pmap, va);
2907}
2908
2909/*
2910 *	Remove the given range of addresses from the specified map.
2911 *
2912 *	It is assumed that the start and end are properly
2913 *	rounded to the page size.
2914 */
2915void
2916pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2917{
2918	vm_offset_t pdnxt;
2919	pd_entry_t ptpaddr;
2920	pt_entry_t *pte;
2921	vm_page_t free = NULL;
2922	int anyvalid;
2923
2924	/*
2925	 * Perform an unsynchronized read.  This is, however, safe.
2926	 */
2927	if (pmap->pm_stats.resident_count == 0)
2928		return;
2929
2930	anyvalid = 0;
2931
2932	rw_wlock(&pvh_global_lock);
2933	sched_pin();
2934	PMAP_LOCK(pmap);
2935
2936	/*
2937	 * special handling of removing one page.  a very
2938	 * common operation and easy to short circuit some
2939	 * code.
2940	 */
2941	if ((sva + PAGE_SIZE == eva) &&
2942	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2943		pmap_remove_page(pmap, sva, &free);
2944		goto out;
2945	}
2946
2947	for (; sva < eva; sva = pdnxt) {
2948		u_int pdirindex;
2949
2950		/*
2951		 * Calculate index for next page table.
2952		 */
2953		pdnxt = (sva + NBPDR) & ~PDRMASK;
2954		if (pdnxt < sva)
2955			pdnxt = eva;
2956		if (pmap->pm_stats.resident_count == 0)
2957			break;
2958
2959		pdirindex = sva >> PDRSHIFT;
2960		ptpaddr = pmap->pm_pdir[pdirindex];
2961
2962		/*
2963		 * Weed out invalid mappings. Note: we assume that the page
2964		 * directory table is always allocated, and in kernel virtual.
2965		 */
2966		if (ptpaddr == 0)
2967			continue;
2968
2969		/*
2970		 * Check for large page.
2971		 */
2972		if ((ptpaddr & PG_PS) != 0) {
2973			/*
2974			 * Are we removing the entire large page?  If not,
2975			 * demote the mapping and fall through.
2976			 */
2977			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2978				/*
2979				 * The TLB entry for a PG_G mapping is
2980				 * invalidated by pmap_remove_pde().
2981				 */
2982				if ((ptpaddr & PG_G) == 0)
2983					anyvalid = 1;
2984				pmap_remove_pde(pmap,
2985				    &pmap->pm_pdir[pdirindex], sva, &free);
2986				continue;
2987			} else if (!pmap_demote_pde(pmap,
2988			    &pmap->pm_pdir[pdirindex], sva)) {
2989				/* The large page mapping was destroyed. */
2990				continue;
2991			}
2992		}
2993
2994		/*
2995		 * Limit our scan to either the end of the va represented
2996		 * by the current page table page, or to the end of the
2997		 * range being removed.
2998		 */
2999		if (pdnxt > eva)
3000			pdnxt = eva;
3001
3002		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3003		    sva += PAGE_SIZE) {
3004			if (*pte == 0)
3005				continue;
3006
3007			/*
3008			 * The TLB entry for a PG_G mapping is invalidated
3009			 * by pmap_remove_pte().
3010			 */
3011			if ((*pte & PG_G) == 0)
3012				anyvalid = 1;
3013			if (pmap_remove_pte(pmap, pte, sva, &free))
3014				break;
3015		}
3016	}
3017out:
3018	sched_unpin();
3019	if (anyvalid)
3020		pmap_invalidate_all(pmap);
3021	rw_wunlock(&pvh_global_lock);
3022	PMAP_UNLOCK(pmap);
3023	pmap_free_zero_pages(free);
3024}
3025
3026/*
3027 *	Routine:	pmap_remove_all
3028 *	Function:
3029 *		Removes this physical page from
3030 *		all physical maps in which it resides.
3031 *		Reflects back modify bits to the pager.
3032 *
3033 *	Notes:
3034 *		Original versions of this routine were very
3035 *		inefficient because they iteratively called
3036 *		pmap_remove (slow...)
3037 */
3038
3039void
3040pmap_remove_all(vm_page_t m)
3041{
3042	struct md_page *pvh;
3043	pv_entry_t pv;
3044	pmap_t pmap;
3045	pt_entry_t *pte, tpte;
3046	pd_entry_t *pde;
3047	vm_offset_t va;
3048	vm_page_t free;
3049
3050	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3051	    ("pmap_remove_all: page %p is not managed", m));
3052	free = NULL;
3053	rw_wlock(&pvh_global_lock);
3054	sched_pin();
3055	if ((m->flags & PG_FICTITIOUS) != 0)
3056		goto small_mappings;
3057	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3058	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3059		va = pv->pv_va;
3060		pmap = PV_PMAP(pv);
3061		PMAP_LOCK(pmap);
3062		pde = pmap_pde(pmap, va);
3063		(void)pmap_demote_pde(pmap, pde, va);
3064		PMAP_UNLOCK(pmap);
3065	}
3066small_mappings:
3067	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3068		pmap = PV_PMAP(pv);
3069		PMAP_LOCK(pmap);
3070		pmap->pm_stats.resident_count--;
3071		pde = pmap_pde(pmap, pv->pv_va);
3072		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3073		    " a 4mpage in page %p's pv list", m));
3074		pte = pmap_pte_quick(pmap, pv->pv_va);
3075		tpte = pte_load_clear(pte);
3076		if (tpte & PG_W)
3077			pmap->pm_stats.wired_count--;
3078		if (tpte & PG_A)
3079			vm_page_aflag_set(m, PGA_REFERENCED);
3080
3081		/*
3082		 * Update the vm_page_t clean and reference bits.
3083		 */
3084		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3085			vm_page_dirty(m);
3086		pmap_unuse_pt(pmap, pv->pv_va, &free);
3087		pmap_invalidate_page(pmap, pv->pv_va);
3088		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3089		free_pv_entry(pmap, pv);
3090		PMAP_UNLOCK(pmap);
3091	}
3092	vm_page_aflag_clear(m, PGA_WRITEABLE);
3093	sched_unpin();
3094	rw_wunlock(&pvh_global_lock);
3095	pmap_free_zero_pages(free);
3096}
3097
3098/*
3099 * pmap_protect_pde: do the things to protect a 4mpage in a process
3100 */
3101static boolean_t
3102pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3103{
3104	pd_entry_t newpde, oldpde;
3105	vm_offset_t eva, va;
3106	vm_page_t m;
3107	boolean_t anychanged;
3108
3109	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3110	KASSERT((sva & PDRMASK) == 0,
3111	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3112	anychanged = FALSE;
3113retry:
3114	oldpde = newpde = *pde;
3115	if (oldpde & PG_MANAGED) {
3116		eva = sva + NBPDR;
3117		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3118		    va < eva; va += PAGE_SIZE, m++)
3119			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3120				vm_page_dirty(m);
3121	}
3122	if ((prot & VM_PROT_WRITE) == 0)
3123		newpde &= ~(PG_RW | PG_M);
3124#ifdef PAE
3125	if ((prot & VM_PROT_EXECUTE) == 0)
3126		newpde |= pg_nx;
3127#endif
3128	if (newpde != oldpde) {
3129		if (!pde_cmpset(pde, oldpde, newpde))
3130			goto retry;
3131		if (oldpde & PG_G)
3132			pmap_invalidate_page(pmap, sva);
3133		else
3134			anychanged = TRUE;
3135	}
3136	return (anychanged);
3137}
3138
3139/*
3140 *	Set the physical protection on the
3141 *	specified range of this map as requested.
3142 */
3143void
3144pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3145{
3146	vm_offset_t pdnxt;
3147	pd_entry_t ptpaddr;
3148	pt_entry_t *pte;
3149	boolean_t anychanged, pv_lists_locked;
3150
3151	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3152		pmap_remove(pmap, sva, eva);
3153		return;
3154	}
3155
3156#ifdef PAE
3157	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3158	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3159		return;
3160#else
3161	if (prot & VM_PROT_WRITE)
3162		return;
3163#endif
3164
3165	if (pmap_is_current(pmap))
3166		pv_lists_locked = FALSE;
3167	else {
3168		pv_lists_locked = TRUE;
3169resume:
3170		rw_wlock(&pvh_global_lock);
3171		sched_pin();
3172	}
3173	anychanged = FALSE;
3174
3175	PMAP_LOCK(pmap);
3176	for (; sva < eva; sva = pdnxt) {
3177		pt_entry_t obits, pbits;
3178		u_int pdirindex;
3179
3180		pdnxt = (sva + NBPDR) & ~PDRMASK;
3181		if (pdnxt < sva)
3182			pdnxt = eva;
3183
3184		pdirindex = sva >> PDRSHIFT;
3185		ptpaddr = pmap->pm_pdir[pdirindex];
3186
3187		/*
3188		 * Weed out invalid mappings. Note: we assume that the page
3189		 * directory table is always allocated, and in kernel virtual.
3190		 */
3191		if (ptpaddr == 0)
3192			continue;
3193
3194		/*
3195		 * Check for large page.
3196		 */
3197		if ((ptpaddr & PG_PS) != 0) {
3198			/*
3199			 * Are we protecting the entire large page?  If not,
3200			 * demote the mapping and fall through.
3201			 */
3202			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3203				/*
3204				 * The TLB entry for a PG_G mapping is
3205				 * invalidated by pmap_protect_pde().
3206				 */
3207				if (pmap_protect_pde(pmap,
3208				    &pmap->pm_pdir[pdirindex], sva, prot))
3209					anychanged = TRUE;
3210				continue;
3211			} else {
3212				if (!pv_lists_locked) {
3213					pv_lists_locked = TRUE;
3214					if (!rw_try_wlock(&pvh_global_lock)) {
3215						if (anychanged)
3216							pmap_invalidate_all(
3217							    pmap);
3218						PMAP_UNLOCK(pmap);
3219						goto resume;
3220					}
3221				}
3222				if (!pmap_demote_pde(pmap,
3223				    &pmap->pm_pdir[pdirindex], sva)) {
3224					/*
3225					 * The large page mapping was
3226					 * destroyed.
3227					 */
3228					continue;
3229				}
3230			}
3231		}
3232
3233		if (pdnxt > eva)
3234			pdnxt = eva;
3235
3236		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3237		    sva += PAGE_SIZE) {
3238			vm_page_t m;
3239
3240retry:
3241			/*
3242			 * Regardless of whether a pte is 32 or 64 bits in
3243			 * size, PG_RW, PG_A, and PG_M are among the least
3244			 * significant 32 bits.
3245			 */
3246			obits = pbits = *pte;
3247			if ((pbits & PG_V) == 0)
3248				continue;
3249
3250			if ((prot & VM_PROT_WRITE) == 0) {
3251				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3252				    (PG_MANAGED | PG_M | PG_RW)) {
3253					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3254					vm_page_dirty(m);
3255				}
3256				pbits &= ~(PG_RW | PG_M);
3257			}
3258#ifdef PAE
3259			if ((prot & VM_PROT_EXECUTE) == 0)
3260				pbits |= pg_nx;
3261#endif
3262
3263			if (pbits != obits) {
3264#ifdef PAE
3265				if (!atomic_cmpset_64(pte, obits, pbits))
3266					goto retry;
3267#else
3268				if (!atomic_cmpset_int((u_int *)pte, obits,
3269				    pbits))
3270					goto retry;
3271#endif
3272				if (obits & PG_G)
3273					pmap_invalidate_page(pmap, sva);
3274				else
3275					anychanged = TRUE;
3276			}
3277		}
3278	}
3279	if (anychanged)
3280		pmap_invalidate_all(pmap);
3281	if (pv_lists_locked) {
3282		sched_unpin();
3283		rw_wunlock(&pvh_global_lock);
3284	}
3285	PMAP_UNLOCK(pmap);
3286}
3287
3288/*
3289 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3290 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3291 * For promotion to occur, two conditions must be met: (1) the 4KB page
3292 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3293 * mappings must have identical characteristics.
3294 *
3295 * Managed (PG_MANAGED) mappings within the kernel address space are not
3296 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3297 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3298 * pmap.
3299 */
3300static void
3301pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3302{
3303	pd_entry_t newpde;
3304	pt_entry_t *firstpte, oldpte, pa, *pte;
3305	vm_offset_t oldpteva;
3306	vm_page_t mpte;
3307
3308	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3309
3310	/*
3311	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3312	 * either invalid, unused, or does not map the first 4KB physical page
3313	 * within a 2- or 4MB page.
3314	 */
3315	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3316setpde:
3317	newpde = *firstpte;
3318	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3319		pmap_pde_p_failures++;
3320		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3321		    " in pmap %p", va, pmap);
3322		return;
3323	}
3324	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3325		pmap_pde_p_failures++;
3326		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3327		    " in pmap %p", va, pmap);
3328		return;
3329	}
3330	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3331		/*
3332		 * When PG_M is already clear, PG_RW can be cleared without
3333		 * a TLB invalidation.
3334		 */
3335		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3336		    ~PG_RW))
3337			goto setpde;
3338		newpde &= ~PG_RW;
3339	}
3340
3341	/*
3342	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3343	 * PTE maps an unexpected 4KB physical page or does not have identical
3344	 * characteristics to the first PTE.
3345	 */
3346	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3347	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3348setpte:
3349		oldpte = *pte;
3350		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3351			pmap_pde_p_failures++;
3352			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3353			    " in pmap %p", va, pmap);
3354			return;
3355		}
3356		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3357			/*
3358			 * When PG_M is already clear, PG_RW can be cleared
3359			 * without a TLB invalidation.
3360			 */
3361			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3362			    oldpte & ~PG_RW))
3363				goto setpte;
3364			oldpte &= ~PG_RW;
3365			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3366			    (va & ~PDRMASK);
3367			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3368			    " in pmap %p", oldpteva, pmap);
3369		}
3370		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3371			pmap_pde_p_failures++;
3372			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3373			    " in pmap %p", va, pmap);
3374			return;
3375		}
3376		pa -= PAGE_SIZE;
3377	}
3378
3379	/*
3380	 * Save the page table page in its current state until the PDE
3381	 * mapping the superpage is demoted by pmap_demote_pde() or
3382	 * destroyed by pmap_remove_pde().
3383	 */
3384	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3385	KASSERT(mpte >= vm_page_array &&
3386	    mpte < &vm_page_array[vm_page_array_size],
3387	    ("pmap_promote_pde: page table page is out of range"));
3388	KASSERT(mpte->pindex == va >> PDRSHIFT,
3389	    ("pmap_promote_pde: page table page's pindex is wrong"));
3390	pmap_insert_pt_page(pmap, mpte);
3391
3392	/*
3393	 * Promote the pv entries.
3394	 */
3395	if ((newpde & PG_MANAGED) != 0)
3396		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3397
3398	/*
3399	 * Propagate the PAT index to its proper position.
3400	 */
3401	if ((newpde & PG_PTE_PAT) != 0)
3402		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3403
3404	/*
3405	 * Map the superpage.
3406	 */
3407	if (workaround_erratum383)
3408		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3409	else if (pmap == kernel_pmap)
3410		pmap_kenter_pde(va, PG_PS | newpde);
3411	else
3412		pde_store(pde, PG_PS | newpde);
3413
3414	pmap_pde_promotions++;
3415	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3416	    " in pmap %p", va, pmap);
3417}
3418
3419/*
3420 *	Insert the given physical page (p) at
3421 *	the specified virtual address (v) in the
3422 *	target physical map with the protection requested.
3423 *
3424 *	If specified, the page will be wired down, meaning
3425 *	that the related pte can not be reclaimed.
3426 *
3427 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3428 *	or lose information.  That is, this routine must actually
3429 *	insert this page into the given map NOW.
3430 */
3431void
3432pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3433    vm_prot_t prot, boolean_t wired)
3434{
3435	pd_entry_t *pde;
3436	pt_entry_t *pte;
3437	pt_entry_t newpte, origpte;
3438	pv_entry_t pv;
3439	vm_paddr_t opa, pa;
3440	vm_page_t mpte, om;
3441	boolean_t invlva;
3442
3443	va = trunc_page(va);
3444	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3445	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3446	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3447	    va));
3448	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
3449	    VM_OBJECT_LOCKED(m->object),
3450	    ("pmap_enter: page %p is not busy", m));
3451
3452	mpte = NULL;
3453
3454	rw_wlock(&pvh_global_lock);
3455	PMAP_LOCK(pmap);
3456	sched_pin();
3457
3458	/*
3459	 * In the case that a page table page is not
3460	 * resident, we are creating it here.
3461	 */
3462	if (va < VM_MAXUSER_ADDRESS) {
3463		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3464	}
3465
3466	pde = pmap_pde(pmap, va);
3467	if ((*pde & PG_PS) != 0)
3468		panic("pmap_enter: attempted pmap_enter on 4MB page");
3469	pte = pmap_pte_quick(pmap, va);
3470
3471	/*
3472	 * Page Directory table entry not valid, we need a new PT page
3473	 */
3474	if (pte == NULL) {
3475		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3476			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3477	}
3478
3479	pa = VM_PAGE_TO_PHYS(m);
3480	om = NULL;
3481	origpte = *pte;
3482	opa = origpte & PG_FRAME;
3483
3484	/*
3485	 * Mapping has not changed, must be protection or wiring change.
3486	 */
3487	if (origpte && (opa == pa)) {
3488		/*
3489		 * Wiring change, just update stats. We don't worry about
3490		 * wiring PT pages as they remain resident as long as there
3491		 * are valid mappings in them. Hence, if a user page is wired,
3492		 * the PT page will be also.
3493		 */
3494		if (wired && ((origpte & PG_W) == 0))
3495			pmap->pm_stats.wired_count++;
3496		else if (!wired && (origpte & PG_W))
3497			pmap->pm_stats.wired_count--;
3498
3499		/*
3500		 * Remove extra pte reference
3501		 */
3502		if (mpte)
3503			mpte->wire_count--;
3504
3505		if (origpte & PG_MANAGED) {
3506			om = m;
3507			pa |= PG_MANAGED;
3508		}
3509		goto validate;
3510	}
3511
3512	pv = NULL;
3513
3514	/*
3515	 * Mapping has changed, invalidate old range and fall through to
3516	 * handle validating new mapping.
3517	 */
3518	if (opa) {
3519		if (origpte & PG_W)
3520			pmap->pm_stats.wired_count--;
3521		if (origpte & PG_MANAGED) {
3522			om = PHYS_TO_VM_PAGE(opa);
3523			pv = pmap_pvh_remove(&om->md, pmap, va);
3524		}
3525		if (mpte != NULL) {
3526			mpte->wire_count--;
3527			KASSERT(mpte->wire_count > 0,
3528			    ("pmap_enter: missing reference to page table page,"
3529			     " va: 0x%x", va));
3530		}
3531	} else
3532		pmap->pm_stats.resident_count++;
3533
3534	/*
3535	 * Enter on the PV list if part of our managed memory.
3536	 */
3537	if ((m->oflags & VPO_UNMANAGED) == 0) {
3538		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3539		    ("pmap_enter: managed mapping within the clean submap"));
3540		if (pv == NULL)
3541			pv = get_pv_entry(pmap, FALSE);
3542		pv->pv_va = va;
3543		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3544		pa |= PG_MANAGED;
3545	} else if (pv != NULL)
3546		free_pv_entry(pmap, pv);
3547
3548	/*
3549	 * Increment counters
3550	 */
3551	if (wired)
3552		pmap->pm_stats.wired_count++;
3553
3554validate:
3555	/*
3556	 * Now validate mapping with desired protection/wiring.
3557	 */
3558	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3559	if ((prot & VM_PROT_WRITE) != 0) {
3560		newpte |= PG_RW;
3561		if ((newpte & PG_MANAGED) != 0)
3562			vm_page_aflag_set(m, PGA_WRITEABLE);
3563	}
3564#ifdef PAE
3565	if ((prot & VM_PROT_EXECUTE) == 0)
3566		newpte |= pg_nx;
3567#endif
3568	if (wired)
3569		newpte |= PG_W;
3570	if (va < VM_MAXUSER_ADDRESS)
3571		newpte |= PG_U;
3572	if (pmap == kernel_pmap)
3573		newpte |= pgeflag;
3574
3575	/*
3576	 * if the mapping or permission bits are different, we need
3577	 * to update the pte.
3578	 */
3579	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3580		newpte |= PG_A;
3581		if ((access & VM_PROT_WRITE) != 0)
3582			newpte |= PG_M;
3583		if (origpte & PG_V) {
3584			invlva = FALSE;
3585			origpte = pte_load_store(pte, newpte);
3586			if (origpte & PG_A) {
3587				if (origpte & PG_MANAGED)
3588					vm_page_aflag_set(om, PGA_REFERENCED);
3589				if (opa != VM_PAGE_TO_PHYS(m))
3590					invlva = TRUE;
3591#ifdef PAE
3592				if ((origpte & PG_NX) == 0 &&
3593				    (newpte & PG_NX) != 0)
3594					invlva = TRUE;
3595#endif
3596			}
3597			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3598				if ((origpte & PG_MANAGED) != 0)
3599					vm_page_dirty(om);
3600				if ((prot & VM_PROT_WRITE) == 0)
3601					invlva = TRUE;
3602			}
3603			if ((origpte & PG_MANAGED) != 0 &&
3604			    TAILQ_EMPTY(&om->md.pv_list) &&
3605			    ((om->flags & PG_FICTITIOUS) != 0 ||
3606			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3607				vm_page_aflag_clear(om, PGA_WRITEABLE);
3608			if (invlva)
3609				pmap_invalidate_page(pmap, va);
3610		} else
3611			pte_store(pte, newpte);
3612	}
3613
3614	/*
3615	 * If both the page table page and the reservation are fully
3616	 * populated, then attempt promotion.
3617	 */
3618	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3619	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3620	    vm_reserv_level_iffullpop(m) == 0)
3621		pmap_promote_pde(pmap, pde, va);
3622
3623	sched_unpin();
3624	rw_wunlock(&pvh_global_lock);
3625	PMAP_UNLOCK(pmap);
3626}
3627
3628/*
3629 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3630 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3631 * blocking, (2) a mapping already exists at the specified virtual address, or
3632 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3633 */
3634static boolean_t
3635pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3636{
3637	pd_entry_t *pde, newpde;
3638
3639	rw_assert(&pvh_global_lock, RA_WLOCKED);
3640	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3641	pde = pmap_pde(pmap, va);
3642	if (*pde != 0) {
3643		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3644		    " in pmap %p", va, pmap);
3645		return (FALSE);
3646	}
3647	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3648	    PG_PS | PG_V;
3649	if ((m->oflags & VPO_UNMANAGED) == 0) {
3650		newpde |= PG_MANAGED;
3651
3652		/*
3653		 * Abort this mapping if its PV entry could not be created.
3654		 */
3655		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3656			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3657			    " in pmap %p", va, pmap);
3658			return (FALSE);
3659		}
3660	}
3661#ifdef PAE
3662	if ((prot & VM_PROT_EXECUTE) == 0)
3663		newpde |= pg_nx;
3664#endif
3665	if (va < VM_MAXUSER_ADDRESS)
3666		newpde |= PG_U;
3667
3668	/*
3669	 * Increment counters.
3670	 */
3671	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3672
3673	/*
3674	 * Map the superpage.
3675	 */
3676	pde_store(pde, newpde);
3677
3678	pmap_pde_mappings++;
3679	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3680	    " in pmap %p", va, pmap);
3681	return (TRUE);
3682}
3683
3684/*
3685 * Maps a sequence of resident pages belonging to the same object.
3686 * The sequence begins with the given page m_start.  This page is
3687 * mapped at the given virtual address start.  Each subsequent page is
3688 * mapped at a virtual address that is offset from start by the same
3689 * amount as the page is offset from m_start within the object.  The
3690 * last page in the sequence is the page with the largest offset from
3691 * m_start that can be mapped at a virtual address less than the given
3692 * virtual address end.  Not every virtual page between start and end
3693 * is mapped; only those for which a resident page exists with the
3694 * corresponding offset from m_start are mapped.
3695 */
3696void
3697pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3698    vm_page_t m_start, vm_prot_t prot)
3699{
3700	vm_offset_t va;
3701	vm_page_t m, mpte;
3702	vm_pindex_t diff, psize;
3703
3704	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3705	psize = atop(end - start);
3706	mpte = NULL;
3707	m = m_start;
3708	rw_wlock(&pvh_global_lock);
3709	PMAP_LOCK(pmap);
3710	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3711		va = start + ptoa(diff);
3712		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3713		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3714		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3715		    pmap_enter_pde(pmap, va, m, prot))
3716			m = &m[NBPDR / PAGE_SIZE - 1];
3717		else
3718			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3719			    mpte);
3720		m = TAILQ_NEXT(m, listq);
3721	}
3722	rw_wunlock(&pvh_global_lock);
3723	PMAP_UNLOCK(pmap);
3724}
3725
3726/*
3727 * this code makes some *MAJOR* assumptions:
3728 * 1. Current pmap & pmap exists.
3729 * 2. Not wired.
3730 * 3. Read access.
3731 * 4. No page table pages.
3732 * but is *MUCH* faster than pmap_enter...
3733 */
3734
3735void
3736pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3737{
3738
3739	rw_wlock(&pvh_global_lock);
3740	PMAP_LOCK(pmap);
3741	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3742	rw_wunlock(&pvh_global_lock);
3743	PMAP_UNLOCK(pmap);
3744}
3745
3746static vm_page_t
3747pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3748    vm_prot_t prot, vm_page_t mpte)
3749{
3750	pt_entry_t *pte;
3751	vm_paddr_t pa;
3752	vm_page_t free;
3753
3754	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3755	    (m->oflags & VPO_UNMANAGED) != 0,
3756	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3757	rw_assert(&pvh_global_lock, RA_WLOCKED);
3758	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3759
3760	/*
3761	 * In the case that a page table page is not
3762	 * resident, we are creating it here.
3763	 */
3764	if (va < VM_MAXUSER_ADDRESS) {
3765		u_int ptepindex;
3766		pd_entry_t ptepa;
3767
3768		/*
3769		 * Calculate pagetable page index
3770		 */
3771		ptepindex = va >> PDRSHIFT;
3772		if (mpte && (mpte->pindex == ptepindex)) {
3773			mpte->wire_count++;
3774		} else {
3775			/*
3776			 * Get the page directory entry
3777			 */
3778			ptepa = pmap->pm_pdir[ptepindex];
3779
3780			/*
3781			 * If the page table page is mapped, we just increment
3782			 * the hold count, and activate it.
3783			 */
3784			if (ptepa) {
3785				if (ptepa & PG_PS)
3786					return (NULL);
3787				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3788				mpte->wire_count++;
3789			} else {
3790				mpte = _pmap_allocpte(pmap, ptepindex,
3791				    M_NOWAIT);
3792				if (mpte == NULL)
3793					return (mpte);
3794			}
3795		}
3796	} else {
3797		mpte = NULL;
3798	}
3799
3800	/*
3801	 * This call to vtopte makes the assumption that we are
3802	 * entering the page into the current pmap.  In order to support
3803	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3804	 * But that isn't as quick as vtopte.
3805	 */
3806	pte = vtopte(va);
3807	if (*pte) {
3808		if (mpte != NULL) {
3809			mpte->wire_count--;
3810			mpte = NULL;
3811		}
3812		return (mpte);
3813	}
3814
3815	/*
3816	 * Enter on the PV list if part of our managed memory.
3817	 */
3818	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3819	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3820		if (mpte != NULL) {
3821			free = NULL;
3822			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3823				pmap_invalidate_page(pmap, va);
3824				pmap_free_zero_pages(free);
3825			}
3826
3827			mpte = NULL;
3828		}
3829		return (mpte);
3830	}
3831
3832	/*
3833	 * Increment counters
3834	 */
3835	pmap->pm_stats.resident_count++;
3836
3837	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3838#ifdef PAE
3839	if ((prot & VM_PROT_EXECUTE) == 0)
3840		pa |= pg_nx;
3841#endif
3842
3843	/*
3844	 * Now validate mapping with RO protection
3845	 */
3846	if ((m->oflags & VPO_UNMANAGED) != 0)
3847		pte_store(pte, pa | PG_V | PG_U);
3848	else
3849		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3850	return (mpte);
3851}
3852
3853/*
3854 * Make a temporary mapping for a physical address.  This is only intended
3855 * to be used for panic dumps.
3856 */
3857void *
3858pmap_kenter_temporary(vm_paddr_t pa, int i)
3859{
3860	vm_offset_t va;
3861
3862	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3863	pmap_kenter(va, pa);
3864	invlpg(va);
3865	return ((void *)crashdumpmap);
3866}
3867
3868/*
3869 * This code maps large physical mmap regions into the
3870 * processor address space.  Note that some shortcuts
3871 * are taken, but the code works.
3872 */
3873void
3874pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3875    vm_pindex_t pindex, vm_size_t size)
3876{
3877	pd_entry_t *pde;
3878	vm_paddr_t pa, ptepa;
3879	vm_page_t p;
3880	int pat_mode;
3881
3882	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3883	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3884	    ("pmap_object_init_pt: non-device object"));
3885	if (pseflag &&
3886	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3887		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3888			return;
3889		p = vm_page_lookup(object, pindex);
3890		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3891		    ("pmap_object_init_pt: invalid page %p", p));
3892		pat_mode = p->md.pat_mode;
3893
3894		/*
3895		 * Abort the mapping if the first page is not physically
3896		 * aligned to a 2/4MB page boundary.
3897		 */
3898		ptepa = VM_PAGE_TO_PHYS(p);
3899		if (ptepa & (NBPDR - 1))
3900			return;
3901
3902		/*
3903		 * Skip the first page.  Abort the mapping if the rest of
3904		 * the pages are not physically contiguous or have differing
3905		 * memory attributes.
3906		 */
3907		p = TAILQ_NEXT(p, listq);
3908		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3909		    pa += PAGE_SIZE) {
3910			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3911			    ("pmap_object_init_pt: invalid page %p", p));
3912			if (pa != VM_PAGE_TO_PHYS(p) ||
3913			    pat_mode != p->md.pat_mode)
3914				return;
3915			p = TAILQ_NEXT(p, listq);
3916		}
3917
3918		/*
3919		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3920		 * "size" is a multiple of 2/4M, adding the PAT setting to
3921		 * "pa" will not affect the termination of this loop.
3922		 */
3923		PMAP_LOCK(pmap);
3924		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3925		    size; pa += NBPDR) {
3926			pde = pmap_pde(pmap, addr);
3927			if (*pde == 0) {
3928				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3929				    PG_U | PG_RW | PG_V);
3930				pmap->pm_stats.resident_count += NBPDR /
3931				    PAGE_SIZE;
3932				pmap_pde_mappings++;
3933			}
3934			/* Else continue on if the PDE is already valid. */
3935			addr += NBPDR;
3936		}
3937		PMAP_UNLOCK(pmap);
3938	}
3939}
3940
3941/*
3942 *	Routine:	pmap_change_wiring
3943 *	Function:	Change the wiring attribute for a map/virtual-address
3944 *			pair.
3945 *	In/out conditions:
3946 *			The mapping must already exist in the pmap.
3947 */
3948void
3949pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3950{
3951	pd_entry_t *pde;
3952	pt_entry_t *pte;
3953	boolean_t are_queues_locked;
3954
3955	are_queues_locked = FALSE;
3956retry:
3957	PMAP_LOCK(pmap);
3958	pde = pmap_pde(pmap, va);
3959	if ((*pde & PG_PS) != 0) {
3960		if (!wired != ((*pde & PG_W) == 0)) {
3961			if (!are_queues_locked) {
3962				are_queues_locked = TRUE;
3963				if (!rw_try_wlock(&pvh_global_lock)) {
3964					PMAP_UNLOCK(pmap);
3965					rw_wlock(&pvh_global_lock);
3966					goto retry;
3967				}
3968			}
3969			if (!pmap_demote_pde(pmap, pde, va))
3970				panic("pmap_change_wiring: demotion failed");
3971		} else
3972			goto out;
3973	}
3974	pte = pmap_pte(pmap, va);
3975
3976	if (wired && !pmap_pte_w(pte))
3977		pmap->pm_stats.wired_count++;
3978	else if (!wired && pmap_pte_w(pte))
3979		pmap->pm_stats.wired_count--;
3980
3981	/*
3982	 * Wiring is not a hardware characteristic so there is no need to
3983	 * invalidate TLB.
3984	 */
3985	pmap_pte_set_w(pte, wired);
3986	pmap_pte_release(pte);
3987out:
3988	if (are_queues_locked)
3989		rw_wunlock(&pvh_global_lock);
3990	PMAP_UNLOCK(pmap);
3991}
3992
3993
3994
3995/*
3996 *	Copy the range specified by src_addr/len
3997 *	from the source map to the range dst_addr/len
3998 *	in the destination map.
3999 *
4000 *	This routine is only advisory and need not do anything.
4001 */
4002
4003void
4004pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4005    vm_offset_t src_addr)
4006{
4007	vm_page_t   free;
4008	vm_offset_t addr;
4009	vm_offset_t end_addr = src_addr + len;
4010	vm_offset_t pdnxt;
4011
4012	if (dst_addr != src_addr)
4013		return;
4014
4015	if (!pmap_is_current(src_pmap))
4016		return;
4017
4018	rw_wlock(&pvh_global_lock);
4019	if (dst_pmap < src_pmap) {
4020		PMAP_LOCK(dst_pmap);
4021		PMAP_LOCK(src_pmap);
4022	} else {
4023		PMAP_LOCK(src_pmap);
4024		PMAP_LOCK(dst_pmap);
4025	}
4026	sched_pin();
4027	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4028		pt_entry_t *src_pte, *dst_pte;
4029		vm_page_t dstmpte, srcmpte;
4030		pd_entry_t srcptepaddr;
4031		u_int ptepindex;
4032
4033		KASSERT(addr < UPT_MIN_ADDRESS,
4034		    ("pmap_copy: invalid to pmap_copy page tables"));
4035
4036		pdnxt = (addr + NBPDR) & ~PDRMASK;
4037		if (pdnxt < addr)
4038			pdnxt = end_addr;
4039		ptepindex = addr >> PDRSHIFT;
4040
4041		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4042		if (srcptepaddr == 0)
4043			continue;
4044
4045		if (srcptepaddr & PG_PS) {
4046			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4047			    ((srcptepaddr & PG_MANAGED) == 0 ||
4048			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4049			    PG_PS_FRAME))) {
4050				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4051				    ~PG_W;
4052				dst_pmap->pm_stats.resident_count +=
4053				    NBPDR / PAGE_SIZE;
4054			}
4055			continue;
4056		}
4057
4058		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4059		KASSERT(srcmpte->wire_count > 0,
4060		    ("pmap_copy: source page table page is unused"));
4061
4062		if (pdnxt > end_addr)
4063			pdnxt = end_addr;
4064
4065		src_pte = vtopte(addr);
4066		while (addr < pdnxt) {
4067			pt_entry_t ptetemp;
4068			ptetemp = *src_pte;
4069			/*
4070			 * we only virtual copy managed pages
4071			 */
4072			if ((ptetemp & PG_MANAGED) != 0) {
4073				dstmpte = pmap_allocpte(dst_pmap, addr,
4074				    M_NOWAIT);
4075				if (dstmpte == NULL)
4076					goto out;
4077				dst_pte = pmap_pte_quick(dst_pmap, addr);
4078				if (*dst_pte == 0 &&
4079				    pmap_try_insert_pv_entry(dst_pmap, addr,
4080				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4081					/*
4082					 * Clear the wired, modified, and
4083					 * accessed (referenced) bits
4084					 * during the copy.
4085					 */
4086					*dst_pte = ptetemp & ~(PG_W | PG_M |
4087					    PG_A);
4088					dst_pmap->pm_stats.resident_count++;
4089	 			} else {
4090					free = NULL;
4091					if (pmap_unwire_pte_hold(dst_pmap,
4092					    dstmpte, &free)) {
4093						pmap_invalidate_page(dst_pmap,
4094						    addr);
4095						pmap_free_zero_pages(free);
4096					}
4097					goto out;
4098				}
4099				if (dstmpte->wire_count >= srcmpte->wire_count)
4100					break;
4101			}
4102			addr += PAGE_SIZE;
4103			src_pte++;
4104		}
4105	}
4106out:
4107	sched_unpin();
4108	rw_wunlock(&pvh_global_lock);
4109	PMAP_UNLOCK(src_pmap);
4110	PMAP_UNLOCK(dst_pmap);
4111}
4112
4113static __inline void
4114pagezero(void *page)
4115{
4116#if defined(I686_CPU)
4117	if (cpu_class == CPUCLASS_686) {
4118#if defined(CPU_ENABLE_SSE)
4119		if (cpu_feature & CPUID_SSE2)
4120			sse2_pagezero(page);
4121		else
4122#endif
4123			i686_pagezero(page);
4124	} else
4125#endif
4126		bzero(page, PAGE_SIZE);
4127}
4128
4129/*
4130 *	pmap_zero_page zeros the specified hardware page by mapping
4131 *	the page into KVM and using bzero to clear its contents.
4132 */
4133void
4134pmap_zero_page(vm_page_t m)
4135{
4136	struct sysmaps *sysmaps;
4137
4138	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4139	mtx_lock(&sysmaps->lock);
4140	if (*sysmaps->CMAP2)
4141		panic("pmap_zero_page: CMAP2 busy");
4142	sched_pin();
4143	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4144	    pmap_cache_bits(m->md.pat_mode, 0);
4145	invlcaddr(sysmaps->CADDR2);
4146	pagezero(sysmaps->CADDR2);
4147	*sysmaps->CMAP2 = 0;
4148	sched_unpin();
4149	mtx_unlock(&sysmaps->lock);
4150}
4151
4152/*
4153 *	pmap_zero_page_area zeros the specified hardware page by mapping
4154 *	the page into KVM and using bzero to clear its contents.
4155 *
4156 *	off and size may not cover an area beyond a single hardware page.
4157 */
4158void
4159pmap_zero_page_area(vm_page_t m, int off, int size)
4160{
4161	struct sysmaps *sysmaps;
4162
4163	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4164	mtx_lock(&sysmaps->lock);
4165	if (*sysmaps->CMAP2)
4166		panic("pmap_zero_page_area: CMAP2 busy");
4167	sched_pin();
4168	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4169	    pmap_cache_bits(m->md.pat_mode, 0);
4170	invlcaddr(sysmaps->CADDR2);
4171	if (off == 0 && size == PAGE_SIZE)
4172		pagezero(sysmaps->CADDR2);
4173	else
4174		bzero((char *)sysmaps->CADDR2 + off, size);
4175	*sysmaps->CMAP2 = 0;
4176	sched_unpin();
4177	mtx_unlock(&sysmaps->lock);
4178}
4179
4180/*
4181 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4182 *	the page into KVM and using bzero to clear its contents.  This
4183 *	is intended to be called from the vm_pagezero process only and
4184 *	outside of Giant.
4185 */
4186void
4187pmap_zero_page_idle(vm_page_t m)
4188{
4189
4190	if (*CMAP3)
4191		panic("pmap_zero_page_idle: CMAP3 busy");
4192	sched_pin();
4193	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4194	    pmap_cache_bits(m->md.pat_mode, 0);
4195	invlcaddr(CADDR3);
4196	pagezero(CADDR3);
4197	*CMAP3 = 0;
4198	sched_unpin();
4199}
4200
4201/*
4202 *	pmap_copy_page copies the specified (machine independent)
4203 *	page by mapping the page into virtual memory and using
4204 *	bcopy to copy the page, one machine dependent page at a
4205 *	time.
4206 */
4207void
4208pmap_copy_page(vm_page_t src, vm_page_t dst)
4209{
4210	struct sysmaps *sysmaps;
4211
4212	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4213	mtx_lock(&sysmaps->lock);
4214	if (*sysmaps->CMAP1)
4215		panic("pmap_copy_page: CMAP1 busy");
4216	if (*sysmaps->CMAP2)
4217		panic("pmap_copy_page: CMAP2 busy");
4218	sched_pin();
4219	invlpg((u_int)sysmaps->CADDR1);
4220	invlpg((u_int)sysmaps->CADDR2);
4221	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4222	    pmap_cache_bits(src->md.pat_mode, 0);
4223	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4224	    pmap_cache_bits(dst->md.pat_mode, 0);
4225	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4226	*sysmaps->CMAP1 = 0;
4227	*sysmaps->CMAP2 = 0;
4228	sched_unpin();
4229	mtx_unlock(&sysmaps->lock);
4230}
4231
4232/*
4233 * Returns true if the pmap's pv is one of the first
4234 * 16 pvs linked to from this page.  This count may
4235 * be changed upwards or downwards in the future; it
4236 * is only necessary that true be returned for a small
4237 * subset of pmaps for proper page aging.
4238 */
4239boolean_t
4240pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4241{
4242	struct md_page *pvh;
4243	pv_entry_t pv;
4244	int loops = 0;
4245	boolean_t rv;
4246
4247	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4248	    ("pmap_page_exists_quick: page %p is not managed", m));
4249	rv = FALSE;
4250	rw_wlock(&pvh_global_lock);
4251	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4252		if (PV_PMAP(pv) == pmap) {
4253			rv = TRUE;
4254			break;
4255		}
4256		loops++;
4257		if (loops >= 16)
4258			break;
4259	}
4260	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4261		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4262		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4263			if (PV_PMAP(pv) == pmap) {
4264				rv = TRUE;
4265				break;
4266			}
4267			loops++;
4268			if (loops >= 16)
4269				break;
4270		}
4271	}
4272	rw_wunlock(&pvh_global_lock);
4273	return (rv);
4274}
4275
4276/*
4277 *	pmap_page_wired_mappings:
4278 *
4279 *	Return the number of managed mappings to the given physical page
4280 *	that are wired.
4281 */
4282int
4283pmap_page_wired_mappings(vm_page_t m)
4284{
4285	int count;
4286
4287	count = 0;
4288	if ((m->oflags & VPO_UNMANAGED) != 0)
4289		return (count);
4290	rw_wlock(&pvh_global_lock);
4291	count = pmap_pvh_wired_mappings(&m->md, count);
4292	if ((m->flags & PG_FICTITIOUS) == 0) {
4293	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4294	        count);
4295	}
4296	rw_wunlock(&pvh_global_lock);
4297	return (count);
4298}
4299
4300/*
4301 *	pmap_pvh_wired_mappings:
4302 *
4303 *	Return the updated number "count" of managed mappings that are wired.
4304 */
4305static int
4306pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4307{
4308	pmap_t pmap;
4309	pt_entry_t *pte;
4310	pv_entry_t pv;
4311
4312	rw_assert(&pvh_global_lock, RA_WLOCKED);
4313	sched_pin();
4314	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4315		pmap = PV_PMAP(pv);
4316		PMAP_LOCK(pmap);
4317		pte = pmap_pte_quick(pmap, pv->pv_va);
4318		if ((*pte & PG_W) != 0)
4319			count++;
4320		PMAP_UNLOCK(pmap);
4321	}
4322	sched_unpin();
4323	return (count);
4324}
4325
4326/*
4327 * Returns TRUE if the given page is mapped individually or as part of
4328 * a 4mpage.  Otherwise, returns FALSE.
4329 */
4330boolean_t
4331pmap_page_is_mapped(vm_page_t m)
4332{
4333	boolean_t rv;
4334
4335	if ((m->oflags & VPO_UNMANAGED) != 0)
4336		return (FALSE);
4337	rw_wlock(&pvh_global_lock);
4338	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4339	    ((m->flags & PG_FICTITIOUS) == 0 &&
4340	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4341	rw_wunlock(&pvh_global_lock);
4342	return (rv);
4343}
4344
4345/*
4346 * Remove all pages from specified address space
4347 * this aids process exit speeds.  Also, this code
4348 * is special cased for current process only, but
4349 * can have the more generic (and slightly slower)
4350 * mode enabled.  This is much faster than pmap_remove
4351 * in the case of running down an entire address space.
4352 */
4353void
4354pmap_remove_pages(pmap_t pmap)
4355{
4356	pt_entry_t *pte, tpte;
4357	vm_page_t free = NULL;
4358	vm_page_t m, mpte, mt;
4359	pv_entry_t pv;
4360	struct md_page *pvh;
4361	struct pv_chunk *pc, *npc;
4362	int field, idx;
4363	int32_t bit;
4364	uint32_t inuse, bitmask;
4365	int allfree;
4366
4367	if (pmap != PCPU_GET(curpmap)) {
4368		printf("warning: pmap_remove_pages called with non-current pmap\n");
4369		return;
4370	}
4371	rw_wlock(&pvh_global_lock);
4372	PMAP_LOCK(pmap);
4373	sched_pin();
4374	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4375		allfree = 1;
4376		for (field = 0; field < _NPCM; field++) {
4377			inuse = ~pc->pc_map[field] & pc_freemask[field];
4378			while (inuse != 0) {
4379				bit = bsfl(inuse);
4380				bitmask = 1UL << bit;
4381				idx = field * 32 + bit;
4382				pv = &pc->pc_pventry[idx];
4383				inuse &= ~bitmask;
4384
4385				pte = pmap_pde(pmap, pv->pv_va);
4386				tpte = *pte;
4387				if ((tpte & PG_PS) == 0) {
4388					pte = vtopte(pv->pv_va);
4389					tpte = *pte & ~PG_PTE_PAT;
4390				}
4391
4392				if (tpte == 0) {
4393					printf(
4394					    "TPTE at %p  IS ZERO @ VA %08x\n",
4395					    pte, pv->pv_va);
4396					panic("bad pte");
4397				}
4398
4399/*
4400 * We cannot remove wired pages from a process' mapping at this time
4401 */
4402				if (tpte & PG_W) {
4403					allfree = 0;
4404					continue;
4405				}
4406
4407				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4408				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4409				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4410				    m, (uintmax_t)m->phys_addr,
4411				    (uintmax_t)tpte));
4412
4413				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4414				    m < &vm_page_array[vm_page_array_size],
4415				    ("pmap_remove_pages: bad tpte %#jx",
4416				    (uintmax_t)tpte));
4417
4418				pte_clear(pte);
4419
4420				/*
4421				 * Update the vm_page_t clean/reference bits.
4422				 */
4423				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4424					if ((tpte & PG_PS) != 0) {
4425						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4426							vm_page_dirty(mt);
4427					} else
4428						vm_page_dirty(m);
4429				}
4430
4431				/* Mark free */
4432				PV_STAT(pv_entry_frees++);
4433				PV_STAT(pv_entry_spare++);
4434				pv_entry_count--;
4435				pc->pc_map[field] |= bitmask;
4436				if ((tpte & PG_PS) != 0) {
4437					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4438					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4439					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4440					if (TAILQ_EMPTY(&pvh->pv_list)) {
4441						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4442							if (TAILQ_EMPTY(&mt->md.pv_list))
4443								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4444					}
4445					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4446					if (mpte != NULL) {
4447						pmap_remove_pt_page(pmap, mpte);
4448						pmap->pm_stats.resident_count--;
4449						KASSERT(mpte->wire_count == NPTEPG,
4450						    ("pmap_remove_pages: pte page wire count error"));
4451						mpte->wire_count = 0;
4452						pmap_add_delayed_free_list(mpte, &free, FALSE);
4453						atomic_subtract_int(&cnt.v_wire_count, 1);
4454					}
4455				} else {
4456					pmap->pm_stats.resident_count--;
4457					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4458					if (TAILQ_EMPTY(&m->md.pv_list) &&
4459					    (m->flags & PG_FICTITIOUS) == 0) {
4460						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4461						if (TAILQ_EMPTY(&pvh->pv_list))
4462							vm_page_aflag_clear(m, PGA_WRITEABLE);
4463					}
4464					pmap_unuse_pt(pmap, pv->pv_va, &free);
4465				}
4466			}
4467		}
4468		if (allfree) {
4469			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4470			free_pv_chunk(pc);
4471		}
4472	}
4473	sched_unpin();
4474	pmap_invalidate_all(pmap);
4475	rw_wunlock(&pvh_global_lock);
4476	PMAP_UNLOCK(pmap);
4477	pmap_free_zero_pages(free);
4478}
4479
4480/*
4481 *	pmap_is_modified:
4482 *
4483 *	Return whether or not the specified physical page was modified
4484 *	in any physical maps.
4485 */
4486boolean_t
4487pmap_is_modified(vm_page_t m)
4488{
4489	boolean_t rv;
4490
4491	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4492	    ("pmap_is_modified: page %p is not managed", m));
4493
4494	/*
4495	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
4496	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4497	 * is clear, no PTEs can have PG_M set.
4498	 */
4499	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4500	if ((m->oflags & VPO_BUSY) == 0 &&
4501	    (m->aflags & PGA_WRITEABLE) == 0)
4502		return (FALSE);
4503	rw_wlock(&pvh_global_lock);
4504	rv = pmap_is_modified_pvh(&m->md) ||
4505	    ((m->flags & PG_FICTITIOUS) == 0 &&
4506	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4507	rw_wunlock(&pvh_global_lock);
4508	return (rv);
4509}
4510
4511/*
4512 * Returns TRUE if any of the given mappings were used to modify
4513 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4514 * mappings are supported.
4515 */
4516static boolean_t
4517pmap_is_modified_pvh(struct md_page *pvh)
4518{
4519	pv_entry_t pv;
4520	pt_entry_t *pte;
4521	pmap_t pmap;
4522	boolean_t rv;
4523
4524	rw_assert(&pvh_global_lock, RA_WLOCKED);
4525	rv = FALSE;
4526	sched_pin();
4527	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4528		pmap = PV_PMAP(pv);
4529		PMAP_LOCK(pmap);
4530		pte = pmap_pte_quick(pmap, pv->pv_va);
4531		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4532		PMAP_UNLOCK(pmap);
4533		if (rv)
4534			break;
4535	}
4536	sched_unpin();
4537	return (rv);
4538}
4539
4540/*
4541 *	pmap_is_prefaultable:
4542 *
4543 *	Return whether or not the specified virtual address is elgible
4544 *	for prefault.
4545 */
4546boolean_t
4547pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4548{
4549	pd_entry_t *pde;
4550	pt_entry_t *pte;
4551	boolean_t rv;
4552
4553	rv = FALSE;
4554	PMAP_LOCK(pmap);
4555	pde = pmap_pde(pmap, addr);
4556	if (*pde != 0 && (*pde & PG_PS) == 0) {
4557		pte = vtopte(addr);
4558		rv = *pte == 0;
4559	}
4560	PMAP_UNLOCK(pmap);
4561	return (rv);
4562}
4563
4564/*
4565 *	pmap_is_referenced:
4566 *
4567 *	Return whether or not the specified physical page was referenced
4568 *	in any physical maps.
4569 */
4570boolean_t
4571pmap_is_referenced(vm_page_t m)
4572{
4573	boolean_t rv;
4574
4575	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4576	    ("pmap_is_referenced: page %p is not managed", m));
4577	rw_wlock(&pvh_global_lock);
4578	rv = pmap_is_referenced_pvh(&m->md) ||
4579	    ((m->flags & PG_FICTITIOUS) == 0 &&
4580	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4581	rw_wunlock(&pvh_global_lock);
4582	return (rv);
4583}
4584
4585/*
4586 * Returns TRUE if any of the given mappings were referenced and FALSE
4587 * otherwise.  Both page and 4mpage mappings are supported.
4588 */
4589static boolean_t
4590pmap_is_referenced_pvh(struct md_page *pvh)
4591{
4592	pv_entry_t pv;
4593	pt_entry_t *pte;
4594	pmap_t pmap;
4595	boolean_t rv;
4596
4597	rw_assert(&pvh_global_lock, RA_WLOCKED);
4598	rv = FALSE;
4599	sched_pin();
4600	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4601		pmap = PV_PMAP(pv);
4602		PMAP_LOCK(pmap);
4603		pte = pmap_pte_quick(pmap, pv->pv_va);
4604		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4605		PMAP_UNLOCK(pmap);
4606		if (rv)
4607			break;
4608	}
4609	sched_unpin();
4610	return (rv);
4611}
4612
4613/*
4614 * Clear the write and modified bits in each of the given page's mappings.
4615 */
4616void
4617pmap_remove_write(vm_page_t m)
4618{
4619	struct md_page *pvh;
4620	pv_entry_t next_pv, pv;
4621	pmap_t pmap;
4622	pd_entry_t *pde;
4623	pt_entry_t oldpte, *pte;
4624	vm_offset_t va;
4625
4626	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4627	    ("pmap_remove_write: page %p is not managed", m));
4628
4629	/*
4630	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
4631	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
4632	 * is clear, no page table entries need updating.
4633	 */
4634	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4635	if ((m->oflags & VPO_BUSY) == 0 &&
4636	    (m->aflags & PGA_WRITEABLE) == 0)
4637		return;
4638	rw_wlock(&pvh_global_lock);
4639	sched_pin();
4640	if ((m->flags & PG_FICTITIOUS) != 0)
4641		goto small_mappings;
4642	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4643	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4644		va = pv->pv_va;
4645		pmap = PV_PMAP(pv);
4646		PMAP_LOCK(pmap);
4647		pde = pmap_pde(pmap, va);
4648		if ((*pde & PG_RW) != 0)
4649			(void)pmap_demote_pde(pmap, pde, va);
4650		PMAP_UNLOCK(pmap);
4651	}
4652small_mappings:
4653	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4654		pmap = PV_PMAP(pv);
4655		PMAP_LOCK(pmap);
4656		pde = pmap_pde(pmap, pv->pv_va);
4657		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4658		    " a 4mpage in page %p's pv list", m));
4659		pte = pmap_pte_quick(pmap, pv->pv_va);
4660retry:
4661		oldpte = *pte;
4662		if ((oldpte & PG_RW) != 0) {
4663			/*
4664			 * Regardless of whether a pte is 32 or 64 bits
4665			 * in size, PG_RW and PG_M are among the least
4666			 * significant 32 bits.
4667			 */
4668			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4669			    oldpte & ~(PG_RW | PG_M)))
4670				goto retry;
4671			if ((oldpte & PG_M) != 0)
4672				vm_page_dirty(m);
4673			pmap_invalidate_page(pmap, pv->pv_va);
4674		}
4675		PMAP_UNLOCK(pmap);
4676	}
4677	vm_page_aflag_clear(m, PGA_WRITEABLE);
4678	sched_unpin();
4679	rw_wunlock(&pvh_global_lock);
4680}
4681
4682/*
4683 *	pmap_ts_referenced:
4684 *
4685 *	Return a count of reference bits for a page, clearing those bits.
4686 *	It is not necessary for every reference bit to be cleared, but it
4687 *	is necessary that 0 only be returned when there are truly no
4688 *	reference bits set.
4689 *
4690 *	XXX: The exact number of bits to check and clear is a matter that
4691 *	should be tested and standardized at some point in the future for
4692 *	optimal aging of shared pages.
4693 */
4694int
4695pmap_ts_referenced(vm_page_t m)
4696{
4697	struct md_page *pvh;
4698	pv_entry_t pv, pvf, pvn;
4699	pmap_t pmap;
4700	pd_entry_t oldpde, *pde;
4701	pt_entry_t *pte;
4702	vm_offset_t va;
4703	int rtval = 0;
4704
4705	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4706	    ("pmap_ts_referenced: page %p is not managed", m));
4707	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4708	rw_wlock(&pvh_global_lock);
4709	sched_pin();
4710	if ((m->flags & PG_FICTITIOUS) != 0)
4711		goto small_mappings;
4712	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4713		va = pv->pv_va;
4714		pmap = PV_PMAP(pv);
4715		PMAP_LOCK(pmap);
4716		pde = pmap_pde(pmap, va);
4717		oldpde = *pde;
4718		if ((oldpde & PG_A) != 0) {
4719			if (pmap_demote_pde(pmap, pde, va)) {
4720				if ((oldpde & PG_W) == 0) {
4721					/*
4722					 * Remove the mapping to a single page
4723					 * so that a subsequent access may
4724					 * repromote.  Since the underlying
4725					 * page table page is fully populated,
4726					 * this removal never frees a page
4727					 * table page.
4728					 */
4729					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4730					    PG_PS_FRAME);
4731					pmap_remove_page(pmap, va, NULL);
4732					rtval++;
4733					if (rtval > 4) {
4734						PMAP_UNLOCK(pmap);
4735						goto out;
4736					}
4737				}
4738			}
4739		}
4740		PMAP_UNLOCK(pmap);
4741	}
4742small_mappings:
4743	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4744		pvf = pv;
4745		do {
4746			pvn = TAILQ_NEXT(pv, pv_list);
4747			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4748			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4749			pmap = PV_PMAP(pv);
4750			PMAP_LOCK(pmap);
4751			pde = pmap_pde(pmap, pv->pv_va);
4752			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4753			    " found a 4mpage in page %p's pv list", m));
4754			pte = pmap_pte_quick(pmap, pv->pv_va);
4755			if ((*pte & PG_A) != 0) {
4756				atomic_clear_int((u_int *)pte, PG_A);
4757				pmap_invalidate_page(pmap, pv->pv_va);
4758				rtval++;
4759				if (rtval > 4)
4760					pvn = NULL;
4761			}
4762			PMAP_UNLOCK(pmap);
4763		} while ((pv = pvn) != NULL && pv != pvf);
4764	}
4765out:
4766	sched_unpin();
4767	rw_wunlock(&pvh_global_lock);
4768	return (rtval);
4769}
4770
4771/*
4772 *	Clear the modify bits on the specified physical page.
4773 */
4774void
4775pmap_clear_modify(vm_page_t m)
4776{
4777	struct md_page *pvh;
4778	pv_entry_t next_pv, pv;
4779	pmap_t pmap;
4780	pd_entry_t oldpde, *pde;
4781	pt_entry_t oldpte, *pte;
4782	vm_offset_t va;
4783
4784	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4785	    ("pmap_clear_modify: page %p is not managed", m));
4786	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4787	KASSERT((m->oflags & VPO_BUSY) == 0,
4788	    ("pmap_clear_modify: page %p is busy", m));
4789
4790	/*
4791	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4792	 * If the object containing the page is locked and the page is not
4793	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
4794	 */
4795	if ((m->aflags & PGA_WRITEABLE) == 0)
4796		return;
4797	rw_wlock(&pvh_global_lock);
4798	sched_pin();
4799	if ((m->flags & PG_FICTITIOUS) != 0)
4800		goto small_mappings;
4801	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4802	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4803		va = pv->pv_va;
4804		pmap = PV_PMAP(pv);
4805		PMAP_LOCK(pmap);
4806		pde = pmap_pde(pmap, va);
4807		oldpde = *pde;
4808		if ((oldpde & PG_RW) != 0) {
4809			if (pmap_demote_pde(pmap, pde, va)) {
4810				if ((oldpde & PG_W) == 0) {
4811					/*
4812					 * Write protect the mapping to a
4813					 * single page so that a subsequent
4814					 * write access may repromote.
4815					 */
4816					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4817					    PG_PS_FRAME);
4818					pte = pmap_pte_quick(pmap, va);
4819					oldpte = *pte;
4820					if ((oldpte & PG_V) != 0) {
4821						/*
4822						 * Regardless of whether a pte is 32 or 64 bits
4823						 * in size, PG_RW and PG_M are among the least
4824						 * significant 32 bits.
4825						 */
4826						while (!atomic_cmpset_int((u_int *)pte,
4827						    oldpte,
4828						    oldpte & ~(PG_M | PG_RW)))
4829							oldpte = *pte;
4830						vm_page_dirty(m);
4831						pmap_invalidate_page(pmap, va);
4832					}
4833				}
4834			}
4835		}
4836		PMAP_UNLOCK(pmap);
4837	}
4838small_mappings:
4839	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4840		pmap = PV_PMAP(pv);
4841		PMAP_LOCK(pmap);
4842		pde = pmap_pde(pmap, pv->pv_va);
4843		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4844		    " a 4mpage in page %p's pv list", m));
4845		pte = pmap_pte_quick(pmap, pv->pv_va);
4846		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4847			/*
4848			 * Regardless of whether a pte is 32 or 64 bits
4849			 * in size, PG_M is among the least significant
4850			 * 32 bits.
4851			 */
4852			atomic_clear_int((u_int *)pte, PG_M);
4853			pmap_invalidate_page(pmap, pv->pv_va);
4854		}
4855		PMAP_UNLOCK(pmap);
4856	}
4857	sched_unpin();
4858	rw_wunlock(&pvh_global_lock);
4859}
4860
4861/*
4862 *	pmap_clear_reference:
4863 *
4864 *	Clear the reference bit on the specified physical page.
4865 */
4866void
4867pmap_clear_reference(vm_page_t m)
4868{
4869	struct md_page *pvh;
4870	pv_entry_t next_pv, pv;
4871	pmap_t pmap;
4872	pd_entry_t oldpde, *pde;
4873	pt_entry_t *pte;
4874	vm_offset_t va;
4875
4876	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4877	    ("pmap_clear_reference: page %p is not managed", m));
4878	rw_wlock(&pvh_global_lock);
4879	sched_pin();
4880	if ((m->flags & PG_FICTITIOUS) != 0)
4881		goto small_mappings;
4882	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4883	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4884		va = pv->pv_va;
4885		pmap = PV_PMAP(pv);
4886		PMAP_LOCK(pmap);
4887		pde = pmap_pde(pmap, va);
4888		oldpde = *pde;
4889		if ((oldpde & PG_A) != 0) {
4890			if (pmap_demote_pde(pmap, pde, va)) {
4891				/*
4892				 * Remove the mapping to a single page so
4893				 * that a subsequent access may repromote.
4894				 * Since the underlying page table page is
4895				 * fully populated, this removal never frees
4896				 * a page table page.
4897				 */
4898				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4899				    PG_PS_FRAME);
4900				pmap_remove_page(pmap, va, NULL);
4901			}
4902		}
4903		PMAP_UNLOCK(pmap);
4904	}
4905small_mappings:
4906	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4907		pmap = PV_PMAP(pv);
4908		PMAP_LOCK(pmap);
4909		pde = pmap_pde(pmap, pv->pv_va);
4910		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4911		    " a 4mpage in page %p's pv list", m));
4912		pte = pmap_pte_quick(pmap, pv->pv_va);
4913		if ((*pte & PG_A) != 0) {
4914			/*
4915			 * Regardless of whether a pte is 32 or 64 bits
4916			 * in size, PG_A is among the least significant
4917			 * 32 bits.
4918			 */
4919			atomic_clear_int((u_int *)pte, PG_A);
4920			pmap_invalidate_page(pmap, pv->pv_va);
4921		}
4922		PMAP_UNLOCK(pmap);
4923	}
4924	sched_unpin();
4925	rw_wunlock(&pvh_global_lock);
4926}
4927
4928/*
4929 * Miscellaneous support routines follow
4930 */
4931
4932/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4933static __inline void
4934pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4935{
4936	u_int opte, npte;
4937
4938	/*
4939	 * The cache mode bits are all in the low 32-bits of the
4940	 * PTE, so we can just spin on updating the low 32-bits.
4941	 */
4942	do {
4943		opte = *(u_int *)pte;
4944		npte = opte & ~PG_PTE_CACHE;
4945		npte |= cache_bits;
4946	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4947}
4948
4949/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4950static __inline void
4951pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4952{
4953	u_int opde, npde;
4954
4955	/*
4956	 * The cache mode bits are all in the low 32-bits of the
4957	 * PDE, so we can just spin on updating the low 32-bits.
4958	 */
4959	do {
4960		opde = *(u_int *)pde;
4961		npde = opde & ~PG_PDE_CACHE;
4962		npde |= cache_bits;
4963	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4964}
4965
4966/*
4967 * Map a set of physical memory pages into the kernel virtual
4968 * address space. Return a pointer to where it is mapped. This
4969 * routine is intended to be used for mapping device memory,
4970 * NOT real memory.
4971 */
4972void *
4973pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4974{
4975	vm_offset_t va, offset;
4976	vm_size_t tmpsize;
4977
4978	offset = pa & PAGE_MASK;
4979	size = roundup(offset + size, PAGE_SIZE);
4980	pa = pa & PG_FRAME;
4981
4982	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4983		va = KERNBASE + pa;
4984	else
4985		va = kmem_alloc_nofault(kernel_map, size);
4986	if (!va)
4987		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4988
4989	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4990		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4991	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4992	pmap_invalidate_cache_range(va, va + size);
4993	return ((void *)(va + offset));
4994}
4995
4996void *
4997pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4998{
4999
5000	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5001}
5002
5003void *
5004pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5005{
5006
5007	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5008}
5009
5010void
5011pmap_unmapdev(vm_offset_t va, vm_size_t size)
5012{
5013	vm_offset_t base, offset, tmpva;
5014
5015	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5016		return;
5017	base = trunc_page(va);
5018	offset = va & PAGE_MASK;
5019	size = roundup(offset + size, PAGE_SIZE);
5020	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
5021		pmap_kremove(tmpva);
5022	pmap_invalidate_range(kernel_pmap, va, tmpva);
5023	kmem_free(kernel_map, base, size);
5024}
5025
5026/*
5027 * Sets the memory attribute for the specified page.
5028 */
5029void
5030pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5031{
5032
5033	m->md.pat_mode = ma;
5034	if ((m->flags & PG_FICTITIOUS) != 0)
5035		return;
5036
5037	/*
5038	 * If "m" is a normal page, flush it from the cache.
5039	 * See pmap_invalidate_cache_range().
5040	 *
5041	 * First, try to find an existing mapping of the page by sf
5042	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5043	 * flushes the cache.
5044	 */
5045	if (sf_buf_invalidate_cache(m))
5046		return;
5047
5048	/*
5049	 * If page is not mapped by sf buffer, but CPU does not
5050	 * support self snoop, map the page transient and do
5051	 * invalidation. In the worst case, whole cache is flushed by
5052	 * pmap_invalidate_cache_range().
5053	 */
5054	if ((cpu_feature & CPUID_SS) == 0)
5055		pmap_flush_page(m);
5056}
5057
5058static void
5059pmap_flush_page(vm_page_t m)
5060{
5061	struct sysmaps *sysmaps;
5062	vm_offset_t sva, eva;
5063
5064	if ((cpu_feature & CPUID_CLFSH) != 0) {
5065		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5066		mtx_lock(&sysmaps->lock);
5067		if (*sysmaps->CMAP2)
5068			panic("pmap_flush_page: CMAP2 busy");
5069		sched_pin();
5070		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5071		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5072		invlcaddr(sysmaps->CADDR2);
5073		sva = (vm_offset_t)sysmaps->CADDR2;
5074		eva = sva + PAGE_SIZE;
5075
5076		/*
5077		 * Use mfence despite the ordering implied by
5078		 * mtx_{un,}lock() because clflush is not guaranteed
5079		 * to be ordered by any other instruction.
5080		 */
5081		mfence();
5082		for (; sva < eva; sva += cpu_clflush_line_size)
5083			clflush(sva);
5084		mfence();
5085		*sysmaps->CMAP2 = 0;
5086		sched_unpin();
5087		mtx_unlock(&sysmaps->lock);
5088	} else
5089		pmap_invalidate_cache();
5090}
5091
5092/*
5093 * Changes the specified virtual address range's memory type to that given by
5094 * the parameter "mode".  The specified virtual address range must be
5095 * completely contained within either the kernel map.
5096 *
5097 * Returns zero if the change completed successfully, and either EINVAL or
5098 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5099 * of the virtual address range was not mapped, and ENOMEM is returned if
5100 * there was insufficient memory available to complete the change.
5101 */
5102int
5103pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5104{
5105	vm_offset_t base, offset, tmpva;
5106	pd_entry_t *pde;
5107	pt_entry_t *pte;
5108	int cache_bits_pte, cache_bits_pde;
5109	boolean_t changed;
5110
5111	base = trunc_page(va);
5112	offset = va & PAGE_MASK;
5113	size = roundup(offset + size, PAGE_SIZE);
5114
5115	/*
5116	 * Only supported on kernel virtual addresses above the recursive map.
5117	 */
5118	if (base < VM_MIN_KERNEL_ADDRESS)
5119		return (EINVAL);
5120
5121	cache_bits_pde = pmap_cache_bits(mode, 1);
5122	cache_bits_pte = pmap_cache_bits(mode, 0);
5123	changed = FALSE;
5124
5125	/*
5126	 * Pages that aren't mapped aren't supported.  Also break down
5127	 * 2/4MB pages into 4KB pages if required.
5128	 */
5129	PMAP_LOCK(kernel_pmap);
5130	for (tmpva = base; tmpva < base + size; ) {
5131		pde = pmap_pde(kernel_pmap, tmpva);
5132		if (*pde == 0) {
5133			PMAP_UNLOCK(kernel_pmap);
5134			return (EINVAL);
5135		}
5136		if (*pde & PG_PS) {
5137			/*
5138			 * If the current 2/4MB page already has
5139			 * the required memory type, then we need not
5140			 * demote this page.  Just increment tmpva to
5141			 * the next 2/4MB page frame.
5142			 */
5143			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5144				tmpva = trunc_4mpage(tmpva) + NBPDR;
5145				continue;
5146			}
5147
5148			/*
5149			 * If the current offset aligns with a 2/4MB
5150			 * page frame and there is at least 2/4MB left
5151			 * within the range, then we need not break
5152			 * down this page into 4KB pages.
5153			 */
5154			if ((tmpva & PDRMASK) == 0 &&
5155			    tmpva + PDRMASK < base + size) {
5156				tmpva += NBPDR;
5157				continue;
5158			}
5159			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5160				PMAP_UNLOCK(kernel_pmap);
5161				return (ENOMEM);
5162			}
5163		}
5164		pte = vtopte(tmpva);
5165		if (*pte == 0) {
5166			PMAP_UNLOCK(kernel_pmap);
5167			return (EINVAL);
5168		}
5169		tmpva += PAGE_SIZE;
5170	}
5171	PMAP_UNLOCK(kernel_pmap);
5172
5173	/*
5174	 * Ok, all the pages exist, so run through them updating their
5175	 * cache mode if required.
5176	 */
5177	for (tmpva = base; tmpva < base + size; ) {
5178		pde = pmap_pde(kernel_pmap, tmpva);
5179		if (*pde & PG_PS) {
5180			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5181				pmap_pde_attr(pde, cache_bits_pde);
5182				changed = TRUE;
5183			}
5184			tmpva = trunc_4mpage(tmpva) + NBPDR;
5185		} else {
5186			pte = vtopte(tmpva);
5187			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5188				pmap_pte_attr(pte, cache_bits_pte);
5189				changed = TRUE;
5190			}
5191			tmpva += PAGE_SIZE;
5192		}
5193	}
5194
5195	/*
5196	 * Flush CPU caches to make sure any data isn't cached that
5197	 * shouldn't be, etc.
5198	 */
5199	if (changed) {
5200		pmap_invalidate_range(kernel_pmap, base, tmpva);
5201		pmap_invalidate_cache_range(base, tmpva);
5202	}
5203	return (0);
5204}
5205
5206/*
5207 * perform the pmap work for mincore
5208 */
5209int
5210pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5211{
5212	pd_entry_t *pdep;
5213	pt_entry_t *ptep, pte;
5214	vm_paddr_t pa;
5215	int val;
5216
5217	PMAP_LOCK(pmap);
5218retry:
5219	pdep = pmap_pde(pmap, addr);
5220	if (*pdep != 0) {
5221		if (*pdep & PG_PS) {
5222			pte = *pdep;
5223			/* Compute the physical address of the 4KB page. */
5224			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5225			    PG_FRAME;
5226			val = MINCORE_SUPER;
5227		} else {
5228			ptep = pmap_pte(pmap, addr);
5229			pte = *ptep;
5230			pmap_pte_release(ptep);
5231			pa = pte & PG_FRAME;
5232			val = 0;
5233		}
5234	} else {
5235		pte = 0;
5236		pa = 0;
5237		val = 0;
5238	}
5239	if ((pte & PG_V) != 0) {
5240		val |= MINCORE_INCORE;
5241		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5242			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5243		if ((pte & PG_A) != 0)
5244			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5245	}
5246	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5247	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5248	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5249		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5250		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5251			goto retry;
5252	} else
5253		PA_UNLOCK_COND(*locked_pa);
5254	PMAP_UNLOCK(pmap);
5255	return (val);
5256}
5257
5258void
5259pmap_activate(struct thread *td)
5260{
5261	pmap_t	pmap, oldpmap;
5262	u_int	cpuid;
5263	u_int32_t  cr3;
5264
5265	critical_enter();
5266	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5267	oldpmap = PCPU_GET(curpmap);
5268	cpuid = PCPU_GET(cpuid);
5269#if defined(SMP)
5270	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5271	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5272#else
5273	CPU_CLR(cpuid, &oldpmap->pm_active);
5274	CPU_SET(cpuid, &pmap->pm_active);
5275#endif
5276#ifdef PAE
5277	cr3 = vtophys(pmap->pm_pdpt);
5278#else
5279	cr3 = vtophys(pmap->pm_pdir);
5280#endif
5281	/*
5282	 * pmap_activate is for the current thread on the current cpu
5283	 */
5284	td->td_pcb->pcb_cr3 = cr3;
5285	load_cr3(cr3);
5286	PCPU_SET(curpmap, pmap);
5287	critical_exit();
5288}
5289
5290void
5291pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5292{
5293}
5294
5295/*
5296 *	Increase the starting virtual address of the given mapping if a
5297 *	different alignment might result in more superpage mappings.
5298 */
5299void
5300pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5301    vm_offset_t *addr, vm_size_t size)
5302{
5303	vm_offset_t superpage_offset;
5304
5305	if (size < NBPDR)
5306		return;
5307	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5308		offset += ptoa(object->pg_color);
5309	superpage_offset = offset & PDRMASK;
5310	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5311	    (*addr & PDRMASK) == superpage_offset)
5312		return;
5313	if ((*addr & PDRMASK) < superpage_offset)
5314		*addr = (*addr & ~PDRMASK) + superpage_offset;
5315	else
5316		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5317}
5318
5319
5320#if defined(PMAP_DEBUG)
5321pmap_pid_dump(int pid)
5322{
5323	pmap_t pmap;
5324	struct proc *p;
5325	int npte = 0;
5326	int index;
5327
5328	sx_slock(&allproc_lock);
5329	FOREACH_PROC_IN_SYSTEM(p) {
5330		if (p->p_pid != pid)
5331			continue;
5332
5333		if (p->p_vmspace) {
5334			int i,j;
5335			index = 0;
5336			pmap = vmspace_pmap(p->p_vmspace);
5337			for (i = 0; i < NPDEPTD; i++) {
5338				pd_entry_t *pde;
5339				pt_entry_t *pte;
5340				vm_offset_t base = i << PDRSHIFT;
5341
5342				pde = &pmap->pm_pdir[i];
5343				if (pde && pmap_pde_v(pde)) {
5344					for (j = 0; j < NPTEPG; j++) {
5345						vm_offset_t va = base + (j << PAGE_SHIFT);
5346						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5347							if (index) {
5348								index = 0;
5349								printf("\n");
5350							}
5351							sx_sunlock(&allproc_lock);
5352							return (npte);
5353						}
5354						pte = pmap_pte(pmap, va);
5355						if (pte && pmap_pte_v(pte)) {
5356							pt_entry_t pa;
5357							vm_page_t m;
5358							pa = *pte;
5359							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5360							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5361								va, pa, m->hold_count, m->wire_count, m->flags);
5362							npte++;
5363							index++;
5364							if (index >= 2) {
5365								index = 0;
5366								printf("\n");
5367							} else {
5368								printf(" ");
5369							}
5370						}
5371					}
5372				}
5373			}
5374		}
5375	}
5376	sx_sunlock(&allproc_lock);
5377	return (npte);
5378}
5379#endif
5380
5381#if defined(DEBUG)
5382
5383static void	pads(pmap_t pm);
5384void		pmap_pvdump(vm_paddr_t pa);
5385
5386/* print address space of pmap*/
5387static void
5388pads(pmap_t pm)
5389{
5390	int i, j;
5391	vm_paddr_t va;
5392	pt_entry_t *ptep;
5393
5394	if (pm == kernel_pmap)
5395		return;
5396	for (i = 0; i < NPDEPTD; i++)
5397		if (pm->pm_pdir[i])
5398			for (j = 0; j < NPTEPG; j++) {
5399				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5400				if (pm == kernel_pmap && va < KERNBASE)
5401					continue;
5402				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5403					continue;
5404				ptep = pmap_pte(pm, va);
5405				if (pmap_pte_v(ptep))
5406					printf("%x:%x ", va, *ptep);
5407			};
5408
5409}
5410
5411void
5412pmap_pvdump(vm_paddr_t pa)
5413{
5414	pv_entry_t pv;
5415	pmap_t pmap;
5416	vm_page_t m;
5417
5418	printf("pa %x", pa);
5419	m = PHYS_TO_VM_PAGE(pa);
5420	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5421		pmap = PV_PMAP(pv);
5422		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5423		pads(pmap);
5424	}
5425	printf(" ");
5426}
5427#endif
5428