pmap.c revision 237952
1151497Sru/*-
2151497Sru * Copyright (c) 1991 Regents of the University of California.
3151497Sru * All rights reserved.
4151497Sru * Copyright (c) 1994 John S. Dyson
5151497Sru * All rights reserved.
6151497Sru * Copyright (c) 1994 David Greenman
7151497Sru * All rights reserved.
8151497Sru * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9151497Sru * All rights reserved.
10151497Sru *
11151497Sru * This code is derived from software contributed to Berkeley by
12151497Sru * the Systems Programming Group of the University of Utah Computer
13151497Sru * Science Department and William Jolitz of UUNET Technologies Inc.
14151497Sru *
15151497Sru * Redistribution and use in source and binary forms, with or without
16151497Sru * modification, are permitted provided that the following conditions
17151497Sru * are met:
18151497Sru * 1. Redistributions of source code must retain the above copyright
19151497Sru *    notice, this list of conditions and the following disclaimer.
20151497Sru * 2. Redistributions in binary form must reproduce the above copyright
21151497Sru *    notice, this list of conditions and the following disclaimer in the
22151497Sru *    documentation and/or other materials provided with the distribution.
23151497Sru * 3. All advertising materials mentioning features or use of this software
24151497Sru *    must display the following acknowledgement:
25151497Sru *	This product includes software developed by the University of
26151497Sru *	California, Berkeley and its contributors.
27151497Sru * 4. Neither the name of the University nor the names of its contributors
28151497Sru *    may be used to endorse or promote products derived from this software
29151497Sru *    without specific prior written permission.
30151497Sru *
31151497Sru * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32151497Sru * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33151497Sru * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34151497Sru * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35151497Sru * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36151497Sru * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37151497Sru * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38151497Sru * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39151497Sru * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40151497Sru * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41151497Sru * SUCH DAMAGE.
42151497Sru *
43151497Sru *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44151497Sru */
45151497Sru/*-
46151497Sru * Copyright (c) 2003 Networks Associates Technology, Inc.
47151497Sru * All rights reserved.
48151497Sru *
49151497Sru * This software was developed for the FreeBSD Project by Jake Burkholder,
50151497Sru * Safeport Network Services, and Network Associates Laboratories, the
51151497Sru * Security Research Division of Network Associates, Inc. under
52151497Sru * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53151497Sru * CHATS research program.
54151497Sru *
55151497Sru * Redistribution and use in source and binary forms, with or without
56151497Sru * modification, are permitted provided that the following conditions
57151497Sru * are met:
58151497Sru * 1. Redistributions of source code must retain the above copyright
59151497Sru *    notice, this list of conditions and the following disclaimer.
60151497Sru * 2. Redistributions in binary form must reproduce the above copyright
61151497Sru *    notice, this list of conditions and the following disclaimer in the
62151497Sru *    documentation and/or other materials provided with the distribution.
63151497Sru *
64151497Sru * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65151497Sru * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66151497Sru * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67151497Sru * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68151497Sru * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69151497Sru * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70151497Sru * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71151497Sru * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72151497Sru * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73151497Sru * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74151497Sru * SUCH DAMAGE.
75151497Sru */
76151497Sru
77151497Sru#include <sys/cdefs.h>
78151497Sru__FBSDID("$FreeBSD: stable/9/sys/i386/i386/pmap.c 237952 2012-07-02 05:57:44Z alc $");
79151497Sru
80151497Sru/*
81151497Sru *	Manages physical address maps.
82151497Sru *
83151497Sru *	In addition to hardware address maps, this
84151497Sru *	module is called upon to provide software-use-only
85151497Sru *	maps which may or may not be stored in the same
86151497Sru *	form as hardware maps.  These pseudo-maps are
87151497Sru *	used to store intermediate results from copy
88151497Sru *	operations to and from address spaces.
89151497Sru *
90151497Sru *	Since the information managed by this module is
91151497Sru *	also stored by the logical address mapping module,
92151497Sru *	this module may throw away valid virtual-to-physical
93151497Sru *	mappings at almost any time.  However, invalidations
94151497Sru *	of virtual-to-physical mappings must be done as
95151497Sru *	requested.
96151497Sru *
97151497Sru *	In order to cope with hardware architectures which
98151497Sru *	make virtual-to-physical map invalidates expensive,
99151497Sru *	this module may delay invalidate or reduced protection
100151497Sru *	operations until such time as they are actually
101151497Sru *	necessary.  This module is given full information as
102151497Sru *	to which processors are currently using which maps,
103151497Sru *	and to when physical maps must be made correct.
104151497Sru */
105151497Sru
106151497Sru#include "opt_cpu.h"
107151497Sru#include "opt_pmap.h"
108151497Sru#include "opt_smp.h"
109151497Sru#include "opt_xbox.h"
110151497Sru
111151497Sru#include <sys/param.h>
112151497Sru#include <sys/systm.h>
113151497Sru#include <sys/kernel.h>
114151497Sru#include <sys/ktr.h>
115151497Sru#include <sys/lock.h>
116151497Sru#include <sys/malloc.h>
117151497Sru#include <sys/mman.h>
118151497Sru#include <sys/msgbuf.h>
119151497Sru#include <sys/mutex.h>
120151497Sru#include <sys/proc.h>
121151497Sru#include <sys/sf_buf.h>
122151497Sru#include <sys/sx.h>
123151497Sru#include <sys/vmmeter.h>
124151497Sru#include <sys/sched.h>
125151497Sru#include <sys/sysctl.h>
126151497Sru#ifdef SMP
127151497Sru#include <sys/smp.h>
128151497Sru#else
129151497Sru#include <sys/cpuset.h>
130151497Sru#endif
131151497Sru
132151497Sru#include <vm/vm.h>
133151497Sru#include <vm/vm_param.h>
134151497Sru#include <vm/vm_kern.h>
135151497Sru#include <vm/vm_page.h>
136151497Sru#include <vm/vm_map.h>
137151497Sru#include <vm/vm_object.h>
138151497Sru#include <vm/vm_extern.h>
139151497Sru#include <vm/vm_pageout.h>
140151497Sru#include <vm/vm_pager.h>
141151497Sru#include <vm/vm_reserv.h>
142151497Sru#include <vm/uma.h>
143151497Sru
144151497Sru#include <machine/cpu.h>
145151497Sru#include <machine/cputypes.h>
146151497Sru#include <machine/md_var.h>
147151497Sru#include <machine/pcb.h>
148151497Sru#include <machine/specialreg.h>
149151497Sru#ifdef SMP
150151497Sru#include <machine/smp.h>
151151497Sru#endif
152151497Sru
153151497Sru#ifdef XBOX
154151497Sru#include <machine/xbox.h>
155151497Sru#endif
156151497Sru
157151497Sru#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
158151497Sru#define CPU_ENABLE_SSE
159151497Sru#endif
160151497Sru
161151497Sru#ifndef PMAP_SHPGPERPROC
162151497Sru#define PMAP_SHPGPERPROC 200
163151497Sru#endif
164151497Sru
165151497Sru#if !defined(DIAGNOSTIC)
166151497Sru#ifdef __GNUC_GNU_INLINE__
167151497Sru#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
168151497Sru#else
169151497Sru#define PMAP_INLINE	extern inline
170151497Sru#endif
171151497Sru#else
172151497Sru#define PMAP_INLINE
173151497Sru#endif
174151497Sru
175151497Sru#ifdef PV_STATS
176151497Sru#define PV_STAT(x)	do { x ; } while (0)
177151497Sru#else
178151497Sru#define PV_STAT(x)	do { } while (0)
179151497Sru#endif
180151497Sru
181151497Sru#define	pa_index(pa)	((pa) >> PDRSHIFT)
182151497Sru#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
183151497Sru
184151497Sru/*
185151497Sru * Get PDEs and PTEs for user/kernel address space
186151497Sru */
187151497Sru#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
188151497Sru#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
189151497Sru
190151497Sru#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
191151497Sru#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
192151497Sru#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
193151497Sru#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
194151497Sru#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
195151497Sru
196151497Sru#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
197151497Sru    atomic_clear_int((u_int *)(pte), PG_W))
198151497Sru#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
199151497Sru
200151497Srustruct pmap kernel_pmap_store;
201151497SruLIST_HEAD(pmaplist, pmap);
202151497Srustatic struct pmaplist allpmaps;
203151497Srustatic struct mtx allpmaps_lock;
204151497Sru
205151497Sruvm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
206151497Sruvm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
207151497Sruint pgeflag = 0;		/* PG_G or-in */
208151497Sruint pseflag = 0;		/* PG_PS or-in */
209151497Sru
210151497Srustatic int nkpt = NKPT;
211151497Sruvm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
212151497Sruextern u_int32_t KERNend;
213151497Sruextern u_int32_t KPTphys;
214151497Sru
215151497Sru#ifdef PAE
216151497Srupt_entry_t pg_nx;
217151497Srustatic uma_zone_t pdptzone;
218151497Sru#endif
219151497Sru
220151497SruSYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
221151497Sru
222151497Srustatic int pat_works = 1;
223151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
224151497Sru    "Is page attribute table fully functional?");
225151497Sru
226151497Srustatic int pg_ps_enabled = 1;
227151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
228151497Sru    "Are large page mappings enabled?");
229151497Sru
230151497Sru#define	PAT_INDEX_SIZE	8
231151497Srustatic int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
232151497Sru
233151497Sru/*
234151497Sru * Data for the pv entry allocation mechanism
235151497Sru */
236151497Srustatic TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
237151497Srustatic int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
238151497Srustatic struct md_page *pv_table;
239151497Srustatic int shpgperproc = PMAP_SHPGPERPROC;
240151497Sru
241151497Srustruct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
242151497Sruint pv_maxchunks;			/* How many chunks we have KVA for */
243151497Sruvm_offset_t pv_vafree;			/* freelist stored in the PTE */
244151497Sru
245151497Sru/*
246151497Sru * All those kernel PT submaps that BSD is so fond of
247151497Sru */
248151497Srustruct sysmaps {
249151497Sru	struct	mtx lock;
250151497Sru	pt_entry_t *CMAP1;
251151497Sru	pt_entry_t *CMAP2;
252151497Sru	caddr_t	CADDR1;
253151497Sru	caddr_t	CADDR2;
254151497Sru};
255151497Srustatic struct sysmaps sysmaps_pcpu[MAXCPU];
256151497Srupt_entry_t *CMAP1 = 0;
257151497Srustatic pt_entry_t *CMAP3;
258151497Srustatic pd_entry_t *KPTD;
259151497Srucaddr_t CADDR1 = 0, ptvmmap = 0;
260151497Srustatic caddr_t CADDR3;
261151497Srustruct msgbuf *msgbufp = 0;
262151497Sru
263151497Sru/*
264151497Sru * Crashdump maps.
265151497Sru */
266151497Srustatic caddr_t crashdumpmap;
267151497Sru
268151497Srustatic pt_entry_t *PMAP1 = 0, *PMAP2;
269151497Srustatic pt_entry_t *PADDR1 = 0, *PADDR2;
270151497Sru#ifdef SMP
271151497Srustatic int PMAP1cpu;
272151497Srustatic int PMAP1changedcpu;
273151497SruSYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
274151497Sru	   &PMAP1changedcpu, 0,
275151497Sru	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
276151497Sru#endif
277151497Srustatic int PMAP1changed;
278151497SruSYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
279151497Sru	   &PMAP1changed, 0,
280151497Sru	   "Number of times pmap_pte_quick changed PMAP1");
281151497Srustatic int PMAP1unchanged;
282151497SruSYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
283151497Sru	   &PMAP1unchanged, 0,
284151497Sru	   "Number of times pmap_pte_quick didn't change PMAP1");
285151497Srustatic struct mtx PMAP2mutex;
286151497Sru
287151497Srustatic void	free_pv_chunk(struct pv_chunk *pc);
288151497Srustatic void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
289151497Srustatic pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
290151497Srustatic void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
291151497Srustatic boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
292151497Srustatic void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
293151497Srustatic void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
294151497Srustatic pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
295151497Sru		    vm_offset_t va);
296151497Srustatic int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
297151497Sru
298151497Srustatic boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
299151497Srustatic boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
300151497Sru    vm_prot_t prot);
301151497Srustatic vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
302151497Sru    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
303151497Srustatic void pmap_flush_page(vm_page_t m);
304151497Srustatic void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
305151497Srustatic void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
306151497Srustatic boolean_t pmap_is_modified_pvh(struct md_page *pvh);
307151497Srustatic boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
308151497Srustatic void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
309151497Srustatic void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
310151497Srustatic vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
311151497Srustatic void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
312151497Srustatic void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
313151497Srustatic boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
314151497Sru    vm_prot_t prot);
315151497Srustatic void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
316151497Srustatic void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
317151497Sru    vm_page_t *free);
318151497Srustatic int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
319151497Sru    vm_page_t *free);
320151497Srustatic void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
321151497Srustatic void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
322151497Sru    vm_page_t *free);
323151497Srustatic void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
324151497Sru					vm_offset_t va);
325151497Srustatic void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
326151497Srustatic boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
327151497Sru    vm_page_t m);
328151497Srustatic void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
329151497Sru    pd_entry_t newpde);
330151497Srustatic void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
331151497Sru
332151497Srustatic vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
333151497Sru
334151497Srustatic vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags);
335151497Srustatic int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
336151497Srustatic pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
337151497Srustatic void pmap_pte_release(pt_entry_t *pte);
338151497Srustatic int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
339151497Sru#ifdef PAE
340151497Srustatic void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
341151497Sru#endif
342151497Srustatic void pmap_set_pg(void);
343151497Sru
344151497Srustatic __inline void pagezero(void *page);
345151497Sru
346151497SruCTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
347151497SruCTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
348151497Sru
349151497Sru/*
350151497Sru * If you get an error here, then you set KVA_PAGES wrong! See the
351151497Sru * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
352151497Sru * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
353151497Sru */
354151497SruCTASSERT(KERNBASE % (1 << 24) == 0);
355151497Sru
356151497Sru/*
357151497Sru *	Bootstrap the system enough to run with virtual memory.
358151497Sru *
359151497Sru *	On the i386 this is called after mapping has already been enabled
360151497Sru *	and just syncs the pmap module with what has already been done.
361151497Sru *	[We can't call it easily with mapping off since the kernel is not
362151497Sru *	mapped with PA == VA, hence we would have to relocate every address
363151497Sru *	from the linked base (virtual) address "KERNBASE" to the actual
364151497Sru *	(physical) address starting relative to 0]
365151497Sru */
366151497Sruvoid
367151497Srupmap_bootstrap(vm_paddr_t firstaddr)
368151497Sru{
369151497Sru	vm_offset_t va;
370151497Sru	pt_entry_t *pte, *unused;
371151497Sru	struct sysmaps *sysmaps;
372151497Sru	int i;
373151497Sru
374151497Sru	/*
375151497Sru	 * Initialize the first available kernel virtual address.  However,
376151497Sru	 * using "firstaddr" may waste a few pages of the kernel virtual
377151497Sru	 * address space, because locore may not have mapped every physical
378151497Sru	 * page that it allocated.  Preferably, locore would provide a first
379151497Sru	 * unused virtual address in addition to "firstaddr".
380151497Sru	 */
381151497Sru	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
382151497Sru
383151497Sru	virtual_end = VM_MAX_KERNEL_ADDRESS;
384151497Sru
385151497Sru	/*
386151497Sru	 * Initialize the kernel pmap (which is statically allocated).
387151497Sru	 */
388151497Sru	PMAP_LOCK_INIT(kernel_pmap);
389151497Sru	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
390151497Sru#ifdef PAE
391151497Sru	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
392151497Sru#endif
393151497Sru	kernel_pmap->pm_root = NULL;
394151497Sru	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
395151497Sru	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
396151497Sru	LIST_INIT(&allpmaps);
397151497Sru
398151497Sru	/*
399151497Sru	 * Request a spin mutex so that changes to allpmaps cannot be
400151497Sru	 * preempted by smp_rendezvous_cpus().  Otherwise,
401151497Sru	 * pmap_update_pde_kernel() could access allpmaps while it is
402151497Sru	 * being changed.
403151497Sru	 */
404151497Sru	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
405151497Sru	mtx_lock_spin(&allpmaps_lock);
406151497Sru	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
407151497Sru	mtx_unlock_spin(&allpmaps_lock);
408151497Sru
409151497Sru	/*
410151497Sru	 * Reserve some special page table entries/VA space for temporary
411151497Sru	 * mapping of pages.
412151497Sru	 */
413151497Sru#define	SYSMAP(c, p, v, n)	\
414151497Sru	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
415151497Sru
416151497Sru	va = virtual_avail;
417151497Sru	pte = vtopte(va);
418151497Sru
419151497Sru	/*
420151497Sru	 * CMAP1/CMAP2 are used for zeroing and copying pages.
421151497Sru	 * CMAP3 is used for the idle process page zeroing.
422151497Sru	 */
423151497Sru	for (i = 0; i < MAXCPU; i++) {
424151497Sru		sysmaps = &sysmaps_pcpu[i];
425151497Sru		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
426151497Sru		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
427151497Sru		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
428151497Sru	}
429151497Sru	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
430151497Sru	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
431151497Sru
432151497Sru	/*
433151497Sru	 * Crashdump maps.
434151497Sru	 */
435151497Sru	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
436151497Sru
437151497Sru	/*
438151497Sru	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
439151497Sru	 */
440151497Sru	SYSMAP(caddr_t, unused, ptvmmap, 1)
441151497Sru
442151497Sru	/*
443151497Sru	 * msgbufp is used to map the system message buffer.
444151497Sru	 */
445151497Sru	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
446151497Sru
447151497Sru	/*
448151497Sru	 * KPTmap is used by pmap_kextract().
449151497Sru	 *
450151497Sru	 * KPTmap is first initialized by locore.  However, that initial
451151497Sru	 * KPTmap can only support NKPT page table pages.  Here, a larger
452151497Sru	 * KPTmap is created that can support KVA_PAGES page table pages.
453151497Sru	 */
454151497Sru	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
455151497Sru
456151497Sru	for (i = 0; i < NKPT; i++)
457151497Sru		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
458151497Sru
459151497Sru	/*
460151497Sru	 * Adjust the start of the KPTD and KPTmap so that the implementation
461151497Sru	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
462151497Sru	 */
463151497Sru	KPTD -= KPTDI;
464151497Sru	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
465151497Sru
466151497Sru	/*
467151497Sru	 * ptemap is used for pmap_pte_quick
468151497Sru	 */
469151497Sru	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
470151497Sru	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
471151497Sru
472151497Sru	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
473151497Sru
474151497Sru	virtual_avail = va;
475151497Sru
476151497Sru	/*
477151497Sru	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
478151497Sru	 * physical memory region that is used by the ACPI wakeup code.  This
479151497Sru	 * mapping must not have PG_G set.
480151497Sru	 */
481151497Sru#ifdef XBOX
482151497Sru	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
483151497Sru	 * an early stadium, we cannot yet neatly map video memory ... :-(
484151497Sru	 * Better fixes are very welcome! */
485151497Sru	if (!arch_i386_is_xbox)
486151497Sru#endif
487151497Sru	for (i = 1; i < NKPT; i++)
488151497Sru		PTD[i] = 0;
489151497Sru
490151497Sru	/* Initialize the PAT MSR if present. */
491151497Sru	pmap_init_pat();
492151497Sru
493151497Sru	/* Turn on PG_G on kernel page(s) */
494151497Sru	pmap_set_pg();
495151497Sru}
496151497Sru
497151497Sru/*
498151497Sru * Setup the PAT MSR.
499151497Sru */
500151497Sruvoid
501151497Srupmap_init_pat(void)
502151497Sru{
503151497Sru	int pat_table[PAT_INDEX_SIZE];
504151497Sru	uint64_t pat_msr;
505151497Sru	u_long cr0, cr4;
506151497Sru	int i;
507151497Sru
508151497Sru	/* Set default PAT index table. */
509151497Sru	for (i = 0; i < PAT_INDEX_SIZE; i++)
510151497Sru		pat_table[i] = -1;
511151497Sru	pat_table[PAT_WRITE_BACK] = 0;
512151497Sru	pat_table[PAT_WRITE_THROUGH] = 1;
513151497Sru	pat_table[PAT_UNCACHEABLE] = 3;
514151497Sru	pat_table[PAT_WRITE_COMBINING] = 3;
515151497Sru	pat_table[PAT_WRITE_PROTECTED] = 3;
516151497Sru	pat_table[PAT_UNCACHED] = 3;
517151497Sru
518151497Sru	/* Bail if this CPU doesn't implement PAT. */
519151497Sru	if ((cpu_feature & CPUID_PAT) == 0) {
520151497Sru		for (i = 0; i < PAT_INDEX_SIZE; i++)
521151497Sru			pat_index[i] = pat_table[i];
522151497Sru		pat_works = 0;
523151497Sru		return;
524151497Sru	}
525151497Sru
526151497Sru	/*
527151497Sru	 * Due to some Intel errata, we can only safely use the lower 4
528151497Sru	 * PAT entries.
529151497Sru	 *
530151497Sru	 *   Intel Pentium III Processor Specification Update
531151497Sru	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
532151497Sru	 * or Mode C Paging)
533151497Sru	 *
534151497Sru	 *   Intel Pentium IV  Processor Specification Update
535151497Sru	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
536151497Sru	 */
537151497Sru	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
538151497Sru	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
539151497Sru		pat_works = 0;
540151497Sru
541151497Sru	/* Initialize default PAT entries. */
542151497Sru	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
543151497Sru	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
544151497Sru	    PAT_VALUE(2, PAT_UNCACHED) |
545151497Sru	    PAT_VALUE(3, PAT_UNCACHEABLE) |
546151497Sru	    PAT_VALUE(4, PAT_WRITE_BACK) |
547151497Sru	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
548151497Sru	    PAT_VALUE(6, PAT_UNCACHED) |
549151497Sru	    PAT_VALUE(7, PAT_UNCACHEABLE);
550151497Sru
551151497Sru	if (pat_works) {
552151497Sru		/*
553151497Sru		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
554151497Sru		 * Program 5 and 6 as WP and WC.
555151497Sru		 * Leave 4 and 7 as WB and UC.
556151497Sru		 */
557151497Sru		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
558151497Sru		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
559151497Sru		    PAT_VALUE(6, PAT_WRITE_COMBINING);
560151497Sru		pat_table[PAT_UNCACHED] = 2;
561151497Sru		pat_table[PAT_WRITE_PROTECTED] = 5;
562151497Sru		pat_table[PAT_WRITE_COMBINING] = 6;
563151497Sru	} else {
564151497Sru		/*
565151497Sru		 * Just replace PAT Index 2 with WC instead of UC-.
566151497Sru		 */
567151497Sru		pat_msr &= ~PAT_MASK(2);
568151497Sru		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
569151497Sru		pat_table[PAT_WRITE_COMBINING] = 2;
570151497Sru	}
571151497Sru
572151497Sru	/* Disable PGE. */
573151497Sru	cr4 = rcr4();
574151497Sru	load_cr4(cr4 & ~CR4_PGE);
575151497Sru
576151497Sru	/* Disable caches (CD = 1, NW = 0). */
577151497Sru	cr0 = rcr0();
578151497Sru	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
579151497Sru
580151497Sru	/* Flushes caches and TLBs. */
581151497Sru	wbinvd();
582151497Sru	invltlb();
583151497Sru
584151497Sru	/* Update PAT and index table. */
585151497Sru	wrmsr(MSR_PAT, pat_msr);
586151497Sru	for (i = 0; i < PAT_INDEX_SIZE; i++)
587151497Sru		pat_index[i] = pat_table[i];
588151497Sru
589151497Sru	/* Flush caches and TLBs again. */
590151497Sru	wbinvd();
591151497Sru	invltlb();
592151497Sru
593151497Sru	/* Restore caches and PGE. */
594151497Sru	load_cr0(cr0);
595151497Sru	load_cr4(cr4);
596151497Sru}
597151497Sru
598151497Sru/*
599151497Sru * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
600151497Sru */
601151497Srustatic void
602151497Srupmap_set_pg(void)
603151497Sru{
604151497Sru	pt_entry_t *pte;
605151497Sru	vm_offset_t va, endva;
606151497Sru
607151497Sru	if (pgeflag == 0)
608151497Sru		return;
609151497Sru
610151497Sru	endva = KERNBASE + KERNend;
611151497Sru
612151497Sru	if (pseflag) {
613151497Sru		va = KERNBASE + KERNLOAD;
614151497Sru		while (va  < endva) {
615151497Sru			pdir_pde(PTD, va) |= pgeflag;
616151497Sru			invltlb();	/* Play it safe, invltlb() every time */
617151497Sru			va += NBPDR;
618151497Sru		}
619151497Sru	} else {
620151497Sru		va = (vm_offset_t)btext;
621151497Sru		while (va < endva) {
622151497Sru			pte = vtopte(va);
623151497Sru			if (*pte)
624151497Sru				*pte |= pgeflag;
625151497Sru			invltlb();	/* Play it safe, invltlb() every time */
626151497Sru			va += PAGE_SIZE;
627151497Sru		}
628151497Sru	}
629151497Sru}
630151497Sru
631151497Sru/*
632151497Sru * Initialize a vm_page's machine-dependent fields.
633151497Sru */
634151497Sruvoid
635151497Srupmap_page_init(vm_page_t m)
636151497Sru{
637151497Sru
638151497Sru	TAILQ_INIT(&m->md.pv_list);
639151497Sru	m->md.pat_mode = PAT_WRITE_BACK;
640151497Sru}
641151497Sru
642151497Sru#ifdef PAE
643151497Srustatic void *
644151497Srupmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
645151497Sru{
646151497Sru
647151497Sru	/* Inform UMA that this allocator uses kernel_map/object. */
648151497Sru	*flags = UMA_SLAB_KERNEL;
649151497Sru	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
650151497Sru	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
651151497Sru}
652151497Sru#endif
653151497Sru
654151497Sru/*
655151497Sru * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
656151497Sru * Requirements:
657151497Sru *  - Must deal with pages in order to ensure that none of the PG_* bits
658151497Sru *    are ever set, PG_V in particular.
659151497Sru *  - Assumes we can write to ptes without pte_store() atomic ops, even
660151497Sru *    on PAE systems.  This should be ok.
661151497Sru *  - Assumes nothing will ever test these addresses for 0 to indicate
662151497Sru *    no mapping instead of correctly checking PG_V.
663151497Sru *  - Assumes a vm_offset_t will fit in a pte (true for i386).
664151497Sru * Because PG_V is never set, there can be no mappings to invalidate.
665151497Sru */
666151497Srustatic vm_offset_t
667151497Srupmap_ptelist_alloc(vm_offset_t *head)
668151497Sru{
669151497Sru	pt_entry_t *pte;
670151497Sru	vm_offset_t va;
671151497Sru
672151497Sru	va = *head;
673151497Sru	if (va == 0)
674151497Sru		return (va);	/* Out of memory */
675151497Sru	pte = vtopte(va);
676151497Sru	*head = *pte;
677151497Sru	if (*head & PG_V)
678151497Sru		panic("pmap_ptelist_alloc: va with PG_V set!");
679151497Sru	*pte = 0;
680151497Sru	return (va);
681151497Sru}
682151497Sru
683151497Srustatic void
684151497Srupmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
685151497Sru{
686151497Sru	pt_entry_t *pte;
687151497Sru
688151497Sru	if (va & PG_V)
689151497Sru		panic("pmap_ptelist_free: freeing va with PG_V set!");
690151497Sru	pte = vtopte(va);
691151497Sru	*pte = *head;		/* virtual! PG_V is 0 though */
692151497Sru	*head = va;
693151497Sru}
694151497Sru
695151497Srustatic void
696151497Srupmap_ptelist_init(vm_offset_t *head, void *base, int npages)
697151497Sru{
698151497Sru	int i;
699151497Sru	vm_offset_t va;
700151497Sru
701151497Sru	*head = 0;
702151497Sru	for (i = npages - 1; i >= 0; i--) {
703151497Sru		va = (vm_offset_t)base + i * PAGE_SIZE;
704151497Sru		pmap_ptelist_free(head, va);
705151497Sru	}
706151497Sru}
707151497Sru
708151497Sru
709151497Sru/*
710151497Sru *	Initialize the pmap module.
711151497Sru *	Called by vm_init, to initialize any structures that the pmap
712151497Sru *	system needs to map virtual memory.
713151497Sru */
714151497Sruvoid
715151497Srupmap_init(void)
716151497Sru{
717151497Sru	vm_page_t mpte;
718151497Sru	vm_size_t s;
719151497Sru	int i, pv_npg;
720151497Sru
721151497Sru	/*
722151497Sru	 * Initialize the vm page array entries for the kernel pmap's
723151497Sru	 * page table pages.
724151497Sru	 */
725151497Sru	for (i = 0; i < NKPT; i++) {
726151497Sru		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
727151497Sru		KASSERT(mpte >= vm_page_array &&
728151497Sru		    mpte < &vm_page_array[vm_page_array_size],
729151497Sru		    ("pmap_init: page table page is out of range"));
730151497Sru		mpte->pindex = i + KPTDI;
731151497Sru		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
732151497Sru	}
733151497Sru
734151497Sru	/*
735151497Sru	 * Initialize the address space (zone) for the pv entries.  Set a
736151497Sru	 * high water mark so that the system can recover from excessive
737151497Sru	 * numbers of pv entries.
738151497Sru	 */
739151497Sru	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
740151497Sru	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
741151497Sru	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
742151497Sru	pv_entry_max = roundup(pv_entry_max, _NPCPV);
743151497Sru	pv_entry_high_water = 9 * (pv_entry_max / 10);
744151497Sru
745151497Sru	/*
746151497Sru	 * If the kernel is running in a virtual machine on an AMD Family 10h
747151497Sru	 * processor, then it must assume that MCA is enabled by the virtual
748151497Sru	 * machine monitor.
749151497Sru	 */
750151497Sru	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
751151497Sru	    CPUID_TO_FAMILY(cpu_id) == 0x10)
752151497Sru		workaround_erratum383 = 1;
753151497Sru
754151497Sru	/*
755151497Sru	 * Are large page mappings supported and enabled?
756151497Sru	 */
757151497Sru	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
758151497Sru	if (pseflag == 0)
759151497Sru		pg_ps_enabled = 0;
760151497Sru	else if (pg_ps_enabled) {
761151497Sru		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
762151497Sru		    ("pmap_init: can't assign to pagesizes[1]"));
763151497Sru		pagesizes[1] = NBPDR;
764151497Sru	}
765151497Sru
766151497Sru	/*
767151497Sru	 * Calculate the size of the pv head table for superpages.
768151497Sru	 */
769151497Sru	for (i = 0; phys_avail[i + 1]; i += 2);
770151497Sru	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
771151497Sru
772151497Sru	/*
773151497Sru	 * Allocate memory for the pv head table for superpages.
774151497Sru	 */
775151497Sru	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
776151497Sru	s = round_page(s);
777151497Sru	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
778151497Sru	for (i = 0; i < pv_npg; i++)
779151497Sru		TAILQ_INIT(&pv_table[i].pv_list);
780151497Sru
781151497Sru	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
782151497Sru	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
783151497Sru	    PAGE_SIZE * pv_maxchunks);
784151497Sru	if (pv_chunkbase == NULL)
785151497Sru		panic("pmap_init: not enough kvm for pv chunks");
786151497Sru	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
787151497Sru#ifdef PAE
788151497Sru	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
789151497Sru	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
790151497Sru	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
791151497Sru	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
792151497Sru#endif
793151497Sru}
794151497Sru
795151497Sru
796151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
797151497Sru	"Max number of PV entries");
798151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
799151497Sru	"Page share factor per proc");
800151497Sru
801151497SruSYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
802151497Sru    "2/4MB page mapping counters");
803151497Sru
804151497Srustatic u_long pmap_pde_demotions;
805151497SruSYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
806151497Sru    &pmap_pde_demotions, 0, "2/4MB page demotions");
807151497Sru
808151497Srustatic u_long pmap_pde_mappings;
809151497SruSYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
810151497Sru    &pmap_pde_mappings, 0, "2/4MB page mappings");
811151497Sru
812151497Srustatic u_long pmap_pde_p_failures;
813151497SruSYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
814151497Sru    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
815151497Sru
816151497Srustatic u_long pmap_pde_promotions;
817151497SruSYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
818151497Sru    &pmap_pde_promotions, 0, "2/4MB page promotions");
819151497Sru
820151497Sru/***************************************************
821151497Sru * Low level helper routines.....
822151497Sru ***************************************************/
823151497Sru
824151497Sru/*
825151497Sru * Determine the appropriate bits to set in a PTE or PDE for a specified
826151497Sru * caching mode.
827151497Sru */
828151497Sruint
829151497Srupmap_cache_bits(int mode, boolean_t is_pde)
830151497Sru{
831151497Sru	int cache_bits, pat_flag, pat_idx;
832151497Sru
833151497Sru	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
834151497Sru		panic("Unknown caching mode %d\n", mode);
835151497Sru
836151497Sru	/* The PAT bit is different for PTE's and PDE's. */
837151497Sru	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
838151497Sru
839151497Sru	/* Map the caching mode to a PAT index. */
840151497Sru	pat_idx = pat_index[mode];
841151497Sru
842151497Sru	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
843151497Sru	cache_bits = 0;
844151497Sru	if (pat_idx & 0x4)
845151497Sru		cache_bits |= pat_flag;
846151497Sru	if (pat_idx & 0x2)
847151497Sru		cache_bits |= PG_NC_PCD;
848151497Sru	if (pat_idx & 0x1)
849151497Sru		cache_bits |= PG_NC_PWT;
850151497Sru	return (cache_bits);
851151497Sru}
852151497Sru
853151497Sru/*
854151497Sru * The caller is responsible for maintaining TLB consistency.
855151497Sru */
856151497Srustatic void
857151497Srupmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
858151497Sru{
859151497Sru	pd_entry_t *pde;
860151497Sru	pmap_t pmap;
861151497Sru	boolean_t PTD_updated;
862151497Sru
863151497Sru	PTD_updated = FALSE;
864151497Sru	mtx_lock_spin(&allpmaps_lock);
865151497Sru	LIST_FOREACH(pmap, &allpmaps, pm_list) {
866151497Sru		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
867151497Sru		    PG_FRAME))
868151497Sru			PTD_updated = TRUE;
869151497Sru		pde = pmap_pde(pmap, va);
870151497Sru		pde_store(pde, newpde);
871151497Sru	}
872151497Sru	mtx_unlock_spin(&allpmaps_lock);
873151497Sru	KASSERT(PTD_updated,
874151497Sru	    ("pmap_kenter_pde: current page table is not in allpmaps"));
875151497Sru}
876151497Sru
877151497Sru/*
878151497Sru * After changing the page size for the specified virtual address in the page
879151497Sru * table, flush the corresponding entries from the processor's TLB.  Only the
880151497Sru * calling processor's TLB is affected.
881151497Sru *
882151497Sru * The calling thread must be pinned to a processor.
883151497Sru */
884151497Srustatic void
885151497Srupmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
886151497Sru{
887151497Sru	u_long cr4;
888151497Sru
889151497Sru	if ((newpde & PG_PS) == 0)
890151497Sru		/* Demotion: flush a specific 2MB page mapping. */
891151497Sru		invlpg(va);
892151497Sru	else if ((newpde & PG_G) == 0)
893151497Sru		/*
894151497Sru		 * Promotion: flush every 4KB page mapping from the TLB
895151497Sru		 * because there are too many to flush individually.
896151497Sru		 */
897151497Sru		invltlb();
898151497Sru	else {
899151497Sru		/*
900151497Sru		 * Promotion: flush every 4KB page mapping from the TLB,
901151497Sru		 * including any global (PG_G) mappings.
902151497Sru		 */
903151497Sru		cr4 = rcr4();
904151497Sru		load_cr4(cr4 & ~CR4_PGE);
905151497Sru		/*
906151497Sru		 * Although preemption at this point could be detrimental to
907151497Sru		 * performance, it would not lead to an error.  PG_G is simply
908151497Sru		 * ignored if CR4.PGE is clear.  Moreover, in case this block
909151497Sru		 * is re-entered, the load_cr4() either above or below will
910151497Sru		 * modify CR4.PGE flushing the TLB.
911151497Sru		 */
912151497Sru		load_cr4(cr4 | CR4_PGE);
913151497Sru	}
914151497Sru}
915151497Sru#ifdef SMP
916151497Sru/*
917151497Sru * For SMP, these functions have to use the IPI mechanism for coherence.
918151497Sru *
919151497Sru * N.B.: Before calling any of the following TLB invalidation functions,
920151497Sru * the calling processor must ensure that all stores updating a non-
921151497Sru * kernel page table are globally performed.  Otherwise, another
922151497Sru * processor could cache an old, pre-update entry without being
923151497Sru * invalidated.  This can happen one of two ways: (1) The pmap becomes
924151497Sru * active on another processor after its pm_active field is checked by
925151497Sru * one of the following functions but before a store updating the page
926151497Sru * table is globally performed. (2) The pmap becomes active on another
927151497Sru * processor before its pm_active field is checked but due to
928151497Sru * speculative loads one of the following functions stills reads the
929151497Sru * pmap as inactive on the other processor.
930151497Sru *
931151497Sru * The kernel page table is exempt because its pm_active field is
932151497Sru * immutable.  The kernel page table is always active on every
933151497Sru * processor.
934151497Sru */
935151497Sruvoid
936151497Srupmap_invalidate_page(pmap_t pmap, vm_offset_t va)
937151497Sru{
938151497Sru	cpuset_t other_cpus;
939151497Sru	u_int cpuid;
940151497Sru
941151497Sru	sched_pin();
942151497Sru	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
943151497Sru		invlpg(va);
944151497Sru		smp_invlpg(va);
945151497Sru	} else {
946151497Sru		cpuid = PCPU_GET(cpuid);
947151497Sru		other_cpus = all_cpus;
948151497Sru		CPU_CLR(cpuid, &other_cpus);
949151497Sru		if (CPU_ISSET(cpuid, &pmap->pm_active))
950151497Sru			invlpg(va);
951151497Sru		CPU_AND(&other_cpus, &pmap->pm_active);
952151497Sru		if (!CPU_EMPTY(&other_cpus))
953151497Sru			smp_masked_invlpg(other_cpus, va);
954151497Sru	}
955151497Sru	sched_unpin();
956151497Sru}
957151497Sru
958151497Sruvoid
959151497Srupmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
960151497Sru{
961151497Sru	cpuset_t other_cpus;
962151497Sru	vm_offset_t addr;
963151497Sru	u_int cpuid;
964151497Sru
965151497Sru	sched_pin();
966151497Sru	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
967151497Sru		for (addr = sva; addr < eva; addr += PAGE_SIZE)
968151497Sru			invlpg(addr);
969151497Sru		smp_invlpg_range(sva, eva);
970151497Sru	} else {
971151497Sru		cpuid = PCPU_GET(cpuid);
972151497Sru		other_cpus = all_cpus;
973151497Sru		CPU_CLR(cpuid, &other_cpus);
974151497Sru		if (CPU_ISSET(cpuid, &pmap->pm_active))
975151497Sru			for (addr = sva; addr < eva; addr += PAGE_SIZE)
976151497Sru				invlpg(addr);
977151497Sru		CPU_AND(&other_cpus, &pmap->pm_active);
978151497Sru		if (!CPU_EMPTY(&other_cpus))
979151497Sru			smp_masked_invlpg_range(other_cpus, sva, eva);
980151497Sru	}
981151497Sru	sched_unpin();
982151497Sru}
983151497Sru
984151497Sruvoid
985151497Srupmap_invalidate_all(pmap_t pmap)
986151497Sru{
987151497Sru	cpuset_t other_cpus;
988151497Sru	u_int cpuid;
989151497Sru
990151497Sru	sched_pin();
991151497Sru	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
992151497Sru		invltlb();
993151497Sru		smp_invltlb();
994151497Sru	} else {
995151497Sru		cpuid = PCPU_GET(cpuid);
996151497Sru		other_cpus = all_cpus;
997151497Sru		CPU_CLR(cpuid, &other_cpus);
998151497Sru		if (CPU_ISSET(cpuid, &pmap->pm_active))
999151497Sru			invltlb();
1000151497Sru		CPU_AND(&other_cpus, &pmap->pm_active);
1001151497Sru		if (!CPU_EMPTY(&other_cpus))
1002151497Sru			smp_masked_invltlb(other_cpus);
1003151497Sru	}
1004151497Sru	sched_unpin();
1005151497Sru}
1006151497Sru
1007151497Sruvoid
1008151497Srupmap_invalidate_cache(void)
1009151497Sru{
1010151497Sru
1011151497Sru	sched_pin();
1012151497Sru	wbinvd();
1013151497Sru	smp_cache_flush();
1014151497Sru	sched_unpin();
1015151497Sru}
1016151497Sru
1017151497Srustruct pde_action {
1018151497Sru	cpuset_t invalidate;	/* processors that invalidate their TLB */
1019151497Sru	vm_offset_t va;
1020151497Sru	pd_entry_t *pde;
1021151497Sru	pd_entry_t newpde;
1022151497Sru	u_int store;		/* processor that updates the PDE */
1023151497Sru};
1024151497Sru
1025151497Srustatic void
1026151497Srupmap_update_pde_kernel(void *arg)
1027151497Sru{
1028151497Sru	struct pde_action *act = arg;
1029151497Sru	pd_entry_t *pde;
1030151497Sru	pmap_t pmap;
1031151497Sru
1032151497Sru	if (act->store == PCPU_GET(cpuid)) {
1033151497Sru
1034151497Sru		/*
1035151497Sru		 * Elsewhere, this operation requires allpmaps_lock for
1036151497Sru		 * synchronization.  Here, it does not because it is being
1037151497Sru		 * performed in the context of an all_cpus rendezvous.
1038151497Sru		 */
1039151497Sru		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1040151497Sru			pde = pmap_pde(pmap, act->va);
1041151497Sru			pde_store(pde, act->newpde);
1042151497Sru		}
1043151497Sru	}
1044151497Sru}
1045151497Sru
1046151497Srustatic void
1047151497Srupmap_update_pde_user(void *arg)
1048151497Sru{
1049151497Sru	struct pde_action *act = arg;
1050151497Sru
1051151497Sru	if (act->store == PCPU_GET(cpuid))
1052151497Sru		pde_store(act->pde, act->newpde);
1053151497Sru}
1054151497Sru
1055151497Srustatic void
1056151497Srupmap_update_pde_teardown(void *arg)
1057151497Sru{
1058151497Sru	struct pde_action *act = arg;
1059151497Sru
1060151497Sru	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1061151497Sru		pmap_update_pde_invalidate(act->va, act->newpde);
1062151497Sru}
1063151497Sru
1064151497Sru/*
1065151497Sru * Change the page size for the specified virtual address in a way that
1066151497Sru * prevents any possibility of the TLB ever having two entries that map the
1067151497Sru * same virtual address using different page sizes.  This is the recommended
1068151497Sru * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1069151497Sru * machine check exception for a TLB state that is improperly diagnosed as a
1070151497Sru * hardware error.
1071151497Sru */
1072151497Srustatic void
1073151497Srupmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1074151497Sru{
1075151497Sru	struct pde_action act;
1076151497Sru	cpuset_t active, other_cpus;
1077151497Sru	u_int cpuid;
1078151497Sru
1079151497Sru	sched_pin();
1080151497Sru	cpuid = PCPU_GET(cpuid);
1081151497Sru	other_cpus = all_cpus;
1082151497Sru	CPU_CLR(cpuid, &other_cpus);
1083151497Sru	if (pmap == kernel_pmap)
1084151497Sru		active = all_cpus;
1085151497Sru	else
1086151497Sru		active = pmap->pm_active;
1087151497Sru	if (CPU_OVERLAP(&active, &other_cpus)) {
1088151497Sru		act.store = cpuid;
1089151497Sru		act.invalidate = active;
1090151497Sru		act.va = va;
1091151497Sru		act.pde = pde;
1092151497Sru		act.newpde = newpde;
1093151497Sru		CPU_SET(cpuid, &active);
1094151497Sru		smp_rendezvous_cpus(active,
1095151497Sru		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1096151497Sru		    pmap_update_pde_kernel : pmap_update_pde_user,
1097151497Sru		    pmap_update_pde_teardown, &act);
1098151497Sru	} else {
1099151497Sru		if (pmap == kernel_pmap)
1100151497Sru			pmap_kenter_pde(va, newpde);
1101151497Sru		else
1102151497Sru			pde_store(pde, newpde);
1103151497Sru		if (CPU_ISSET(cpuid, &active))
1104151497Sru			pmap_update_pde_invalidate(va, newpde);
1105151497Sru	}
1106151497Sru	sched_unpin();
1107151497Sru}
1108151497Sru#else /* !SMP */
1109151497Sru/*
1110151497Sru * Normal, non-SMP, 486+ invalidation functions.
1111151497Sru * We inline these within pmap.c for speed.
1112151497Sru */
1113151497SruPMAP_INLINE void
1114151497Srupmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1115151497Sru{
1116151497Sru
1117151497Sru	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1118151497Sru		invlpg(va);
1119151497Sru}
1120151497Sru
1121151497SruPMAP_INLINE void
1122151497Srupmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1123151497Sru{
1124151497Sru	vm_offset_t addr;
1125151497Sru
1126151497Sru	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1127151497Sru		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1128151497Sru			invlpg(addr);
1129151497Sru}
1130151497Sru
1131151497SruPMAP_INLINE void
1132151497Srupmap_invalidate_all(pmap_t pmap)
1133151497Sru{
1134151497Sru
1135151497Sru	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1136151497Sru		invltlb();
1137151497Sru}
1138151497Sru
1139151497SruPMAP_INLINE void
1140151497Srupmap_invalidate_cache(void)
1141151497Sru{
1142151497Sru
1143151497Sru	wbinvd();
1144151497Sru}
1145151497Sru
1146151497Srustatic void
1147151497Srupmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1148151497Sru{
1149151497Sru
1150151497Sru	if (pmap == kernel_pmap)
1151151497Sru		pmap_kenter_pde(va, newpde);
1152151497Sru	else
1153151497Sru		pde_store(pde, newpde);
1154151497Sru	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1155151497Sru		pmap_update_pde_invalidate(va, newpde);
1156151497Sru}
1157151497Sru#endif /* !SMP */
1158151497Sru
1159151497Sru#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1160151497Sru
1161151497Sruvoid
1162151497Srupmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1163151497Sru{
1164151497Sru
1165151497Sru	KASSERT((sva & PAGE_MASK) == 0,
1166151497Sru	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1167151497Sru	KASSERT((eva & PAGE_MASK) == 0,
1168151497Sru	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1169151497Sru
1170151497Sru	if (cpu_feature & CPUID_SS)
1171151497Sru		; /* If "Self Snoop" is supported, do nothing. */
1172151497Sru	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1173151497Sru	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1174151497Sru
1175151497Sru		/*
1176151497Sru		 * Otherwise, do per-cache line flush.  Use the mfence
1177151497Sru		 * instruction to insure that previous stores are
1178151497Sru		 * included in the write-back.  The processor
1179151497Sru		 * propagates flush to other processors in the cache
1180151497Sru		 * coherence domain.
1181151497Sru		 */
1182151497Sru		mfence();
1183151497Sru		for (; sva < eva; sva += cpu_clflush_line_size)
1184151497Sru			clflush(sva);
1185151497Sru		mfence();
1186151497Sru	} else {
1187151497Sru
1188151497Sru		/*
1189151497Sru		 * No targeted cache flush methods are supported by CPU,
1190151497Sru		 * or the supplied range is bigger than 2MB.
1191151497Sru		 * Globally invalidate cache.
1192151497Sru		 */
1193151497Sru		pmap_invalidate_cache();
1194151497Sru	}
1195151497Sru}
1196151497Sru
1197151497Sruvoid
1198151497Srupmap_invalidate_cache_pages(vm_page_t *pages, int count)
1199151497Sru{
1200151497Sru	int i;
1201151497Sru
1202151497Sru	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1203151497Sru	    (cpu_feature & CPUID_CLFSH) == 0) {
1204151497Sru		pmap_invalidate_cache();
1205151497Sru	} else {
1206151497Sru		for (i = 0; i < count; i++)
1207151497Sru			pmap_flush_page(pages[i]);
1208151497Sru	}
1209151497Sru}
1210151497Sru
1211151497Sru/*
1212151497Sru * Are we current address space or kernel?  N.B. We return FALSE when
1213151497Sru * a pmap's page table is in use because a kernel thread is borrowing
1214151497Sru * it.  The borrowed page table can change spontaneously, making any
1215151497Sru * dependence on its continued use subject to a race condition.
1216151497Sru */
1217151497Srustatic __inline int
1218151497Srupmap_is_current(pmap_t pmap)
1219151497Sru{
1220151497Sru
1221151497Sru	return (pmap == kernel_pmap ||
1222151497Sru	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1223151497Sru	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1224151497Sru}
1225151497Sru
1226151497Sru/*
1227151497Sru * If the given pmap is not the current or kernel pmap, the returned pte must
1228151497Sru * be released by passing it to pmap_pte_release().
1229151497Sru */
1230151497Srupt_entry_t *
1231151497Srupmap_pte(pmap_t pmap, vm_offset_t va)
1232151497Sru{
1233151497Sru	pd_entry_t newpf;
1234151497Sru	pd_entry_t *pde;
1235151497Sru
1236151497Sru	pde = pmap_pde(pmap, va);
1237151497Sru	if (*pde & PG_PS)
1238151497Sru		return (pde);
1239151497Sru	if (*pde != 0) {
1240151497Sru		/* are we current address space or kernel? */
1241151497Sru		if (pmap_is_current(pmap))
1242151497Sru			return (vtopte(va));
1243151497Sru		mtx_lock(&PMAP2mutex);
1244151497Sru		newpf = *pde & PG_FRAME;
1245151497Sru		if ((*PMAP2 & PG_FRAME) != newpf) {
1246151497Sru			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1247151497Sru			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1248151497Sru		}
1249151497Sru		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1250151497Sru	}
1251151497Sru	return (NULL);
1252151497Sru}
1253151497Sru
1254151497Sru/*
1255151497Sru * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1256151497Sru * being NULL.
1257151497Sru */
1258151497Srustatic __inline void
1259151497Srupmap_pte_release(pt_entry_t *pte)
1260151497Sru{
1261151497Sru
1262151497Sru	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1263151497Sru		mtx_unlock(&PMAP2mutex);
1264151497Sru}
1265151497Sru
1266151497Srustatic __inline void
1267151497Sruinvlcaddr(void *caddr)
1268151497Sru{
1269151497Sru
1270151497Sru	invlpg((u_int)caddr);
1271151497Sru}
1272151497Sru
1273151497Sru/*
1274151497Sru * Super fast pmap_pte routine best used when scanning
1275151497Sru * the pv lists.  This eliminates many coarse-grained
1276151497Sru * invltlb calls.  Note that many of the pv list
1277151497Sru * scans are across different pmaps.  It is very wasteful
1278151497Sru * to do an entire invltlb for checking a single mapping.
1279151497Sru *
1280151497Sru * If the given pmap is not the current pmap, vm_page_queue_mtx
1281151497Sru * must be held and curthread pinned to a CPU.
1282151497Sru */
1283151497Srustatic pt_entry_t *
1284151497Srupmap_pte_quick(pmap_t pmap, vm_offset_t va)
1285151497Sru{
1286151497Sru	pd_entry_t newpf;
1287151497Sru	pd_entry_t *pde;
1288151497Sru
1289151497Sru	pde = pmap_pde(pmap, va);
1290151497Sru	if (*pde & PG_PS)
1291151497Sru		return (pde);
1292151497Sru	if (*pde != 0) {
1293151497Sru		/* are we current address space or kernel? */
1294151497Sru		if (pmap_is_current(pmap))
1295151497Sru			return (vtopte(va));
1296151497Sru		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1297151497Sru		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1298151497Sru		newpf = *pde & PG_FRAME;
1299151497Sru		if ((*PMAP1 & PG_FRAME) != newpf) {
1300151497Sru			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1301151497Sru#ifdef SMP
1302151497Sru			PMAP1cpu = PCPU_GET(cpuid);
1303151497Sru#endif
1304151497Sru			invlcaddr(PADDR1);
1305151497Sru			PMAP1changed++;
1306151497Sru		} else
1307151497Sru#ifdef SMP
1308151497Sru		if (PMAP1cpu != PCPU_GET(cpuid)) {
1309151497Sru			PMAP1cpu = PCPU_GET(cpuid);
1310151497Sru			invlcaddr(PADDR1);
1311151497Sru			PMAP1changedcpu++;
1312151497Sru		} else
1313151497Sru#endif
1314151497Sru			PMAP1unchanged++;
1315151497Sru		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1316151497Sru	}
1317151497Sru	return (0);
1318151497Sru}
1319151497Sru
1320151497Sru/*
1321151497Sru *	Routine:	pmap_extract
1322151497Sru *	Function:
1323151497Sru *		Extract the physical page address associated
1324151497Sru *		with the given map/virtual_address pair.
1325151497Sru */
1326151497Sruvm_paddr_t
1327151497Srupmap_extract(pmap_t pmap, vm_offset_t va)
1328151497Sru{
1329151497Sru	vm_paddr_t rtval;
1330151497Sru	pt_entry_t *pte;
1331151497Sru	pd_entry_t pde;
1332151497Sru
1333151497Sru	rtval = 0;
1334151497Sru	PMAP_LOCK(pmap);
1335151497Sru	pde = pmap->pm_pdir[va >> PDRSHIFT];
1336151497Sru	if (pde != 0) {
1337151497Sru		if ((pde & PG_PS) != 0)
1338151497Sru			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1339151497Sru		else {
1340151497Sru			pte = pmap_pte(pmap, va);
1341151497Sru			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1342151497Sru			pmap_pte_release(pte);
1343151497Sru		}
1344151497Sru	}
1345151497Sru	PMAP_UNLOCK(pmap);
1346151497Sru	return (rtval);
1347151497Sru}
1348151497Sru
1349151497Sru/*
1350151497Sru *	Routine:	pmap_extract_and_hold
1351151497Sru *	Function:
1352151497Sru *		Atomically extract and hold the physical page
1353151497Sru *		with the given pmap and virtual address pair
1354151497Sru *		if that mapping permits the given protection.
1355151497Sru */
1356151497Sruvm_page_t
1357151497Srupmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1358151497Sru{
1359151497Sru	pd_entry_t pde;
1360151497Sru	pt_entry_t pte, *ptep;
1361151497Sru	vm_page_t m;
1362151497Sru	vm_paddr_t pa;
1363151497Sru
1364151497Sru	pa = 0;
1365151497Sru	m = NULL;
1366151497Sru	PMAP_LOCK(pmap);
1367151497Sruretry:
1368151497Sru	pde = *pmap_pde(pmap, va);
1369151497Sru	if (pde != 0) {
1370151497Sru		if (pde & PG_PS) {
1371151497Sru			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1372151497Sru				if (vm_page_pa_tryrelock(pmap, (pde &
1373151497Sru				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1374151497Sru					goto retry;
1375151497Sru				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1376151497Sru				    (va & PDRMASK));
1377151497Sru				vm_page_hold(m);
1378151497Sru			}
1379151497Sru		} else {
1380151497Sru			ptep = pmap_pte(pmap, va);
1381151497Sru			pte = *ptep;
1382151497Sru			pmap_pte_release(ptep);
1383151497Sru			if (pte != 0 &&
1384151497Sru			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1385151497Sru				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1386151497Sru				    &pa))
1387151497Sru					goto retry;
1388151497Sru				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1389151497Sru				vm_page_hold(m);
1390151497Sru			}
1391151497Sru		}
1392151497Sru	}
1393151497Sru	PA_UNLOCK_COND(pa);
1394151497Sru	PMAP_UNLOCK(pmap);
1395151497Sru	return (m);
1396151497Sru}
1397151497Sru
1398151497Sru/***************************************************
1399151497Sru * Low level mapping routines.....
1400151497Sru ***************************************************/
1401151497Sru
1402151497Sru/*
1403151497Sru * Add a wired page to the kva.
1404151497Sru * Note: not SMP coherent.
1405151497Sru *
1406151497Sru * This function may be used before pmap_bootstrap() is called.
1407151497Sru */
1408151497SruPMAP_INLINE void
1409151497Srupmap_kenter(vm_offset_t va, vm_paddr_t pa)
1410151497Sru{
1411151497Sru	pt_entry_t *pte;
1412151497Sru
1413151497Sru	pte = vtopte(va);
1414151497Sru	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1415151497Sru}
1416151497Sru
1417151497Srustatic __inline void
1418151497Srupmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1419151497Sru{
1420151497Sru	pt_entry_t *pte;
1421151497Sru
1422151497Sru	pte = vtopte(va);
1423151497Sru	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1424151497Sru}
1425151497Sru
1426151497Sru/*
1427151497Sru * Remove a page from the kernel pagetables.
1428151497Sru * Note: not SMP coherent.
1429151497Sru *
1430151497Sru * This function may be used before pmap_bootstrap() is called.
1431151497Sru */
1432151497SruPMAP_INLINE void
1433151497Srupmap_kremove(vm_offset_t va)
1434151497Sru{
1435151497Sru	pt_entry_t *pte;
1436151497Sru
1437151497Sru	pte = vtopte(va);
1438151497Sru	pte_clear(pte);
1439151497Sru}
1440151497Sru
1441151497Sru/*
1442151497Sru *	Used to map a range of physical addresses into kernel
1443151497Sru *	virtual address space.
1444151497Sru *
1445151497Sru *	The value passed in '*virt' is a suggested virtual address for
1446151497Sru *	the mapping. Architectures which can support a direct-mapped
1447151497Sru *	physical to virtual region can return the appropriate address
1448151497Sru *	within that region, leaving '*virt' unchanged. Other
1449151497Sru *	architectures should map the pages starting at '*virt' and
1450151497Sru *	update '*virt' with the first usable address after the mapped
1451151497Sru *	region.
1452151497Sru */
1453151497Sruvm_offset_t
1454151497Srupmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1455151497Sru{
1456151497Sru	vm_offset_t va, sva;
1457151497Sru	vm_paddr_t superpage_offset;
1458151497Sru	pd_entry_t newpde;
1459151497Sru
1460151497Sru	va = *virt;
1461151497Sru	/*
1462151497Sru	 * Does the physical address range's size and alignment permit at
1463151497Sru	 * least one superpage mapping to be created?
1464151497Sru	 */
1465151497Sru	superpage_offset = start & PDRMASK;
1466151497Sru	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1467151497Sru		/*
1468151497Sru		 * Increase the starting virtual address so that its alignment
1469151497Sru		 * does not preclude the use of superpage mappings.
1470151497Sru		 */
1471151497Sru		if ((va & PDRMASK) < superpage_offset)
1472151497Sru			va = (va & ~PDRMASK) + superpage_offset;
1473151497Sru		else if ((va & PDRMASK) > superpage_offset)
1474151497Sru			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1475151497Sru	}
1476151497Sru	sva = va;
1477151497Sru	while (start < end) {
1478151497Sru		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1479151497Sru		    pseflag) {
1480151497Sru			KASSERT((va & PDRMASK) == 0,
1481151497Sru			    ("pmap_map: misaligned va %#x", va));
1482151497Sru			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1483151497Sru			pmap_kenter_pde(va, newpde);
1484151497Sru			va += NBPDR;
1485151497Sru			start += NBPDR;
1486151497Sru		} else {
1487151497Sru			pmap_kenter(va, start);
1488151497Sru			va += PAGE_SIZE;
1489151497Sru			start += PAGE_SIZE;
1490151497Sru		}
1491151497Sru	}
1492151497Sru	pmap_invalidate_range(kernel_pmap, sva, va);
1493151497Sru	*virt = va;
1494151497Sru	return (sva);
1495151497Sru}
1496151497Sru
1497151497Sru
1498151497Sru/*
1499151497Sru * Add a list of wired pages to the kva
1500151497Sru * this routine is only used for temporary
1501151497Sru * kernel mappings that do not need to have
1502151497Sru * page modification or references recorded.
1503151497Sru * Note that old mappings are simply written
1504151497Sru * over.  The page *must* be wired.
1505151497Sru * Note: SMP coherent.  Uses a ranged shootdown IPI.
1506151497Sru */
1507151497Sruvoid
1508151497Srupmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1509151497Sru{
1510151497Sru	pt_entry_t *endpte, oldpte, pa, *pte;
1511151497Sru	vm_page_t m;
1512151497Sru
1513151497Sru	oldpte = 0;
1514151497Sru	pte = vtopte(sva);
1515151497Sru	endpte = pte + count;
1516151497Sru	while (pte < endpte) {
1517151497Sru		m = *ma++;
1518151497Sru		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1519151497Sru		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1520151497Sru			oldpte |= *pte;
1521151497Sru			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1522151497Sru		}
1523151497Sru		pte++;
1524151497Sru	}
1525151497Sru	if (__predict_false((oldpte & PG_V) != 0))
1526151497Sru		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1527151497Sru		    PAGE_SIZE);
1528151497Sru}
1529151497Sru
1530151497Sru/*
1531151497Sru * This routine tears out page mappings from the
1532151497Sru * kernel -- it is meant only for temporary mappings.
1533151497Sru * Note: SMP coherent.  Uses a ranged shootdown IPI.
1534151497Sru */
1535151497Sruvoid
1536151497Srupmap_qremove(vm_offset_t sva, int count)
1537151497Sru{
1538151497Sru	vm_offset_t va;
1539151497Sru
1540151497Sru	va = sva;
1541151497Sru	while (count-- > 0) {
1542151497Sru		pmap_kremove(va);
1543151497Sru		va += PAGE_SIZE;
1544151497Sru	}
1545151497Sru	pmap_invalidate_range(kernel_pmap, sva, va);
1546151497Sru}
1547151497Sru
1548151497Sru/***************************************************
1549151497Sru * Page table page management routines.....
1550151497Sru ***************************************************/
1551151497Srustatic __inline void
1552151497Srupmap_free_zero_pages(vm_page_t free)
1553151497Sru{
1554151497Sru	vm_page_t m;
1555151497Sru
1556151497Sru	while (free != NULL) {
1557151497Sru		m = free;
1558151497Sru		free = m->right;
1559151497Sru		/* Preserve the page's PG_ZERO setting. */
1560151497Sru		vm_page_free_toq(m);
1561151497Sru	}
1562151497Sru}
1563151497Sru
1564151497Sru/*
1565151497Sru * Schedule the specified unused page table page to be freed.  Specifically,
1566151497Sru * add the page to the specified list of pages that will be released to the
1567151497Sru * physical memory manager after the TLB has been updated.
1568151497Sru */
1569151497Srustatic __inline void
1570151497Srupmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1571151497Sru{
1572151497Sru
1573151497Sru	if (set_PG_ZERO)
1574151497Sru		m->flags |= PG_ZERO;
1575151497Sru	else
1576151497Sru		m->flags &= ~PG_ZERO;
1577151497Sru	m->right = *free;
1578151497Sru	*free = m;
1579151497Sru}
1580151497Sru
1581151497Sru/*
1582151497Sru * Inserts the specified page table page into the specified pmap's collection
1583151497Sru * of idle page table pages.  Each of a pmap's page table pages is responsible
1584151497Sru * for mapping a distinct range of virtual addresses.  The pmap's collection is
1585151497Sru * ordered by this virtual address range.
1586151497Sru */
1587151497Srustatic void
1588151497Srupmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1589151497Sru{
1590151497Sru	vm_page_t root;
1591151497Sru
1592151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1593151497Sru	root = pmap->pm_root;
1594151497Sru	if (root == NULL) {
1595151497Sru		mpte->left = NULL;
1596151497Sru		mpte->right = NULL;
1597151497Sru	} else {
1598151497Sru		root = vm_page_splay(mpte->pindex, root);
1599151497Sru		if (mpte->pindex < root->pindex) {
1600151497Sru			mpte->left = root->left;
1601151497Sru			mpte->right = root;
1602151497Sru			root->left = NULL;
1603151497Sru		} else if (mpte->pindex == root->pindex)
1604151497Sru			panic("pmap_insert_pt_page: pindex already inserted");
1605151497Sru		else {
1606151497Sru			mpte->right = root->right;
1607151497Sru			mpte->left = root;
1608151497Sru			root->right = NULL;
1609151497Sru		}
1610151497Sru	}
1611151497Sru	pmap->pm_root = mpte;
1612151497Sru}
1613151497Sru
1614151497Sru/*
1615151497Sru * Looks for a page table page mapping the specified virtual address in the
1616151497Sru * specified pmap's collection of idle page table pages.  Returns NULL if there
1617151497Sru * is no page table page corresponding to the specified virtual address.
1618151497Sru */
1619151497Srustatic vm_page_t
1620151497Srupmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1621151497Sru{
1622151497Sru	vm_page_t mpte;
1623151497Sru	vm_pindex_t pindex = va >> PDRSHIFT;
1624151497Sru
1625151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1626151497Sru	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1627151497Sru		mpte = vm_page_splay(pindex, mpte);
1628151497Sru		if ((pmap->pm_root = mpte)->pindex != pindex)
1629151497Sru			mpte = NULL;
1630151497Sru	}
1631151497Sru	return (mpte);
1632151497Sru}
1633151497Sru
1634151497Sru/*
1635151497Sru * Removes the specified page table page from the specified pmap's collection
1636151497Sru * of idle page table pages.  The specified page table page must be a member of
1637151497Sru * the pmap's collection.
1638151497Sru */
1639151497Srustatic void
1640151497Srupmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1641151497Sru{
1642151497Sru	vm_page_t root;
1643151497Sru
1644151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1645151497Sru	if (mpte != pmap->pm_root)
1646151497Sru		vm_page_splay(mpte->pindex, pmap->pm_root);
1647151497Sru	if (mpte->left == NULL)
1648151497Sru		root = mpte->right;
1649151497Sru	else {
1650151497Sru		root = vm_page_splay(mpte->pindex, mpte->left);
1651151497Sru		root->right = mpte->right;
1652151497Sru	}
1653151497Sru	pmap->pm_root = root;
1654151497Sru}
1655151497Sru
1656151497Sru/*
1657151497Sru * This routine unholds page table pages, and if the hold count
1658151497Sru * drops to zero, then it decrements the wire count.
1659151497Sru */
1660151497Srustatic __inline int
1661151497Srupmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1662151497Sru{
1663151497Sru
1664151497Sru	--m->wire_count;
1665151497Sru	if (m->wire_count == 0)
1666151497Sru		return (_pmap_unwire_pte_hold(pmap, m, free));
1667151497Sru	else
1668151497Sru		return (0);
1669151497Sru}
1670151497Sru
1671151497Srustatic int
1672151497Sru_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1673151497Sru{
1674151497Sru	vm_offset_t pteva;
1675151497Sru
1676151497Sru	/*
1677151497Sru	 * unmap the page table page
1678151497Sru	 */
1679151497Sru	pmap->pm_pdir[m->pindex] = 0;
1680151497Sru	--pmap->pm_stats.resident_count;
1681151497Sru
1682151497Sru	/*
1683151497Sru	 * This is a release store so that the ordinary store unmapping
1684151497Sru	 * the page table page is globally performed before TLB shoot-
1685151497Sru	 * down is begun.
1686151497Sru	 */
1687151497Sru	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1688151497Sru
1689151497Sru	/*
1690151497Sru	 * Do an invltlb to make the invalidated mapping
1691151497Sru	 * take effect immediately.
1692151497Sru	 */
1693151497Sru	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1694151497Sru	pmap_invalidate_page(pmap, pteva);
1695151497Sru
1696151497Sru	/*
1697151497Sru	 * Put page on a list so that it is released after
1698151497Sru	 * *ALL* TLB shootdown is done
1699151497Sru	 */
1700151497Sru	pmap_add_delayed_free_list(m, free, TRUE);
1701151497Sru
1702151497Sru	return (1);
1703151497Sru}
1704151497Sru
1705151497Sru/*
1706151497Sru * After removing a page table entry, this routine is used to
1707151497Sru * conditionally free the page, and manage the hold/wire counts.
1708151497Sru */
1709151497Srustatic int
1710151497Srupmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1711151497Sru{
1712151497Sru	pd_entry_t ptepde;
1713151497Sru	vm_page_t mpte;
1714151497Sru
1715151497Sru	if (va >= VM_MAXUSER_ADDRESS)
1716151497Sru		return (0);
1717151497Sru	ptepde = *pmap_pde(pmap, va);
1718151497Sru	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1719151497Sru	return (pmap_unwire_pte_hold(pmap, mpte, free));
1720151497Sru}
1721151497Sru
1722151497Sru/*
1723151497Sru * Initialize the pmap for the swapper process.
1724151497Sru */
1725151497Sruvoid
1726151497Srupmap_pinit0(pmap_t pmap)
1727151497Sru{
1728151497Sru
1729151497Sru	PMAP_LOCK_INIT(pmap);
1730151497Sru	/*
1731151497Sru	 * Since the page table directory is shared with the kernel pmap,
1732151497Sru	 * which is already included in the list "allpmaps", this pmap does
1733151497Sru	 * not need to be inserted into that list.
1734151497Sru	 */
1735151497Sru	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1736151497Sru#ifdef PAE
1737151497Sru	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1738151497Sru#endif
1739151497Sru	pmap->pm_root = NULL;
1740151497Sru	CPU_ZERO(&pmap->pm_active);
1741151497Sru	PCPU_SET(curpmap, pmap);
1742151497Sru	TAILQ_INIT(&pmap->pm_pvchunk);
1743151497Sru	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1744151497Sru}
1745151497Sru
1746151497Sru/*
1747151497Sru * Initialize a preallocated and zeroed pmap structure,
1748151497Sru * such as one in a vmspace structure.
1749151497Sru */
1750151497Sruint
1751151497Srupmap_pinit(pmap_t pmap)
1752151497Sru{
1753151497Sru	vm_page_t m, ptdpg[NPGPTD];
1754151497Sru	vm_paddr_t pa;
1755151497Sru	int i;
1756151497Sru
1757151497Sru	PMAP_LOCK_INIT(pmap);
1758151497Sru
1759151497Sru	/*
1760151497Sru	 * No need to allocate page table space yet but we do need a valid
1761151497Sru	 * page directory table.
1762151497Sru	 */
1763151497Sru	if (pmap->pm_pdir == NULL) {
1764151497Sru		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1765151497Sru		    NBPTD);
1766151497Sru		if (pmap->pm_pdir == NULL) {
1767151497Sru			PMAP_LOCK_DESTROY(pmap);
1768151497Sru			return (0);
1769151497Sru		}
1770151497Sru#ifdef PAE
1771151497Sru		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1772151497Sru		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1773151497Sru		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1774151497Sru		    ("pmap_pinit: pdpt misaligned"));
1775151497Sru		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1776151497Sru		    ("pmap_pinit: pdpt above 4g"));
1777151497Sru#endif
1778151497Sru		pmap->pm_root = NULL;
1779151497Sru	}
1780151497Sru	KASSERT(pmap->pm_root == NULL,
1781151497Sru	    ("pmap_pinit: pmap has reserved page table page(s)"));
1782151497Sru
1783151497Sru	/*
1784151497Sru	 * allocate the page directory page(s)
1785151497Sru	 */
1786151497Sru	for (i = 0; i < NPGPTD;) {
1787151497Sru		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1788151497Sru		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1789151497Sru		if (m == NULL)
1790151497Sru			VM_WAIT;
1791151497Sru		else {
1792151497Sru			ptdpg[i++] = m;
1793151497Sru		}
1794151497Sru	}
1795151497Sru
1796151497Sru	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1797151497Sru
1798151497Sru	for (i = 0; i < NPGPTD; i++)
1799151497Sru		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1800151497Sru			pagezero(pmap->pm_pdir + (i * NPDEPG));
1801151497Sru
1802151497Sru	mtx_lock_spin(&allpmaps_lock);
1803151497Sru	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1804151497Sru	/* Copy the kernel page table directory entries. */
1805151497Sru	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1806151497Sru	mtx_unlock_spin(&allpmaps_lock);
1807151497Sru
1808151497Sru	/* install self-referential address mapping entry(s) */
1809151497Sru	for (i = 0; i < NPGPTD; i++) {
1810151497Sru		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1811151497Sru		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1812151497Sru#ifdef PAE
1813151497Sru		pmap->pm_pdpt[i] = pa | PG_V;
1814151497Sru#endif
1815151497Sru	}
1816151497Sru
1817151497Sru	CPU_ZERO(&pmap->pm_active);
1818151497Sru	TAILQ_INIT(&pmap->pm_pvchunk);
1819151497Sru	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1820151497Sru
1821151497Sru	return (1);
1822151497Sru}
1823151497Sru
1824151497Sru/*
1825151497Sru * this routine is called if the page table page is not
1826151497Sru * mapped correctly.
1827151497Sru */
1828151497Srustatic vm_page_t
1829151497Sru_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags)
1830151497Sru{
1831151497Sru	vm_paddr_t ptepa;
1832151497Sru	vm_page_t m;
1833151497Sru
1834151497Sru	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1835151497Sru	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1836151497Sru	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1837151497Sru
1838151497Sru	/*
1839151497Sru	 * Allocate a page table page.
1840151497Sru	 */
1841151497Sru	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1842151497Sru	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1843151497Sru		if (flags & M_WAITOK) {
1844151497Sru			PMAP_UNLOCK(pmap);
1845151497Sru			vm_page_unlock_queues();
1846151497Sru			VM_WAIT;
1847151497Sru			vm_page_lock_queues();
1848151497Sru			PMAP_LOCK(pmap);
1849151497Sru		}
1850151497Sru
1851151497Sru		/*
1852151497Sru		 * Indicate the need to retry.  While waiting, the page table
1853151497Sru		 * page may have been allocated.
1854151497Sru		 */
1855151497Sru		return (NULL);
1856151497Sru	}
1857151497Sru	if ((m->flags & PG_ZERO) == 0)
1858151497Sru		pmap_zero_page(m);
1859151497Sru
1860151497Sru	/*
1861151497Sru	 * Map the pagetable page into the process address space, if
1862151497Sru	 * it isn't already there.
1863151497Sru	 */
1864151497Sru
1865151497Sru	pmap->pm_stats.resident_count++;
1866151497Sru
1867151497Sru	ptepa = VM_PAGE_TO_PHYS(m);
1868151497Sru	pmap->pm_pdir[ptepindex] =
1869151497Sru		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1870151497Sru
1871151497Sru	return (m);
1872151497Sru}
1873151497Sru
1874151497Srustatic vm_page_t
1875151497Srupmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1876151497Sru{
1877151497Sru	u_int ptepindex;
1878151497Sru	pd_entry_t ptepa;
1879151497Sru	vm_page_t m;
1880151497Sru
1881151497Sru	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1882151497Sru	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1883151497Sru	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1884151497Sru
1885151497Sru	/*
1886151497Sru	 * Calculate pagetable page index
1887151497Sru	 */
1888151497Sru	ptepindex = va >> PDRSHIFT;
1889151497Sruretry:
1890151497Sru	/*
1891151497Sru	 * Get the page directory entry
1892151497Sru	 */
1893151497Sru	ptepa = pmap->pm_pdir[ptepindex];
1894151497Sru
1895151497Sru	/*
1896151497Sru	 * This supports switching from a 4MB page to a
1897151497Sru	 * normal 4K page.
1898151497Sru	 */
1899151497Sru	if (ptepa & PG_PS) {
1900151497Sru		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1901151497Sru		ptepa = pmap->pm_pdir[ptepindex];
1902151497Sru	}
1903151497Sru
1904151497Sru	/*
1905151497Sru	 * If the page table page is mapped, we just increment the
1906151497Sru	 * hold count, and activate it.
1907151497Sru	 */
1908151497Sru	if (ptepa) {
1909151497Sru		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1910151497Sru		m->wire_count++;
1911151497Sru	} else {
1912151497Sru		/*
1913151497Sru		 * Here if the pte page isn't mapped, or if it has
1914151497Sru		 * been deallocated.
1915151497Sru		 */
1916151497Sru		m = _pmap_allocpte(pmap, ptepindex, flags);
1917151497Sru		if (m == NULL && (flags & M_WAITOK))
1918151497Sru			goto retry;
1919151497Sru	}
1920151497Sru	return (m);
1921151497Sru}
1922151497Sru
1923151497Sru
1924151497Sru/***************************************************
1925151497Sru* Pmap allocation/deallocation routines.
1926151497Sru ***************************************************/
1927151497Sru
1928151497Sru#ifdef SMP
1929151497Sru/*
1930151497Sru * Deal with a SMP shootdown of other users of the pmap that we are
1931151497Sru * trying to dispose of.  This can be a bit hairy.
1932151497Sru */
1933151497Srustatic cpuset_t *lazymask;
1934151497Srustatic u_int lazyptd;
1935151497Srustatic volatile u_int lazywait;
1936151497Sru
1937151497Sruvoid pmap_lazyfix_action(void);
1938151497Sru
1939151497Sruvoid
1940151497Srupmap_lazyfix_action(void)
1941151497Sru{
1942151497Sru
1943151497Sru#ifdef COUNT_IPIS
1944151497Sru	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1945151497Sru#endif
1946151497Sru	if (rcr3() == lazyptd)
1947151497Sru		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1948151497Sru	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1949151497Sru	atomic_store_rel_int(&lazywait, 1);
1950151497Sru}
1951151497Sru
1952151497Srustatic void
1953151497Srupmap_lazyfix_self(u_int cpuid)
1954151497Sru{
1955151497Sru
1956151497Sru	if (rcr3() == lazyptd)
1957151497Sru		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1958151497Sru	CPU_CLR_ATOMIC(cpuid, lazymask);
1959151497Sru}
1960151497Sru
1961151497Sru
1962151497Srustatic void
1963151497Srupmap_lazyfix(pmap_t pmap)
1964151497Sru{
1965151497Sru	cpuset_t mymask, mask;
1966151497Sru	u_int cpuid, spins;
1967151497Sru	int lsb;
1968151497Sru
1969151497Sru	mask = pmap->pm_active;
1970151497Sru	while (!CPU_EMPTY(&mask)) {
1971151497Sru		spins = 50000000;
1972151497Sru
1973151497Sru		/* Find least significant set bit. */
1974151497Sru		lsb = cpusetobj_ffs(&mask);
1975151497Sru		MPASS(lsb != 0);
1976151497Sru		lsb--;
1977151497Sru		CPU_SETOF(lsb, &mask);
1978151497Sru		mtx_lock_spin(&smp_ipi_mtx);
1979151497Sru#ifdef PAE
1980151497Sru		lazyptd = vtophys(pmap->pm_pdpt);
1981151497Sru#else
1982151497Sru		lazyptd = vtophys(pmap->pm_pdir);
1983151497Sru#endif
1984151497Sru		cpuid = PCPU_GET(cpuid);
1985151497Sru
1986151497Sru		/* Use a cpuset just for having an easy check. */
1987151497Sru		CPU_SETOF(cpuid, &mymask);
1988151497Sru		if (!CPU_CMP(&mask, &mymask)) {
1989151497Sru			lazymask = &pmap->pm_active;
1990151497Sru			pmap_lazyfix_self(cpuid);
1991151497Sru		} else {
1992151497Sru			atomic_store_rel_int((u_int *)&lazymask,
1993151497Sru			    (u_int)&pmap->pm_active);
1994151497Sru			atomic_store_rel_int(&lazywait, 0);
1995151497Sru			ipi_selected(mask, IPI_LAZYPMAP);
1996151497Sru			while (lazywait == 0) {
1997151497Sru				ia32_pause();
1998151497Sru				if (--spins == 0)
1999151497Sru					break;
2000151497Sru			}
2001151497Sru		}
2002151497Sru		mtx_unlock_spin(&smp_ipi_mtx);
2003151497Sru		if (spins == 0)
2004151497Sru			printf("pmap_lazyfix: spun for 50000000\n");
2005151497Sru		mask = pmap->pm_active;
2006151497Sru	}
2007151497Sru}
2008151497Sru
2009151497Sru#else	/* SMP */
2010151497Sru
2011151497Sru/*
2012151497Sru * Cleaning up on uniprocessor is easy.  For various reasons, we're
2013151497Sru * unlikely to have to even execute this code, including the fact
2014151497Sru * that the cleanup is deferred until the parent does a wait(2), which
2015151497Sru * means that another userland process has run.
2016151497Sru */
2017151497Srustatic void
2018151497Srupmap_lazyfix(pmap_t pmap)
2019151497Sru{
2020151497Sru	u_int cr3;
2021151497Sru
2022151497Sru	cr3 = vtophys(pmap->pm_pdir);
2023151497Sru	if (cr3 == rcr3()) {
2024151497Sru		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
2025151497Sru		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2026151497Sru	}
2027151497Sru}
2028151497Sru#endif	/* SMP */
2029151497Sru
2030151497Sru/*
2031151497Sru * Release any resources held by the given physical map.
2032151497Sru * Called when a pmap initialized by pmap_pinit is being released.
2033151497Sru * Should only be called if the map contains no valid mappings.
2034151497Sru */
2035151497Sruvoid
2036151497Srupmap_release(pmap_t pmap)
2037151497Sru{
2038151497Sru	vm_page_t m, ptdpg[NPGPTD];
2039151497Sru	int i;
2040151497Sru
2041151497Sru	KASSERT(pmap->pm_stats.resident_count == 0,
2042151497Sru	    ("pmap_release: pmap resident count %ld != 0",
2043151497Sru	    pmap->pm_stats.resident_count));
2044151497Sru	KASSERT(pmap->pm_root == NULL,
2045151497Sru	    ("pmap_release: pmap has reserved page table page(s)"));
2046151497Sru
2047151497Sru	pmap_lazyfix(pmap);
2048151497Sru	mtx_lock_spin(&allpmaps_lock);
2049151497Sru	LIST_REMOVE(pmap, pm_list);
2050151497Sru	mtx_unlock_spin(&allpmaps_lock);
2051151497Sru
2052151497Sru	for (i = 0; i < NPGPTD; i++)
2053151497Sru		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2054151497Sru		    PG_FRAME);
2055151497Sru
2056151497Sru	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2057151497Sru	    sizeof(*pmap->pm_pdir));
2058151497Sru
2059151497Sru	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2060151497Sru
2061151497Sru	for (i = 0; i < NPGPTD; i++) {
2062151497Sru		m = ptdpg[i];
2063151497Sru#ifdef PAE
2064151497Sru		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2065151497Sru		    ("pmap_release: got wrong ptd page"));
2066151497Sru#endif
2067151497Sru		m->wire_count--;
2068151497Sru		atomic_subtract_int(&cnt.v_wire_count, 1);
2069151497Sru		vm_page_free_zero(m);
2070151497Sru	}
2071151497Sru	PMAP_LOCK_DESTROY(pmap);
2072151497Sru}
2073151497Sru
2074151497Srustatic int
2075151497Srukvm_size(SYSCTL_HANDLER_ARGS)
2076151497Sru{
2077151497Sru	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2078151497Sru
2079151497Sru	return (sysctl_handle_long(oidp, &ksize, 0, req));
2080151497Sru}
2081151497SruSYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2082151497Sru    0, 0, kvm_size, "IU", "Size of KVM");
2083151497Sru
2084151497Srustatic int
2085151497Srukvm_free(SYSCTL_HANDLER_ARGS)
2086151497Sru{
2087151497Sru	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2088151497Sru
2089151497Sru	return (sysctl_handle_long(oidp, &kfree, 0, req));
2090151497Sru}
2091151497SruSYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2092151497Sru    0, 0, kvm_free, "IU", "Amount of KVM free");
2093151497Sru
2094151497Sru/*
2095151497Sru * grow the number of kernel page table entries, if needed
2096151497Sru */
2097151497Sruvoid
2098151497Srupmap_growkernel(vm_offset_t addr)
2099151497Sru{
2100151497Sru	vm_paddr_t ptppaddr;
2101151497Sru	vm_page_t nkpg;
2102151497Sru	pd_entry_t newpdir;
2103151497Sru
2104151497Sru	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2105151497Sru	addr = roundup2(addr, NBPDR);
2106151497Sru	if (addr - 1 >= kernel_map->max_offset)
2107151497Sru		addr = kernel_map->max_offset;
2108151497Sru	while (kernel_vm_end < addr) {
2109151497Sru		if (pdir_pde(PTD, kernel_vm_end)) {
2110151497Sru			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2111151497Sru			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2112151497Sru				kernel_vm_end = kernel_map->max_offset;
2113151497Sru				break;
2114151497Sru			}
2115151497Sru			continue;
2116151497Sru		}
2117151497Sru
2118151497Sru		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2119151497Sru		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2120151497Sru		    VM_ALLOC_ZERO);
2121151497Sru		if (nkpg == NULL)
2122151497Sru			panic("pmap_growkernel: no memory to grow kernel");
2123151497Sru
2124151497Sru		nkpt++;
2125151497Sru
2126151497Sru		if ((nkpg->flags & PG_ZERO) == 0)
2127151497Sru			pmap_zero_page(nkpg);
2128151497Sru		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2129151497Sru		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2130151497Sru		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2131151497Sru
2132151497Sru		pmap_kenter_pde(kernel_vm_end, newpdir);
2133151497Sru		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2134151497Sru		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2135151497Sru			kernel_vm_end = kernel_map->max_offset;
2136151497Sru			break;
2137151497Sru		}
2138151497Sru	}
2139151497Sru}
2140151497Sru
2141151497Sru
2142151497Sru/***************************************************
2143151497Sru * page management routines.
2144151497Sru ***************************************************/
2145151497Sru
2146151497SruCTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2147151497SruCTASSERT(_NPCM == 11);
2148151497SruCTASSERT(_NPCPV == 336);
2149151497Sru
2150151497Srustatic __inline struct pv_chunk *
2151151497Srupv_to_chunk(pv_entry_t pv)
2152151497Sru{
2153151497Sru
2154151497Sru	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2155151497Sru}
2156151497Sru
2157151497Sru#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2158151497Sru
2159151497Sru#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2160151497Sru#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2161151497Sru
2162151497Srustatic uint32_t pc_freemask[_NPCM] = {
2163151497Sru	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2164151497Sru	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2165151497Sru	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2166151497Sru	PC_FREE0_9, PC_FREE10
2167151497Sru};
2168151497Sru
2169151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2170151497Sru	"Current number of pv entries");
2171151497Sru
2172151497Sru#ifdef PV_STATS
2173151497Srustatic int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2174151497Sru
2175151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2176151497Sru	"Current number of pv entry chunks");
2177151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2178151497Sru	"Current number of pv entry chunks allocated");
2179151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2180151497Sru	"Current number of pv entry chunks frees");
2181151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2182151497Sru	"Number of times tried to get a chunk page but failed.");
2183151497Sru
2184151497Srustatic long pv_entry_frees, pv_entry_allocs;
2185151497Srustatic int pv_entry_spare;
2186151497Sru
2187151497SruSYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2188151497Sru	"Current number of pv entry frees");
2189151497SruSYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2190151497Sru	"Current number of pv entry allocs");
2191151497SruSYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2192151497Sru	"Current number of spare pv entries");
2193151497Sru#endif
2194151497Sru
2195151497Sru/*
2196151497Sru * We are in a serious low memory condition.  Resort to
2197151497Sru * drastic measures to free some pages so we can allocate
2198151497Sru * another pv entry chunk.
2199151497Sru */
2200151497Srustatic vm_page_t
2201151497Srupmap_pv_reclaim(pmap_t locked_pmap)
2202151497Sru{
2203151497Sru	struct pch newtail;
2204151497Sru	struct pv_chunk *pc;
2205151497Sru	struct md_page *pvh;
2206151497Sru	pd_entry_t *pde;
2207151497Sru	pmap_t pmap;
2208151497Sru	pt_entry_t *pte, tpte;
2209151497Sru	pv_entry_t pv;
2210151497Sru	vm_offset_t va;
2211151497Sru	vm_page_t free, m, m_pc;
2212151497Sru	uint32_t inuse, freemask;
2213151497Sru	int bit, field, freed;
2214151497Sru
2215151497Sru	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2216151497Sru	pmap = NULL;
2217151497Sru	free = m_pc = NULL;
2218151497Sru	TAILQ_INIT(&newtail);
2219151497Sru	sched_pin();
2220151497Sru	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2221151497Sru	    free == NULL)) {
2222151497Sru		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2223151497Sru		if (pmap != pc->pc_pmap) {
2224151497Sru			if (pmap != NULL) {
2225151497Sru				pmap_invalidate_all(pmap);
2226151497Sru				if (pmap != locked_pmap)
2227151497Sru					PMAP_UNLOCK(pmap);
2228151497Sru			}
2229151497Sru			pmap = pc->pc_pmap;
2230151497Sru			/* Avoid deadlock and lock recursion. */
2231151497Sru			if (pmap > locked_pmap)
2232151497Sru				PMAP_LOCK(pmap);
2233151497Sru			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2234151497Sru				pmap = NULL;
2235151497Sru				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2236151497Sru				continue;
2237151497Sru			}
2238151497Sru		}
2239151497Sru
2240151497Sru		/*
2241151497Sru		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2242151497Sru		 */
2243151497Sru		freed = 0;
2244151497Sru		for (field = 0; field < _NPCM; field++) {
2245151497Sru			freemask = 0;
2246151497Sru			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2247151497Sru			    inuse != 0; inuse &= ~(1UL << bit)) {
2248151497Sru				bit = bsfl(inuse);
2249151497Sru				pv = &pc->pc_pventry[field * 32 + bit];
2250151497Sru				va = pv->pv_va;
2251151497Sru				pde = pmap_pde(pmap, va);
2252151497Sru				if ((*pde & PG_PS) != 0)
2253151497Sru					continue;
2254151497Sru				pte = pmap_pte_quick(pmap, va);
2255151497Sru				if ((*pte & PG_W) != 0)
2256151497Sru					continue;
2257151497Sru				tpte = pte_load_clear(pte);
2258151497Sru				if ((tpte & PG_G) != 0)
2259151497Sru					pmap_invalidate_page(pmap, va);
2260151497Sru				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2261151497Sru				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2262151497Sru					vm_page_dirty(m);
2263151497Sru				if ((tpte & PG_A) != 0)
2264151497Sru					vm_page_aflag_set(m, PGA_REFERENCED);
2265151497Sru				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2266151497Sru				if (TAILQ_EMPTY(&m->md.pv_list) &&
2267151497Sru				    (m->flags & PG_FICTITIOUS) == 0) {
2268151497Sru					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2269151497Sru					if (TAILQ_EMPTY(&pvh->pv_list)) {
2270151497Sru						vm_page_aflag_clear(m,
2271151497Sru						    PGA_WRITEABLE);
2272151497Sru					}
2273151497Sru				}
2274151497Sru				pmap_unuse_pt(pmap, va, &free);
2275151497Sru				freemask |= 1UL << bit;
2276151497Sru				freed++;
2277151497Sru			}
2278151497Sru			pc->pc_map[field] |= freemask;
2279151497Sru		}
2280151497Sru		if (freed == 0) {
2281151497Sru			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2282151497Sru			continue;
2283151497Sru		}
2284151497Sru		pmap->pm_stats.resident_count -= freed;
2285151497Sru		PV_STAT(pv_entry_frees += freed);
2286151497Sru		PV_STAT(pv_entry_spare += freed);
2287151497Sru		pv_entry_count -= freed;
2288151497Sru		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2289151497Sru		for (field = 0; field < _NPCM; field++)
2290151497Sru			if (pc->pc_map[field] != pc_freemask[field]) {
2291151497Sru				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2292151497Sru				    pc_list);
2293151497Sru				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2294151497Sru
2295151497Sru				/*
2296151497Sru				 * One freed pv entry in locked_pmap is
2297151497Sru				 * sufficient.
2298151497Sru				 */
2299151497Sru				if (pmap == locked_pmap)
2300151497Sru					goto out;
2301151497Sru				break;
2302151497Sru			}
2303151497Sru		if (field == _NPCM) {
2304151497Sru			PV_STAT(pv_entry_spare -= _NPCPV);
2305151497Sru			PV_STAT(pc_chunk_count--);
2306151497Sru			PV_STAT(pc_chunk_frees++);
2307151497Sru			/* Entire chunk is free; return it. */
2308151497Sru			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2309151497Sru			pmap_qremove((vm_offset_t)pc, 1);
2310151497Sru			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2311151497Sru			break;
2312151497Sru		}
2313151497Sru	}
2314151497Sruout:
2315151497Sru	sched_unpin();
2316151497Sru	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2317151497Sru	if (pmap != NULL) {
2318151497Sru		pmap_invalidate_all(pmap);
2319151497Sru		if (pmap != locked_pmap)
2320151497Sru			PMAP_UNLOCK(pmap);
2321151497Sru	}
2322151497Sru	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
2323151497Sru		m_pc = free;
2324151497Sru		free = m_pc->right;
2325151497Sru		/* Recycle a freed page table page. */
2326151497Sru		m_pc->wire_count = 1;
2327151497Sru		atomic_add_int(&cnt.v_wire_count, 1);
2328151497Sru	}
2329151497Sru	pmap_free_zero_pages(free);
2330151497Sru	return (m_pc);
2331151497Sru}
2332151497Sru
2333151497Sru/*
2334151497Sru * free the pv_entry back to the free list
2335151497Sru */
2336151497Srustatic void
2337151497Srufree_pv_entry(pmap_t pmap, pv_entry_t pv)
2338151497Sru{
2339151497Sru	struct pv_chunk *pc;
2340151497Sru	int idx, field, bit;
2341151497Sru
2342151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2343151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2344151497Sru	PV_STAT(pv_entry_frees++);
2345151497Sru	PV_STAT(pv_entry_spare++);
2346151497Sru	pv_entry_count--;
2347151497Sru	pc = pv_to_chunk(pv);
2348151497Sru	idx = pv - &pc->pc_pventry[0];
2349151497Sru	field = idx / 32;
2350151497Sru	bit = idx % 32;
2351151497Sru	pc->pc_map[field] |= 1ul << bit;
2352151497Sru	/* move to head of list */
2353151497Sru	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2354151497Sru	for (idx = 0; idx < _NPCM; idx++)
2355151497Sru		if (pc->pc_map[idx] != pc_freemask[idx]) {
2356151497Sru			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2357151497Sru			return;
2358151497Sru		}
2359151497Sru	free_pv_chunk(pc);
2360151497Sru}
2361151497Sru
2362151497Srustatic void
2363151497Srufree_pv_chunk(struct pv_chunk *pc)
2364151497Sru{
2365151497Sru	vm_page_t m;
2366151497Sru
2367151497Sru 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2368151497Sru	PV_STAT(pv_entry_spare -= _NPCPV);
2369151497Sru	PV_STAT(pc_chunk_count--);
2370151497Sru	PV_STAT(pc_chunk_frees++);
2371151497Sru	/* entire chunk is free, return it */
2372151497Sru	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2373151497Sru	pmap_qremove((vm_offset_t)pc, 1);
2374151497Sru	vm_page_unwire(m, 0);
2375151497Sru	vm_page_free(m);
2376151497Sru	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2377151497Sru}
2378151497Sru
2379151497Sru/*
2380151497Sru * get a new pv_entry, allocating a block from the system
2381151497Sru * when needed.
2382151497Sru */
2383151497Srustatic pv_entry_t
2384151497Sruget_pv_entry(pmap_t pmap, boolean_t try)
2385151497Sru{
2386151497Sru	static const struct timeval printinterval = { 60, 0 };
2387151497Sru	static struct timeval lastprint;
2388151497Sru	int bit, field;
2389151497Sru	pv_entry_t pv;
2390151497Sru	struct pv_chunk *pc;
2391151497Sru	vm_page_t m;
2392151497Sru
2393151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2394151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2395151497Sru	PV_STAT(pv_entry_allocs++);
2396151497Sru	pv_entry_count++;
2397151497Sru	if (pv_entry_count > pv_entry_high_water)
2398151497Sru		if (ratecheck(&lastprint, &printinterval))
2399151497Sru			printf("Approaching the limit on PV entries, consider "
2400151497Sru			    "increasing either the vm.pmap.shpgperproc or the "
2401151497Sru			    "vm.pmap.pv_entry_max tunable.\n");
2402151497Sruretry:
2403151497Sru	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2404151497Sru	if (pc != NULL) {
2405151497Sru		for (field = 0; field < _NPCM; field++) {
2406151497Sru			if (pc->pc_map[field]) {
2407151497Sru				bit = bsfl(pc->pc_map[field]);
2408151497Sru				break;
2409151497Sru			}
2410151497Sru		}
2411151497Sru		if (field < _NPCM) {
2412151497Sru			pv = &pc->pc_pventry[field * 32 + bit];
2413151497Sru			pc->pc_map[field] &= ~(1ul << bit);
2414151497Sru			/* If this was the last item, move it to tail */
2415151497Sru			for (field = 0; field < _NPCM; field++)
2416151497Sru				if (pc->pc_map[field] != 0) {
2417151497Sru					PV_STAT(pv_entry_spare--);
2418151497Sru					return (pv);	/* not full, return */
2419151497Sru				}
2420151497Sru			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2421151497Sru			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2422151497Sru			if (pc != TAILQ_LAST(&pv_chunks, pch)) {
2423151497Sru				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2424151497Sru				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2425151497Sru			}
2426151497Sru			PV_STAT(pv_entry_spare--);
2427151497Sru			return (pv);
2428151497Sru		}
2429151497Sru	}
2430151497Sru	/*
2431151497Sru	 * Access to the ptelist "pv_vafree" is synchronized by the page
2432151497Sru	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2433151497Sru	 * remain non-empty until pmap_ptelist_alloc() completes.
2434151497Sru	 */
2435151497Sru	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2436151497Sru	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2437151497Sru		if (try) {
2438151497Sru			pv_entry_count--;
2439151497Sru			PV_STAT(pc_chunk_tryfail++);
2440151497Sru			return (NULL);
2441151497Sru		}
2442151497Sru		m = pmap_pv_reclaim(pmap);
2443151497Sru		if (m == NULL)
2444151497Sru			goto retry;
2445151497Sru	}
2446151497Sru	PV_STAT(pc_chunk_count++);
2447151497Sru	PV_STAT(pc_chunk_allocs++);
2448151497Sru	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2449151497Sru	pmap_qenter((vm_offset_t)pc, &m, 1);
2450151497Sru	pc->pc_pmap = pmap;
2451151497Sru	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2452151497Sru	for (field = 1; field < _NPCM; field++)
2453151497Sru		pc->pc_map[field] = pc_freemask[field];
2454151497Sru	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2455151497Sru	pv = &pc->pc_pventry[0];
2456151497Sru	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2457151497Sru	PV_STAT(pv_entry_spare += _NPCPV - 1);
2458151497Sru	return (pv);
2459151497Sru}
2460151497Sru
2461151497Srustatic __inline pv_entry_t
2462151497Srupmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2463151497Sru{
2464151497Sru	pv_entry_t pv;
2465151497Sru
2466151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2467151497Sru	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2468151497Sru		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2469151497Sru			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2470151497Sru			break;
2471151497Sru		}
2472151497Sru	}
2473151497Sru	return (pv);
2474151497Sru}
2475151497Sru
2476151497Srustatic void
2477151497Srupmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2478151497Sru{
2479151497Sru	struct md_page *pvh;
2480151497Sru	pv_entry_t pv;
2481151497Sru	vm_offset_t va_last;
2482151497Sru	vm_page_t m;
2483151497Sru
2484151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2485151497Sru	KASSERT((pa & PDRMASK) == 0,
2486151497Sru	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2487151497Sru
2488151497Sru	/*
2489151497Sru	 * Transfer the 4mpage's pv entry for this mapping to the first
2490151497Sru	 * page's pv list.
2491151497Sru	 */
2492151497Sru	pvh = pa_to_pvh(pa);
2493151497Sru	va = trunc_4mpage(va);
2494151497Sru	pv = pmap_pvh_remove(pvh, pmap, va);
2495151497Sru	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2496151497Sru	m = PHYS_TO_VM_PAGE(pa);
2497151497Sru	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2498151497Sru	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2499151497Sru	va_last = va + NBPDR - PAGE_SIZE;
2500151497Sru	do {
2501151497Sru		m++;
2502151497Sru		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2503151497Sru		    ("pmap_pv_demote_pde: page %p is not managed", m));
2504151497Sru		va += PAGE_SIZE;
2505151497Sru		pmap_insert_entry(pmap, va, m);
2506151497Sru	} while (va < va_last);
2507151497Sru}
2508151497Sru
2509151497Srustatic void
2510151497Srupmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2511151497Sru{
2512151497Sru	struct md_page *pvh;
2513151497Sru	pv_entry_t pv;
2514151497Sru	vm_offset_t va_last;
2515151497Sru	vm_page_t m;
2516151497Sru
2517151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2518151497Sru	KASSERT((pa & PDRMASK) == 0,
2519151497Sru	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2520151497Sru
2521151497Sru	/*
2522151497Sru	 * Transfer the first page's pv entry for this mapping to the
2523151497Sru	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2524151497Sru	 * to get_pv_entry(), a transfer avoids the possibility that
2525151497Sru	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2526151497Sru	 * removes one of the mappings that is being promoted.
2527151497Sru	 */
2528151497Sru	m = PHYS_TO_VM_PAGE(pa);
2529151497Sru	va = trunc_4mpage(va);
2530151497Sru	pv = pmap_pvh_remove(&m->md, pmap, va);
2531151497Sru	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2532151497Sru	pvh = pa_to_pvh(pa);
2533151497Sru	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2534151497Sru	/* Free the remaining NPTEPG - 1 pv entries. */
2535151497Sru	va_last = va + NBPDR - PAGE_SIZE;
2536151497Sru	do {
2537151497Sru		m++;
2538151497Sru		va += PAGE_SIZE;
2539151497Sru		pmap_pvh_free(&m->md, pmap, va);
2540151497Sru	} while (va < va_last);
2541151497Sru}
2542151497Sru
2543151497Srustatic void
2544151497Srupmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2545151497Sru{
2546151497Sru	pv_entry_t pv;
2547151497Sru
2548151497Sru	pv = pmap_pvh_remove(pvh, pmap, va);
2549151497Sru	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2550151497Sru	free_pv_entry(pmap, pv);
2551151497Sru}
2552151497Sru
2553151497Srustatic void
2554151497Srupmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2555151497Sru{
2556151497Sru	struct md_page *pvh;
2557151497Sru
2558151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2559151497Sru	pmap_pvh_free(&m->md, pmap, va);
2560151497Sru	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2561151497Sru		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2562151497Sru		if (TAILQ_EMPTY(&pvh->pv_list))
2563151497Sru			vm_page_aflag_clear(m, PGA_WRITEABLE);
2564151497Sru	}
2565151497Sru}
2566151497Sru
2567151497Sru/*
2568151497Sru * Create a pv entry for page at pa for
2569151497Sru * (pmap, va).
2570151497Sru */
2571151497Srustatic void
2572151497Srupmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2573151497Sru{
2574151497Sru	pv_entry_t pv;
2575151497Sru
2576151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2577151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2578151497Sru	pv = get_pv_entry(pmap, FALSE);
2579151497Sru	pv->pv_va = va;
2580151497Sru	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2581151497Sru}
2582151497Sru
2583151497Sru/*
2584151497Sru * Conditionally create a pv entry.
2585151497Sru */
2586151497Srustatic boolean_t
2587151497Srupmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2588151497Sru{
2589151497Sru	pv_entry_t pv;
2590151497Sru
2591151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2592151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2593151497Sru	if (pv_entry_count < pv_entry_high_water &&
2594151497Sru	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2595151497Sru		pv->pv_va = va;
2596151497Sru		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2597151497Sru		return (TRUE);
2598151497Sru	} else
2599151497Sru		return (FALSE);
2600151497Sru}
2601151497Sru
2602151497Sru/*
2603151497Sru * Create the pv entries for each of the pages within a superpage.
2604151497Sru */
2605151497Srustatic boolean_t
2606151497Srupmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2607151497Sru{
2608151497Sru	struct md_page *pvh;
2609151497Sru	pv_entry_t pv;
2610151497Sru
2611151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2612151497Sru	if (pv_entry_count < pv_entry_high_water &&
2613151497Sru	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2614151497Sru		pv->pv_va = va;
2615151497Sru		pvh = pa_to_pvh(pa);
2616151497Sru		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2617151497Sru		return (TRUE);
2618151497Sru	} else
2619151497Sru		return (FALSE);
2620151497Sru}
2621151497Sru
2622151497Sru/*
2623151497Sru * Fills a page table page with mappings to consecutive physical pages.
2624151497Sru */
2625151497Srustatic void
2626151497Srupmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2627151497Sru{
2628151497Sru	pt_entry_t *pte;
2629151497Sru
2630151497Sru	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2631151497Sru		*pte = newpte;
2632151497Sru		newpte += PAGE_SIZE;
2633151497Sru	}
2634151497Sru}
2635151497Sru
2636151497Sru/*
2637151497Sru * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2638151497Sru * 2- or 4MB page mapping is invalidated.
2639151497Sru */
2640151497Srustatic boolean_t
2641151497Srupmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2642151497Sru{
2643151497Sru	pd_entry_t newpde, oldpde;
2644151497Sru	pt_entry_t *firstpte, newpte;
2645151497Sru	vm_paddr_t mptepa;
2646151497Sru	vm_page_t free, mpte;
2647151497Sru
2648151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2649151497Sru	oldpde = *pde;
2650151497Sru	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2651151497Sru	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2652151497Sru	mpte = pmap_lookup_pt_page(pmap, va);
2653151497Sru	if (mpte != NULL)
2654151497Sru		pmap_remove_pt_page(pmap, mpte);
2655151497Sru	else {
2656151497Sru		KASSERT((oldpde & PG_W) == 0,
2657151497Sru		    ("pmap_demote_pde: page table page for a wired mapping"
2658151497Sru		    " is missing"));
2659151497Sru
2660151497Sru		/*
2661151497Sru		 * Invalidate the 2- or 4MB page mapping and return
2662151497Sru		 * "failure" if the mapping was never accessed or the
2663151497Sru		 * allocation of the new page table page fails.
2664151497Sru		 */
2665151497Sru		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2666151497Sru		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2667151497Sru		    VM_ALLOC_WIRED)) == NULL) {
2668151497Sru			free = NULL;
2669151497Sru			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2670151497Sru			pmap_invalidate_page(pmap, trunc_4mpage(va));
2671151497Sru			pmap_free_zero_pages(free);
2672151497Sru			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2673151497Sru			    " in pmap %p", va, pmap);
2674151497Sru			return (FALSE);
2675151497Sru		}
2676151497Sru		if (va < VM_MAXUSER_ADDRESS)
2677151497Sru			pmap->pm_stats.resident_count++;
2678151497Sru	}
2679151497Sru	mptepa = VM_PAGE_TO_PHYS(mpte);
2680151497Sru
2681151497Sru	/*
2682151497Sru	 * If the page mapping is in the kernel's address space, then the
2683151497Sru	 * KPTmap can provide access to the page table page.  Otherwise,
2684151497Sru	 * temporarily map the page table page (mpte) into the kernel's
2685151497Sru	 * address space at either PADDR1 or PADDR2.
2686151497Sru	 */
2687151497Sru	if (va >= KERNBASE)
2688151497Sru		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2689151497Sru	else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2690151497Sru		if ((*PMAP1 & PG_FRAME) != mptepa) {
2691151497Sru			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2692151497Sru#ifdef SMP
2693151497Sru			PMAP1cpu = PCPU_GET(cpuid);
2694151497Sru#endif
2695151497Sru			invlcaddr(PADDR1);
2696151497Sru			PMAP1changed++;
2697151497Sru		} else
2698151497Sru#ifdef SMP
2699151497Sru		if (PMAP1cpu != PCPU_GET(cpuid)) {
2700151497Sru			PMAP1cpu = PCPU_GET(cpuid);
2701151497Sru			invlcaddr(PADDR1);
2702151497Sru			PMAP1changedcpu++;
2703151497Sru		} else
2704151497Sru#endif
2705151497Sru			PMAP1unchanged++;
2706151497Sru		firstpte = PADDR1;
2707151497Sru	} else {
2708151497Sru		mtx_lock(&PMAP2mutex);
2709151497Sru		if ((*PMAP2 & PG_FRAME) != mptepa) {
2710151497Sru			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2711151497Sru			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2712151497Sru		}
2713151497Sru		firstpte = PADDR2;
2714151497Sru	}
2715151497Sru	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2716151497Sru	KASSERT((oldpde & PG_A) != 0,
2717151497Sru	    ("pmap_demote_pde: oldpde is missing PG_A"));
2718151497Sru	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2719151497Sru	    ("pmap_demote_pde: oldpde is missing PG_M"));
2720151497Sru	newpte = oldpde & ~PG_PS;
2721151497Sru	if ((newpte & PG_PDE_PAT) != 0)
2722151497Sru		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2723151497Sru
2724151497Sru	/*
2725151497Sru	 * If the page table page is new, initialize it.
2726151497Sru	 */
2727151497Sru	if (mpte->wire_count == 1) {
2728151497Sru		mpte->wire_count = NPTEPG;
2729151497Sru		pmap_fill_ptp(firstpte, newpte);
2730151497Sru	}
2731151497Sru	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2732151497Sru	    ("pmap_demote_pde: firstpte and newpte map different physical"
2733151497Sru	    " addresses"));
2734151497Sru
2735151497Sru	/*
2736151497Sru	 * If the mapping has changed attributes, update the page table
2737151497Sru	 * entries.
2738151497Sru	 */
2739151497Sru	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2740151497Sru		pmap_fill_ptp(firstpte, newpte);
2741151497Sru
2742151497Sru	/*
2743151497Sru	 * Demote the mapping.  This pmap is locked.  The old PDE has
2744151497Sru	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2745151497Sru	 * set.  Thus, there is no danger of a race with another
2746151497Sru	 * processor changing the setting of PG_A and/or PG_M between
2747151497Sru	 * the read above and the store below.
2748151497Sru	 */
2749151497Sru	if (workaround_erratum383)
2750151497Sru		pmap_update_pde(pmap, va, pde, newpde);
2751151497Sru	else if (pmap == kernel_pmap)
2752151497Sru		pmap_kenter_pde(va, newpde);
2753151497Sru	else
2754151497Sru		pde_store(pde, newpde);
2755151497Sru	if (firstpte == PADDR2)
2756151497Sru		mtx_unlock(&PMAP2mutex);
2757151497Sru
2758151497Sru	/*
2759151497Sru	 * Invalidate the recursive mapping of the page table page.
2760151497Sru	 */
2761151497Sru	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2762151497Sru
2763151497Sru	/*
2764151497Sru	 * Demote the pv entry.  This depends on the earlier demotion
2765151497Sru	 * of the mapping.  Specifically, the (re)creation of a per-
2766151497Sru	 * page pv entry might trigger the execution of pmap_collect(),
2767151497Sru	 * which might reclaim a newly (re)created per-page pv entry
2768151497Sru	 * and destroy the associated mapping.  In order to destroy
2769151497Sru	 * the mapping, the PDE must have already changed from mapping
2770151497Sru	 * the 2mpage to referencing the page table page.
2771151497Sru	 */
2772151497Sru	if ((oldpde & PG_MANAGED) != 0)
2773151497Sru		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2774151497Sru
2775151497Sru	pmap_pde_demotions++;
2776151497Sru	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2777151497Sru	    " in pmap %p", va, pmap);
2778151497Sru	return (TRUE);
2779151497Sru}
2780151497Sru
2781151497Sru/*
2782151497Sru * pmap_remove_pde: do the things to unmap a superpage in a process
2783151497Sru */
2784151497Srustatic void
2785151497Srupmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2786151497Sru    vm_page_t *free)
2787151497Sru{
2788151497Sru	struct md_page *pvh;
2789151497Sru	pd_entry_t oldpde;
2790151497Sru	vm_offset_t eva, va;
2791151497Sru	vm_page_t m, mpte;
2792151497Sru
2793151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2794151497Sru	KASSERT((sva & PDRMASK) == 0,
2795151497Sru	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2796151497Sru	oldpde = pte_load_clear(pdq);
2797151497Sru	if (oldpde & PG_W)
2798151497Sru		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2799151497Sru
2800151497Sru	/*
2801151497Sru	 * Machines that don't support invlpg, also don't support
2802151497Sru	 * PG_G.
2803151497Sru	 */
2804151497Sru	if (oldpde & PG_G)
2805151497Sru		pmap_invalidate_page(kernel_pmap, sva);
2806151497Sru	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2807151497Sru	if (oldpde & PG_MANAGED) {
2808151497Sru		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2809151497Sru		pmap_pvh_free(pvh, pmap, sva);
2810151497Sru		eva = sva + NBPDR;
2811151497Sru		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2812151497Sru		    va < eva; va += PAGE_SIZE, m++) {
2813151497Sru			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2814151497Sru				vm_page_dirty(m);
2815151497Sru			if (oldpde & PG_A)
2816151497Sru				vm_page_aflag_set(m, PGA_REFERENCED);
2817151497Sru			if (TAILQ_EMPTY(&m->md.pv_list) &&
2818151497Sru			    TAILQ_EMPTY(&pvh->pv_list))
2819151497Sru				vm_page_aflag_clear(m, PGA_WRITEABLE);
2820151497Sru		}
2821151497Sru	}
2822151497Sru	if (pmap == kernel_pmap) {
2823151497Sru		if (!pmap_demote_pde(pmap, pdq, sva))
2824151497Sru			panic("pmap_remove_pde: failed demotion");
2825151497Sru	} else {
2826151497Sru		mpte = pmap_lookup_pt_page(pmap, sva);
2827151497Sru		if (mpte != NULL) {
2828151497Sru			pmap_remove_pt_page(pmap, mpte);
2829151497Sru			pmap->pm_stats.resident_count--;
2830151497Sru			KASSERT(mpte->wire_count == NPTEPG,
2831151497Sru			    ("pmap_remove_pde: pte page wire count error"));
2832151497Sru			mpte->wire_count = 0;
2833151497Sru			pmap_add_delayed_free_list(mpte, free, FALSE);
2834151497Sru			atomic_subtract_int(&cnt.v_wire_count, 1);
2835151497Sru		}
2836151497Sru	}
2837151497Sru}
2838151497Sru
2839151497Sru/*
2840151497Sru * pmap_remove_pte: do the things to unmap a page in a process
2841151497Sru */
2842151497Srustatic int
2843151497Srupmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2844151497Sru{
2845151497Sru	pt_entry_t oldpte;
2846151497Sru	vm_page_t m;
2847151497Sru
2848151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2849151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2850151497Sru	oldpte = pte_load_clear(ptq);
2851151497Sru	if (oldpte & PG_W)
2852151497Sru		pmap->pm_stats.wired_count -= 1;
2853151497Sru	/*
2854151497Sru	 * Machines that don't support invlpg, also don't support
2855151497Sru	 * PG_G.
2856151497Sru	 */
2857151497Sru	if (oldpte & PG_G)
2858151497Sru		pmap_invalidate_page(kernel_pmap, va);
2859151497Sru	pmap->pm_stats.resident_count -= 1;
2860151497Sru	if (oldpte & PG_MANAGED) {
2861151497Sru		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2862151497Sru		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2863151497Sru			vm_page_dirty(m);
2864151497Sru		if (oldpte & PG_A)
2865151497Sru			vm_page_aflag_set(m, PGA_REFERENCED);
2866151497Sru		pmap_remove_entry(pmap, m, va);
2867151497Sru	}
2868151497Sru	return (pmap_unuse_pt(pmap, va, free));
2869151497Sru}
2870151497Sru
2871151497Sru/*
2872151497Sru * Remove a single page from a process address space
2873151497Sru */
2874151497Srustatic void
2875151497Srupmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2876151497Sru{
2877151497Sru	pt_entry_t *pte;
2878151497Sru
2879151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2880151497Sru	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2881151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2882151497Sru	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2883151497Sru		return;
2884151497Sru	pmap_remove_pte(pmap, pte, va, free);
2885151497Sru	pmap_invalidate_page(pmap, va);
2886151497Sru}
2887151497Sru
2888151497Sru/*
2889151497Sru *	Remove the given range of addresses from the specified map.
2890151497Sru *
2891151497Sru *	It is assumed that the start and end are properly
2892151497Sru *	rounded to the page size.
2893151497Sru */
2894151497Sruvoid
2895151497Srupmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2896151497Sru{
2897151497Sru	vm_offset_t pdnxt;
2898151497Sru	pd_entry_t ptpaddr;
2899151497Sru	pt_entry_t *pte;
2900151497Sru	vm_page_t free = NULL;
2901151497Sru	int anyvalid;
2902151497Sru
2903151497Sru	/*
2904151497Sru	 * Perform an unsynchronized read.  This is, however, safe.
2905151497Sru	 */
2906151497Sru	if (pmap->pm_stats.resident_count == 0)
2907151497Sru		return;
2908151497Sru
2909151497Sru	anyvalid = 0;
2910151497Sru
2911151497Sru	vm_page_lock_queues();
2912151497Sru	sched_pin();
2913151497Sru	PMAP_LOCK(pmap);
2914151497Sru
2915151497Sru	/*
2916151497Sru	 * special handling of removing one page.  a very
2917151497Sru	 * common operation and easy to short circuit some
2918151497Sru	 * code.
2919151497Sru	 */
2920151497Sru	if ((sva + PAGE_SIZE == eva) &&
2921151497Sru	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2922151497Sru		pmap_remove_page(pmap, sva, &free);
2923151497Sru		goto out;
2924151497Sru	}
2925151497Sru
2926151497Sru	for (; sva < eva; sva = pdnxt) {
2927151497Sru		u_int pdirindex;
2928151497Sru
2929151497Sru		/*
2930151497Sru		 * Calculate index for next page table.
2931151497Sru		 */
2932151497Sru		pdnxt = (sva + NBPDR) & ~PDRMASK;
2933151497Sru		if (pdnxt < sva)
2934151497Sru			pdnxt = eva;
2935151497Sru		if (pmap->pm_stats.resident_count == 0)
2936151497Sru			break;
2937151497Sru
2938151497Sru		pdirindex = sva >> PDRSHIFT;
2939151497Sru		ptpaddr = pmap->pm_pdir[pdirindex];
2940151497Sru
2941151497Sru		/*
2942151497Sru		 * Weed out invalid mappings. Note: we assume that the page
2943151497Sru		 * directory table is always allocated, and in kernel virtual.
2944151497Sru		 */
2945151497Sru		if (ptpaddr == 0)
2946151497Sru			continue;
2947151497Sru
2948151497Sru		/*
2949151497Sru		 * Check for large page.
2950151497Sru		 */
2951151497Sru		if ((ptpaddr & PG_PS) != 0) {
2952151497Sru			/*
2953151497Sru			 * Are we removing the entire large page?  If not,
2954151497Sru			 * demote the mapping and fall through.
2955151497Sru			 */
2956151497Sru			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2957151497Sru				/*
2958151497Sru				 * The TLB entry for a PG_G mapping is
2959151497Sru				 * invalidated by pmap_remove_pde().
2960151497Sru				 */
2961151497Sru				if ((ptpaddr & PG_G) == 0)
2962151497Sru					anyvalid = 1;
2963151497Sru				pmap_remove_pde(pmap,
2964151497Sru				    &pmap->pm_pdir[pdirindex], sva, &free);
2965151497Sru				continue;
2966151497Sru			} else if (!pmap_demote_pde(pmap,
2967151497Sru			    &pmap->pm_pdir[pdirindex], sva)) {
2968151497Sru				/* The large page mapping was destroyed. */
2969151497Sru				continue;
2970151497Sru			}
2971151497Sru		}
2972151497Sru
2973151497Sru		/*
2974151497Sru		 * Limit our scan to either the end of the va represented
2975151497Sru		 * by the current page table page, or to the end of the
2976151497Sru		 * range being removed.
2977151497Sru		 */
2978151497Sru		if (pdnxt > eva)
2979151497Sru			pdnxt = eva;
2980151497Sru
2981151497Sru		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2982151497Sru		    sva += PAGE_SIZE) {
2983151497Sru			if (*pte == 0)
2984151497Sru				continue;
2985151497Sru
2986151497Sru			/*
2987151497Sru			 * The TLB entry for a PG_G mapping is invalidated
2988151497Sru			 * by pmap_remove_pte().
2989151497Sru			 */
2990151497Sru			if ((*pte & PG_G) == 0)
2991151497Sru				anyvalid = 1;
2992151497Sru			if (pmap_remove_pte(pmap, pte, sva, &free))
2993151497Sru				break;
2994151497Sru		}
2995151497Sru	}
2996151497Sruout:
2997151497Sru	sched_unpin();
2998151497Sru	if (anyvalid)
2999151497Sru		pmap_invalidate_all(pmap);
3000151497Sru	vm_page_unlock_queues();
3001151497Sru	PMAP_UNLOCK(pmap);
3002151497Sru	pmap_free_zero_pages(free);
3003151497Sru}
3004151497Sru
3005151497Sru/*
3006151497Sru *	Routine:	pmap_remove_all
3007151497Sru *	Function:
3008151497Sru *		Removes this physical page from
3009151497Sru *		all physical maps in which it resides.
3010151497Sru *		Reflects back modify bits to the pager.
3011151497Sru *
3012151497Sru *	Notes:
3013151497Sru *		Original versions of this routine were very
3014151497Sru *		inefficient because they iteratively called
3015151497Sru *		pmap_remove (slow...)
3016151497Sru */
3017151497Sru
3018151497Sruvoid
3019151497Srupmap_remove_all(vm_page_t m)
3020151497Sru{
3021151497Sru	struct md_page *pvh;
3022151497Sru	pv_entry_t pv;
3023151497Sru	pmap_t pmap;
3024151497Sru	pt_entry_t *pte, tpte;
3025151497Sru	pd_entry_t *pde;
3026151497Sru	vm_offset_t va;
3027151497Sru	vm_page_t free;
3028151497Sru
3029151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3030151497Sru	    ("pmap_remove_all: page %p is not managed", m));
3031151497Sru	free = NULL;
3032151497Sru	vm_page_lock_queues();
3033151497Sru	sched_pin();
3034151497Sru	if ((m->flags & PG_FICTITIOUS) != 0)
3035151497Sru		goto small_mappings;
3036151497Sru	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3037151497Sru	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3038151497Sru		va = pv->pv_va;
3039151497Sru		pmap = PV_PMAP(pv);
3040151497Sru		PMAP_LOCK(pmap);
3041151497Sru		pde = pmap_pde(pmap, va);
3042151497Sru		(void)pmap_demote_pde(pmap, pde, va);
3043151497Sru		PMAP_UNLOCK(pmap);
3044151497Sru	}
3045151497Srusmall_mappings:
3046151497Sru	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3047151497Sru		pmap = PV_PMAP(pv);
3048151497Sru		PMAP_LOCK(pmap);
3049151497Sru		pmap->pm_stats.resident_count--;
3050151497Sru		pde = pmap_pde(pmap, pv->pv_va);
3051151497Sru		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3052151497Sru		    " a 4mpage in page %p's pv list", m));
3053151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
3054151497Sru		tpte = pte_load_clear(pte);
3055151497Sru		if (tpte & PG_W)
3056151497Sru			pmap->pm_stats.wired_count--;
3057151497Sru		if (tpte & PG_A)
3058151497Sru			vm_page_aflag_set(m, PGA_REFERENCED);
3059151497Sru
3060151497Sru		/*
3061151497Sru		 * Update the vm_page_t clean and reference bits.
3062151497Sru		 */
3063151497Sru		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3064151497Sru			vm_page_dirty(m);
3065151497Sru		pmap_unuse_pt(pmap, pv->pv_va, &free);
3066151497Sru		pmap_invalidate_page(pmap, pv->pv_va);
3067151497Sru		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3068151497Sru		free_pv_entry(pmap, pv);
3069151497Sru		PMAP_UNLOCK(pmap);
3070151497Sru	}
3071151497Sru	vm_page_aflag_clear(m, PGA_WRITEABLE);
3072151497Sru	sched_unpin();
3073151497Sru	vm_page_unlock_queues();
3074151497Sru	pmap_free_zero_pages(free);
3075151497Sru}
3076151497Sru
3077151497Sru/*
3078151497Sru * pmap_protect_pde: do the things to protect a 4mpage in a process
3079151497Sru */
3080151497Srustatic boolean_t
3081151497Srupmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3082151497Sru{
3083151497Sru	pd_entry_t newpde, oldpde;
3084151497Sru	vm_offset_t eva, va;
3085151497Sru	vm_page_t m;
3086151497Sru	boolean_t anychanged;
3087151497Sru
3088151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3089151497Sru	KASSERT((sva & PDRMASK) == 0,
3090151497Sru	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3091151497Sru	anychanged = FALSE;
3092151497Sruretry:
3093151497Sru	oldpde = newpde = *pde;
3094151497Sru	if (oldpde & PG_MANAGED) {
3095151497Sru		eva = sva + NBPDR;
3096151497Sru		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3097151497Sru		    va < eva; va += PAGE_SIZE, m++)
3098151497Sru			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3099151497Sru				vm_page_dirty(m);
3100151497Sru	}
3101151497Sru	if ((prot & VM_PROT_WRITE) == 0)
3102151497Sru		newpde &= ~(PG_RW | PG_M);
3103151497Sru#ifdef PAE
3104151497Sru	if ((prot & VM_PROT_EXECUTE) == 0)
3105151497Sru		newpde |= pg_nx;
3106151497Sru#endif
3107151497Sru	if (newpde != oldpde) {
3108151497Sru		if (!pde_cmpset(pde, oldpde, newpde))
3109151497Sru			goto retry;
3110151497Sru		if (oldpde & PG_G)
3111151497Sru			pmap_invalidate_page(pmap, sva);
3112151497Sru		else
3113151497Sru			anychanged = TRUE;
3114151497Sru	}
3115151497Sru	return (anychanged);
3116151497Sru}
3117151497Sru
3118151497Sru/*
3119151497Sru *	Set the physical protection on the
3120151497Sru *	specified range of this map as requested.
3121151497Sru */
3122151497Sruvoid
3123151497Srupmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3124151497Sru{
3125151497Sru	vm_offset_t pdnxt;
3126151497Sru	pd_entry_t ptpaddr;
3127151497Sru	pt_entry_t *pte;
3128151497Sru	boolean_t anychanged, pv_lists_locked;
3129151497Sru
3130151497Sru	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3131151497Sru		pmap_remove(pmap, sva, eva);
3132151497Sru		return;
3133151497Sru	}
3134151497Sru
3135151497Sru#ifdef PAE
3136151497Sru	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3137151497Sru	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3138151497Sru		return;
3139151497Sru#else
3140151497Sru	if (prot & VM_PROT_WRITE)
3141151497Sru		return;
3142151497Sru#endif
3143151497Sru
3144151497Sru	if (pmap_is_current(pmap))
3145151497Sru		pv_lists_locked = FALSE;
3146151497Sru	else {
3147151497Sru		pv_lists_locked = TRUE;
3148151497Sruresume:
3149151497Sru		vm_page_lock_queues();
3150151497Sru		sched_pin();
3151151497Sru	}
3152151497Sru	anychanged = FALSE;
3153151497Sru
3154151497Sru	PMAP_LOCK(pmap);
3155151497Sru	for (; sva < eva; sva = pdnxt) {
3156151497Sru		pt_entry_t obits, pbits;
3157151497Sru		u_int pdirindex;
3158151497Sru
3159151497Sru		pdnxt = (sva + NBPDR) & ~PDRMASK;
3160151497Sru		if (pdnxt < sva)
3161151497Sru			pdnxt = eva;
3162151497Sru
3163151497Sru		pdirindex = sva >> PDRSHIFT;
3164151497Sru		ptpaddr = pmap->pm_pdir[pdirindex];
3165151497Sru
3166151497Sru		/*
3167151497Sru		 * Weed out invalid mappings. Note: we assume that the page
3168151497Sru		 * directory table is always allocated, and in kernel virtual.
3169151497Sru		 */
3170151497Sru		if (ptpaddr == 0)
3171151497Sru			continue;
3172151497Sru
3173151497Sru		/*
3174151497Sru		 * Check for large page.
3175151497Sru		 */
3176151497Sru		if ((ptpaddr & PG_PS) != 0) {
3177151497Sru			/*
3178151497Sru			 * Are we protecting the entire large page?  If not,
3179151497Sru			 * demote the mapping and fall through.
3180151497Sru			 */
3181151497Sru			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3182151497Sru				/*
3183151497Sru				 * The TLB entry for a PG_G mapping is
3184151497Sru				 * invalidated by pmap_protect_pde().
3185151497Sru				 */
3186151497Sru				if (pmap_protect_pde(pmap,
3187151497Sru				    &pmap->pm_pdir[pdirindex], sva, prot))
3188151497Sru					anychanged = TRUE;
3189151497Sru				continue;
3190151497Sru			} else {
3191151497Sru				if (!pv_lists_locked) {
3192151497Sru					pv_lists_locked = TRUE;
3193151497Sru					if (!mtx_trylock(&vm_page_queue_mtx)) {
3194151497Sru						if (anychanged)
3195151497Sru							pmap_invalidate_all(
3196151497Sru							    pmap);
3197151497Sru						PMAP_UNLOCK(pmap);
3198151497Sru						goto resume;
3199151497Sru					}
3200151497Sru				}
3201151497Sru				if (!pmap_demote_pde(pmap,
3202151497Sru				    &pmap->pm_pdir[pdirindex], sva)) {
3203151497Sru					/*
3204151497Sru					 * The large page mapping was
3205151497Sru					 * destroyed.
3206151497Sru					 */
3207151497Sru					continue;
3208151497Sru				}
3209151497Sru			}
3210151497Sru		}
3211151497Sru
3212151497Sru		if (pdnxt > eva)
3213151497Sru			pdnxt = eva;
3214151497Sru
3215151497Sru		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3216151497Sru		    sva += PAGE_SIZE) {
3217151497Sru			vm_page_t m;
3218151497Sru
3219151497Sruretry:
3220151497Sru			/*
3221151497Sru			 * Regardless of whether a pte is 32 or 64 bits in
3222151497Sru			 * size, PG_RW, PG_A, and PG_M are among the least
3223151497Sru			 * significant 32 bits.
3224151497Sru			 */
3225151497Sru			obits = pbits = *pte;
3226151497Sru			if ((pbits & PG_V) == 0)
3227151497Sru				continue;
3228151497Sru
3229151497Sru			if ((prot & VM_PROT_WRITE) == 0) {
3230151497Sru				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3231151497Sru				    (PG_MANAGED | PG_M | PG_RW)) {
3232151497Sru					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3233151497Sru					vm_page_dirty(m);
3234151497Sru				}
3235151497Sru				pbits &= ~(PG_RW | PG_M);
3236151497Sru			}
3237151497Sru#ifdef PAE
3238151497Sru			if ((prot & VM_PROT_EXECUTE) == 0)
3239151497Sru				pbits |= pg_nx;
3240151497Sru#endif
3241151497Sru
3242151497Sru			if (pbits != obits) {
3243151497Sru#ifdef PAE
3244151497Sru				if (!atomic_cmpset_64(pte, obits, pbits))
3245151497Sru					goto retry;
3246151497Sru#else
3247151497Sru				if (!atomic_cmpset_int((u_int *)pte, obits,
3248151497Sru				    pbits))
3249151497Sru					goto retry;
3250151497Sru#endif
3251151497Sru				if (obits & PG_G)
3252151497Sru					pmap_invalidate_page(pmap, sva);
3253151497Sru				else
3254151497Sru					anychanged = TRUE;
3255151497Sru			}
3256151497Sru		}
3257151497Sru	}
3258151497Sru	if (anychanged)
3259151497Sru		pmap_invalidate_all(pmap);
3260151497Sru	if (pv_lists_locked) {
3261151497Sru		sched_unpin();
3262151497Sru		vm_page_unlock_queues();
3263151497Sru	}
3264151497Sru	PMAP_UNLOCK(pmap);
3265151497Sru}
3266151497Sru
3267151497Sru/*
3268151497Sru * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3269151497Sru * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3270151497Sru * For promotion to occur, two conditions must be met: (1) the 4KB page
3271151497Sru * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3272151497Sru * mappings must have identical characteristics.
3273151497Sru *
3274151497Sru * Managed (PG_MANAGED) mappings within the kernel address space are not
3275151497Sru * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3276151497Sru * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3277151497Sru * pmap.
3278151497Sru */
3279151497Srustatic void
3280151497Srupmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3281151497Sru{
3282151497Sru	pd_entry_t newpde;
3283151497Sru	pt_entry_t *firstpte, oldpte, pa, *pte;
3284151497Sru	vm_offset_t oldpteva;
3285151497Sru	vm_page_t mpte;
3286151497Sru
3287151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3288151497Sru
3289151497Sru	/*
3290151497Sru	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3291151497Sru	 * either invalid, unused, or does not map the first 4KB physical page
3292151497Sru	 * within a 2- or 4MB page.
3293151497Sru	 */
3294151497Sru	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3295151497Srusetpde:
3296151497Sru	newpde = *firstpte;
3297151497Sru	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3298151497Sru		pmap_pde_p_failures++;
3299151497Sru		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3300151497Sru		    " in pmap %p", va, pmap);
3301151497Sru		return;
3302151497Sru	}
3303151497Sru	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3304151497Sru		pmap_pde_p_failures++;
3305151497Sru		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3306151497Sru		    " in pmap %p", va, pmap);
3307151497Sru		return;
3308151497Sru	}
3309151497Sru	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3310151497Sru		/*
3311151497Sru		 * When PG_M is already clear, PG_RW can be cleared without
3312151497Sru		 * a TLB invalidation.
3313151497Sru		 */
3314151497Sru		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3315151497Sru		    ~PG_RW))
3316151497Sru			goto setpde;
3317151497Sru		newpde &= ~PG_RW;
3318151497Sru	}
3319151497Sru
3320151497Sru	/*
3321151497Sru	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3322151497Sru	 * PTE maps an unexpected 4KB physical page or does not have identical
3323151497Sru	 * characteristics to the first PTE.
3324151497Sru	 */
3325151497Sru	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3326151497Sru	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3327151497Srusetpte:
3328151497Sru		oldpte = *pte;
3329151497Sru		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3330151497Sru			pmap_pde_p_failures++;
3331151497Sru			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3332151497Sru			    " in pmap %p", va, pmap);
3333151497Sru			return;
3334151497Sru		}
3335151497Sru		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3336151497Sru			/*
3337151497Sru			 * When PG_M is already clear, PG_RW can be cleared
3338151497Sru			 * without a TLB invalidation.
3339151497Sru			 */
3340151497Sru			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3341151497Sru			    oldpte & ~PG_RW))
3342151497Sru				goto setpte;
3343151497Sru			oldpte &= ~PG_RW;
3344151497Sru			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3345151497Sru			    (va & ~PDRMASK);
3346151497Sru			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3347151497Sru			    " in pmap %p", oldpteva, pmap);
3348151497Sru		}
3349151497Sru		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3350151497Sru			pmap_pde_p_failures++;
3351151497Sru			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3352151497Sru			    " in pmap %p", va, pmap);
3353151497Sru			return;
3354151497Sru		}
3355151497Sru		pa -= PAGE_SIZE;
3356151497Sru	}
3357151497Sru
3358151497Sru	/*
3359151497Sru	 * Save the page table page in its current state until the PDE
3360151497Sru	 * mapping the superpage is demoted by pmap_demote_pde() or
3361151497Sru	 * destroyed by pmap_remove_pde().
3362151497Sru	 */
3363151497Sru	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3364151497Sru	KASSERT(mpte >= vm_page_array &&
3365151497Sru	    mpte < &vm_page_array[vm_page_array_size],
3366151497Sru	    ("pmap_promote_pde: page table page is out of range"));
3367151497Sru	KASSERT(mpte->pindex == va >> PDRSHIFT,
3368151497Sru	    ("pmap_promote_pde: page table page's pindex is wrong"));
3369151497Sru	pmap_insert_pt_page(pmap, mpte);
3370151497Sru
3371151497Sru	/*
3372151497Sru	 * Promote the pv entries.
3373151497Sru	 */
3374151497Sru	if ((newpde & PG_MANAGED) != 0)
3375151497Sru		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3376151497Sru
3377151497Sru	/*
3378151497Sru	 * Propagate the PAT index to its proper position.
3379151497Sru	 */
3380151497Sru	if ((newpde & PG_PTE_PAT) != 0)
3381151497Sru		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3382151497Sru
3383151497Sru	/*
3384151497Sru	 * Map the superpage.
3385151497Sru	 */
3386151497Sru	if (workaround_erratum383)
3387151497Sru		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3388151497Sru	else if (pmap == kernel_pmap)
3389151497Sru		pmap_kenter_pde(va, PG_PS | newpde);
3390151497Sru	else
3391151497Sru		pde_store(pde, PG_PS | newpde);
3392151497Sru
3393151497Sru	pmap_pde_promotions++;
3394151497Sru	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3395151497Sru	    " in pmap %p", va, pmap);
3396151497Sru}
3397151497Sru
3398151497Sru/*
3399151497Sru *	Insert the given physical page (p) at
3400151497Sru *	the specified virtual address (v) in the
3401151497Sru *	target physical map with the protection requested.
3402151497Sru *
3403151497Sru *	If specified, the page will be wired down, meaning
3404151497Sru *	that the related pte can not be reclaimed.
3405151497Sru *
3406151497Sru *	NB:  This is the only routine which MAY NOT lazy-evaluate
3407151497Sru *	or lose information.  That is, this routine must actually
3408151497Sru *	insert this page into the given map NOW.
3409151497Sru */
3410151497Sruvoid
3411151497Srupmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3412151497Sru    vm_prot_t prot, boolean_t wired)
3413151497Sru{
3414151497Sru	pd_entry_t *pde;
3415151497Sru	pt_entry_t *pte;
3416151497Sru	pt_entry_t newpte, origpte;
3417151497Sru	pv_entry_t pv;
3418151497Sru	vm_paddr_t opa, pa;
3419151497Sru	vm_page_t mpte, om;
3420151497Sru	boolean_t invlva;
3421151497Sru
3422151497Sru	va = trunc_page(va);
3423151497Sru	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3424151497Sru	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3425151497Sru	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3426151497Sru	    va));
3427151497Sru	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
3428151497Sru	    VM_OBJECT_LOCKED(m->object),
3429151497Sru	    ("pmap_enter: page %p is not busy", m));
3430151497Sru
3431151497Sru	mpte = NULL;
3432151497Sru
3433151497Sru	vm_page_lock_queues();
3434151497Sru	PMAP_LOCK(pmap);
3435151497Sru	sched_pin();
3436151497Sru
3437151497Sru	/*
3438151497Sru	 * In the case that a page table page is not
3439151497Sru	 * resident, we are creating it here.
3440151497Sru	 */
3441151497Sru	if (va < VM_MAXUSER_ADDRESS) {
3442151497Sru		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3443151497Sru	}
3444151497Sru
3445151497Sru	pde = pmap_pde(pmap, va);
3446151497Sru	if ((*pde & PG_PS) != 0)
3447151497Sru		panic("pmap_enter: attempted pmap_enter on 4MB page");
3448151497Sru	pte = pmap_pte_quick(pmap, va);
3449151497Sru
3450151497Sru	/*
3451151497Sru	 * Page Directory table entry not valid, we need a new PT page
3452151497Sru	 */
3453151497Sru	if (pte == NULL) {
3454151497Sru		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3455151497Sru			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3456151497Sru	}
3457151497Sru
3458151497Sru	pa = VM_PAGE_TO_PHYS(m);
3459151497Sru	om = NULL;
3460151497Sru	origpte = *pte;
3461151497Sru	opa = origpte & PG_FRAME;
3462151497Sru
3463151497Sru	/*
3464151497Sru	 * Mapping has not changed, must be protection or wiring change.
3465151497Sru	 */
3466151497Sru	if (origpte && (opa == pa)) {
3467151497Sru		/*
3468151497Sru		 * Wiring change, just update stats. We don't worry about
3469151497Sru		 * wiring PT pages as they remain resident as long as there
3470151497Sru		 * are valid mappings in them. Hence, if a user page is wired,
3471151497Sru		 * the PT page will be also.
3472151497Sru		 */
3473151497Sru		if (wired && ((origpte & PG_W) == 0))
3474151497Sru			pmap->pm_stats.wired_count++;
3475151497Sru		else if (!wired && (origpte & PG_W))
3476151497Sru			pmap->pm_stats.wired_count--;
3477151497Sru
3478151497Sru		/*
3479151497Sru		 * Remove extra pte reference
3480151497Sru		 */
3481151497Sru		if (mpte)
3482151497Sru			mpte->wire_count--;
3483151497Sru
3484151497Sru		if (origpte & PG_MANAGED) {
3485151497Sru			om = m;
3486151497Sru			pa |= PG_MANAGED;
3487151497Sru		}
3488151497Sru		goto validate;
3489151497Sru	}
3490151497Sru
3491151497Sru	pv = NULL;
3492151497Sru
3493151497Sru	/*
3494151497Sru	 * Mapping has changed, invalidate old range and fall through to
3495151497Sru	 * handle validating new mapping.
3496151497Sru	 */
3497151497Sru	if (opa) {
3498151497Sru		if (origpte & PG_W)
3499151497Sru			pmap->pm_stats.wired_count--;
3500151497Sru		if (origpte & PG_MANAGED) {
3501151497Sru			om = PHYS_TO_VM_PAGE(opa);
3502151497Sru			pv = pmap_pvh_remove(&om->md, pmap, va);
3503151497Sru		}
3504151497Sru		if (mpte != NULL) {
3505151497Sru			mpte->wire_count--;
3506151497Sru			KASSERT(mpte->wire_count > 0,
3507151497Sru			    ("pmap_enter: missing reference to page table page,"
3508151497Sru			     " va: 0x%x", va));
3509151497Sru		}
3510151497Sru	} else
3511151497Sru		pmap->pm_stats.resident_count++;
3512151497Sru
3513151497Sru	/*
3514151497Sru	 * Enter on the PV list if part of our managed memory.
3515151497Sru	 */
3516151497Sru	if ((m->oflags & VPO_UNMANAGED) == 0) {
3517151497Sru		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3518151497Sru		    ("pmap_enter: managed mapping within the clean submap"));
3519151497Sru		if (pv == NULL)
3520151497Sru			pv = get_pv_entry(pmap, FALSE);
3521151497Sru		pv->pv_va = va;
3522151497Sru		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3523151497Sru		pa |= PG_MANAGED;
3524151497Sru	} else if (pv != NULL)
3525151497Sru		free_pv_entry(pmap, pv);
3526151497Sru
3527151497Sru	/*
3528151497Sru	 * Increment counters
3529151497Sru	 */
3530151497Sru	if (wired)
3531151497Sru		pmap->pm_stats.wired_count++;
3532151497Sru
3533151497Sruvalidate:
3534151497Sru	/*
3535151497Sru	 * Now validate mapping with desired protection/wiring.
3536151497Sru	 */
3537151497Sru	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3538151497Sru	if ((prot & VM_PROT_WRITE) != 0) {
3539151497Sru		newpte |= PG_RW;
3540151497Sru		if ((newpte & PG_MANAGED) != 0)
3541151497Sru			vm_page_aflag_set(m, PGA_WRITEABLE);
3542151497Sru	}
3543151497Sru#ifdef PAE
3544151497Sru	if ((prot & VM_PROT_EXECUTE) == 0)
3545151497Sru		newpte |= pg_nx;
3546151497Sru#endif
3547151497Sru	if (wired)
3548151497Sru		newpte |= PG_W;
3549151497Sru	if (va < VM_MAXUSER_ADDRESS)
3550151497Sru		newpte |= PG_U;
3551151497Sru	if (pmap == kernel_pmap)
3552151497Sru		newpte |= pgeflag;
3553151497Sru
3554151497Sru	/*
3555151497Sru	 * if the mapping or permission bits are different, we need
3556151497Sru	 * to update the pte.
3557151497Sru	 */
3558151497Sru	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3559151497Sru		newpte |= PG_A;
3560151497Sru		if ((access & VM_PROT_WRITE) != 0)
3561151497Sru			newpte |= PG_M;
3562151497Sru		if (origpte & PG_V) {
3563151497Sru			invlva = FALSE;
3564151497Sru			origpte = pte_load_store(pte, newpte);
3565151497Sru			if (origpte & PG_A) {
3566151497Sru				if (origpte & PG_MANAGED)
3567151497Sru					vm_page_aflag_set(om, PGA_REFERENCED);
3568151497Sru				if (opa != VM_PAGE_TO_PHYS(m))
3569151497Sru					invlva = TRUE;
3570151497Sru#ifdef PAE
3571151497Sru				if ((origpte & PG_NX) == 0 &&
3572151497Sru				    (newpte & PG_NX) != 0)
3573151497Sru					invlva = TRUE;
3574151497Sru#endif
3575151497Sru			}
3576151497Sru			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3577151497Sru				if ((origpte & PG_MANAGED) != 0)
3578151497Sru					vm_page_dirty(om);
3579151497Sru				if ((prot & VM_PROT_WRITE) == 0)
3580151497Sru					invlva = TRUE;
3581151497Sru			}
3582151497Sru			if ((origpte & PG_MANAGED) != 0 &&
3583151497Sru			    TAILQ_EMPTY(&om->md.pv_list) &&
3584151497Sru			    ((om->flags & PG_FICTITIOUS) != 0 ||
3585151497Sru			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3586151497Sru				vm_page_aflag_clear(om, PGA_WRITEABLE);
3587151497Sru			if (invlva)
3588151497Sru				pmap_invalidate_page(pmap, va);
3589151497Sru		} else
3590151497Sru			pte_store(pte, newpte);
3591151497Sru	}
3592151497Sru
3593151497Sru	/*
3594151497Sru	 * If both the page table page and the reservation are fully
3595151497Sru	 * populated, then attempt promotion.
3596151497Sru	 */
3597151497Sru	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3598151497Sru	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3599151497Sru	    vm_reserv_level_iffullpop(m) == 0)
3600151497Sru		pmap_promote_pde(pmap, pde, va);
3601151497Sru
3602151497Sru	sched_unpin();
3603151497Sru	vm_page_unlock_queues();
3604151497Sru	PMAP_UNLOCK(pmap);
3605151497Sru}
3606151497Sru
3607151497Sru/*
3608151497Sru * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3609151497Sru * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3610151497Sru * blocking, (2) a mapping already exists at the specified virtual address, or
3611151497Sru * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3612151497Sru */
3613151497Srustatic boolean_t
3614151497Srupmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3615151497Sru{
3616151497Sru	pd_entry_t *pde, newpde;
3617151497Sru
3618151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3619151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3620151497Sru	pde = pmap_pde(pmap, va);
3621151497Sru	if (*pde != 0) {
3622151497Sru		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3623151497Sru		    " in pmap %p", va, pmap);
3624151497Sru		return (FALSE);
3625151497Sru	}
3626151497Sru	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3627151497Sru	    PG_PS | PG_V;
3628151497Sru	if ((m->oflags & VPO_UNMANAGED) == 0) {
3629151497Sru		newpde |= PG_MANAGED;
3630151497Sru
3631151497Sru		/*
3632151497Sru		 * Abort this mapping if its PV entry could not be created.
3633151497Sru		 */
3634151497Sru		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3635151497Sru			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3636151497Sru			    " in pmap %p", va, pmap);
3637151497Sru			return (FALSE);
3638151497Sru		}
3639151497Sru	}
3640151497Sru#ifdef PAE
3641151497Sru	if ((prot & VM_PROT_EXECUTE) == 0)
3642151497Sru		newpde |= pg_nx;
3643151497Sru#endif
3644151497Sru	if (va < VM_MAXUSER_ADDRESS)
3645151497Sru		newpde |= PG_U;
3646151497Sru
3647151497Sru	/*
3648151497Sru	 * Increment counters.
3649151497Sru	 */
3650151497Sru	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3651151497Sru
3652151497Sru	/*
3653151497Sru	 * Map the superpage.
3654151497Sru	 */
3655151497Sru	pde_store(pde, newpde);
3656151497Sru
3657151497Sru	pmap_pde_mappings++;
3658151497Sru	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3659151497Sru	    " in pmap %p", va, pmap);
3660151497Sru	return (TRUE);
3661151497Sru}
3662151497Sru
3663151497Sru/*
3664151497Sru * Maps a sequence of resident pages belonging to the same object.
3665151497Sru * The sequence begins with the given page m_start.  This page is
3666151497Sru * mapped at the given virtual address start.  Each subsequent page is
3667151497Sru * mapped at a virtual address that is offset from start by the same
3668151497Sru * amount as the page is offset from m_start within the object.  The
3669151497Sru * last page in the sequence is the page with the largest offset from
3670151497Sru * m_start that can be mapped at a virtual address less than the given
3671151497Sru * virtual address end.  Not every virtual page between start and end
3672151497Sru * is mapped; only those for which a resident page exists with the
3673151497Sru * corresponding offset from m_start are mapped.
3674151497Sru */
3675151497Sruvoid
3676151497Srupmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3677151497Sru    vm_page_t m_start, vm_prot_t prot)
3678151497Sru{
3679151497Sru	vm_offset_t va;
3680151497Sru	vm_page_t m, mpte;
3681151497Sru	vm_pindex_t diff, psize;
3682151497Sru
3683151497Sru	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3684151497Sru	psize = atop(end - start);
3685151497Sru	mpte = NULL;
3686151497Sru	m = m_start;
3687151497Sru	vm_page_lock_queues();
3688151497Sru	PMAP_LOCK(pmap);
3689151497Sru	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3690151497Sru		va = start + ptoa(diff);
3691151497Sru		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3692151497Sru		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3693151497Sru		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3694151497Sru		    pmap_enter_pde(pmap, va, m, prot))
3695151497Sru			m = &m[NBPDR / PAGE_SIZE - 1];
3696151497Sru		else
3697151497Sru			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3698151497Sru			    mpte);
3699151497Sru		m = TAILQ_NEXT(m, listq);
3700151497Sru	}
3701151497Sru	vm_page_unlock_queues();
3702151497Sru	PMAP_UNLOCK(pmap);
3703151497Sru}
3704151497Sru
3705151497Sru/*
3706151497Sru * this code makes some *MAJOR* assumptions:
3707151497Sru * 1. Current pmap & pmap exists.
3708151497Sru * 2. Not wired.
3709151497Sru * 3. Read access.
3710151497Sru * 4. No page table pages.
3711151497Sru * but is *MUCH* faster than pmap_enter...
3712151497Sru */
3713151497Sru
3714151497Sruvoid
3715151497Srupmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3716151497Sru{
3717151497Sru
3718151497Sru	vm_page_lock_queues();
3719151497Sru	PMAP_LOCK(pmap);
3720151497Sru	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3721151497Sru	vm_page_unlock_queues();
3722151497Sru	PMAP_UNLOCK(pmap);
3723151497Sru}
3724151497Sru
3725151497Srustatic vm_page_t
3726151497Srupmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3727151497Sru    vm_prot_t prot, vm_page_t mpte)
3728151497Sru{
3729151497Sru	pt_entry_t *pte;
3730151497Sru	vm_paddr_t pa;
3731151497Sru	vm_page_t free;
3732151497Sru
3733151497Sru	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3734151497Sru	    (m->oflags & VPO_UNMANAGED) != 0,
3735151497Sru	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3736151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3737151497Sru	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3738151497Sru
3739151497Sru	/*
3740151497Sru	 * In the case that a page table page is not
3741151497Sru	 * resident, we are creating it here.
3742151497Sru	 */
3743151497Sru	if (va < VM_MAXUSER_ADDRESS) {
3744151497Sru		u_int ptepindex;
3745151497Sru		pd_entry_t ptepa;
3746151497Sru
3747151497Sru		/*
3748151497Sru		 * Calculate pagetable page index
3749151497Sru		 */
3750151497Sru		ptepindex = va >> PDRSHIFT;
3751151497Sru		if (mpte && (mpte->pindex == ptepindex)) {
3752151497Sru			mpte->wire_count++;
3753151497Sru		} else {
3754151497Sru			/*
3755151497Sru			 * Get the page directory entry
3756151497Sru			 */
3757151497Sru			ptepa = pmap->pm_pdir[ptepindex];
3758151497Sru
3759151497Sru			/*
3760151497Sru			 * If the page table page is mapped, we just increment
3761151497Sru			 * the hold count, and activate it.
3762151497Sru			 */
3763151497Sru			if (ptepa) {
3764151497Sru				if (ptepa & PG_PS)
3765151497Sru					return (NULL);
3766151497Sru				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3767151497Sru				mpte->wire_count++;
3768151497Sru			} else {
3769151497Sru				mpte = _pmap_allocpte(pmap, ptepindex,
3770151497Sru				    M_NOWAIT);
3771151497Sru				if (mpte == NULL)
3772151497Sru					return (mpte);
3773151497Sru			}
3774151497Sru		}
3775151497Sru	} else {
3776151497Sru		mpte = NULL;
3777151497Sru	}
3778151497Sru
3779151497Sru	/*
3780151497Sru	 * This call to vtopte makes the assumption that we are
3781151497Sru	 * entering the page into the current pmap.  In order to support
3782151497Sru	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3783151497Sru	 * But that isn't as quick as vtopte.
3784151497Sru	 */
3785151497Sru	pte = vtopte(va);
3786151497Sru	if (*pte) {
3787151497Sru		if (mpte != NULL) {
3788151497Sru			mpte->wire_count--;
3789151497Sru			mpte = NULL;
3790151497Sru		}
3791151497Sru		return (mpte);
3792151497Sru	}
3793151497Sru
3794151497Sru	/*
3795151497Sru	 * Enter on the PV list if part of our managed memory.
3796151497Sru	 */
3797151497Sru	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3798151497Sru	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3799151497Sru		if (mpte != NULL) {
3800151497Sru			free = NULL;
3801151497Sru			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3802151497Sru				pmap_invalidate_page(pmap, va);
3803151497Sru				pmap_free_zero_pages(free);
3804151497Sru			}
3805151497Sru
3806151497Sru			mpte = NULL;
3807151497Sru		}
3808151497Sru		return (mpte);
3809151497Sru	}
3810151497Sru
3811151497Sru	/*
3812151497Sru	 * Increment counters
3813151497Sru	 */
3814151497Sru	pmap->pm_stats.resident_count++;
3815151497Sru
3816151497Sru	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3817151497Sru#ifdef PAE
3818151497Sru	if ((prot & VM_PROT_EXECUTE) == 0)
3819151497Sru		pa |= pg_nx;
3820151497Sru#endif
3821151497Sru
3822151497Sru	/*
3823151497Sru	 * Now validate mapping with RO protection
3824151497Sru	 */
3825151497Sru	if ((m->oflags & VPO_UNMANAGED) != 0)
3826151497Sru		pte_store(pte, pa | PG_V | PG_U);
3827151497Sru	else
3828151497Sru		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3829151497Sru	return (mpte);
3830151497Sru}
3831151497Sru
3832151497Sru/*
3833151497Sru * Make a temporary mapping for a physical address.  This is only intended
3834151497Sru * to be used for panic dumps.
3835151497Sru */
3836151497Sruvoid *
3837151497Srupmap_kenter_temporary(vm_paddr_t pa, int i)
3838151497Sru{
3839151497Sru	vm_offset_t va;
3840151497Sru
3841151497Sru	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3842151497Sru	pmap_kenter(va, pa);
3843151497Sru	invlpg(va);
3844151497Sru	return ((void *)crashdumpmap);
3845151497Sru}
3846151497Sru
3847151497Sru/*
3848151497Sru * This code maps large physical mmap regions into the
3849151497Sru * processor address space.  Note that some shortcuts
3850151497Sru * are taken, but the code works.
3851151497Sru */
3852151497Sruvoid
3853151497Srupmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3854151497Sru    vm_pindex_t pindex, vm_size_t size)
3855151497Sru{
3856151497Sru	pd_entry_t *pde;
3857151497Sru	vm_paddr_t pa, ptepa;
3858151497Sru	vm_page_t p;
3859151497Sru	int pat_mode;
3860151497Sru
3861151497Sru	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3862151497Sru	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3863151497Sru	    ("pmap_object_init_pt: non-device object"));
3864151497Sru	if (pseflag &&
3865151497Sru	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3866151497Sru		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3867151497Sru			return;
3868151497Sru		p = vm_page_lookup(object, pindex);
3869151497Sru		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3870151497Sru		    ("pmap_object_init_pt: invalid page %p", p));
3871151497Sru		pat_mode = p->md.pat_mode;
3872151497Sru
3873151497Sru		/*
3874151497Sru		 * Abort the mapping if the first page is not physically
3875151497Sru		 * aligned to a 2/4MB page boundary.
3876151497Sru		 */
3877151497Sru		ptepa = VM_PAGE_TO_PHYS(p);
3878151497Sru		if (ptepa & (NBPDR - 1))
3879151497Sru			return;
3880151497Sru
3881151497Sru		/*
3882151497Sru		 * Skip the first page.  Abort the mapping if the rest of
3883151497Sru		 * the pages are not physically contiguous or have differing
3884151497Sru		 * memory attributes.
3885151497Sru		 */
3886151497Sru		p = TAILQ_NEXT(p, listq);
3887151497Sru		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3888151497Sru		    pa += PAGE_SIZE) {
3889151497Sru			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3890151497Sru			    ("pmap_object_init_pt: invalid page %p", p));
3891151497Sru			if (pa != VM_PAGE_TO_PHYS(p) ||
3892151497Sru			    pat_mode != p->md.pat_mode)
3893151497Sru				return;
3894151497Sru			p = TAILQ_NEXT(p, listq);
3895151497Sru		}
3896151497Sru
3897151497Sru		/*
3898151497Sru		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3899151497Sru		 * "size" is a multiple of 2/4M, adding the PAT setting to
3900151497Sru		 * "pa" will not affect the termination of this loop.
3901151497Sru		 */
3902151497Sru		PMAP_LOCK(pmap);
3903151497Sru		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3904151497Sru		    size; pa += NBPDR) {
3905151497Sru			pde = pmap_pde(pmap, addr);
3906151497Sru			if (*pde == 0) {
3907151497Sru				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3908151497Sru				    PG_U | PG_RW | PG_V);
3909151497Sru				pmap->pm_stats.resident_count += NBPDR /
3910151497Sru				    PAGE_SIZE;
3911151497Sru				pmap_pde_mappings++;
3912151497Sru			}
3913151497Sru			/* Else continue on if the PDE is already valid. */
3914151497Sru			addr += NBPDR;
3915151497Sru		}
3916151497Sru		PMAP_UNLOCK(pmap);
3917151497Sru	}
3918151497Sru}
3919151497Sru
3920151497Sru/*
3921151497Sru *	Routine:	pmap_change_wiring
3922151497Sru *	Function:	Change the wiring attribute for a map/virtual-address
3923151497Sru *			pair.
3924151497Sru *	In/out conditions:
3925151497Sru *			The mapping must already exist in the pmap.
3926151497Sru */
3927151497Sruvoid
3928151497Srupmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3929151497Sru{
3930151497Sru	pd_entry_t *pde;
3931151497Sru	pt_entry_t *pte;
3932151497Sru	boolean_t are_queues_locked;
3933151497Sru
3934151497Sru	are_queues_locked = FALSE;
3935151497Sruretry:
3936151497Sru	PMAP_LOCK(pmap);
3937151497Sru	pde = pmap_pde(pmap, va);
3938151497Sru	if ((*pde & PG_PS) != 0) {
3939151497Sru		if (!wired != ((*pde & PG_W) == 0)) {
3940151497Sru			if (!are_queues_locked) {
3941151497Sru				are_queues_locked = TRUE;
3942151497Sru				if (!mtx_trylock(&vm_page_queue_mtx)) {
3943151497Sru					PMAP_UNLOCK(pmap);
3944151497Sru					vm_page_lock_queues();
3945151497Sru					goto retry;
3946151497Sru				}
3947151497Sru			}
3948151497Sru			if (!pmap_demote_pde(pmap, pde, va))
3949151497Sru				panic("pmap_change_wiring: demotion failed");
3950151497Sru		} else
3951151497Sru			goto out;
3952151497Sru	}
3953151497Sru	pte = pmap_pte(pmap, va);
3954151497Sru
3955151497Sru	if (wired && !pmap_pte_w(pte))
3956151497Sru		pmap->pm_stats.wired_count++;
3957151497Sru	else if (!wired && pmap_pte_w(pte))
3958151497Sru		pmap->pm_stats.wired_count--;
3959151497Sru
3960151497Sru	/*
3961151497Sru	 * Wiring is not a hardware characteristic so there is no need to
3962151497Sru	 * invalidate TLB.
3963151497Sru	 */
3964151497Sru	pmap_pte_set_w(pte, wired);
3965151497Sru	pmap_pte_release(pte);
3966151497Sruout:
3967151497Sru	if (are_queues_locked)
3968151497Sru		vm_page_unlock_queues();
3969151497Sru	PMAP_UNLOCK(pmap);
3970151497Sru}
3971151497Sru
3972151497Sru
3973151497Sru
3974151497Sru/*
3975151497Sru *	Copy the range specified by src_addr/len
3976151497Sru *	from the source map to the range dst_addr/len
3977151497Sru *	in the destination map.
3978151497Sru *
3979151497Sru *	This routine is only advisory and need not do anything.
3980151497Sru */
3981151497Sru
3982151497Sruvoid
3983151497Srupmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3984151497Sru    vm_offset_t src_addr)
3985151497Sru{
3986151497Sru	vm_page_t   free;
3987151497Sru	vm_offset_t addr;
3988151497Sru	vm_offset_t end_addr = src_addr + len;
3989151497Sru	vm_offset_t pdnxt;
3990151497Sru
3991151497Sru	if (dst_addr != src_addr)
3992151497Sru		return;
3993151497Sru
3994151497Sru	if (!pmap_is_current(src_pmap))
3995151497Sru		return;
3996151497Sru
3997151497Sru	vm_page_lock_queues();
3998151497Sru	if (dst_pmap < src_pmap) {
3999151497Sru		PMAP_LOCK(dst_pmap);
4000151497Sru		PMAP_LOCK(src_pmap);
4001151497Sru	} else {
4002151497Sru		PMAP_LOCK(src_pmap);
4003151497Sru		PMAP_LOCK(dst_pmap);
4004151497Sru	}
4005151497Sru	sched_pin();
4006151497Sru	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4007151497Sru		pt_entry_t *src_pte, *dst_pte;
4008151497Sru		vm_page_t dstmpte, srcmpte;
4009151497Sru		pd_entry_t srcptepaddr;
4010151497Sru		u_int ptepindex;
4011151497Sru
4012151497Sru		KASSERT(addr < UPT_MIN_ADDRESS,
4013151497Sru		    ("pmap_copy: invalid to pmap_copy page tables"));
4014151497Sru
4015151497Sru		pdnxt = (addr + NBPDR) & ~PDRMASK;
4016151497Sru		if (pdnxt < addr)
4017151497Sru			pdnxt = end_addr;
4018151497Sru		ptepindex = addr >> PDRSHIFT;
4019151497Sru
4020151497Sru		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4021151497Sru		if (srcptepaddr == 0)
4022151497Sru			continue;
4023151497Sru
4024151497Sru		if (srcptepaddr & PG_PS) {
4025151497Sru			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4026151497Sru			    ((srcptepaddr & PG_MANAGED) == 0 ||
4027151497Sru			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4028151497Sru			    PG_PS_FRAME))) {
4029151497Sru				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4030151497Sru				    ~PG_W;
4031151497Sru				dst_pmap->pm_stats.resident_count +=
4032151497Sru				    NBPDR / PAGE_SIZE;
4033151497Sru			}
4034151497Sru			continue;
4035151497Sru		}
4036151497Sru
4037151497Sru		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4038151497Sru		KASSERT(srcmpte->wire_count > 0,
4039151497Sru		    ("pmap_copy: source page table page is unused"));
4040151497Sru
4041151497Sru		if (pdnxt > end_addr)
4042151497Sru			pdnxt = end_addr;
4043151497Sru
4044151497Sru		src_pte = vtopte(addr);
4045151497Sru		while (addr < pdnxt) {
4046151497Sru			pt_entry_t ptetemp;
4047151497Sru			ptetemp = *src_pte;
4048151497Sru			/*
4049151497Sru			 * we only virtual copy managed pages
4050151497Sru			 */
4051151497Sru			if ((ptetemp & PG_MANAGED) != 0) {
4052151497Sru				dstmpte = pmap_allocpte(dst_pmap, addr,
4053151497Sru				    M_NOWAIT);
4054151497Sru				if (dstmpte == NULL)
4055151497Sru					goto out;
4056151497Sru				dst_pte = pmap_pte_quick(dst_pmap, addr);
4057151497Sru				if (*dst_pte == 0 &&
4058151497Sru				    pmap_try_insert_pv_entry(dst_pmap, addr,
4059151497Sru				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4060151497Sru					/*
4061151497Sru					 * Clear the wired, modified, and
4062151497Sru					 * accessed (referenced) bits
4063151497Sru					 * during the copy.
4064151497Sru					 */
4065151497Sru					*dst_pte = ptetemp & ~(PG_W | PG_M |
4066151497Sru					    PG_A);
4067151497Sru					dst_pmap->pm_stats.resident_count++;
4068151497Sru	 			} else {
4069151497Sru					free = NULL;
4070151497Sru					if (pmap_unwire_pte_hold(dst_pmap,
4071151497Sru					    dstmpte, &free)) {
4072151497Sru						pmap_invalidate_page(dst_pmap,
4073151497Sru						    addr);
4074151497Sru						pmap_free_zero_pages(free);
4075151497Sru					}
4076151497Sru					goto out;
4077151497Sru				}
4078151497Sru				if (dstmpte->wire_count >= srcmpte->wire_count)
4079151497Sru					break;
4080151497Sru			}
4081151497Sru			addr += PAGE_SIZE;
4082151497Sru			src_pte++;
4083151497Sru		}
4084151497Sru	}
4085151497Sruout:
4086151497Sru	sched_unpin();
4087151497Sru	vm_page_unlock_queues();
4088151497Sru	PMAP_UNLOCK(src_pmap);
4089151497Sru	PMAP_UNLOCK(dst_pmap);
4090151497Sru}
4091151497Sru
4092151497Srustatic __inline void
4093151497Srupagezero(void *page)
4094151497Sru{
4095151497Sru#if defined(I686_CPU)
4096151497Sru	if (cpu_class == CPUCLASS_686) {
4097151497Sru#if defined(CPU_ENABLE_SSE)
4098151497Sru		if (cpu_feature & CPUID_SSE2)
4099151497Sru			sse2_pagezero(page);
4100151497Sru		else
4101151497Sru#endif
4102151497Sru			i686_pagezero(page);
4103151497Sru	} else
4104151497Sru#endif
4105151497Sru		bzero(page, PAGE_SIZE);
4106151497Sru}
4107151497Sru
4108151497Sru/*
4109151497Sru *	pmap_zero_page zeros the specified hardware page by mapping
4110151497Sru *	the page into KVM and using bzero to clear its contents.
4111151497Sru */
4112151497Sruvoid
4113151497Srupmap_zero_page(vm_page_t m)
4114151497Sru{
4115151497Sru	struct sysmaps *sysmaps;
4116151497Sru
4117151497Sru	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4118151497Sru	mtx_lock(&sysmaps->lock);
4119151497Sru	if (*sysmaps->CMAP2)
4120151497Sru		panic("pmap_zero_page: CMAP2 busy");
4121151497Sru	sched_pin();
4122151497Sru	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4123151497Sru	    pmap_cache_bits(m->md.pat_mode, 0);
4124151497Sru	invlcaddr(sysmaps->CADDR2);
4125151497Sru	pagezero(sysmaps->CADDR2);
4126151497Sru	*sysmaps->CMAP2 = 0;
4127151497Sru	sched_unpin();
4128151497Sru	mtx_unlock(&sysmaps->lock);
4129151497Sru}
4130151497Sru
4131151497Sru/*
4132151497Sru *	pmap_zero_page_area zeros the specified hardware page by mapping
4133151497Sru *	the page into KVM and using bzero to clear its contents.
4134151497Sru *
4135151497Sru *	off and size may not cover an area beyond a single hardware page.
4136151497Sru */
4137151497Sruvoid
4138151497Srupmap_zero_page_area(vm_page_t m, int off, int size)
4139151497Sru{
4140151497Sru	struct sysmaps *sysmaps;
4141151497Sru
4142151497Sru	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4143151497Sru	mtx_lock(&sysmaps->lock);
4144151497Sru	if (*sysmaps->CMAP2)
4145151497Sru		panic("pmap_zero_page_area: CMAP2 busy");
4146151497Sru	sched_pin();
4147151497Sru	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4148151497Sru	    pmap_cache_bits(m->md.pat_mode, 0);
4149151497Sru	invlcaddr(sysmaps->CADDR2);
4150151497Sru	if (off == 0 && size == PAGE_SIZE)
4151151497Sru		pagezero(sysmaps->CADDR2);
4152151497Sru	else
4153151497Sru		bzero((char *)sysmaps->CADDR2 + off, size);
4154151497Sru	*sysmaps->CMAP2 = 0;
4155151497Sru	sched_unpin();
4156151497Sru	mtx_unlock(&sysmaps->lock);
4157151497Sru}
4158151497Sru
4159151497Sru/*
4160151497Sru *	pmap_zero_page_idle zeros the specified hardware page by mapping
4161151497Sru *	the page into KVM and using bzero to clear its contents.  This
4162151497Sru *	is intended to be called from the vm_pagezero process only and
4163151497Sru *	outside of Giant.
4164151497Sru */
4165151497Sruvoid
4166151497Srupmap_zero_page_idle(vm_page_t m)
4167151497Sru{
4168151497Sru
4169151497Sru	if (*CMAP3)
4170151497Sru		panic("pmap_zero_page_idle: CMAP3 busy");
4171151497Sru	sched_pin();
4172151497Sru	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4173151497Sru	    pmap_cache_bits(m->md.pat_mode, 0);
4174151497Sru	invlcaddr(CADDR3);
4175151497Sru	pagezero(CADDR3);
4176151497Sru	*CMAP3 = 0;
4177151497Sru	sched_unpin();
4178151497Sru}
4179151497Sru
4180151497Sru/*
4181151497Sru *	pmap_copy_page copies the specified (machine independent)
4182151497Sru *	page by mapping the page into virtual memory and using
4183151497Sru *	bcopy to copy the page, one machine dependent page at a
4184151497Sru *	time.
4185151497Sru */
4186151497Sruvoid
4187151497Srupmap_copy_page(vm_page_t src, vm_page_t dst)
4188151497Sru{
4189151497Sru	struct sysmaps *sysmaps;
4190151497Sru
4191151497Sru	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4192151497Sru	mtx_lock(&sysmaps->lock);
4193151497Sru	if (*sysmaps->CMAP1)
4194151497Sru		panic("pmap_copy_page: CMAP1 busy");
4195151497Sru	if (*sysmaps->CMAP2)
4196151497Sru		panic("pmap_copy_page: CMAP2 busy");
4197151497Sru	sched_pin();
4198151497Sru	invlpg((u_int)sysmaps->CADDR1);
4199151497Sru	invlpg((u_int)sysmaps->CADDR2);
4200151497Sru	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4201151497Sru	    pmap_cache_bits(src->md.pat_mode, 0);
4202151497Sru	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4203151497Sru	    pmap_cache_bits(dst->md.pat_mode, 0);
4204151497Sru	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4205151497Sru	*sysmaps->CMAP1 = 0;
4206151497Sru	*sysmaps->CMAP2 = 0;
4207151497Sru	sched_unpin();
4208151497Sru	mtx_unlock(&sysmaps->lock);
4209151497Sru}
4210151497Sru
4211151497Sru/*
4212151497Sru * Returns true if the pmap's pv is one of the first
4213151497Sru * 16 pvs linked to from this page.  This count may
4214151497Sru * be changed upwards or downwards in the future; it
4215151497Sru * is only necessary that true be returned for a small
4216151497Sru * subset of pmaps for proper page aging.
4217151497Sru */
4218151497Sruboolean_t
4219151497Srupmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4220151497Sru{
4221151497Sru	struct md_page *pvh;
4222151497Sru	pv_entry_t pv;
4223151497Sru	int loops = 0;
4224151497Sru	boolean_t rv;
4225151497Sru
4226151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4227151497Sru	    ("pmap_page_exists_quick: page %p is not managed", m));
4228151497Sru	rv = FALSE;
4229151497Sru	vm_page_lock_queues();
4230151497Sru	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4231151497Sru		if (PV_PMAP(pv) == pmap) {
4232151497Sru			rv = TRUE;
4233151497Sru			break;
4234151497Sru		}
4235151497Sru		loops++;
4236151497Sru		if (loops >= 16)
4237151497Sru			break;
4238151497Sru	}
4239151497Sru	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4240151497Sru		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4241151497Sru		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4242151497Sru			if (PV_PMAP(pv) == pmap) {
4243151497Sru				rv = TRUE;
4244151497Sru				break;
4245151497Sru			}
4246151497Sru			loops++;
4247151497Sru			if (loops >= 16)
4248151497Sru				break;
4249151497Sru		}
4250151497Sru	}
4251151497Sru	vm_page_unlock_queues();
4252151497Sru	return (rv);
4253151497Sru}
4254151497Sru
4255151497Sru/*
4256151497Sru *	pmap_page_wired_mappings:
4257151497Sru *
4258151497Sru *	Return the number of managed mappings to the given physical page
4259151497Sru *	that are wired.
4260151497Sru */
4261151497Sruint
4262151497Srupmap_page_wired_mappings(vm_page_t m)
4263151497Sru{
4264151497Sru	int count;
4265151497Sru
4266151497Sru	count = 0;
4267151497Sru	if ((m->oflags & VPO_UNMANAGED) != 0)
4268151497Sru		return (count);
4269151497Sru	vm_page_lock_queues();
4270151497Sru	count = pmap_pvh_wired_mappings(&m->md, count);
4271151497Sru	if ((m->flags & PG_FICTITIOUS) == 0) {
4272151497Sru	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4273151497Sru	        count);
4274151497Sru	}
4275151497Sru	vm_page_unlock_queues();
4276151497Sru	return (count);
4277151497Sru}
4278151497Sru
4279151497Sru/*
4280151497Sru *	pmap_pvh_wired_mappings:
4281151497Sru *
4282151497Sru *	Return the updated number "count" of managed mappings that are wired.
4283151497Sru */
4284151497Srustatic int
4285151497Srupmap_pvh_wired_mappings(struct md_page *pvh, int count)
4286151497Sru{
4287151497Sru	pmap_t pmap;
4288151497Sru	pt_entry_t *pte;
4289151497Sru	pv_entry_t pv;
4290151497Sru
4291151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4292151497Sru	sched_pin();
4293151497Sru	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4294151497Sru		pmap = PV_PMAP(pv);
4295151497Sru		PMAP_LOCK(pmap);
4296151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
4297151497Sru		if ((*pte & PG_W) != 0)
4298151497Sru			count++;
4299151497Sru		PMAP_UNLOCK(pmap);
4300151497Sru	}
4301151497Sru	sched_unpin();
4302151497Sru	return (count);
4303151497Sru}
4304151497Sru
4305151497Sru/*
4306151497Sru * Returns TRUE if the given page is mapped individually or as part of
4307151497Sru * a 4mpage.  Otherwise, returns FALSE.
4308151497Sru */
4309151497Sruboolean_t
4310151497Srupmap_page_is_mapped(vm_page_t m)
4311151497Sru{
4312151497Sru	boolean_t rv;
4313151497Sru
4314151497Sru	if ((m->oflags & VPO_UNMANAGED) != 0)
4315151497Sru		return (FALSE);
4316151497Sru	vm_page_lock_queues();
4317151497Sru	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4318151497Sru	    ((m->flags & PG_FICTITIOUS) == 0 &&
4319151497Sru	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4320151497Sru	vm_page_unlock_queues();
4321151497Sru	return (rv);
4322151497Sru}
4323151497Sru
4324151497Sru/*
4325151497Sru * Remove all pages from specified address space
4326151497Sru * this aids process exit speeds.  Also, this code
4327151497Sru * is special cased for current process only, but
4328151497Sru * can have the more generic (and slightly slower)
4329151497Sru * mode enabled.  This is much faster than pmap_remove
4330151497Sru * in the case of running down an entire address space.
4331151497Sru */
4332151497Sruvoid
4333151497Srupmap_remove_pages(pmap_t pmap)
4334151497Sru{
4335151497Sru	pt_entry_t *pte, tpte;
4336151497Sru	vm_page_t free = NULL;
4337151497Sru	vm_page_t m, mpte, mt;
4338151497Sru	pv_entry_t pv;
4339151497Sru	struct md_page *pvh;
4340151497Sru	struct pv_chunk *pc, *npc;
4341151497Sru	int field, idx;
4342151497Sru	int32_t bit;
4343151497Sru	uint32_t inuse, bitmask;
4344151497Sru	int allfree;
4345151497Sru
4346151497Sru	if (pmap != PCPU_GET(curpmap)) {
4347151497Sru		printf("warning: pmap_remove_pages called with non-current pmap\n");
4348151497Sru		return;
4349151497Sru	}
4350151497Sru	vm_page_lock_queues();
4351151497Sru	PMAP_LOCK(pmap);
4352151497Sru	sched_pin();
4353151497Sru	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4354151497Sru		allfree = 1;
4355151497Sru		for (field = 0; field < _NPCM; field++) {
4356151497Sru			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4357151497Sru			while (inuse != 0) {
4358151497Sru				bit = bsfl(inuse);
4359151497Sru				bitmask = 1UL << bit;
4360151497Sru				idx = field * 32 + bit;
4361151497Sru				pv = &pc->pc_pventry[idx];
4362151497Sru				inuse &= ~bitmask;
4363151497Sru
4364151497Sru				pte = pmap_pde(pmap, pv->pv_va);
4365151497Sru				tpte = *pte;
4366151497Sru				if ((tpte & PG_PS) == 0) {
4367151497Sru					pte = vtopte(pv->pv_va);
4368151497Sru					tpte = *pte & ~PG_PTE_PAT;
4369151497Sru				}
4370151497Sru
4371151497Sru				if (tpte == 0) {
4372151497Sru					printf(
4373151497Sru					    "TPTE at %p  IS ZERO @ VA %08x\n",
4374151497Sru					    pte, pv->pv_va);
4375151497Sru					panic("bad pte");
4376151497Sru				}
4377151497Sru
4378151497Sru/*
4379151497Sru * We cannot remove wired pages from a process' mapping at this time
4380151497Sru */
4381151497Sru				if (tpte & PG_W) {
4382151497Sru					allfree = 0;
4383151497Sru					continue;
4384151497Sru				}
4385151497Sru
4386151497Sru				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4387151497Sru				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4388151497Sru				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4389151497Sru				    m, (uintmax_t)m->phys_addr,
4390151497Sru				    (uintmax_t)tpte));
4391151497Sru
4392151497Sru				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4393151497Sru				    m < &vm_page_array[vm_page_array_size],
4394151497Sru				    ("pmap_remove_pages: bad tpte %#jx",
4395151497Sru				    (uintmax_t)tpte));
4396151497Sru
4397151497Sru				pte_clear(pte);
4398151497Sru
4399151497Sru				/*
4400151497Sru				 * Update the vm_page_t clean/reference bits.
4401151497Sru				 */
4402151497Sru				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4403151497Sru					if ((tpte & PG_PS) != 0) {
4404151497Sru						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4405151497Sru							vm_page_dirty(mt);
4406151497Sru					} else
4407151497Sru						vm_page_dirty(m);
4408151497Sru				}
4409151497Sru
4410151497Sru				/* Mark free */
4411151497Sru				PV_STAT(pv_entry_frees++);
4412151497Sru				PV_STAT(pv_entry_spare++);
4413151497Sru				pv_entry_count--;
4414151497Sru				pc->pc_map[field] |= bitmask;
4415151497Sru				if ((tpte & PG_PS) != 0) {
4416151497Sru					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4417151497Sru					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4418151497Sru					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4419151497Sru					if (TAILQ_EMPTY(&pvh->pv_list)) {
4420151497Sru						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4421151497Sru							if (TAILQ_EMPTY(&mt->md.pv_list))
4422151497Sru								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4423151497Sru					}
4424151497Sru					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4425151497Sru					if (mpte != NULL) {
4426151497Sru						pmap_remove_pt_page(pmap, mpte);
4427151497Sru						pmap->pm_stats.resident_count--;
4428151497Sru						KASSERT(mpte->wire_count == NPTEPG,
4429151497Sru						    ("pmap_remove_pages: pte page wire count error"));
4430151497Sru						mpte->wire_count = 0;
4431151497Sru						pmap_add_delayed_free_list(mpte, &free, FALSE);
4432151497Sru						atomic_subtract_int(&cnt.v_wire_count, 1);
4433151497Sru					}
4434151497Sru				} else {
4435151497Sru					pmap->pm_stats.resident_count--;
4436151497Sru					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4437151497Sru					if (TAILQ_EMPTY(&m->md.pv_list) &&
4438151497Sru					    (m->flags & PG_FICTITIOUS) == 0) {
4439151497Sru						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4440151497Sru						if (TAILQ_EMPTY(&pvh->pv_list))
4441151497Sru							vm_page_aflag_clear(m, PGA_WRITEABLE);
4442151497Sru					}
4443151497Sru					pmap_unuse_pt(pmap, pv->pv_va, &free);
4444151497Sru				}
4445151497Sru			}
4446151497Sru		}
4447151497Sru		if (allfree) {
4448151497Sru			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4449151497Sru			free_pv_chunk(pc);
4450151497Sru		}
4451151497Sru	}
4452151497Sru	sched_unpin();
4453151497Sru	pmap_invalidate_all(pmap);
4454151497Sru	vm_page_unlock_queues();
4455151497Sru	PMAP_UNLOCK(pmap);
4456151497Sru	pmap_free_zero_pages(free);
4457151497Sru}
4458151497Sru
4459151497Sru/*
4460151497Sru *	pmap_is_modified:
4461151497Sru *
4462151497Sru *	Return whether or not the specified physical page was modified
4463151497Sru *	in any physical maps.
4464151497Sru */
4465151497Sruboolean_t
4466151497Srupmap_is_modified(vm_page_t m)
4467151497Sru{
4468151497Sru	boolean_t rv;
4469151497Sru
4470151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4471151497Sru	    ("pmap_is_modified: page %p is not managed", m));
4472151497Sru
4473151497Sru	/*
4474151497Sru	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
4475151497Sru	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4476151497Sru	 * is clear, no PTEs can have PG_M set.
4477151497Sru	 */
4478151497Sru	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4479151497Sru	if ((m->oflags & VPO_BUSY) == 0 &&
4480151497Sru	    (m->aflags & PGA_WRITEABLE) == 0)
4481151497Sru		return (FALSE);
4482151497Sru	vm_page_lock_queues();
4483151497Sru	rv = pmap_is_modified_pvh(&m->md) ||
4484151497Sru	    ((m->flags & PG_FICTITIOUS) == 0 &&
4485151497Sru	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4486151497Sru	vm_page_unlock_queues();
4487151497Sru	return (rv);
4488151497Sru}
4489151497Sru
4490151497Sru/*
4491151497Sru * Returns TRUE if any of the given mappings were used to modify
4492151497Sru * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4493151497Sru * mappings are supported.
4494151497Sru */
4495151497Srustatic boolean_t
4496151497Srupmap_is_modified_pvh(struct md_page *pvh)
4497151497Sru{
4498151497Sru	pv_entry_t pv;
4499151497Sru	pt_entry_t *pte;
4500151497Sru	pmap_t pmap;
4501151497Sru	boolean_t rv;
4502151497Sru
4503151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4504151497Sru	rv = FALSE;
4505151497Sru	sched_pin();
4506151497Sru	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4507151497Sru		pmap = PV_PMAP(pv);
4508151497Sru		PMAP_LOCK(pmap);
4509151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
4510151497Sru		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4511151497Sru		PMAP_UNLOCK(pmap);
4512151497Sru		if (rv)
4513151497Sru			break;
4514151497Sru	}
4515151497Sru	sched_unpin();
4516151497Sru	return (rv);
4517151497Sru}
4518151497Sru
4519151497Sru/*
4520151497Sru *	pmap_is_prefaultable:
4521151497Sru *
4522151497Sru *	Return whether or not the specified virtual address is elgible
4523151497Sru *	for prefault.
4524151497Sru */
4525151497Sruboolean_t
4526151497Srupmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4527151497Sru{
4528151497Sru	pd_entry_t *pde;
4529151497Sru	pt_entry_t *pte;
4530151497Sru	boolean_t rv;
4531151497Sru
4532151497Sru	rv = FALSE;
4533151497Sru	PMAP_LOCK(pmap);
4534151497Sru	pde = pmap_pde(pmap, addr);
4535151497Sru	if (*pde != 0 && (*pde & PG_PS) == 0) {
4536151497Sru		pte = vtopte(addr);
4537151497Sru		rv = *pte == 0;
4538151497Sru	}
4539151497Sru	PMAP_UNLOCK(pmap);
4540151497Sru	return (rv);
4541151497Sru}
4542151497Sru
4543151497Sru/*
4544151497Sru *	pmap_is_referenced:
4545151497Sru *
4546151497Sru *	Return whether or not the specified physical page was referenced
4547151497Sru *	in any physical maps.
4548151497Sru */
4549151497Sruboolean_t
4550151497Srupmap_is_referenced(vm_page_t m)
4551151497Sru{
4552151497Sru	boolean_t rv;
4553151497Sru
4554151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4555151497Sru	    ("pmap_is_referenced: page %p is not managed", m));
4556151497Sru	vm_page_lock_queues();
4557151497Sru	rv = pmap_is_referenced_pvh(&m->md) ||
4558151497Sru	    ((m->flags & PG_FICTITIOUS) == 0 &&
4559151497Sru	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4560151497Sru	vm_page_unlock_queues();
4561151497Sru	return (rv);
4562151497Sru}
4563151497Sru
4564151497Sru/*
4565151497Sru * Returns TRUE if any of the given mappings were referenced and FALSE
4566151497Sru * otherwise.  Both page and 4mpage mappings are supported.
4567151497Sru */
4568151497Srustatic boolean_t
4569151497Srupmap_is_referenced_pvh(struct md_page *pvh)
4570151497Sru{
4571151497Sru	pv_entry_t pv;
4572151497Sru	pt_entry_t *pte;
4573151497Sru	pmap_t pmap;
4574151497Sru	boolean_t rv;
4575151497Sru
4576151497Sru	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4577151497Sru	rv = FALSE;
4578151497Sru	sched_pin();
4579151497Sru	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4580151497Sru		pmap = PV_PMAP(pv);
4581151497Sru		PMAP_LOCK(pmap);
4582151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
4583151497Sru		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4584151497Sru		PMAP_UNLOCK(pmap);
4585151497Sru		if (rv)
4586151497Sru			break;
4587151497Sru	}
4588151497Sru	sched_unpin();
4589151497Sru	return (rv);
4590151497Sru}
4591151497Sru
4592151497Sru/*
4593151497Sru * Clear the write and modified bits in each of the given page's mappings.
4594151497Sru */
4595151497Sruvoid
4596151497Srupmap_remove_write(vm_page_t m)
4597151497Sru{
4598151497Sru	struct md_page *pvh;
4599151497Sru	pv_entry_t next_pv, pv;
4600151497Sru	pmap_t pmap;
4601151497Sru	pd_entry_t *pde;
4602151497Sru	pt_entry_t oldpte, *pte;
4603151497Sru	vm_offset_t va;
4604151497Sru
4605151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4606151497Sru	    ("pmap_remove_write: page %p is not managed", m));
4607151497Sru
4608151497Sru	/*
4609151497Sru	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
4610151497Sru	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
4611151497Sru	 * is clear, no page table entries need updating.
4612151497Sru	 */
4613151497Sru	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4614151497Sru	if ((m->oflags & VPO_BUSY) == 0 &&
4615151497Sru	    (m->aflags & PGA_WRITEABLE) == 0)
4616151497Sru		return;
4617151497Sru	vm_page_lock_queues();
4618151497Sru	sched_pin();
4619151497Sru	if ((m->flags & PG_FICTITIOUS) != 0)
4620151497Sru		goto small_mappings;
4621151497Sru	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4622151497Sru	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4623151497Sru		va = pv->pv_va;
4624151497Sru		pmap = PV_PMAP(pv);
4625151497Sru		PMAP_LOCK(pmap);
4626151497Sru		pde = pmap_pde(pmap, va);
4627151497Sru		if ((*pde & PG_RW) != 0)
4628151497Sru			(void)pmap_demote_pde(pmap, pde, va);
4629151497Sru		PMAP_UNLOCK(pmap);
4630151497Sru	}
4631151497Srusmall_mappings:
4632151497Sru	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4633151497Sru		pmap = PV_PMAP(pv);
4634151497Sru		PMAP_LOCK(pmap);
4635151497Sru		pde = pmap_pde(pmap, pv->pv_va);
4636151497Sru		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4637151497Sru		    " a 4mpage in page %p's pv list", m));
4638151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
4639151497Sruretry:
4640151497Sru		oldpte = *pte;
4641151497Sru		if ((oldpte & PG_RW) != 0) {
4642151497Sru			/*
4643151497Sru			 * Regardless of whether a pte is 32 or 64 bits
4644151497Sru			 * in size, PG_RW and PG_M are among the least
4645151497Sru			 * significant 32 bits.
4646151497Sru			 */
4647151497Sru			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4648151497Sru			    oldpte & ~(PG_RW | PG_M)))
4649151497Sru				goto retry;
4650151497Sru			if ((oldpte & PG_M) != 0)
4651151497Sru				vm_page_dirty(m);
4652151497Sru			pmap_invalidate_page(pmap, pv->pv_va);
4653151497Sru		}
4654151497Sru		PMAP_UNLOCK(pmap);
4655151497Sru	}
4656151497Sru	vm_page_aflag_clear(m, PGA_WRITEABLE);
4657151497Sru	sched_unpin();
4658151497Sru	vm_page_unlock_queues();
4659151497Sru}
4660151497Sru
4661151497Sru/*
4662151497Sru *	pmap_ts_referenced:
4663151497Sru *
4664151497Sru *	Return a count of reference bits for a page, clearing those bits.
4665151497Sru *	It is not necessary for every reference bit to be cleared, but it
4666151497Sru *	is necessary that 0 only be returned when there are truly no
4667151497Sru *	reference bits set.
4668151497Sru *
4669151497Sru *	XXX: The exact number of bits to check and clear is a matter that
4670151497Sru *	should be tested and standardized at some point in the future for
4671151497Sru *	optimal aging of shared pages.
4672151497Sru */
4673151497Sruint
4674151497Srupmap_ts_referenced(vm_page_t m)
4675151497Sru{
4676151497Sru	struct md_page *pvh;
4677151497Sru	pv_entry_t pv, pvf, pvn;
4678151497Sru	pmap_t pmap;
4679151497Sru	pd_entry_t oldpde, *pde;
4680151497Sru	pt_entry_t *pte;
4681151497Sru	vm_offset_t va;
4682151497Sru	int rtval = 0;
4683151497Sru
4684151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4685151497Sru	    ("pmap_ts_referenced: page %p is not managed", m));
4686151497Sru	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4687151497Sru	vm_page_lock_queues();
4688151497Sru	sched_pin();
4689151497Sru	if ((m->flags & PG_FICTITIOUS) != 0)
4690151497Sru		goto small_mappings;
4691151497Sru	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4692151497Sru		va = pv->pv_va;
4693151497Sru		pmap = PV_PMAP(pv);
4694151497Sru		PMAP_LOCK(pmap);
4695151497Sru		pde = pmap_pde(pmap, va);
4696151497Sru		oldpde = *pde;
4697151497Sru		if ((oldpde & PG_A) != 0) {
4698151497Sru			if (pmap_demote_pde(pmap, pde, va)) {
4699151497Sru				if ((oldpde & PG_W) == 0) {
4700151497Sru					/*
4701151497Sru					 * Remove the mapping to a single page
4702151497Sru					 * so that a subsequent access may
4703151497Sru					 * repromote.  Since the underlying
4704151497Sru					 * page table page is fully populated,
4705151497Sru					 * this removal never frees a page
4706151497Sru					 * table page.
4707151497Sru					 */
4708151497Sru					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4709151497Sru					    PG_PS_FRAME);
4710151497Sru					pmap_remove_page(pmap, va, NULL);
4711151497Sru					rtval++;
4712151497Sru					if (rtval > 4) {
4713151497Sru						PMAP_UNLOCK(pmap);
4714151497Sru						goto out;
4715151497Sru					}
4716151497Sru				}
4717151497Sru			}
4718151497Sru		}
4719151497Sru		PMAP_UNLOCK(pmap);
4720151497Sru	}
4721151497Srusmall_mappings:
4722151497Sru	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4723151497Sru		pvf = pv;
4724151497Sru		do {
4725151497Sru			pvn = TAILQ_NEXT(pv, pv_list);
4726151497Sru			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4727151497Sru			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4728151497Sru			pmap = PV_PMAP(pv);
4729151497Sru			PMAP_LOCK(pmap);
4730151497Sru			pde = pmap_pde(pmap, pv->pv_va);
4731151497Sru			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4732151497Sru			    " found a 4mpage in page %p's pv list", m));
4733151497Sru			pte = pmap_pte_quick(pmap, pv->pv_va);
4734151497Sru			if ((*pte & PG_A) != 0) {
4735151497Sru				atomic_clear_int((u_int *)pte, PG_A);
4736151497Sru				pmap_invalidate_page(pmap, pv->pv_va);
4737151497Sru				rtval++;
4738151497Sru				if (rtval > 4)
4739151497Sru					pvn = NULL;
4740151497Sru			}
4741151497Sru			PMAP_UNLOCK(pmap);
4742151497Sru		} while ((pv = pvn) != NULL && pv != pvf);
4743151497Sru	}
4744151497Sruout:
4745151497Sru	sched_unpin();
4746151497Sru	vm_page_unlock_queues();
4747151497Sru	return (rtval);
4748151497Sru}
4749151497Sru
4750151497Sru/*
4751151497Sru *	Clear the modify bits on the specified physical page.
4752151497Sru */
4753151497Sruvoid
4754151497Srupmap_clear_modify(vm_page_t m)
4755151497Sru{
4756151497Sru	struct md_page *pvh;
4757151497Sru	pv_entry_t next_pv, pv;
4758151497Sru	pmap_t pmap;
4759151497Sru	pd_entry_t oldpde, *pde;
4760151497Sru	pt_entry_t oldpte, *pte;
4761151497Sru	vm_offset_t va;
4762151497Sru
4763151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4764151497Sru	    ("pmap_clear_modify: page %p is not managed", m));
4765151497Sru	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4766151497Sru	KASSERT((m->oflags & VPO_BUSY) == 0,
4767151497Sru	    ("pmap_clear_modify: page %p is busy", m));
4768151497Sru
4769151497Sru	/*
4770151497Sru	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4771151497Sru	 * If the object containing the page is locked and the page is not
4772151497Sru	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
4773151497Sru	 */
4774151497Sru	if ((m->aflags & PGA_WRITEABLE) == 0)
4775151497Sru		return;
4776151497Sru	vm_page_lock_queues();
4777151497Sru	sched_pin();
4778151497Sru	if ((m->flags & PG_FICTITIOUS) != 0)
4779151497Sru		goto small_mappings;
4780151497Sru	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4781151497Sru	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4782151497Sru		va = pv->pv_va;
4783151497Sru		pmap = PV_PMAP(pv);
4784151497Sru		PMAP_LOCK(pmap);
4785151497Sru		pde = pmap_pde(pmap, va);
4786151497Sru		oldpde = *pde;
4787151497Sru		if ((oldpde & PG_RW) != 0) {
4788151497Sru			if (pmap_demote_pde(pmap, pde, va)) {
4789151497Sru				if ((oldpde & PG_W) == 0) {
4790151497Sru					/*
4791151497Sru					 * Write protect the mapping to a
4792151497Sru					 * single page so that a subsequent
4793151497Sru					 * write access may repromote.
4794151497Sru					 */
4795151497Sru					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4796151497Sru					    PG_PS_FRAME);
4797151497Sru					pte = pmap_pte_quick(pmap, va);
4798151497Sru					oldpte = *pte;
4799151497Sru					if ((oldpte & PG_V) != 0) {
4800151497Sru						/*
4801151497Sru						 * Regardless of whether a pte is 32 or 64 bits
4802151497Sru						 * in size, PG_RW and PG_M are among the least
4803151497Sru						 * significant 32 bits.
4804151497Sru						 */
4805151497Sru						while (!atomic_cmpset_int((u_int *)pte,
4806151497Sru						    oldpte,
4807151497Sru						    oldpte & ~(PG_M | PG_RW)))
4808151497Sru							oldpte = *pte;
4809151497Sru						vm_page_dirty(m);
4810151497Sru						pmap_invalidate_page(pmap, va);
4811151497Sru					}
4812151497Sru				}
4813151497Sru			}
4814151497Sru		}
4815151497Sru		PMAP_UNLOCK(pmap);
4816151497Sru	}
4817151497Srusmall_mappings:
4818151497Sru	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4819151497Sru		pmap = PV_PMAP(pv);
4820151497Sru		PMAP_LOCK(pmap);
4821151497Sru		pde = pmap_pde(pmap, pv->pv_va);
4822151497Sru		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4823151497Sru		    " a 4mpage in page %p's pv list", m));
4824151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
4825151497Sru		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4826151497Sru			/*
4827151497Sru			 * Regardless of whether a pte is 32 or 64 bits
4828151497Sru			 * in size, PG_M is among the least significant
4829151497Sru			 * 32 bits.
4830151497Sru			 */
4831151497Sru			atomic_clear_int((u_int *)pte, PG_M);
4832151497Sru			pmap_invalidate_page(pmap, pv->pv_va);
4833151497Sru		}
4834151497Sru		PMAP_UNLOCK(pmap);
4835151497Sru	}
4836151497Sru	sched_unpin();
4837151497Sru	vm_page_unlock_queues();
4838151497Sru}
4839151497Sru
4840151497Sru/*
4841151497Sru *	pmap_clear_reference:
4842151497Sru *
4843151497Sru *	Clear the reference bit on the specified physical page.
4844151497Sru */
4845151497Sruvoid
4846151497Srupmap_clear_reference(vm_page_t m)
4847151497Sru{
4848151497Sru	struct md_page *pvh;
4849151497Sru	pv_entry_t next_pv, pv;
4850151497Sru	pmap_t pmap;
4851151497Sru	pd_entry_t oldpde, *pde;
4852151497Sru	pt_entry_t *pte;
4853151497Sru	vm_offset_t va;
4854151497Sru
4855151497Sru	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4856151497Sru	    ("pmap_clear_reference: page %p is not managed", m));
4857151497Sru	vm_page_lock_queues();
4858151497Sru	sched_pin();
4859151497Sru	if ((m->flags & PG_FICTITIOUS) != 0)
4860151497Sru		goto small_mappings;
4861151497Sru	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4862151497Sru	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4863151497Sru		va = pv->pv_va;
4864151497Sru		pmap = PV_PMAP(pv);
4865151497Sru		PMAP_LOCK(pmap);
4866151497Sru		pde = pmap_pde(pmap, va);
4867151497Sru		oldpde = *pde;
4868151497Sru		if ((oldpde & PG_A) != 0) {
4869151497Sru			if (pmap_demote_pde(pmap, pde, va)) {
4870151497Sru				/*
4871151497Sru				 * Remove the mapping to a single page so
4872151497Sru				 * that a subsequent access may repromote.
4873151497Sru				 * Since the underlying page table page is
4874151497Sru				 * fully populated, this removal never frees
4875151497Sru				 * a page table page.
4876151497Sru				 */
4877151497Sru				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4878151497Sru				    PG_PS_FRAME);
4879151497Sru				pmap_remove_page(pmap, va, NULL);
4880151497Sru			}
4881151497Sru		}
4882151497Sru		PMAP_UNLOCK(pmap);
4883151497Sru	}
4884151497Srusmall_mappings:
4885151497Sru	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4886151497Sru		pmap = PV_PMAP(pv);
4887151497Sru		PMAP_LOCK(pmap);
4888151497Sru		pde = pmap_pde(pmap, pv->pv_va);
4889151497Sru		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4890151497Sru		    " a 4mpage in page %p's pv list", m));
4891151497Sru		pte = pmap_pte_quick(pmap, pv->pv_va);
4892151497Sru		if ((*pte & PG_A) != 0) {
4893151497Sru			/*
4894151497Sru			 * Regardless of whether a pte is 32 or 64 bits
4895151497Sru			 * in size, PG_A is among the least significant
4896151497Sru			 * 32 bits.
4897151497Sru			 */
4898151497Sru			atomic_clear_int((u_int *)pte, PG_A);
4899151497Sru			pmap_invalidate_page(pmap, pv->pv_va);
4900151497Sru		}
4901151497Sru		PMAP_UNLOCK(pmap);
4902151497Sru	}
4903151497Sru	sched_unpin();
4904151497Sru	vm_page_unlock_queues();
4905151497Sru}
4906151497Sru
4907151497Sru/*
4908151497Sru * Miscellaneous support routines follow
4909151497Sru */
4910151497Sru
4911151497Sru/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4912151497Srustatic __inline void
4913151497Srupmap_pte_attr(pt_entry_t *pte, int cache_bits)
4914151497Sru{
4915151497Sru	u_int opte, npte;
4916151497Sru
4917151497Sru	/*
4918151497Sru	 * The cache mode bits are all in the low 32-bits of the
4919151497Sru	 * PTE, so we can just spin on updating the low 32-bits.
4920151497Sru	 */
4921151497Sru	do {
4922151497Sru		opte = *(u_int *)pte;
4923151497Sru		npte = opte & ~PG_PTE_CACHE;
4924151497Sru		npte |= cache_bits;
4925151497Sru	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4926151497Sru}
4927151497Sru
4928151497Sru/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4929151497Srustatic __inline void
4930151497Srupmap_pde_attr(pd_entry_t *pde, int cache_bits)
4931151497Sru{
4932151497Sru	u_int opde, npde;
4933151497Sru
4934151497Sru	/*
4935151497Sru	 * The cache mode bits are all in the low 32-bits of the
4936151497Sru	 * PDE, so we can just spin on updating the low 32-bits.
4937151497Sru	 */
4938151497Sru	do {
4939151497Sru		opde = *(u_int *)pde;
4940151497Sru		npde = opde & ~PG_PDE_CACHE;
4941151497Sru		npde |= cache_bits;
4942151497Sru	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4943151497Sru}
4944151497Sru
4945151497Sru/*
4946151497Sru * Map a set of physical memory pages into the kernel virtual
4947151497Sru * address space. Return a pointer to where it is mapped. This
4948151497Sru * routine is intended to be used for mapping device memory,
4949151497Sru * NOT real memory.
4950151497Sru */
4951151497Sruvoid *
4952151497Srupmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4953151497Sru{
4954151497Sru	vm_offset_t va, offset;
4955151497Sru	vm_size_t tmpsize;
4956151497Sru
4957151497Sru	offset = pa & PAGE_MASK;
4958151497Sru	size = roundup(offset + size, PAGE_SIZE);
4959151497Sru	pa = pa & PG_FRAME;
4960151497Sru
4961151497Sru	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4962151497Sru		va = KERNBASE + pa;
4963151497Sru	else
4964151497Sru		va = kmem_alloc_nofault(kernel_map, size);
4965151497Sru	if (!va)
4966151497Sru		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4967151497Sru
4968151497Sru	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4969151497Sru		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4970151497Sru	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4971151497Sru	pmap_invalidate_cache_range(va, va + size);
4972151497Sru	return ((void *)(va + offset));
4973151497Sru}
4974151497Sru
4975151497Sruvoid *
4976151497Srupmap_mapdev(vm_paddr_t pa, vm_size_t size)
4977151497Sru{
4978151497Sru
4979151497Sru	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4980151497Sru}
4981151497Sru
4982151497Sruvoid *
4983151497Srupmap_mapbios(vm_paddr_t pa, vm_size_t size)
4984151497Sru{
4985151497Sru
4986151497Sru	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4987151497Sru}
4988151497Sru
4989151497Sruvoid
4990151497Srupmap_unmapdev(vm_offset_t va, vm_size_t size)
4991151497Sru{
4992151497Sru	vm_offset_t base, offset, tmpva;
4993151497Sru
4994151497Sru	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4995151497Sru		return;
4996151497Sru	base = trunc_page(va);
4997151497Sru	offset = va & PAGE_MASK;
4998151497Sru	size = roundup(offset + size, PAGE_SIZE);
4999151497Sru	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
5000151497Sru		pmap_kremove(tmpva);
5001151497Sru	pmap_invalidate_range(kernel_pmap, va, tmpva);
5002151497Sru	kmem_free(kernel_map, base, size);
5003151497Sru}
5004151497Sru
5005151497Sru/*
5006151497Sru * Sets the memory attribute for the specified page.
5007151497Sru */
5008151497Sruvoid
5009151497Srupmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5010151497Sru{
5011151497Sru
5012151497Sru	m->md.pat_mode = ma;
5013151497Sru	if ((m->flags & PG_FICTITIOUS) != 0)
5014151497Sru		return;
5015151497Sru
5016151497Sru	/*
5017151497Sru	 * If "m" is a normal page, flush it from the cache.
5018151497Sru	 * See pmap_invalidate_cache_range().
5019151497Sru	 *
5020151497Sru	 * First, try to find an existing mapping of the page by sf
5021151497Sru	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5022151497Sru	 * flushes the cache.
5023151497Sru	 */
5024151497Sru	if (sf_buf_invalidate_cache(m))
5025151497Sru		return;
5026151497Sru
5027151497Sru	/*
5028151497Sru	 * If page is not mapped by sf buffer, but CPU does not
5029151497Sru	 * support self snoop, map the page transient and do
5030151497Sru	 * invalidation. In the worst case, whole cache is flushed by
5031151497Sru	 * pmap_invalidate_cache_range().
5032151497Sru	 */
5033151497Sru	if ((cpu_feature & CPUID_SS) == 0)
5034151497Sru		pmap_flush_page(m);
5035151497Sru}
5036151497Sru
5037151497Srustatic void
5038151497Srupmap_flush_page(vm_page_t m)
5039151497Sru{
5040151497Sru	struct sysmaps *sysmaps;
5041151497Sru	vm_offset_t sva, eva;
5042151497Sru
5043151497Sru	if ((cpu_feature & CPUID_CLFSH) != 0) {
5044151497Sru		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5045151497Sru		mtx_lock(&sysmaps->lock);
5046151497Sru		if (*sysmaps->CMAP2)
5047151497Sru			panic("pmap_flush_page: CMAP2 busy");
5048151497Sru		sched_pin();
5049151497Sru		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5050151497Sru		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5051151497Sru		invlcaddr(sysmaps->CADDR2);
5052151497Sru		sva = (vm_offset_t)sysmaps->CADDR2;
5053151497Sru		eva = sva + PAGE_SIZE;
5054151497Sru
5055151497Sru		/*
5056151497Sru		 * Use mfence despite the ordering implied by
5057151497Sru		 * mtx_{un,}lock() because clflush is not guaranteed
5058151497Sru		 * to be ordered by any other instruction.
5059151497Sru		 */
5060151497Sru		mfence();
5061151497Sru		for (; sva < eva; sva += cpu_clflush_line_size)
5062151497Sru			clflush(sva);
5063151497Sru		mfence();
5064151497Sru		*sysmaps->CMAP2 = 0;
5065151497Sru		sched_unpin();
5066151497Sru		mtx_unlock(&sysmaps->lock);
5067151497Sru	} else
5068151497Sru		pmap_invalidate_cache();
5069151497Sru}
5070151497Sru
5071151497Sru/*
5072151497Sru * Changes the specified virtual address range's memory type to that given by
5073151497Sru * the parameter "mode".  The specified virtual address range must be
5074151497Sru * completely contained within either the kernel map.
5075151497Sru *
5076151497Sru * Returns zero if the change completed successfully, and either EINVAL or
5077151497Sru * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5078151497Sru * of the virtual address range was not mapped, and ENOMEM is returned if
5079151497Sru * there was insufficient memory available to complete the change.
5080151497Sru */
5081151497Sruint
5082151497Srupmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5083151497Sru{
5084151497Sru	vm_offset_t base, offset, tmpva;
5085151497Sru	pd_entry_t *pde;
5086151497Sru	pt_entry_t *pte;
5087151497Sru	int cache_bits_pte, cache_bits_pde;
5088151497Sru	boolean_t changed;
5089151497Sru
5090151497Sru	base = trunc_page(va);
5091151497Sru	offset = va & PAGE_MASK;
5092151497Sru	size = roundup(offset + size, PAGE_SIZE);
5093151497Sru
5094151497Sru	/*
5095151497Sru	 * Only supported on kernel virtual addresses above the recursive map.
5096151497Sru	 */
5097151497Sru	if (base < VM_MIN_KERNEL_ADDRESS)
5098151497Sru		return (EINVAL);
5099151497Sru
5100151497Sru	cache_bits_pde = pmap_cache_bits(mode, 1);
5101151497Sru	cache_bits_pte = pmap_cache_bits(mode, 0);
5102151497Sru	changed = FALSE;
5103151497Sru
5104151497Sru	/*
5105151497Sru	 * Pages that aren't mapped aren't supported.  Also break down
5106151497Sru	 * 2/4MB pages into 4KB pages if required.
5107151497Sru	 */
5108151497Sru	PMAP_LOCK(kernel_pmap);
5109151497Sru	for (tmpva = base; tmpva < base + size; ) {
5110151497Sru		pde = pmap_pde(kernel_pmap, tmpva);
5111151497Sru		if (*pde == 0) {
5112151497Sru			PMAP_UNLOCK(kernel_pmap);
5113151497Sru			return (EINVAL);
5114151497Sru		}
5115151497Sru		if (*pde & PG_PS) {
5116151497Sru			/*
5117151497Sru			 * If the current 2/4MB page already has
5118151497Sru			 * the required memory type, then we need not
5119151497Sru			 * demote this page.  Just increment tmpva to
5120151497Sru			 * the next 2/4MB page frame.
5121151497Sru			 */
5122151497Sru			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5123151497Sru				tmpva = trunc_4mpage(tmpva) + NBPDR;
5124151497Sru				continue;
5125151497Sru			}
5126151497Sru
5127151497Sru			/*
5128151497Sru			 * If the current offset aligns with a 2/4MB
5129151497Sru			 * page frame and there is at least 2/4MB left
5130151497Sru			 * within the range, then we need not break
5131151497Sru			 * down this page into 4KB pages.
5132151497Sru			 */
5133151497Sru			if ((tmpva & PDRMASK) == 0 &&
5134151497Sru			    tmpva + PDRMASK < base + size) {
5135151497Sru				tmpva += NBPDR;
5136151497Sru				continue;
5137151497Sru			}
5138151497Sru			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5139151497Sru				PMAP_UNLOCK(kernel_pmap);
5140151497Sru				return (ENOMEM);
5141151497Sru			}
5142151497Sru		}
5143151497Sru		pte = vtopte(tmpva);
5144151497Sru		if (*pte == 0) {
5145151497Sru			PMAP_UNLOCK(kernel_pmap);
5146151497Sru			return (EINVAL);
5147151497Sru		}
5148151497Sru		tmpva += PAGE_SIZE;
5149151497Sru	}
5150151497Sru	PMAP_UNLOCK(kernel_pmap);
5151151497Sru
5152151497Sru	/*
5153151497Sru	 * Ok, all the pages exist, so run through them updating their
5154151497Sru	 * cache mode if required.
5155151497Sru	 */
5156151497Sru	for (tmpva = base; tmpva < base + size; ) {
5157151497Sru		pde = pmap_pde(kernel_pmap, tmpva);
5158151497Sru		if (*pde & PG_PS) {
5159151497Sru			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5160151497Sru				pmap_pde_attr(pde, cache_bits_pde);
5161151497Sru				changed = TRUE;
5162151497Sru			}
5163151497Sru			tmpva = trunc_4mpage(tmpva) + NBPDR;
5164151497Sru		} else {
5165151497Sru			pte = vtopte(tmpva);
5166151497Sru			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5167151497Sru				pmap_pte_attr(pte, cache_bits_pte);
5168151497Sru				changed = TRUE;
5169151497Sru			}
5170151497Sru			tmpva += PAGE_SIZE;
5171151497Sru		}
5172151497Sru	}
5173151497Sru
5174151497Sru	/*
5175151497Sru	 * Flush CPU caches to make sure any data isn't cached that
5176151497Sru	 * shouldn't be, etc.
5177151497Sru	 */
5178151497Sru	if (changed) {
5179151497Sru		pmap_invalidate_range(kernel_pmap, base, tmpva);
5180151497Sru		pmap_invalidate_cache_range(base, tmpva);
5181151497Sru	}
5182151497Sru	return (0);
5183151497Sru}
5184151497Sru
5185151497Sru/*
5186151497Sru * perform the pmap work for mincore
5187151497Sru */
5188151497Sruint
5189151497Srupmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5190151497Sru{
5191151497Sru	pd_entry_t *pdep;
5192151497Sru	pt_entry_t *ptep, pte;
5193151497Sru	vm_paddr_t pa;
5194151497Sru	int val;
5195151497Sru
5196151497Sru	PMAP_LOCK(pmap);
5197151497Sruretry:
5198151497Sru	pdep = pmap_pde(pmap, addr);
5199151497Sru	if (*pdep != 0) {
5200151497Sru		if (*pdep & PG_PS) {
5201151497Sru			pte = *pdep;
5202151497Sru			/* Compute the physical address of the 4KB page. */
5203151497Sru			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5204151497Sru			    PG_FRAME;
5205151497Sru			val = MINCORE_SUPER;
5206151497Sru		} else {
5207151497Sru			ptep = pmap_pte(pmap, addr);
5208151497Sru			pte = *ptep;
5209151497Sru			pmap_pte_release(ptep);
5210151497Sru			pa = pte & PG_FRAME;
5211151497Sru			val = 0;
5212151497Sru		}
5213151497Sru	} else {
5214151497Sru		pte = 0;
5215151497Sru		pa = 0;
5216151497Sru		val = 0;
5217151497Sru	}
5218151497Sru	if ((pte & PG_V) != 0) {
5219151497Sru		val |= MINCORE_INCORE;
5220151497Sru		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5221151497Sru			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5222151497Sru		if ((pte & PG_A) != 0)
5223151497Sru			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5224151497Sru	}
5225151497Sru	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5226151497Sru	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5227151497Sru	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5228151497Sru		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5229151497Sru		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5230151497Sru			goto retry;
5231151497Sru	} else
5232151497Sru		PA_UNLOCK_COND(*locked_pa);
5233151497Sru	PMAP_UNLOCK(pmap);
5234151497Sru	return (val);
5235151497Sru}
5236151497Sru
5237151497Sruvoid
5238151497Srupmap_activate(struct thread *td)
5239151497Sru{
5240151497Sru	pmap_t	pmap, oldpmap;
5241151497Sru	u_int	cpuid;
5242151497Sru	u_int32_t  cr3;
5243151497Sru
5244151497Sru	critical_enter();
5245151497Sru	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5246151497Sru	oldpmap = PCPU_GET(curpmap);
5247151497Sru	cpuid = PCPU_GET(cpuid);
5248151497Sru#if defined(SMP)
5249151497Sru	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5250151497Sru	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5251151497Sru#else
5252151497Sru	CPU_CLR(cpuid, &oldpmap->pm_active);
5253151497Sru	CPU_SET(cpuid, &pmap->pm_active);
5254151497Sru#endif
5255151497Sru#ifdef PAE
5256151497Sru	cr3 = vtophys(pmap->pm_pdpt);
5257151497Sru#else
5258151497Sru	cr3 = vtophys(pmap->pm_pdir);
5259151497Sru#endif
5260151497Sru	/*
5261151497Sru	 * pmap_activate is for the current thread on the current cpu
5262151497Sru	 */
5263151497Sru	td->td_pcb->pcb_cr3 = cr3;
5264151497Sru	load_cr3(cr3);
5265151497Sru	PCPU_SET(curpmap, pmap);
5266151497Sru	critical_exit();
5267151497Sru}
5268151497Sru
5269151497Sruvoid
5270151497Srupmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5271151497Sru{
5272151497Sru}
5273151497Sru
5274151497Sru/*
5275151497Sru *	Increase the starting virtual address of the given mapping if a
5276151497Sru *	different alignment might result in more superpage mappings.
5277151497Sru */
5278151497Sruvoid
5279151497Srupmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5280151497Sru    vm_offset_t *addr, vm_size_t size)
5281151497Sru{
5282151497Sru	vm_offset_t superpage_offset;
5283151497Sru
5284151497Sru	if (size < NBPDR)
5285151497Sru		return;
5286151497Sru	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5287151497Sru		offset += ptoa(object->pg_color);
5288151497Sru	superpage_offset = offset & PDRMASK;
5289151497Sru	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5290151497Sru	    (*addr & PDRMASK) == superpage_offset)
5291151497Sru		return;
5292151497Sru	if ((*addr & PDRMASK) < superpage_offset)
5293151497Sru		*addr = (*addr & ~PDRMASK) + superpage_offset;
5294151497Sru	else
5295151497Sru		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5296151497Sru}
5297151497Sru
5298151497Sru
5299151497Sru#if defined(PMAP_DEBUG)
5300151497Srupmap_pid_dump(int pid)
5301151497Sru{
5302151497Sru	pmap_t pmap;
5303151497Sru	struct proc *p;
5304151497Sru	int npte = 0;
5305151497Sru	int index;
5306151497Sru
5307151497Sru	sx_slock(&allproc_lock);
5308151497Sru	FOREACH_PROC_IN_SYSTEM(p) {
5309151497Sru		if (p->p_pid != pid)
5310151497Sru			continue;
5311151497Sru
5312151497Sru		if (p->p_vmspace) {
5313151497Sru			int i,j;
5314151497Sru			index = 0;
5315151497Sru			pmap = vmspace_pmap(p->p_vmspace);
5316151497Sru			for (i = 0; i < NPDEPTD; i++) {
5317151497Sru				pd_entry_t *pde;
5318151497Sru				pt_entry_t *pte;
5319151497Sru				vm_offset_t base = i << PDRSHIFT;
5320151497Sru
5321151497Sru				pde = &pmap->pm_pdir[i];
5322151497Sru				if (pde && pmap_pde_v(pde)) {
5323151497Sru					for (j = 0; j < NPTEPG; j++) {
5324151497Sru						vm_offset_t va = base + (j << PAGE_SHIFT);
5325151497Sru						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5326151497Sru							if (index) {
5327151497Sru								index = 0;
5328151497Sru								printf("\n");
5329151497Sru							}
5330151497Sru							sx_sunlock(&allproc_lock);
5331151497Sru							return (npte);
5332151497Sru						}
5333151497Sru						pte = pmap_pte(pmap, va);
5334151497Sru						if (pte && pmap_pte_v(pte)) {
5335151497Sru							pt_entry_t pa;
5336151497Sru							vm_page_t m;
5337151497Sru							pa = *pte;
5338151497Sru							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5339151497Sru							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5340151497Sru								va, pa, m->hold_count, m->wire_count, m->flags);
5341151497Sru							npte++;
5342151497Sru							index++;
5343151497Sru							if (index >= 2) {
5344151497Sru								index = 0;
5345151497Sru								printf("\n");
5346151497Sru							} else {
5347151497Sru								printf(" ");
5348151497Sru							}
5349151497Sru						}
5350151497Sru					}
5351151497Sru				}
5352151497Sru			}
5353151497Sru		}
5354151497Sru	}
5355151497Sru	sx_sunlock(&allproc_lock);
5356151497Sru	return (npte);
5357151497Sru}
5358151497Sru#endif
5359151497Sru
5360151497Sru#if defined(DEBUG)
5361151497Sru
5362151497Srustatic void	pads(pmap_t pm);
5363151497Sruvoid		pmap_pvdump(vm_paddr_t pa);
5364151497Sru
5365151497Sru/* print address space of pmap*/
5366151497Srustatic void
5367151497Srupads(pmap_t pm)
5368151497Sru{
5369151497Sru	int i, j;
5370151497Sru	vm_paddr_t va;
5371151497Sru	pt_entry_t *ptep;
5372151497Sru
5373151497Sru	if (pm == kernel_pmap)
5374151497Sru		return;
5375151497Sru	for (i = 0; i < NPDEPTD; i++)
5376151497Sru		if (pm->pm_pdir[i])
5377151497Sru			for (j = 0; j < NPTEPG; j++) {
5378151497Sru				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5379151497Sru				if (pm == kernel_pmap && va < KERNBASE)
5380151497Sru					continue;
5381151497Sru				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5382151497Sru					continue;
5383151497Sru				ptep = pmap_pte(pm, va);
5384151497Sru				if (pmap_pte_v(ptep))
5385151497Sru					printf("%x:%x ", va, *ptep);
5386151497Sru			};
5387151497Sru
5388151497Sru}
5389151497Sru
5390151497Sruvoid
5391151497Srupmap_pvdump(vm_paddr_t pa)
5392151497Sru{
5393151497Sru	pv_entry_t pv;
5394151497Sru	pmap_t pmap;
5395151497Sru	vm_page_t m;
5396151497Sru
5397151497Sru	printf("pa %x", pa);
5398151497Sru	m = PHYS_TO_VM_PAGE(pa);
5399151497Sru	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5400151497Sru		pmap = PV_PMAP(pv);
5401151497Sru		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5402151497Sru		pads(pmap);
5403151497Sru	}
5404151497Sru	printf(" ");
5405151497Sru}
5406151497Sru#endif
5407151497Sru