pmap.c revision 313148
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: stable/11/sys/i386/i386/pmap.c 313148 2017-02-03 12:03:10Z kib $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	Since the information managed by this module is
84 *	also stored by the logical address mapping module,
85 *	this module may throw away valid virtual-to-physical
86 *	mappings at almost any time.  However, invalidations
87 *	of virtual-to-physical mappings must be done as
88 *	requested.
89 *
90 *	In order to cope with hardware architectures which
91 *	make virtual-to-physical map invalidates expensive,
92 *	this module may delay invalidate or reduced protection
93 *	operations until such time as they are actually
94 *	necessary.  This module is given full information as
95 *	to which processors are currently using which maps,
96 *	and to when physical maps must be made correct.
97 */
98
99#include "opt_apic.h"
100#include "opt_cpu.h"
101#include "opt_pmap.h"
102#include "opt_smp.h"
103#include "opt_xbox.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/ktr.h>
109#include <sys/lock.h>
110#include <sys/malloc.h>
111#include <sys/mman.h>
112#include <sys/msgbuf.h>
113#include <sys/mutex.h>
114#include <sys/proc.h>
115#include <sys/rwlock.h>
116#include <sys/sf_buf.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/smp.h>
122
123#include <vm/vm.h>
124#include <vm/vm_param.h>
125#include <vm/vm_kern.h>
126#include <vm/vm_page.h>
127#include <vm/vm_map.h>
128#include <vm/vm_object.h>
129#include <vm/vm_extern.h>
130#include <vm/vm_pageout.h>
131#include <vm/vm_pager.h>
132#include <vm/vm_phys.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#ifdef DEV_APIC
138#include <sys/bus.h>
139#include <machine/intr_machdep.h>
140#include <x86/apicvar.h>
141#endif
142#include <machine/cpu.h>
143#include <machine/cputypes.h>
144#include <machine/md_var.h>
145#include <machine/pcb.h>
146#include <machine/specialreg.h>
147#ifdef SMP
148#include <machine/smp.h>
149#endif
150
151#ifdef XBOX
152#include <machine/xbox.h>
153#endif
154
155#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
156#define CPU_ENABLE_SSE
157#endif
158
159#ifndef PMAP_SHPGPERPROC
160#define PMAP_SHPGPERPROC 200
161#endif
162
163#if !defined(DIAGNOSTIC)
164#ifdef __GNUC_GNU_INLINE__
165#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
166#else
167#define PMAP_INLINE	extern inline
168#endif
169#else
170#define PMAP_INLINE
171#endif
172
173#ifdef PV_STATS
174#define PV_STAT(x)	do { x ; } while (0)
175#else
176#define PV_STAT(x)	do { } while (0)
177#endif
178
179#define	pa_index(pa)	((pa) >> PDRSHIFT)
180#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
181
182/*
183 * Get PDEs and PTEs for user/kernel address space
184 */
185#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
186#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
187
188#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
189#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
190#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
191#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
192#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
193
194#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
195    atomic_clear_int((u_int *)(pte), PG_W))
196#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
197
198struct pmap kernel_pmap_store;
199LIST_HEAD(pmaplist, pmap);
200static struct pmaplist allpmaps;
201static struct mtx allpmaps_lock;
202
203vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
204vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
205int pgeflag = 0;		/* PG_G or-in */
206int pseflag = 0;		/* PG_PS or-in */
207
208static int nkpt = NKPT;
209vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
210extern u_int32_t KERNend;
211extern u_int32_t KPTphys;
212
213#if defined(PAE) || defined(PAE_TABLES)
214pt_entry_t pg_nx;
215static uma_zone_t pdptzone;
216#endif
217
218static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
219
220static int pat_works = 1;
221SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
222    "Is page attribute table fully functional?");
223
224static int pg_ps_enabled = 1;
225SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
226    &pg_ps_enabled, 0, "Are large page mappings enabled?");
227
228#define	PAT_INDEX_SIZE	8
229static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
230
231/*
232 * pmap_mapdev support pre initialization (i.e. console)
233 */
234#define	PMAP_PREINIT_MAPPING_COUNT	8
235static struct pmap_preinit_mapping {
236	vm_paddr_t	pa;
237	vm_offset_t	va;
238	vm_size_t	sz;
239	int		mode;
240} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
241static int pmap_initialized;
242
243static struct rwlock_padalign pvh_global_lock;
244
245/*
246 * Data for the pv entry allocation mechanism
247 */
248static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
249static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
250static struct md_page *pv_table;
251static int shpgperproc = PMAP_SHPGPERPROC;
252
253struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
254int pv_maxchunks;			/* How many chunks we have KVA for */
255vm_offset_t pv_vafree;			/* freelist stored in the PTE */
256
257/*
258 * All those kernel PT submaps that BSD is so fond of
259 */
260pt_entry_t *CMAP3;
261static pd_entry_t *KPTD;
262caddr_t ptvmmap = 0;
263caddr_t CADDR3;
264struct msgbuf *msgbufp = NULL;
265
266/*
267 * Crashdump maps.
268 */
269static caddr_t crashdumpmap;
270
271static pt_entry_t *PMAP1 = NULL, *PMAP2;
272static pt_entry_t *PADDR1 = NULL, *PADDR2;
273#ifdef SMP
274static int PMAP1cpu;
275static int PMAP1changedcpu;
276SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
277	   &PMAP1changedcpu, 0,
278	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
279#endif
280static int PMAP1changed;
281SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
282	   &PMAP1changed, 0,
283	   "Number of times pmap_pte_quick changed PMAP1");
284static int PMAP1unchanged;
285SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
286	   &PMAP1unchanged, 0,
287	   "Number of times pmap_pte_quick didn't change PMAP1");
288static struct mtx PMAP2mutex;
289
290static void	free_pv_chunk(struct pv_chunk *pc);
291static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
292static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
293static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
294static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
295static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
296static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
297static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
298		    vm_offset_t va);
299static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
300
301static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
302static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
303    vm_prot_t prot);
304static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
305    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
306static void pmap_flush_page(vm_page_t m);
307static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
308static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
309static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
310static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
311static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
312static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
313static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
314static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
315static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
316static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
317    vm_prot_t prot);
318static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
319static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
320    struct spglist *free);
321static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
322    struct spglist *free);
323static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
324static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
325    struct spglist *free);
326static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
327					vm_offset_t va);
328static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
329static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
330    vm_page_t m);
331static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
332    pd_entry_t newpde);
333static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
334
335static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
336
337static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
338static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
339static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
340static void pmap_pte_release(pt_entry_t *pte);
341static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
342#if defined(PAE) || defined(PAE_TABLES)
343static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
344    int wait);
345#endif
346static void pmap_set_pg(void);
347
348static __inline void pagezero(void *page);
349
350CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
351CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
352
353/*
354 * If you get an error here, then you set KVA_PAGES wrong! See the
355 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
356 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
357 */
358CTASSERT(KERNBASE % (1 << 24) == 0);
359
360/*
361 *	Bootstrap the system enough to run with virtual memory.
362 *
363 *	On the i386 this is called after mapping has already been enabled
364 *	and just syncs the pmap module with what has already been done.
365 *	[We can't call it easily with mapping off since the kernel is not
366 *	mapped with PA == VA, hence we would have to relocate every address
367 *	from the linked base (virtual) address "KERNBASE" to the actual
368 *	(physical) address starting relative to 0]
369 */
370void
371pmap_bootstrap(vm_paddr_t firstaddr)
372{
373	vm_offset_t va;
374	pt_entry_t *pte, *unused;
375	struct pcpu *pc;
376	int i;
377
378	/*
379	 * Add a physical memory segment (vm_phys_seg) corresponding to the
380	 * preallocated kernel page table pages so that vm_page structures
381	 * representing these pages will be created.  The vm_page structures
382	 * are required for promotion of the corresponding kernel virtual
383	 * addresses to superpage mappings.
384	 */
385	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
386
387	/*
388	 * Initialize the first available kernel virtual address.  However,
389	 * using "firstaddr" may waste a few pages of the kernel virtual
390	 * address space, because locore may not have mapped every physical
391	 * page that it allocated.  Preferably, locore would provide a first
392	 * unused virtual address in addition to "firstaddr".
393	 */
394	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
395
396	virtual_end = VM_MAX_KERNEL_ADDRESS;
397
398	/*
399	 * Initialize the kernel pmap (which is statically allocated).
400	 */
401	PMAP_LOCK_INIT(kernel_pmap);
402	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
403#if defined(PAE) || defined(PAE_TABLES)
404	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
405#endif
406	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
407	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
408
409 	/*
410	 * Initialize the global pv list lock.
411	 */
412	rw_init(&pvh_global_lock, "pmap pv global");
413
414	LIST_INIT(&allpmaps);
415
416	/*
417	 * Request a spin mutex so that changes to allpmaps cannot be
418	 * preempted by smp_rendezvous_cpus().  Otherwise,
419	 * pmap_update_pde_kernel() could access allpmaps while it is
420	 * being changed.
421	 */
422	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
423	mtx_lock_spin(&allpmaps_lock);
424	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
425	mtx_unlock_spin(&allpmaps_lock);
426
427	/*
428	 * Reserve some special page table entries/VA space for temporary
429	 * mapping of pages.
430	 */
431#define	SYSMAP(c, p, v, n)	\
432	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
433
434	va = virtual_avail;
435	pte = vtopte(va);
436
437
438	/*
439	 * Initialize temporary map objects on the current CPU for use
440	 * during early boot.
441	 * CMAP1/CMAP2 are used for zeroing and copying pages.
442	 * CMAP3 is used for the idle process page zeroing.
443	 */
444	pc = pcpu_find(curcpu);
445	mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
446	SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
447	SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
448	SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
449
450	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
451
452	/*
453	 * Crashdump maps.
454	 */
455	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
456
457	/*
458	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
459	 */
460	SYSMAP(caddr_t, unused, ptvmmap, 1)
461
462	/*
463	 * msgbufp is used to map the system message buffer.
464	 */
465	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
466
467	/*
468	 * KPTmap is used by pmap_kextract().
469	 *
470	 * KPTmap is first initialized by locore.  However, that initial
471	 * KPTmap can only support NKPT page table pages.  Here, a larger
472	 * KPTmap is created that can support KVA_PAGES page table pages.
473	 */
474	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
475
476	for (i = 0; i < NKPT; i++)
477		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
478
479	/*
480	 * Adjust the start of the KPTD and KPTmap so that the implementation
481	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
482	 */
483	KPTD -= KPTDI;
484	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
485
486	/*
487	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
488	 * respectively.
489	 */
490	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
491	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
492
493	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
494
495	virtual_avail = va;
496
497	/*
498	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
499	 * physical memory region that is used by the ACPI wakeup code.  This
500	 * mapping must not have PG_G set.
501	 */
502#ifdef XBOX
503	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
504	 * an early stadium, we cannot yet neatly map video memory ... :-(
505	 * Better fixes are very welcome! */
506	if (!arch_i386_is_xbox)
507#endif
508	for (i = 1; i < NKPT; i++)
509		PTD[i] = 0;
510
511	/* Initialize the PAT MSR if present. */
512	pmap_init_pat();
513
514	/* Turn on PG_G on kernel page(s) */
515	pmap_set_pg();
516}
517
518static void
519pmap_init_reserved_pages(void)
520{
521	struct pcpu *pc;
522	vm_offset_t pages;
523	int i;
524
525	CPU_FOREACH(i) {
526		pc = pcpu_find(i);
527		/*
528		 * Skip if the mapping has already been initialized,
529		 * i.e. this is the BSP.
530		 */
531		if (pc->pc_cmap_addr1 != 0)
532			continue;
533		mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
534		pages = kva_alloc(PAGE_SIZE * 3);
535		if (pages == 0)
536			panic("%s: unable to allocate KVA", __func__);
537		pc->pc_cmap_pte1 = vtopte(pages);
538		pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
539		pc->pc_cmap_addr1 = (caddr_t)pages;
540		pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
541		pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
542	}
543}
544
545SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
546
547/*
548 * Setup the PAT MSR.
549 */
550void
551pmap_init_pat(void)
552{
553	int pat_table[PAT_INDEX_SIZE];
554	uint64_t pat_msr;
555	u_long cr0, cr4;
556	int i;
557
558	/* Set default PAT index table. */
559	for (i = 0; i < PAT_INDEX_SIZE; i++)
560		pat_table[i] = -1;
561	pat_table[PAT_WRITE_BACK] = 0;
562	pat_table[PAT_WRITE_THROUGH] = 1;
563	pat_table[PAT_UNCACHEABLE] = 3;
564	pat_table[PAT_WRITE_COMBINING] = 3;
565	pat_table[PAT_WRITE_PROTECTED] = 3;
566	pat_table[PAT_UNCACHED] = 3;
567
568	/* Bail if this CPU doesn't implement PAT. */
569	if ((cpu_feature & CPUID_PAT) == 0) {
570		for (i = 0; i < PAT_INDEX_SIZE; i++)
571			pat_index[i] = pat_table[i];
572		pat_works = 0;
573		return;
574	}
575
576	/*
577	 * Due to some Intel errata, we can only safely use the lower 4
578	 * PAT entries.
579	 *
580	 *   Intel Pentium III Processor Specification Update
581	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
582	 * or Mode C Paging)
583	 *
584	 *   Intel Pentium IV  Processor Specification Update
585	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
586	 */
587	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
588	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
589		pat_works = 0;
590
591	/* Initialize default PAT entries. */
592	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
593	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
594	    PAT_VALUE(2, PAT_UNCACHED) |
595	    PAT_VALUE(3, PAT_UNCACHEABLE) |
596	    PAT_VALUE(4, PAT_WRITE_BACK) |
597	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
598	    PAT_VALUE(6, PAT_UNCACHED) |
599	    PAT_VALUE(7, PAT_UNCACHEABLE);
600
601	if (pat_works) {
602		/*
603		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
604		 * Program 5 and 6 as WP and WC.
605		 * Leave 4 and 7 as WB and UC.
606		 */
607		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
608		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
609		    PAT_VALUE(6, PAT_WRITE_COMBINING);
610		pat_table[PAT_UNCACHED] = 2;
611		pat_table[PAT_WRITE_PROTECTED] = 5;
612		pat_table[PAT_WRITE_COMBINING] = 6;
613	} else {
614		/*
615		 * Just replace PAT Index 2 with WC instead of UC-.
616		 */
617		pat_msr &= ~PAT_MASK(2);
618		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
619		pat_table[PAT_WRITE_COMBINING] = 2;
620	}
621
622	/* Disable PGE. */
623	cr4 = rcr4();
624	load_cr4(cr4 & ~CR4_PGE);
625
626	/* Disable caches (CD = 1, NW = 0). */
627	cr0 = rcr0();
628	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
629
630	/* Flushes caches and TLBs. */
631	wbinvd();
632	invltlb();
633
634	/* Update PAT and index table. */
635	wrmsr(MSR_PAT, pat_msr);
636	for (i = 0; i < PAT_INDEX_SIZE; i++)
637		pat_index[i] = pat_table[i];
638
639	/* Flush caches and TLBs again. */
640	wbinvd();
641	invltlb();
642
643	/* Restore caches and PGE. */
644	load_cr0(cr0);
645	load_cr4(cr4);
646}
647
648/*
649 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
650 */
651static void
652pmap_set_pg(void)
653{
654	pt_entry_t *pte;
655	vm_offset_t va, endva;
656
657	if (pgeflag == 0)
658		return;
659
660	endva = KERNBASE + KERNend;
661
662	if (pseflag) {
663		va = KERNBASE + KERNLOAD;
664		while (va  < endva) {
665			pdir_pde(PTD, va) |= pgeflag;
666			invltlb();	/* Flush non-PG_G entries. */
667			va += NBPDR;
668		}
669	} else {
670		va = (vm_offset_t)btext;
671		while (va < endva) {
672			pte = vtopte(va);
673			if (*pte)
674				*pte |= pgeflag;
675			invltlb();	/* Flush non-PG_G entries. */
676			va += PAGE_SIZE;
677		}
678	}
679}
680
681/*
682 * Initialize a vm_page's machine-dependent fields.
683 */
684void
685pmap_page_init(vm_page_t m)
686{
687
688	TAILQ_INIT(&m->md.pv_list);
689	m->md.pat_mode = PAT_WRITE_BACK;
690}
691
692#if defined(PAE) || defined(PAE_TABLES)
693static void *
694pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
695{
696
697	/* Inform UMA that this allocator uses kernel_map/object. */
698	*flags = UMA_SLAB_KERNEL;
699	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
700	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
701}
702#endif
703
704/*
705 * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
706 * Requirements:
707 *  - Must deal with pages in order to ensure that none of the PG_* bits
708 *    are ever set, PG_V in particular.
709 *  - Assumes we can write to ptes without pte_store() atomic ops, even
710 *    on PAE systems.  This should be ok.
711 *  - Assumes nothing will ever test these addresses for 0 to indicate
712 *    no mapping instead of correctly checking PG_V.
713 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
714 * Because PG_V is never set, there can be no mappings to invalidate.
715 */
716static vm_offset_t
717pmap_ptelist_alloc(vm_offset_t *head)
718{
719	pt_entry_t *pte;
720	vm_offset_t va;
721
722	va = *head;
723	if (va == 0)
724		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
725	pte = vtopte(va);
726	*head = *pte;
727	if (*head & PG_V)
728		panic("pmap_ptelist_alloc: va with PG_V set!");
729	*pte = 0;
730	return (va);
731}
732
733static void
734pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
735{
736	pt_entry_t *pte;
737
738	if (va & PG_V)
739		panic("pmap_ptelist_free: freeing va with PG_V set!");
740	pte = vtopte(va);
741	*pte = *head;		/* virtual! PG_V is 0 though */
742	*head = va;
743}
744
745static void
746pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
747{
748	int i;
749	vm_offset_t va;
750
751	*head = 0;
752	for (i = npages - 1; i >= 0; i--) {
753		va = (vm_offset_t)base + i * PAGE_SIZE;
754		pmap_ptelist_free(head, va);
755	}
756}
757
758
759/*
760 *	Initialize the pmap module.
761 *	Called by vm_init, to initialize any structures that the pmap
762 *	system needs to map virtual memory.
763 */
764void
765pmap_init(void)
766{
767	struct pmap_preinit_mapping *ppim;
768	vm_page_t mpte;
769	vm_size_t s;
770	int i, pv_npg;
771
772	/*
773	 * Initialize the vm page array entries for the kernel pmap's
774	 * page table pages.
775	 */
776	for (i = 0; i < NKPT; i++) {
777		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
778		KASSERT(mpte >= vm_page_array &&
779		    mpte < &vm_page_array[vm_page_array_size],
780		    ("pmap_init: page table page is out of range"));
781		mpte->pindex = i + KPTDI;
782		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
783	}
784
785	/*
786	 * Initialize the address space (zone) for the pv entries.  Set a
787	 * high water mark so that the system can recover from excessive
788	 * numbers of pv entries.
789	 */
790	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
791	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
792	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
793	pv_entry_max = roundup(pv_entry_max, _NPCPV);
794	pv_entry_high_water = 9 * (pv_entry_max / 10);
795
796	/*
797	 * If the kernel is running on a virtual machine, then it must assume
798	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
799	 * be prepared for the hypervisor changing the vendor and family that
800	 * are reported by CPUID.  Consequently, the workaround for AMD Family
801	 * 10h Erratum 383 is enabled if the processor's feature set does not
802	 * include at least one feature that is only supported by older Intel
803	 * or newer AMD processors.
804	 */
805	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
806	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
807	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
808	    AMDID2_FMA4)) == 0)
809		workaround_erratum383 = 1;
810
811	/*
812	 * Are large page mappings supported and enabled?
813	 */
814	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
815	if (pseflag == 0)
816		pg_ps_enabled = 0;
817	else if (pg_ps_enabled) {
818		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
819		    ("pmap_init: can't assign to pagesizes[1]"));
820		pagesizes[1] = NBPDR;
821	}
822
823	/*
824	 * Calculate the size of the pv head table for superpages.
825	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
826	 */
827	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
828	    PAGE_SIZE) / NBPDR + 1;
829
830	/*
831	 * Allocate memory for the pv head table for superpages.
832	 */
833	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
834	s = round_page(s);
835	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
836	    M_WAITOK | M_ZERO);
837	for (i = 0; i < pv_npg; i++)
838		TAILQ_INIT(&pv_table[i].pv_list);
839
840	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
841	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
842	if (pv_chunkbase == NULL)
843		panic("pmap_init: not enough kvm for pv chunks");
844	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
845#if defined(PAE) || defined(PAE_TABLES)
846	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
847	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
848	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
849	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
850#endif
851
852	pmap_initialized = 1;
853	if (!bootverbose)
854		return;
855	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
856		ppim = pmap_preinit_mapping + i;
857		if (ppim->va == 0)
858			continue;
859		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
860		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
861	}
862}
863
864
865SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
866	"Max number of PV entries");
867SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
868	"Page share factor per proc");
869
870static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
871    "2/4MB page mapping counters");
872
873static u_long pmap_pde_demotions;
874SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
875    &pmap_pde_demotions, 0, "2/4MB page demotions");
876
877static u_long pmap_pde_mappings;
878SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
879    &pmap_pde_mappings, 0, "2/4MB page mappings");
880
881static u_long pmap_pde_p_failures;
882SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
883    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
884
885static u_long pmap_pde_promotions;
886SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
887    &pmap_pde_promotions, 0, "2/4MB page promotions");
888
889/***************************************************
890 * Low level helper routines.....
891 ***************************************************/
892
893/*
894 * Determine the appropriate bits to set in a PTE or PDE for a specified
895 * caching mode.
896 */
897int
898pmap_cache_bits(int mode, boolean_t is_pde)
899{
900	int cache_bits, pat_flag, pat_idx;
901
902	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
903		panic("Unknown caching mode %d\n", mode);
904
905	/* The PAT bit is different for PTE's and PDE's. */
906	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
907
908	/* Map the caching mode to a PAT index. */
909	pat_idx = pat_index[mode];
910
911	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
912	cache_bits = 0;
913	if (pat_idx & 0x4)
914		cache_bits |= pat_flag;
915	if (pat_idx & 0x2)
916		cache_bits |= PG_NC_PCD;
917	if (pat_idx & 0x1)
918		cache_bits |= PG_NC_PWT;
919	return (cache_bits);
920}
921
922/*
923 * The caller is responsible for maintaining TLB consistency.
924 */
925static void
926pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
927{
928	pd_entry_t *pde;
929	pmap_t pmap;
930	boolean_t PTD_updated;
931
932	PTD_updated = FALSE;
933	mtx_lock_spin(&allpmaps_lock);
934	LIST_FOREACH(pmap, &allpmaps, pm_list) {
935		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
936		    PG_FRAME))
937			PTD_updated = TRUE;
938		pde = pmap_pde(pmap, va);
939		pde_store(pde, newpde);
940	}
941	mtx_unlock_spin(&allpmaps_lock);
942	KASSERT(PTD_updated,
943	    ("pmap_kenter_pde: current page table is not in allpmaps"));
944}
945
946/*
947 * After changing the page size for the specified virtual address in the page
948 * table, flush the corresponding entries from the processor's TLB.  Only the
949 * calling processor's TLB is affected.
950 *
951 * The calling thread must be pinned to a processor.
952 */
953static void
954pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
955{
956	u_long cr4;
957
958	if ((newpde & PG_PS) == 0)
959		/* Demotion: flush a specific 2MB page mapping. */
960		invlpg(va);
961	else if ((newpde & PG_G) == 0)
962		/*
963		 * Promotion: flush every 4KB page mapping from the TLB
964		 * because there are too many to flush individually.
965		 */
966		invltlb();
967	else {
968		/*
969		 * Promotion: flush every 4KB page mapping from the TLB,
970		 * including any global (PG_G) mappings.
971		 */
972		cr4 = rcr4();
973		load_cr4(cr4 & ~CR4_PGE);
974		/*
975		 * Although preemption at this point could be detrimental to
976		 * performance, it would not lead to an error.  PG_G is simply
977		 * ignored if CR4.PGE is clear.  Moreover, in case this block
978		 * is re-entered, the load_cr4() either above or below will
979		 * modify CR4.PGE flushing the TLB.
980		 */
981		load_cr4(cr4 | CR4_PGE);
982	}
983}
984
985void
986invltlb_glob(void)
987{
988	uint64_t cr4;
989
990	if (pgeflag == 0) {
991		invltlb();
992	} else {
993		cr4 = rcr4();
994		load_cr4(cr4 & ~CR4_PGE);
995		load_cr4(cr4 | CR4_PGE);
996	}
997}
998
999
1000#ifdef SMP
1001/*
1002 * For SMP, these functions have to use the IPI mechanism for coherence.
1003 *
1004 * N.B.: Before calling any of the following TLB invalidation functions,
1005 * the calling processor must ensure that all stores updating a non-
1006 * kernel page table are globally performed.  Otherwise, another
1007 * processor could cache an old, pre-update entry without being
1008 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1009 * active on another processor after its pm_active field is checked by
1010 * one of the following functions but before a store updating the page
1011 * table is globally performed. (2) The pmap becomes active on another
1012 * processor before its pm_active field is checked but due to
1013 * speculative loads one of the following functions stills reads the
1014 * pmap as inactive on the other processor.
1015 *
1016 * The kernel page table is exempt because its pm_active field is
1017 * immutable.  The kernel page table is always active on every
1018 * processor.
1019 */
1020void
1021pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1022{
1023	cpuset_t *mask, other_cpus;
1024	u_int cpuid;
1025
1026	sched_pin();
1027	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1028		invlpg(va);
1029		mask = &all_cpus;
1030	} else {
1031		cpuid = PCPU_GET(cpuid);
1032		other_cpus = all_cpus;
1033		CPU_CLR(cpuid, &other_cpus);
1034		if (CPU_ISSET(cpuid, &pmap->pm_active))
1035			invlpg(va);
1036		CPU_AND(&other_cpus, &pmap->pm_active);
1037		mask = &other_cpus;
1038	}
1039	smp_masked_invlpg(*mask, va);
1040	sched_unpin();
1041}
1042
1043/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1044#define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
1045
1046void
1047pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1048{
1049	cpuset_t *mask, other_cpus;
1050	vm_offset_t addr;
1051	u_int cpuid;
1052
1053	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1054		pmap_invalidate_all(pmap);
1055		return;
1056	}
1057
1058	sched_pin();
1059	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1060		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1061			invlpg(addr);
1062		mask = &all_cpus;
1063	} else {
1064		cpuid = PCPU_GET(cpuid);
1065		other_cpus = all_cpus;
1066		CPU_CLR(cpuid, &other_cpus);
1067		if (CPU_ISSET(cpuid, &pmap->pm_active))
1068			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1069				invlpg(addr);
1070		CPU_AND(&other_cpus, &pmap->pm_active);
1071		mask = &other_cpus;
1072	}
1073	smp_masked_invlpg_range(*mask, sva, eva);
1074	sched_unpin();
1075}
1076
1077void
1078pmap_invalidate_all(pmap_t pmap)
1079{
1080	cpuset_t *mask, other_cpus;
1081	u_int cpuid;
1082
1083	sched_pin();
1084	if (pmap == kernel_pmap) {
1085		invltlb_glob();
1086		mask = &all_cpus;
1087	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
1088		invltlb();
1089		mask = &all_cpus;
1090	} else {
1091		cpuid = PCPU_GET(cpuid);
1092		other_cpus = all_cpus;
1093		CPU_CLR(cpuid, &other_cpus);
1094		if (CPU_ISSET(cpuid, &pmap->pm_active))
1095			invltlb();
1096		CPU_AND(&other_cpus, &pmap->pm_active);
1097		mask = &other_cpus;
1098	}
1099	smp_masked_invltlb(*mask, pmap);
1100	sched_unpin();
1101}
1102
1103void
1104pmap_invalidate_cache(void)
1105{
1106
1107	sched_pin();
1108	wbinvd();
1109	smp_cache_flush();
1110	sched_unpin();
1111}
1112
1113struct pde_action {
1114	cpuset_t invalidate;	/* processors that invalidate their TLB */
1115	vm_offset_t va;
1116	pd_entry_t *pde;
1117	pd_entry_t newpde;
1118	u_int store;		/* processor that updates the PDE */
1119};
1120
1121static void
1122pmap_update_pde_kernel(void *arg)
1123{
1124	struct pde_action *act = arg;
1125	pd_entry_t *pde;
1126	pmap_t pmap;
1127
1128	if (act->store == PCPU_GET(cpuid)) {
1129
1130		/*
1131		 * Elsewhere, this operation requires allpmaps_lock for
1132		 * synchronization.  Here, it does not because it is being
1133		 * performed in the context of an all_cpus rendezvous.
1134		 */
1135		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1136			pde = pmap_pde(pmap, act->va);
1137			pde_store(pde, act->newpde);
1138		}
1139	}
1140}
1141
1142static void
1143pmap_update_pde_user(void *arg)
1144{
1145	struct pde_action *act = arg;
1146
1147	if (act->store == PCPU_GET(cpuid))
1148		pde_store(act->pde, act->newpde);
1149}
1150
1151static void
1152pmap_update_pde_teardown(void *arg)
1153{
1154	struct pde_action *act = arg;
1155
1156	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1157		pmap_update_pde_invalidate(act->va, act->newpde);
1158}
1159
1160/*
1161 * Change the page size for the specified virtual address in a way that
1162 * prevents any possibility of the TLB ever having two entries that map the
1163 * same virtual address using different page sizes.  This is the recommended
1164 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1165 * machine check exception for a TLB state that is improperly diagnosed as a
1166 * hardware error.
1167 */
1168static void
1169pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1170{
1171	struct pde_action act;
1172	cpuset_t active, other_cpus;
1173	u_int cpuid;
1174
1175	sched_pin();
1176	cpuid = PCPU_GET(cpuid);
1177	other_cpus = all_cpus;
1178	CPU_CLR(cpuid, &other_cpus);
1179	if (pmap == kernel_pmap)
1180		active = all_cpus;
1181	else
1182		active = pmap->pm_active;
1183	if (CPU_OVERLAP(&active, &other_cpus)) {
1184		act.store = cpuid;
1185		act.invalidate = active;
1186		act.va = va;
1187		act.pde = pde;
1188		act.newpde = newpde;
1189		CPU_SET(cpuid, &active);
1190		smp_rendezvous_cpus(active,
1191		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1192		    pmap_update_pde_kernel : pmap_update_pde_user,
1193		    pmap_update_pde_teardown, &act);
1194	} else {
1195		if (pmap == kernel_pmap)
1196			pmap_kenter_pde(va, newpde);
1197		else
1198			pde_store(pde, newpde);
1199		if (CPU_ISSET(cpuid, &active))
1200			pmap_update_pde_invalidate(va, newpde);
1201	}
1202	sched_unpin();
1203}
1204#else /* !SMP */
1205/*
1206 * Normal, non-SMP, 486+ invalidation functions.
1207 * We inline these within pmap.c for speed.
1208 */
1209PMAP_INLINE void
1210pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1211{
1212
1213	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1214		invlpg(va);
1215}
1216
1217PMAP_INLINE void
1218pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1219{
1220	vm_offset_t addr;
1221
1222	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1223		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1224			invlpg(addr);
1225}
1226
1227PMAP_INLINE void
1228pmap_invalidate_all(pmap_t pmap)
1229{
1230
1231	if (pmap == kernel_pmap)
1232		invltlb_glob();
1233	else if (!CPU_EMPTY(&pmap->pm_active))
1234		invltlb();
1235}
1236
1237PMAP_INLINE void
1238pmap_invalidate_cache(void)
1239{
1240
1241	wbinvd();
1242}
1243
1244static void
1245pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1246{
1247
1248	if (pmap == kernel_pmap)
1249		pmap_kenter_pde(va, newpde);
1250	else
1251		pde_store(pde, newpde);
1252	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1253		pmap_update_pde_invalidate(va, newpde);
1254}
1255#endif /* !SMP */
1256
1257#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1258
1259void
1260pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1261{
1262
1263	if (force) {
1264		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1265	} else {
1266		KASSERT((sva & PAGE_MASK) == 0,
1267		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1268		KASSERT((eva & PAGE_MASK) == 0,
1269		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1270	}
1271
1272	if ((cpu_feature & CPUID_SS) != 0 && !force)
1273		; /* If "Self Snoop" is supported and allowed, do nothing. */
1274	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1275	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1276#ifdef DEV_APIC
1277		/*
1278		 * XXX: Some CPUs fault, hang, or trash the local APIC
1279		 * registers if we use CLFLUSH on the local APIC
1280		 * range.  The local APIC is always uncached, so we
1281		 * don't need to flush for that range anyway.
1282		 */
1283		if (pmap_kextract(sva) == lapic_paddr)
1284			return;
1285#endif
1286		/*
1287		 * Otherwise, do per-cache line flush.  Use the sfence
1288		 * instruction to insure that previous stores are
1289		 * included in the write-back.  The processor
1290		 * propagates flush to other processors in the cache
1291		 * coherence domain.
1292		 */
1293		sfence();
1294		for (; sva < eva; sva += cpu_clflush_line_size)
1295			clflushopt(sva);
1296		sfence();
1297	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1298	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1299#ifdef DEV_APIC
1300		if (pmap_kextract(sva) == lapic_paddr)
1301			return;
1302#endif
1303		/*
1304		 * Writes are ordered by CLFLUSH on Intel CPUs.
1305		 */
1306		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1307			mfence();
1308		for (; sva < eva; sva += cpu_clflush_line_size)
1309			clflush(sva);
1310		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1311			mfence();
1312	} else {
1313
1314		/*
1315		 * No targeted cache flush methods are supported by CPU,
1316		 * or the supplied range is bigger than 2MB.
1317		 * Globally invalidate cache.
1318		 */
1319		pmap_invalidate_cache();
1320	}
1321}
1322
1323void
1324pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1325{
1326	int i;
1327
1328	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1329	    (cpu_feature & CPUID_CLFSH) == 0) {
1330		pmap_invalidate_cache();
1331	} else {
1332		for (i = 0; i < count; i++)
1333			pmap_flush_page(pages[i]);
1334	}
1335}
1336
1337/*
1338 * Are we current address space or kernel?
1339 */
1340static __inline int
1341pmap_is_current(pmap_t pmap)
1342{
1343
1344	return (pmap == kernel_pmap || pmap ==
1345	    vmspace_pmap(curthread->td_proc->p_vmspace));
1346}
1347
1348/*
1349 * If the given pmap is not the current or kernel pmap, the returned pte must
1350 * be released by passing it to pmap_pte_release().
1351 */
1352pt_entry_t *
1353pmap_pte(pmap_t pmap, vm_offset_t va)
1354{
1355	pd_entry_t newpf;
1356	pd_entry_t *pde;
1357
1358	pde = pmap_pde(pmap, va);
1359	if (*pde & PG_PS)
1360		return (pde);
1361	if (*pde != 0) {
1362		/* are we current address space or kernel? */
1363		if (pmap_is_current(pmap))
1364			return (vtopte(va));
1365		mtx_lock(&PMAP2mutex);
1366		newpf = *pde & PG_FRAME;
1367		if ((*PMAP2 & PG_FRAME) != newpf) {
1368			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1369			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1370		}
1371		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1372	}
1373	return (NULL);
1374}
1375
1376/*
1377 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1378 * being NULL.
1379 */
1380static __inline void
1381pmap_pte_release(pt_entry_t *pte)
1382{
1383
1384	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1385		mtx_unlock(&PMAP2mutex);
1386}
1387
1388/*
1389 * NB:  The sequence of updating a page table followed by accesses to the
1390 * corresponding pages is subject to the situation described in the "AMD64
1391 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1392 * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
1393 * right after modifying the PTE bits is crucial.
1394 */
1395static __inline void
1396invlcaddr(void *caddr)
1397{
1398
1399	invlpg((u_int)caddr);
1400}
1401
1402/*
1403 * Super fast pmap_pte routine best used when scanning
1404 * the pv lists.  This eliminates many coarse-grained
1405 * invltlb calls.  Note that many of the pv list
1406 * scans are across different pmaps.  It is very wasteful
1407 * to do an entire invltlb for checking a single mapping.
1408 *
1409 * If the given pmap is not the current pmap, pvh_global_lock
1410 * must be held and curthread pinned to a CPU.
1411 */
1412static pt_entry_t *
1413pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1414{
1415	pd_entry_t newpf;
1416	pd_entry_t *pde;
1417
1418	pde = pmap_pde(pmap, va);
1419	if (*pde & PG_PS)
1420		return (pde);
1421	if (*pde != 0) {
1422		/* are we current address space or kernel? */
1423		if (pmap_is_current(pmap))
1424			return (vtopte(va));
1425		rw_assert(&pvh_global_lock, RA_WLOCKED);
1426		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1427		newpf = *pde & PG_FRAME;
1428		if ((*PMAP1 & PG_FRAME) != newpf) {
1429			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1430#ifdef SMP
1431			PMAP1cpu = PCPU_GET(cpuid);
1432#endif
1433			invlcaddr(PADDR1);
1434			PMAP1changed++;
1435		} else
1436#ifdef SMP
1437		if (PMAP1cpu != PCPU_GET(cpuid)) {
1438			PMAP1cpu = PCPU_GET(cpuid);
1439			invlcaddr(PADDR1);
1440			PMAP1changedcpu++;
1441		} else
1442#endif
1443			PMAP1unchanged++;
1444		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1445	}
1446	return (0);
1447}
1448
1449/*
1450 *	Routine:	pmap_extract
1451 *	Function:
1452 *		Extract the physical page address associated
1453 *		with the given map/virtual_address pair.
1454 */
1455vm_paddr_t
1456pmap_extract(pmap_t pmap, vm_offset_t va)
1457{
1458	vm_paddr_t rtval;
1459	pt_entry_t *pte;
1460	pd_entry_t pde;
1461
1462	rtval = 0;
1463	PMAP_LOCK(pmap);
1464	pde = pmap->pm_pdir[va >> PDRSHIFT];
1465	if (pde != 0) {
1466		if ((pde & PG_PS) != 0)
1467			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1468		else {
1469			pte = pmap_pte(pmap, va);
1470			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1471			pmap_pte_release(pte);
1472		}
1473	}
1474	PMAP_UNLOCK(pmap);
1475	return (rtval);
1476}
1477
1478/*
1479 *	Routine:	pmap_extract_and_hold
1480 *	Function:
1481 *		Atomically extract and hold the physical page
1482 *		with the given pmap and virtual address pair
1483 *		if that mapping permits the given protection.
1484 */
1485vm_page_t
1486pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1487{
1488	pd_entry_t pde;
1489	pt_entry_t pte, *ptep;
1490	vm_page_t m;
1491	vm_paddr_t pa;
1492
1493	pa = 0;
1494	m = NULL;
1495	PMAP_LOCK(pmap);
1496retry:
1497	pde = *pmap_pde(pmap, va);
1498	if (pde != 0) {
1499		if (pde & PG_PS) {
1500			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1501				if (vm_page_pa_tryrelock(pmap, (pde &
1502				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1503					goto retry;
1504				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1505				    (va & PDRMASK));
1506				vm_page_hold(m);
1507			}
1508		} else {
1509			ptep = pmap_pte(pmap, va);
1510			pte = *ptep;
1511			pmap_pte_release(ptep);
1512			if (pte != 0 &&
1513			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1514				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1515				    &pa))
1516					goto retry;
1517				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1518				vm_page_hold(m);
1519			}
1520		}
1521	}
1522	PA_UNLOCK_COND(pa);
1523	PMAP_UNLOCK(pmap);
1524	return (m);
1525}
1526
1527/***************************************************
1528 * Low level mapping routines.....
1529 ***************************************************/
1530
1531/*
1532 * Add a wired page to the kva.
1533 * Note: not SMP coherent.
1534 *
1535 * This function may be used before pmap_bootstrap() is called.
1536 */
1537PMAP_INLINE void
1538pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1539{
1540	pt_entry_t *pte;
1541
1542	pte = vtopte(va);
1543	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1544}
1545
1546static __inline void
1547pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1548{
1549	pt_entry_t *pte;
1550
1551	pte = vtopte(va);
1552	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1553}
1554
1555/*
1556 * Remove a page from the kernel pagetables.
1557 * Note: not SMP coherent.
1558 *
1559 * This function may be used before pmap_bootstrap() is called.
1560 */
1561PMAP_INLINE void
1562pmap_kremove(vm_offset_t va)
1563{
1564	pt_entry_t *pte;
1565
1566	pte = vtopte(va);
1567	pte_clear(pte);
1568}
1569
1570/*
1571 *	Used to map a range of physical addresses into kernel
1572 *	virtual address space.
1573 *
1574 *	The value passed in '*virt' is a suggested virtual address for
1575 *	the mapping. Architectures which can support a direct-mapped
1576 *	physical to virtual region can return the appropriate address
1577 *	within that region, leaving '*virt' unchanged. Other
1578 *	architectures should map the pages starting at '*virt' and
1579 *	update '*virt' with the first usable address after the mapped
1580 *	region.
1581 */
1582vm_offset_t
1583pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1584{
1585	vm_offset_t va, sva;
1586	vm_paddr_t superpage_offset;
1587	pd_entry_t newpde;
1588
1589	va = *virt;
1590	/*
1591	 * Does the physical address range's size and alignment permit at
1592	 * least one superpage mapping to be created?
1593	 */
1594	superpage_offset = start & PDRMASK;
1595	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1596		/*
1597		 * Increase the starting virtual address so that its alignment
1598		 * does not preclude the use of superpage mappings.
1599		 */
1600		if ((va & PDRMASK) < superpage_offset)
1601			va = (va & ~PDRMASK) + superpage_offset;
1602		else if ((va & PDRMASK) > superpage_offset)
1603			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1604	}
1605	sva = va;
1606	while (start < end) {
1607		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1608		    pseflag) {
1609			KASSERT((va & PDRMASK) == 0,
1610			    ("pmap_map: misaligned va %#x", va));
1611			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1612			pmap_kenter_pde(va, newpde);
1613			va += NBPDR;
1614			start += NBPDR;
1615		} else {
1616			pmap_kenter(va, start);
1617			va += PAGE_SIZE;
1618			start += PAGE_SIZE;
1619		}
1620	}
1621	pmap_invalidate_range(kernel_pmap, sva, va);
1622	*virt = va;
1623	return (sva);
1624}
1625
1626
1627/*
1628 * Add a list of wired pages to the kva
1629 * this routine is only used for temporary
1630 * kernel mappings that do not need to have
1631 * page modification or references recorded.
1632 * Note that old mappings are simply written
1633 * over.  The page *must* be wired.
1634 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1635 */
1636void
1637pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1638{
1639	pt_entry_t *endpte, oldpte, pa, *pte;
1640	vm_page_t m;
1641
1642	oldpte = 0;
1643	pte = vtopte(sva);
1644	endpte = pte + count;
1645	while (pte < endpte) {
1646		m = *ma++;
1647		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1648		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1649			oldpte |= *pte;
1650			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1651		}
1652		pte++;
1653	}
1654	if (__predict_false((oldpte & PG_V) != 0))
1655		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1656		    PAGE_SIZE);
1657}
1658
1659/*
1660 * This routine tears out page mappings from the
1661 * kernel -- it is meant only for temporary mappings.
1662 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1663 */
1664void
1665pmap_qremove(vm_offset_t sva, int count)
1666{
1667	vm_offset_t va;
1668
1669	va = sva;
1670	while (count-- > 0) {
1671		pmap_kremove(va);
1672		va += PAGE_SIZE;
1673	}
1674	pmap_invalidate_range(kernel_pmap, sva, va);
1675}
1676
1677/***************************************************
1678 * Page table page management routines.....
1679 ***************************************************/
1680static __inline void
1681pmap_free_zero_pages(struct spglist *free)
1682{
1683	vm_page_t m;
1684
1685	while ((m = SLIST_FIRST(free)) != NULL) {
1686		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1687		/* Preserve the page's PG_ZERO setting. */
1688		vm_page_free_toq(m);
1689	}
1690}
1691
1692/*
1693 * Schedule the specified unused page table page to be freed.  Specifically,
1694 * add the page to the specified list of pages that will be released to the
1695 * physical memory manager after the TLB has been updated.
1696 */
1697static __inline void
1698pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1699    boolean_t set_PG_ZERO)
1700{
1701
1702	if (set_PG_ZERO)
1703		m->flags |= PG_ZERO;
1704	else
1705		m->flags &= ~PG_ZERO;
1706	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1707}
1708
1709/*
1710 * Inserts the specified page table page into the specified pmap's collection
1711 * of idle page table pages.  Each of a pmap's page table pages is responsible
1712 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1713 * ordered by this virtual address range.
1714 */
1715static __inline int
1716pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1717{
1718
1719	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1720	return (vm_radix_insert(&pmap->pm_root, mpte));
1721}
1722
1723/*
1724 * Looks for a page table page mapping the specified virtual address in the
1725 * specified pmap's collection of idle page table pages.  Returns NULL if there
1726 * is no page table page corresponding to the specified virtual address.
1727 */
1728static __inline vm_page_t
1729pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1730{
1731
1732	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1733	return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT));
1734}
1735
1736/*
1737 * Removes the specified page table page from the specified pmap's collection
1738 * of idle page table pages.  The specified page table page must be a member of
1739 * the pmap's collection.
1740 */
1741static __inline void
1742pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1743{
1744
1745	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1746	vm_radix_remove(&pmap->pm_root, mpte->pindex);
1747}
1748
1749/*
1750 * Decrements a page table page's wire count, which is used to record the
1751 * number of valid page table entries within the page.  If the wire count
1752 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1753 * page table page was unmapped and FALSE otherwise.
1754 */
1755static inline boolean_t
1756pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1757{
1758
1759	--m->wire_count;
1760	if (m->wire_count == 0) {
1761		_pmap_unwire_ptp(pmap, m, free);
1762		return (TRUE);
1763	} else
1764		return (FALSE);
1765}
1766
1767static void
1768_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1769{
1770	vm_offset_t pteva;
1771
1772	/*
1773	 * unmap the page table page
1774	 */
1775	pmap->pm_pdir[m->pindex] = 0;
1776	--pmap->pm_stats.resident_count;
1777
1778	/*
1779	 * This is a release store so that the ordinary store unmapping
1780	 * the page table page is globally performed before TLB shoot-
1781	 * down is begun.
1782	 */
1783	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
1784
1785	/*
1786	 * Do an invltlb to make the invalidated mapping
1787	 * take effect immediately.
1788	 */
1789	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1790	pmap_invalidate_page(pmap, pteva);
1791
1792	/*
1793	 * Put page on a list so that it is released after
1794	 * *ALL* TLB shootdown is done
1795	 */
1796	pmap_add_delayed_free_list(m, free, TRUE);
1797}
1798
1799/*
1800 * After removing a page table entry, this routine is used to
1801 * conditionally free the page, and manage the hold/wire counts.
1802 */
1803static int
1804pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1805{
1806	pd_entry_t ptepde;
1807	vm_page_t mpte;
1808
1809	if (va >= VM_MAXUSER_ADDRESS)
1810		return (0);
1811	ptepde = *pmap_pde(pmap, va);
1812	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1813	return (pmap_unwire_ptp(pmap, mpte, free));
1814}
1815
1816/*
1817 * Initialize the pmap for the swapper process.
1818 */
1819void
1820pmap_pinit0(pmap_t pmap)
1821{
1822
1823	PMAP_LOCK_INIT(pmap);
1824	/*
1825	 * Since the page table directory is shared with the kernel pmap,
1826	 * which is already included in the list "allpmaps", this pmap does
1827	 * not need to be inserted into that list.
1828	 */
1829	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1830#if defined(PAE) || defined(PAE_TABLES)
1831	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1832#endif
1833	pmap->pm_root.rt_root = 0;
1834	CPU_ZERO(&pmap->pm_active);
1835	PCPU_SET(curpmap, pmap);
1836	TAILQ_INIT(&pmap->pm_pvchunk);
1837	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1838}
1839
1840/*
1841 * Initialize a preallocated and zeroed pmap structure,
1842 * such as one in a vmspace structure.
1843 */
1844int
1845pmap_pinit(pmap_t pmap)
1846{
1847	vm_page_t m, ptdpg[NPGPTD];
1848	vm_paddr_t pa;
1849	int i;
1850
1851	/*
1852	 * No need to allocate page table space yet but we do need a valid
1853	 * page directory table.
1854	 */
1855	if (pmap->pm_pdir == NULL) {
1856		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1857		if (pmap->pm_pdir == NULL)
1858			return (0);
1859#if defined(PAE) || defined(PAE_TABLES)
1860		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1861		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1862		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1863		    ("pmap_pinit: pdpt misaligned"));
1864		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1865		    ("pmap_pinit: pdpt above 4g"));
1866#endif
1867		pmap->pm_root.rt_root = 0;
1868	}
1869	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1870	    ("pmap_pinit: pmap has reserved page table page(s)"));
1871
1872	/*
1873	 * allocate the page directory page(s)
1874	 */
1875	for (i = 0; i < NPGPTD;) {
1876		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1877		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1878		if (m == NULL)
1879			VM_WAIT;
1880		else {
1881			ptdpg[i++] = m;
1882		}
1883	}
1884
1885	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1886
1887	for (i = 0; i < NPGPTD; i++)
1888		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1889			pagezero(pmap->pm_pdir + (i * NPDEPG));
1890
1891	mtx_lock_spin(&allpmaps_lock);
1892	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1893	/* Copy the kernel page table directory entries. */
1894	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1895	mtx_unlock_spin(&allpmaps_lock);
1896
1897	/* install self-referential address mapping entry(s) */
1898	for (i = 0; i < NPGPTD; i++) {
1899		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1900		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1901#if defined(PAE) || defined(PAE_TABLES)
1902		pmap->pm_pdpt[i] = pa | PG_V;
1903#endif
1904	}
1905
1906	CPU_ZERO(&pmap->pm_active);
1907	TAILQ_INIT(&pmap->pm_pvchunk);
1908	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1909
1910	return (1);
1911}
1912
1913/*
1914 * this routine is called if the page table page is not
1915 * mapped correctly.
1916 */
1917static vm_page_t
1918_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1919{
1920	vm_paddr_t ptepa;
1921	vm_page_t m;
1922
1923	/*
1924	 * Allocate a page table page.
1925	 */
1926	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1927	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1928		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1929			PMAP_UNLOCK(pmap);
1930			rw_wunlock(&pvh_global_lock);
1931			VM_WAIT;
1932			rw_wlock(&pvh_global_lock);
1933			PMAP_LOCK(pmap);
1934		}
1935
1936		/*
1937		 * Indicate the need to retry.  While waiting, the page table
1938		 * page may have been allocated.
1939		 */
1940		return (NULL);
1941	}
1942	if ((m->flags & PG_ZERO) == 0)
1943		pmap_zero_page(m);
1944
1945	/*
1946	 * Map the pagetable page into the process address space, if
1947	 * it isn't already there.
1948	 */
1949
1950	pmap->pm_stats.resident_count++;
1951
1952	ptepa = VM_PAGE_TO_PHYS(m);
1953	pmap->pm_pdir[ptepindex] =
1954		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1955
1956	return (m);
1957}
1958
1959static vm_page_t
1960pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1961{
1962	u_int ptepindex;
1963	pd_entry_t ptepa;
1964	vm_page_t m;
1965
1966	/*
1967	 * Calculate pagetable page index
1968	 */
1969	ptepindex = va >> PDRSHIFT;
1970retry:
1971	/*
1972	 * Get the page directory entry
1973	 */
1974	ptepa = pmap->pm_pdir[ptepindex];
1975
1976	/*
1977	 * This supports switching from a 4MB page to a
1978	 * normal 4K page.
1979	 */
1980	if (ptepa & PG_PS) {
1981		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1982		ptepa = pmap->pm_pdir[ptepindex];
1983	}
1984
1985	/*
1986	 * If the page table page is mapped, we just increment the
1987	 * hold count, and activate it.
1988	 */
1989	if (ptepa) {
1990		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1991		m->wire_count++;
1992	} else {
1993		/*
1994		 * Here if the pte page isn't mapped, or if it has
1995		 * been deallocated.
1996		 */
1997		m = _pmap_allocpte(pmap, ptepindex, flags);
1998		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
1999			goto retry;
2000	}
2001	return (m);
2002}
2003
2004
2005/***************************************************
2006* Pmap allocation/deallocation routines.
2007 ***************************************************/
2008
2009/*
2010 * Release any resources held by the given physical map.
2011 * Called when a pmap initialized by pmap_pinit is being released.
2012 * Should only be called if the map contains no valid mappings.
2013 */
2014void
2015pmap_release(pmap_t pmap)
2016{
2017	vm_page_t m, ptdpg[NPGPTD];
2018	int i;
2019
2020	KASSERT(pmap->pm_stats.resident_count == 0,
2021	    ("pmap_release: pmap resident count %ld != 0",
2022	    pmap->pm_stats.resident_count));
2023	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2024	    ("pmap_release: pmap has reserved page table page(s)"));
2025	KASSERT(CPU_EMPTY(&pmap->pm_active),
2026	    ("releasing active pmap %p", pmap));
2027
2028	mtx_lock_spin(&allpmaps_lock);
2029	LIST_REMOVE(pmap, pm_list);
2030	mtx_unlock_spin(&allpmaps_lock);
2031
2032	for (i = 0; i < NPGPTD; i++)
2033		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2034		    PG_FRAME);
2035
2036	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2037	    sizeof(*pmap->pm_pdir));
2038
2039	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2040
2041	for (i = 0; i < NPGPTD; i++) {
2042		m = ptdpg[i];
2043#if defined(PAE) || defined(PAE_TABLES)
2044		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2045		    ("pmap_release: got wrong ptd page"));
2046#endif
2047		m->wire_count--;
2048		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2049		vm_page_free_zero(m);
2050	}
2051}
2052
2053static int
2054kvm_size(SYSCTL_HANDLER_ARGS)
2055{
2056	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2057
2058	return (sysctl_handle_long(oidp, &ksize, 0, req));
2059}
2060SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2061    0, 0, kvm_size, "IU", "Size of KVM");
2062
2063static int
2064kvm_free(SYSCTL_HANDLER_ARGS)
2065{
2066	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2067
2068	return (sysctl_handle_long(oidp, &kfree, 0, req));
2069}
2070SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2071    0, 0, kvm_free, "IU", "Amount of KVM free");
2072
2073/*
2074 * grow the number of kernel page table entries, if needed
2075 */
2076void
2077pmap_growkernel(vm_offset_t addr)
2078{
2079	vm_paddr_t ptppaddr;
2080	vm_page_t nkpg;
2081	pd_entry_t newpdir;
2082
2083	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2084	addr = roundup2(addr, NBPDR);
2085	if (addr - 1 >= kernel_map->max_offset)
2086		addr = kernel_map->max_offset;
2087	while (kernel_vm_end < addr) {
2088		if (pdir_pde(PTD, kernel_vm_end)) {
2089			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2090			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2091				kernel_vm_end = kernel_map->max_offset;
2092				break;
2093			}
2094			continue;
2095		}
2096
2097		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2098		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2099		    VM_ALLOC_ZERO);
2100		if (nkpg == NULL)
2101			panic("pmap_growkernel: no memory to grow kernel");
2102
2103		nkpt++;
2104
2105		if ((nkpg->flags & PG_ZERO) == 0)
2106			pmap_zero_page(nkpg);
2107		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2108		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2109		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2110
2111		pmap_kenter_pde(kernel_vm_end, newpdir);
2112		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2113		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2114			kernel_vm_end = kernel_map->max_offset;
2115			break;
2116		}
2117	}
2118}
2119
2120
2121/***************************************************
2122 * page management routines.
2123 ***************************************************/
2124
2125CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2126CTASSERT(_NPCM == 11);
2127CTASSERT(_NPCPV == 336);
2128
2129static __inline struct pv_chunk *
2130pv_to_chunk(pv_entry_t pv)
2131{
2132
2133	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2134}
2135
2136#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2137
2138#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2139#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2140
2141static const uint32_t pc_freemask[_NPCM] = {
2142	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2143	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2144	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2145	PC_FREE0_9, PC_FREE10
2146};
2147
2148SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2149	"Current number of pv entries");
2150
2151#ifdef PV_STATS
2152static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2153
2154SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2155	"Current number of pv entry chunks");
2156SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2157	"Current number of pv entry chunks allocated");
2158SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2159	"Current number of pv entry chunks frees");
2160SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2161	"Number of times tried to get a chunk page but failed.");
2162
2163static long pv_entry_frees, pv_entry_allocs;
2164static int pv_entry_spare;
2165
2166SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2167	"Current number of pv entry frees");
2168SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2169	"Current number of pv entry allocs");
2170SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2171	"Current number of spare pv entries");
2172#endif
2173
2174/*
2175 * We are in a serious low memory condition.  Resort to
2176 * drastic measures to free some pages so we can allocate
2177 * another pv entry chunk.
2178 */
2179static vm_page_t
2180pmap_pv_reclaim(pmap_t locked_pmap)
2181{
2182	struct pch newtail;
2183	struct pv_chunk *pc;
2184	struct md_page *pvh;
2185	pd_entry_t *pde;
2186	pmap_t pmap;
2187	pt_entry_t *pte, tpte;
2188	pv_entry_t pv;
2189	vm_offset_t va;
2190	vm_page_t m, m_pc;
2191	struct spglist free;
2192	uint32_t inuse;
2193	int bit, field, freed;
2194
2195	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2196	pmap = NULL;
2197	m_pc = NULL;
2198	SLIST_INIT(&free);
2199	TAILQ_INIT(&newtail);
2200	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2201	    SLIST_EMPTY(&free))) {
2202		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2203		if (pmap != pc->pc_pmap) {
2204			if (pmap != NULL) {
2205				pmap_invalidate_all(pmap);
2206				if (pmap != locked_pmap)
2207					PMAP_UNLOCK(pmap);
2208			}
2209			pmap = pc->pc_pmap;
2210			/* Avoid deadlock and lock recursion. */
2211			if (pmap > locked_pmap)
2212				PMAP_LOCK(pmap);
2213			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2214				pmap = NULL;
2215				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2216				continue;
2217			}
2218		}
2219
2220		/*
2221		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2222		 */
2223		freed = 0;
2224		for (field = 0; field < _NPCM; field++) {
2225			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2226			    inuse != 0; inuse &= ~(1UL << bit)) {
2227				bit = bsfl(inuse);
2228				pv = &pc->pc_pventry[field * 32 + bit];
2229				va = pv->pv_va;
2230				pde = pmap_pde(pmap, va);
2231				if ((*pde & PG_PS) != 0)
2232					continue;
2233				pte = pmap_pte(pmap, va);
2234				tpte = *pte;
2235				if ((tpte & PG_W) == 0)
2236					tpte = pte_load_clear(pte);
2237				pmap_pte_release(pte);
2238				if ((tpte & PG_W) != 0)
2239					continue;
2240				KASSERT(tpte != 0,
2241				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2242				    pmap, va));
2243				if ((tpte & PG_G) != 0)
2244					pmap_invalidate_page(pmap, va);
2245				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2246				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2247					vm_page_dirty(m);
2248				if ((tpte & PG_A) != 0)
2249					vm_page_aflag_set(m, PGA_REFERENCED);
2250				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2251				if (TAILQ_EMPTY(&m->md.pv_list) &&
2252				    (m->flags & PG_FICTITIOUS) == 0) {
2253					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2254					if (TAILQ_EMPTY(&pvh->pv_list)) {
2255						vm_page_aflag_clear(m,
2256						    PGA_WRITEABLE);
2257					}
2258				}
2259				pc->pc_map[field] |= 1UL << bit;
2260				pmap_unuse_pt(pmap, va, &free);
2261				freed++;
2262			}
2263		}
2264		if (freed == 0) {
2265			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2266			continue;
2267		}
2268		/* Every freed mapping is for a 4 KB page. */
2269		pmap->pm_stats.resident_count -= freed;
2270		PV_STAT(pv_entry_frees += freed);
2271		PV_STAT(pv_entry_spare += freed);
2272		pv_entry_count -= freed;
2273		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2274		for (field = 0; field < _NPCM; field++)
2275			if (pc->pc_map[field] != pc_freemask[field]) {
2276				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2277				    pc_list);
2278				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2279
2280				/*
2281				 * One freed pv entry in locked_pmap is
2282				 * sufficient.
2283				 */
2284				if (pmap == locked_pmap)
2285					goto out;
2286				break;
2287			}
2288		if (field == _NPCM) {
2289			PV_STAT(pv_entry_spare -= _NPCPV);
2290			PV_STAT(pc_chunk_count--);
2291			PV_STAT(pc_chunk_frees++);
2292			/* Entire chunk is free; return it. */
2293			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2294			pmap_qremove((vm_offset_t)pc, 1);
2295			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2296			break;
2297		}
2298	}
2299out:
2300	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2301	if (pmap != NULL) {
2302		pmap_invalidate_all(pmap);
2303		if (pmap != locked_pmap)
2304			PMAP_UNLOCK(pmap);
2305	}
2306	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2307		m_pc = SLIST_FIRST(&free);
2308		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2309		/* Recycle a freed page table page. */
2310		m_pc->wire_count = 1;
2311		atomic_add_int(&vm_cnt.v_wire_count, 1);
2312	}
2313	pmap_free_zero_pages(&free);
2314	return (m_pc);
2315}
2316
2317/*
2318 * free the pv_entry back to the free list
2319 */
2320static void
2321free_pv_entry(pmap_t pmap, pv_entry_t pv)
2322{
2323	struct pv_chunk *pc;
2324	int idx, field, bit;
2325
2326	rw_assert(&pvh_global_lock, RA_WLOCKED);
2327	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2328	PV_STAT(pv_entry_frees++);
2329	PV_STAT(pv_entry_spare++);
2330	pv_entry_count--;
2331	pc = pv_to_chunk(pv);
2332	idx = pv - &pc->pc_pventry[0];
2333	field = idx / 32;
2334	bit = idx % 32;
2335	pc->pc_map[field] |= 1ul << bit;
2336	for (idx = 0; idx < _NPCM; idx++)
2337		if (pc->pc_map[idx] != pc_freemask[idx]) {
2338			/*
2339			 * 98% of the time, pc is already at the head of the
2340			 * list.  If it isn't already, move it to the head.
2341			 */
2342			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2343			    pc)) {
2344				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2345				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2346				    pc_list);
2347			}
2348			return;
2349		}
2350	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2351	free_pv_chunk(pc);
2352}
2353
2354static void
2355free_pv_chunk(struct pv_chunk *pc)
2356{
2357	vm_page_t m;
2358
2359 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2360	PV_STAT(pv_entry_spare -= _NPCPV);
2361	PV_STAT(pc_chunk_count--);
2362	PV_STAT(pc_chunk_frees++);
2363	/* entire chunk is free, return it */
2364	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2365	pmap_qremove((vm_offset_t)pc, 1);
2366	vm_page_unwire(m, PQ_NONE);
2367	vm_page_free(m);
2368	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2369}
2370
2371/*
2372 * get a new pv_entry, allocating a block from the system
2373 * when needed.
2374 */
2375static pv_entry_t
2376get_pv_entry(pmap_t pmap, boolean_t try)
2377{
2378	static const struct timeval printinterval = { 60, 0 };
2379	static struct timeval lastprint;
2380	int bit, field;
2381	pv_entry_t pv;
2382	struct pv_chunk *pc;
2383	vm_page_t m;
2384
2385	rw_assert(&pvh_global_lock, RA_WLOCKED);
2386	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2387	PV_STAT(pv_entry_allocs++);
2388	pv_entry_count++;
2389	if (pv_entry_count > pv_entry_high_water)
2390		if (ratecheck(&lastprint, &printinterval))
2391			printf("Approaching the limit on PV entries, consider "
2392			    "increasing either the vm.pmap.shpgperproc or the "
2393			    "vm.pmap.pv_entry_max tunable.\n");
2394retry:
2395	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2396	if (pc != NULL) {
2397		for (field = 0; field < _NPCM; field++) {
2398			if (pc->pc_map[field]) {
2399				bit = bsfl(pc->pc_map[field]);
2400				break;
2401			}
2402		}
2403		if (field < _NPCM) {
2404			pv = &pc->pc_pventry[field * 32 + bit];
2405			pc->pc_map[field] &= ~(1ul << bit);
2406			/* If this was the last item, move it to tail */
2407			for (field = 0; field < _NPCM; field++)
2408				if (pc->pc_map[field] != 0) {
2409					PV_STAT(pv_entry_spare--);
2410					return (pv);	/* not full, return */
2411				}
2412			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2413			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2414			PV_STAT(pv_entry_spare--);
2415			return (pv);
2416		}
2417	}
2418	/*
2419	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2420	 * global lock.  If "pv_vafree" is currently non-empty, it will
2421	 * remain non-empty until pmap_ptelist_alloc() completes.
2422	 */
2423	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2424	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2425		if (try) {
2426			pv_entry_count--;
2427			PV_STAT(pc_chunk_tryfail++);
2428			return (NULL);
2429		}
2430		m = pmap_pv_reclaim(pmap);
2431		if (m == NULL)
2432			goto retry;
2433	}
2434	PV_STAT(pc_chunk_count++);
2435	PV_STAT(pc_chunk_allocs++);
2436	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2437	pmap_qenter((vm_offset_t)pc, &m, 1);
2438	pc->pc_pmap = pmap;
2439	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2440	for (field = 1; field < _NPCM; field++)
2441		pc->pc_map[field] = pc_freemask[field];
2442	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2443	pv = &pc->pc_pventry[0];
2444	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2445	PV_STAT(pv_entry_spare += _NPCPV - 1);
2446	return (pv);
2447}
2448
2449static __inline pv_entry_t
2450pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2451{
2452	pv_entry_t pv;
2453
2454	rw_assert(&pvh_global_lock, RA_WLOCKED);
2455	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2456		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2457			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2458			break;
2459		}
2460	}
2461	return (pv);
2462}
2463
2464static void
2465pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2466{
2467	struct md_page *pvh;
2468	pv_entry_t pv;
2469	vm_offset_t va_last;
2470	vm_page_t m;
2471
2472	rw_assert(&pvh_global_lock, RA_WLOCKED);
2473	KASSERT((pa & PDRMASK) == 0,
2474	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2475
2476	/*
2477	 * Transfer the 4mpage's pv entry for this mapping to the first
2478	 * page's pv list.
2479	 */
2480	pvh = pa_to_pvh(pa);
2481	va = trunc_4mpage(va);
2482	pv = pmap_pvh_remove(pvh, pmap, va);
2483	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2484	m = PHYS_TO_VM_PAGE(pa);
2485	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2486	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2487	va_last = va + NBPDR - PAGE_SIZE;
2488	do {
2489		m++;
2490		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2491		    ("pmap_pv_demote_pde: page %p is not managed", m));
2492		va += PAGE_SIZE;
2493		pmap_insert_entry(pmap, va, m);
2494	} while (va < va_last);
2495}
2496
2497static void
2498pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2499{
2500	struct md_page *pvh;
2501	pv_entry_t pv;
2502	vm_offset_t va_last;
2503	vm_page_t m;
2504
2505	rw_assert(&pvh_global_lock, RA_WLOCKED);
2506	KASSERT((pa & PDRMASK) == 0,
2507	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2508
2509	/*
2510	 * Transfer the first page's pv entry for this mapping to the
2511	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2512	 * to get_pv_entry(), a transfer avoids the possibility that
2513	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2514	 * removes one of the mappings that is being promoted.
2515	 */
2516	m = PHYS_TO_VM_PAGE(pa);
2517	va = trunc_4mpage(va);
2518	pv = pmap_pvh_remove(&m->md, pmap, va);
2519	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2520	pvh = pa_to_pvh(pa);
2521	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2522	/* Free the remaining NPTEPG - 1 pv entries. */
2523	va_last = va + NBPDR - PAGE_SIZE;
2524	do {
2525		m++;
2526		va += PAGE_SIZE;
2527		pmap_pvh_free(&m->md, pmap, va);
2528	} while (va < va_last);
2529}
2530
2531static void
2532pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2533{
2534	pv_entry_t pv;
2535
2536	pv = pmap_pvh_remove(pvh, pmap, va);
2537	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2538	free_pv_entry(pmap, pv);
2539}
2540
2541static void
2542pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2543{
2544	struct md_page *pvh;
2545
2546	rw_assert(&pvh_global_lock, RA_WLOCKED);
2547	pmap_pvh_free(&m->md, pmap, va);
2548	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2549		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2550		if (TAILQ_EMPTY(&pvh->pv_list))
2551			vm_page_aflag_clear(m, PGA_WRITEABLE);
2552	}
2553}
2554
2555/*
2556 * Create a pv entry for page at pa for
2557 * (pmap, va).
2558 */
2559static void
2560pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2561{
2562	pv_entry_t pv;
2563
2564	rw_assert(&pvh_global_lock, RA_WLOCKED);
2565	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2566	pv = get_pv_entry(pmap, FALSE);
2567	pv->pv_va = va;
2568	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2569}
2570
2571/*
2572 * Conditionally create a pv entry.
2573 */
2574static boolean_t
2575pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2576{
2577	pv_entry_t pv;
2578
2579	rw_assert(&pvh_global_lock, RA_WLOCKED);
2580	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2581	if (pv_entry_count < pv_entry_high_water &&
2582	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2583		pv->pv_va = va;
2584		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2585		return (TRUE);
2586	} else
2587		return (FALSE);
2588}
2589
2590/*
2591 * Create the pv entries for each of the pages within a superpage.
2592 */
2593static boolean_t
2594pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2595{
2596	struct md_page *pvh;
2597	pv_entry_t pv;
2598
2599	rw_assert(&pvh_global_lock, RA_WLOCKED);
2600	if (pv_entry_count < pv_entry_high_water &&
2601	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2602		pv->pv_va = va;
2603		pvh = pa_to_pvh(pa);
2604		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2605		return (TRUE);
2606	} else
2607		return (FALSE);
2608}
2609
2610/*
2611 * Fills a page table page with mappings to consecutive physical pages.
2612 */
2613static void
2614pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2615{
2616	pt_entry_t *pte;
2617
2618	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2619		*pte = newpte;
2620		newpte += PAGE_SIZE;
2621	}
2622}
2623
2624/*
2625 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2626 * 2- or 4MB page mapping is invalidated.
2627 */
2628static boolean_t
2629pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2630{
2631	pd_entry_t newpde, oldpde;
2632	pt_entry_t *firstpte, newpte;
2633	vm_paddr_t mptepa;
2634	vm_page_t mpte;
2635	struct spglist free;
2636
2637	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2638	oldpde = *pde;
2639	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2640	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2641	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
2642	    NULL)
2643		pmap_remove_pt_page(pmap, mpte);
2644	else {
2645		KASSERT((oldpde & PG_W) == 0,
2646		    ("pmap_demote_pde: page table page for a wired mapping"
2647		    " is missing"));
2648
2649		/*
2650		 * Invalidate the 2- or 4MB page mapping and return
2651		 * "failure" if the mapping was never accessed or the
2652		 * allocation of the new page table page fails.
2653		 */
2654		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2655		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2656		    VM_ALLOC_WIRED)) == NULL) {
2657			SLIST_INIT(&free);
2658			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2659			pmap_invalidate_page(pmap, trunc_4mpage(va));
2660			pmap_free_zero_pages(&free);
2661			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2662			    " in pmap %p", va, pmap);
2663			return (FALSE);
2664		}
2665		if (va < VM_MAXUSER_ADDRESS)
2666			pmap->pm_stats.resident_count++;
2667	}
2668	mptepa = VM_PAGE_TO_PHYS(mpte);
2669
2670	/*
2671	 * If the page mapping is in the kernel's address space, then the
2672	 * KPTmap can provide access to the page table page.  Otherwise,
2673	 * temporarily map the page table page (mpte) into the kernel's
2674	 * address space at either PADDR1 or PADDR2.
2675	 */
2676	if (va >= KERNBASE)
2677		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2678	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2679		if ((*PMAP1 & PG_FRAME) != mptepa) {
2680			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2681#ifdef SMP
2682			PMAP1cpu = PCPU_GET(cpuid);
2683#endif
2684			invlcaddr(PADDR1);
2685			PMAP1changed++;
2686		} else
2687#ifdef SMP
2688		if (PMAP1cpu != PCPU_GET(cpuid)) {
2689			PMAP1cpu = PCPU_GET(cpuid);
2690			invlcaddr(PADDR1);
2691			PMAP1changedcpu++;
2692		} else
2693#endif
2694			PMAP1unchanged++;
2695		firstpte = PADDR1;
2696	} else {
2697		mtx_lock(&PMAP2mutex);
2698		if ((*PMAP2 & PG_FRAME) != mptepa) {
2699			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2700			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2701		}
2702		firstpte = PADDR2;
2703	}
2704	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2705	KASSERT((oldpde & PG_A) != 0,
2706	    ("pmap_demote_pde: oldpde is missing PG_A"));
2707	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2708	    ("pmap_demote_pde: oldpde is missing PG_M"));
2709	newpte = oldpde & ~PG_PS;
2710	if ((newpte & PG_PDE_PAT) != 0)
2711		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2712
2713	/*
2714	 * If the page table page is new, initialize it.
2715	 */
2716	if (mpte->wire_count == 1) {
2717		mpte->wire_count = NPTEPG;
2718		pmap_fill_ptp(firstpte, newpte);
2719	}
2720	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2721	    ("pmap_demote_pde: firstpte and newpte map different physical"
2722	    " addresses"));
2723
2724	/*
2725	 * If the mapping has changed attributes, update the page table
2726	 * entries.
2727	 */
2728	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2729		pmap_fill_ptp(firstpte, newpte);
2730
2731	/*
2732	 * Demote the mapping.  This pmap is locked.  The old PDE has
2733	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2734	 * set.  Thus, there is no danger of a race with another
2735	 * processor changing the setting of PG_A and/or PG_M between
2736	 * the read above and the store below.
2737	 */
2738	if (workaround_erratum383)
2739		pmap_update_pde(pmap, va, pde, newpde);
2740	else if (pmap == kernel_pmap)
2741		pmap_kenter_pde(va, newpde);
2742	else
2743		pde_store(pde, newpde);
2744	if (firstpte == PADDR2)
2745		mtx_unlock(&PMAP2mutex);
2746
2747	/*
2748	 * Invalidate the recursive mapping of the page table page.
2749	 */
2750	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2751
2752	/*
2753	 * Demote the pv entry.  This depends on the earlier demotion
2754	 * of the mapping.  Specifically, the (re)creation of a per-
2755	 * page pv entry might trigger the execution of pmap_collect(),
2756	 * which might reclaim a newly (re)created per-page pv entry
2757	 * and destroy the associated mapping.  In order to destroy
2758	 * the mapping, the PDE must have already changed from mapping
2759	 * the 2mpage to referencing the page table page.
2760	 */
2761	if ((oldpde & PG_MANAGED) != 0)
2762		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2763
2764	pmap_pde_demotions++;
2765	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2766	    " in pmap %p", va, pmap);
2767	return (TRUE);
2768}
2769
2770/*
2771 * Removes a 2- or 4MB page mapping from the kernel pmap.
2772 */
2773static void
2774pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2775{
2776	pd_entry_t newpde;
2777	vm_paddr_t mptepa;
2778	vm_page_t mpte;
2779
2780	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2781	mpte = pmap_lookup_pt_page(pmap, va);
2782	if (mpte == NULL)
2783		panic("pmap_remove_kernel_pde: Missing pt page.");
2784
2785	pmap_remove_pt_page(pmap, mpte);
2786	mptepa = VM_PAGE_TO_PHYS(mpte);
2787	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2788
2789	/*
2790	 * Initialize the page table page.
2791	 */
2792	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2793
2794	/*
2795	 * Remove the mapping.
2796	 */
2797	if (workaround_erratum383)
2798		pmap_update_pde(pmap, va, pde, newpde);
2799	else
2800		pmap_kenter_pde(va, newpde);
2801
2802	/*
2803	 * Invalidate the recursive mapping of the page table page.
2804	 */
2805	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2806}
2807
2808/*
2809 * pmap_remove_pde: do the things to unmap a superpage in a process
2810 */
2811static void
2812pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2813    struct spglist *free)
2814{
2815	struct md_page *pvh;
2816	pd_entry_t oldpde;
2817	vm_offset_t eva, va;
2818	vm_page_t m, mpte;
2819
2820	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2821	KASSERT((sva & PDRMASK) == 0,
2822	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2823	oldpde = pte_load_clear(pdq);
2824	if (oldpde & PG_W)
2825		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2826
2827	/*
2828	 * Machines that don't support invlpg, also don't support
2829	 * PG_G.
2830	 */
2831	if (oldpde & PG_G)
2832		pmap_invalidate_page(kernel_pmap, sva);
2833	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2834	if (oldpde & PG_MANAGED) {
2835		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2836		pmap_pvh_free(pvh, pmap, sva);
2837		eva = sva + NBPDR;
2838		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2839		    va < eva; va += PAGE_SIZE, m++) {
2840			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2841				vm_page_dirty(m);
2842			if (oldpde & PG_A)
2843				vm_page_aflag_set(m, PGA_REFERENCED);
2844			if (TAILQ_EMPTY(&m->md.pv_list) &&
2845			    TAILQ_EMPTY(&pvh->pv_list))
2846				vm_page_aflag_clear(m, PGA_WRITEABLE);
2847		}
2848	}
2849	if (pmap == kernel_pmap) {
2850		pmap_remove_kernel_pde(pmap, pdq, sva);
2851	} else {
2852		mpte = pmap_lookup_pt_page(pmap, sva);
2853		if (mpte != NULL) {
2854			pmap_remove_pt_page(pmap, mpte);
2855			pmap->pm_stats.resident_count--;
2856			KASSERT(mpte->wire_count == NPTEPG,
2857			    ("pmap_remove_pde: pte page wire count error"));
2858			mpte->wire_count = 0;
2859			pmap_add_delayed_free_list(mpte, free, FALSE);
2860			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2861		}
2862	}
2863}
2864
2865/*
2866 * pmap_remove_pte: do the things to unmap a page in a process
2867 */
2868static int
2869pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2870    struct spglist *free)
2871{
2872	pt_entry_t oldpte;
2873	vm_page_t m;
2874
2875	rw_assert(&pvh_global_lock, RA_WLOCKED);
2876	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2877	oldpte = pte_load_clear(ptq);
2878	KASSERT(oldpte != 0,
2879	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2880	if (oldpte & PG_W)
2881		pmap->pm_stats.wired_count -= 1;
2882	/*
2883	 * Machines that don't support invlpg, also don't support
2884	 * PG_G.
2885	 */
2886	if (oldpte & PG_G)
2887		pmap_invalidate_page(kernel_pmap, va);
2888	pmap->pm_stats.resident_count -= 1;
2889	if (oldpte & PG_MANAGED) {
2890		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2891		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2892			vm_page_dirty(m);
2893		if (oldpte & PG_A)
2894			vm_page_aflag_set(m, PGA_REFERENCED);
2895		pmap_remove_entry(pmap, m, va);
2896	}
2897	return (pmap_unuse_pt(pmap, va, free));
2898}
2899
2900/*
2901 * Remove a single page from a process address space
2902 */
2903static void
2904pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2905{
2906	pt_entry_t *pte;
2907
2908	rw_assert(&pvh_global_lock, RA_WLOCKED);
2909	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2910	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2911	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2912		return;
2913	pmap_remove_pte(pmap, pte, va, free);
2914	pmap_invalidate_page(pmap, va);
2915}
2916
2917/*
2918 *	Remove the given range of addresses from the specified map.
2919 *
2920 *	It is assumed that the start and end are properly
2921 *	rounded to the page size.
2922 */
2923void
2924pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2925{
2926	vm_offset_t pdnxt;
2927	pd_entry_t ptpaddr;
2928	pt_entry_t *pte;
2929	struct spglist free;
2930	int anyvalid;
2931
2932	/*
2933	 * Perform an unsynchronized read.  This is, however, safe.
2934	 */
2935	if (pmap->pm_stats.resident_count == 0)
2936		return;
2937
2938	anyvalid = 0;
2939	SLIST_INIT(&free);
2940
2941	rw_wlock(&pvh_global_lock);
2942	sched_pin();
2943	PMAP_LOCK(pmap);
2944
2945	/*
2946	 * special handling of removing one page.  a very
2947	 * common operation and easy to short circuit some
2948	 * code.
2949	 */
2950	if ((sva + PAGE_SIZE == eva) &&
2951	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2952		pmap_remove_page(pmap, sva, &free);
2953		goto out;
2954	}
2955
2956	for (; sva < eva; sva = pdnxt) {
2957		u_int pdirindex;
2958
2959		/*
2960		 * Calculate index for next page table.
2961		 */
2962		pdnxt = (sva + NBPDR) & ~PDRMASK;
2963		if (pdnxt < sva)
2964			pdnxt = eva;
2965		if (pmap->pm_stats.resident_count == 0)
2966			break;
2967
2968		pdirindex = sva >> PDRSHIFT;
2969		ptpaddr = pmap->pm_pdir[pdirindex];
2970
2971		/*
2972		 * Weed out invalid mappings. Note: we assume that the page
2973		 * directory table is always allocated, and in kernel virtual.
2974		 */
2975		if (ptpaddr == 0)
2976			continue;
2977
2978		/*
2979		 * Check for large page.
2980		 */
2981		if ((ptpaddr & PG_PS) != 0) {
2982			/*
2983			 * Are we removing the entire large page?  If not,
2984			 * demote the mapping and fall through.
2985			 */
2986			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2987				/*
2988				 * The TLB entry for a PG_G mapping is
2989				 * invalidated by pmap_remove_pde().
2990				 */
2991				if ((ptpaddr & PG_G) == 0)
2992					anyvalid = 1;
2993				pmap_remove_pde(pmap,
2994				    &pmap->pm_pdir[pdirindex], sva, &free);
2995				continue;
2996			} else if (!pmap_demote_pde(pmap,
2997			    &pmap->pm_pdir[pdirindex], sva)) {
2998				/* The large page mapping was destroyed. */
2999				continue;
3000			}
3001		}
3002
3003		/*
3004		 * Limit our scan to either the end of the va represented
3005		 * by the current page table page, or to the end of the
3006		 * range being removed.
3007		 */
3008		if (pdnxt > eva)
3009			pdnxt = eva;
3010
3011		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3012		    sva += PAGE_SIZE) {
3013			if (*pte == 0)
3014				continue;
3015
3016			/*
3017			 * The TLB entry for a PG_G mapping is invalidated
3018			 * by pmap_remove_pte().
3019			 */
3020			if ((*pte & PG_G) == 0)
3021				anyvalid = 1;
3022			if (pmap_remove_pte(pmap, pte, sva, &free))
3023				break;
3024		}
3025	}
3026out:
3027	sched_unpin();
3028	if (anyvalid)
3029		pmap_invalidate_all(pmap);
3030	rw_wunlock(&pvh_global_lock);
3031	PMAP_UNLOCK(pmap);
3032	pmap_free_zero_pages(&free);
3033}
3034
3035/*
3036 *	Routine:	pmap_remove_all
3037 *	Function:
3038 *		Removes this physical page from
3039 *		all physical maps in which it resides.
3040 *		Reflects back modify bits to the pager.
3041 *
3042 *	Notes:
3043 *		Original versions of this routine were very
3044 *		inefficient because they iteratively called
3045 *		pmap_remove (slow...)
3046 */
3047
3048void
3049pmap_remove_all(vm_page_t m)
3050{
3051	struct md_page *pvh;
3052	pv_entry_t pv;
3053	pmap_t pmap;
3054	pt_entry_t *pte, tpte;
3055	pd_entry_t *pde;
3056	vm_offset_t va;
3057	struct spglist free;
3058
3059	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3060	    ("pmap_remove_all: page %p is not managed", m));
3061	SLIST_INIT(&free);
3062	rw_wlock(&pvh_global_lock);
3063	sched_pin();
3064	if ((m->flags & PG_FICTITIOUS) != 0)
3065		goto small_mappings;
3066	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3067	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3068		va = pv->pv_va;
3069		pmap = PV_PMAP(pv);
3070		PMAP_LOCK(pmap);
3071		pde = pmap_pde(pmap, va);
3072		(void)pmap_demote_pde(pmap, pde, va);
3073		PMAP_UNLOCK(pmap);
3074	}
3075small_mappings:
3076	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3077		pmap = PV_PMAP(pv);
3078		PMAP_LOCK(pmap);
3079		pmap->pm_stats.resident_count--;
3080		pde = pmap_pde(pmap, pv->pv_va);
3081		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3082		    " a 4mpage in page %p's pv list", m));
3083		pte = pmap_pte_quick(pmap, pv->pv_va);
3084		tpte = pte_load_clear(pte);
3085		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3086		    pmap, pv->pv_va));
3087		if (tpte & PG_W)
3088			pmap->pm_stats.wired_count--;
3089		if (tpte & PG_A)
3090			vm_page_aflag_set(m, PGA_REFERENCED);
3091
3092		/*
3093		 * Update the vm_page_t clean and reference bits.
3094		 */
3095		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3096			vm_page_dirty(m);
3097		pmap_unuse_pt(pmap, pv->pv_va, &free);
3098		pmap_invalidate_page(pmap, pv->pv_va);
3099		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3100		free_pv_entry(pmap, pv);
3101		PMAP_UNLOCK(pmap);
3102	}
3103	vm_page_aflag_clear(m, PGA_WRITEABLE);
3104	sched_unpin();
3105	rw_wunlock(&pvh_global_lock);
3106	pmap_free_zero_pages(&free);
3107}
3108
3109/*
3110 * pmap_protect_pde: do the things to protect a 4mpage in a process
3111 */
3112static boolean_t
3113pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3114{
3115	pd_entry_t newpde, oldpde;
3116	vm_offset_t eva, va;
3117	vm_page_t m;
3118	boolean_t anychanged;
3119
3120	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3121	KASSERT((sva & PDRMASK) == 0,
3122	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3123	anychanged = FALSE;
3124retry:
3125	oldpde = newpde = *pde;
3126	if (oldpde & PG_MANAGED) {
3127		eva = sva + NBPDR;
3128		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3129		    va < eva; va += PAGE_SIZE, m++)
3130			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3131				vm_page_dirty(m);
3132	}
3133	if ((prot & VM_PROT_WRITE) == 0)
3134		newpde &= ~(PG_RW | PG_M);
3135#if defined(PAE) || defined(PAE_TABLES)
3136	if ((prot & VM_PROT_EXECUTE) == 0)
3137		newpde |= pg_nx;
3138#endif
3139	if (newpde != oldpde) {
3140		if (!pde_cmpset(pde, oldpde, newpde))
3141			goto retry;
3142		if (oldpde & PG_G)
3143			pmap_invalidate_page(pmap, sva);
3144		else
3145			anychanged = TRUE;
3146	}
3147	return (anychanged);
3148}
3149
3150/*
3151 *	Set the physical protection on the
3152 *	specified range of this map as requested.
3153 */
3154void
3155pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3156{
3157	vm_offset_t pdnxt;
3158	pd_entry_t ptpaddr;
3159	pt_entry_t *pte;
3160	boolean_t anychanged, pv_lists_locked;
3161
3162	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3163	if (prot == VM_PROT_NONE) {
3164		pmap_remove(pmap, sva, eva);
3165		return;
3166	}
3167
3168#if defined(PAE) || defined(PAE_TABLES)
3169	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3170	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3171		return;
3172#else
3173	if (prot & VM_PROT_WRITE)
3174		return;
3175#endif
3176
3177	if (pmap_is_current(pmap))
3178		pv_lists_locked = FALSE;
3179	else {
3180		pv_lists_locked = TRUE;
3181resume:
3182		rw_wlock(&pvh_global_lock);
3183		sched_pin();
3184	}
3185	anychanged = FALSE;
3186
3187	PMAP_LOCK(pmap);
3188	for (; sva < eva; sva = pdnxt) {
3189		pt_entry_t obits, pbits;
3190		u_int pdirindex;
3191
3192		pdnxt = (sva + NBPDR) & ~PDRMASK;
3193		if (pdnxt < sva)
3194			pdnxt = eva;
3195
3196		pdirindex = sva >> PDRSHIFT;
3197		ptpaddr = pmap->pm_pdir[pdirindex];
3198
3199		/*
3200		 * Weed out invalid mappings. Note: we assume that the page
3201		 * directory table is always allocated, and in kernel virtual.
3202		 */
3203		if (ptpaddr == 0)
3204			continue;
3205
3206		/*
3207		 * Check for large page.
3208		 */
3209		if ((ptpaddr & PG_PS) != 0) {
3210			/*
3211			 * Are we protecting the entire large page?  If not,
3212			 * demote the mapping and fall through.
3213			 */
3214			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3215				/*
3216				 * The TLB entry for a PG_G mapping is
3217				 * invalidated by pmap_protect_pde().
3218				 */
3219				if (pmap_protect_pde(pmap,
3220				    &pmap->pm_pdir[pdirindex], sva, prot))
3221					anychanged = TRUE;
3222				continue;
3223			} else {
3224				if (!pv_lists_locked) {
3225					pv_lists_locked = TRUE;
3226					if (!rw_try_wlock(&pvh_global_lock)) {
3227						if (anychanged)
3228							pmap_invalidate_all(
3229							    pmap);
3230						PMAP_UNLOCK(pmap);
3231						goto resume;
3232					}
3233					sched_pin();
3234				}
3235				if (!pmap_demote_pde(pmap,
3236				    &pmap->pm_pdir[pdirindex], sva)) {
3237					/*
3238					 * The large page mapping was
3239					 * destroyed.
3240					 */
3241					continue;
3242				}
3243			}
3244		}
3245
3246		if (pdnxt > eva)
3247			pdnxt = eva;
3248
3249		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3250		    sva += PAGE_SIZE) {
3251			vm_page_t m;
3252
3253retry:
3254			/*
3255			 * Regardless of whether a pte is 32 or 64 bits in
3256			 * size, PG_RW, PG_A, and PG_M are among the least
3257			 * significant 32 bits.
3258			 */
3259			obits = pbits = *pte;
3260			if ((pbits & PG_V) == 0)
3261				continue;
3262
3263			if ((prot & VM_PROT_WRITE) == 0) {
3264				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3265				    (PG_MANAGED | PG_M | PG_RW)) {
3266					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3267					vm_page_dirty(m);
3268				}
3269				pbits &= ~(PG_RW | PG_M);
3270			}
3271#if defined(PAE) || defined(PAE_TABLES)
3272			if ((prot & VM_PROT_EXECUTE) == 0)
3273				pbits |= pg_nx;
3274#endif
3275
3276			if (pbits != obits) {
3277#if defined(PAE) || defined(PAE_TABLES)
3278				if (!atomic_cmpset_64(pte, obits, pbits))
3279					goto retry;
3280#else
3281				if (!atomic_cmpset_int((u_int *)pte, obits,
3282				    pbits))
3283					goto retry;
3284#endif
3285				if (obits & PG_G)
3286					pmap_invalidate_page(pmap, sva);
3287				else
3288					anychanged = TRUE;
3289			}
3290		}
3291	}
3292	if (anychanged)
3293		pmap_invalidate_all(pmap);
3294	if (pv_lists_locked) {
3295		sched_unpin();
3296		rw_wunlock(&pvh_global_lock);
3297	}
3298	PMAP_UNLOCK(pmap);
3299}
3300
3301/*
3302 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3303 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3304 * For promotion to occur, two conditions must be met: (1) the 4KB page
3305 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3306 * mappings must have identical characteristics.
3307 *
3308 * Managed (PG_MANAGED) mappings within the kernel address space are not
3309 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3310 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3311 * pmap.
3312 */
3313static void
3314pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3315{
3316	pd_entry_t newpde;
3317	pt_entry_t *firstpte, oldpte, pa, *pte;
3318	vm_offset_t oldpteva;
3319	vm_page_t mpte;
3320
3321	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3322
3323	/*
3324	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3325	 * either invalid, unused, or does not map the first 4KB physical page
3326	 * within a 2- or 4MB page.
3327	 */
3328	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3329setpde:
3330	newpde = *firstpte;
3331	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3332		pmap_pde_p_failures++;
3333		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3334		    " in pmap %p", va, pmap);
3335		return;
3336	}
3337	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3338		pmap_pde_p_failures++;
3339		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3340		    " in pmap %p", va, pmap);
3341		return;
3342	}
3343	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3344		/*
3345		 * When PG_M is already clear, PG_RW can be cleared without
3346		 * a TLB invalidation.
3347		 */
3348		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3349		    ~PG_RW))
3350			goto setpde;
3351		newpde &= ~PG_RW;
3352	}
3353
3354	/*
3355	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3356	 * PTE maps an unexpected 4KB physical page or does not have identical
3357	 * characteristics to the first PTE.
3358	 */
3359	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3360	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3361setpte:
3362		oldpte = *pte;
3363		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3364			pmap_pde_p_failures++;
3365			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3366			    " in pmap %p", va, pmap);
3367			return;
3368		}
3369		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3370			/*
3371			 * When PG_M is already clear, PG_RW can be cleared
3372			 * without a TLB invalidation.
3373			 */
3374			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3375			    oldpte & ~PG_RW))
3376				goto setpte;
3377			oldpte &= ~PG_RW;
3378			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3379			    (va & ~PDRMASK);
3380			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3381			    " in pmap %p", oldpteva, pmap);
3382		}
3383		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3384			pmap_pde_p_failures++;
3385			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3386			    " in pmap %p", va, pmap);
3387			return;
3388		}
3389		pa -= PAGE_SIZE;
3390	}
3391
3392	/*
3393	 * Save the page table page in its current state until the PDE
3394	 * mapping the superpage is demoted by pmap_demote_pde() or
3395	 * destroyed by pmap_remove_pde().
3396	 */
3397	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3398	KASSERT(mpte >= vm_page_array &&
3399	    mpte < &vm_page_array[vm_page_array_size],
3400	    ("pmap_promote_pde: page table page is out of range"));
3401	KASSERT(mpte->pindex == va >> PDRSHIFT,
3402	    ("pmap_promote_pde: page table page's pindex is wrong"));
3403	if (pmap_insert_pt_page(pmap, mpte)) {
3404		pmap_pde_p_failures++;
3405		CTR2(KTR_PMAP,
3406		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
3407		    pmap);
3408		return;
3409	}
3410
3411	/*
3412	 * Promote the pv entries.
3413	 */
3414	if ((newpde & PG_MANAGED) != 0)
3415		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3416
3417	/*
3418	 * Propagate the PAT index to its proper position.
3419	 */
3420	if ((newpde & PG_PTE_PAT) != 0)
3421		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3422
3423	/*
3424	 * Map the superpage.
3425	 */
3426	if (workaround_erratum383)
3427		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3428	else if (pmap == kernel_pmap)
3429		pmap_kenter_pde(va, PG_PS | newpde);
3430	else
3431		pde_store(pde, PG_PS | newpde);
3432
3433	pmap_pde_promotions++;
3434	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3435	    " in pmap %p", va, pmap);
3436}
3437
3438/*
3439 *	Insert the given physical page (p) at
3440 *	the specified virtual address (v) in the
3441 *	target physical map with the protection requested.
3442 *
3443 *	If specified, the page will be wired down, meaning
3444 *	that the related pte can not be reclaimed.
3445 *
3446 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3447 *	or lose information.  That is, this routine must actually
3448 *	insert this page into the given map NOW.
3449 */
3450int
3451pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3452    u_int flags, int8_t psind)
3453{
3454	pd_entry_t *pde;
3455	pt_entry_t *pte;
3456	pt_entry_t newpte, origpte;
3457	pv_entry_t pv;
3458	vm_paddr_t opa, pa;
3459	vm_page_t mpte, om;
3460	boolean_t invlva, wired;
3461
3462	va = trunc_page(va);
3463	mpte = NULL;
3464	wired = (flags & PMAP_ENTER_WIRED) != 0;
3465
3466	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3467	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3468	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3469	    va));
3470	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3471		VM_OBJECT_ASSERT_LOCKED(m->object);
3472
3473	rw_wlock(&pvh_global_lock);
3474	PMAP_LOCK(pmap);
3475	sched_pin();
3476
3477	pde = pmap_pde(pmap, va);
3478	if (va < VM_MAXUSER_ADDRESS) {
3479		/*
3480		 * va is for UVA.
3481		 * In the case that a page table page is not resident,
3482		 * we are creating it here.  pmap_allocpte() handles
3483		 * demotion.
3484		 */
3485		mpte = pmap_allocpte(pmap, va, flags);
3486		if (mpte == NULL) {
3487			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3488			    ("pmap_allocpte failed with sleep allowed"));
3489			sched_unpin();
3490			rw_wunlock(&pvh_global_lock);
3491			PMAP_UNLOCK(pmap);
3492			return (KERN_RESOURCE_SHORTAGE);
3493		}
3494	} else {
3495		/*
3496		 * va is for KVA, so pmap_demote_pde() will never fail
3497		 * to install a page table page.  PG_V is also
3498		 * asserted by pmap_demote_pde().
3499		 */
3500		KASSERT(pde != NULL && (*pde & PG_V) != 0,
3501		    ("KVA %#x invalid pde pdir %#jx", va,
3502		    (uintmax_t)pmap->pm_pdir[PTDPTDI]));
3503		if ((*pde & PG_PS) != 0)
3504			pmap_demote_pde(pmap, pde, va);
3505	}
3506	pte = pmap_pte_quick(pmap, va);
3507
3508	/*
3509	 * Page Directory table entry is not valid, which should not
3510	 * happen.  We should have either allocated the page table
3511	 * page or demoted the existing mapping above.
3512	 */
3513	if (pte == NULL) {
3514		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3515		    (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3516	}
3517
3518	pa = VM_PAGE_TO_PHYS(m);
3519	om = NULL;
3520	origpte = *pte;
3521	opa = origpte & PG_FRAME;
3522
3523	/*
3524	 * Mapping has not changed, must be protection or wiring change.
3525	 */
3526	if (origpte && (opa == pa)) {
3527		/*
3528		 * Wiring change, just update stats. We don't worry about
3529		 * wiring PT pages as they remain resident as long as there
3530		 * are valid mappings in them. Hence, if a user page is wired,
3531		 * the PT page will be also.
3532		 */
3533		if (wired && ((origpte & PG_W) == 0))
3534			pmap->pm_stats.wired_count++;
3535		else if (!wired && (origpte & PG_W))
3536			pmap->pm_stats.wired_count--;
3537
3538		/*
3539		 * Remove extra pte reference
3540		 */
3541		if (mpte)
3542			mpte->wire_count--;
3543
3544		if (origpte & PG_MANAGED) {
3545			om = m;
3546			pa |= PG_MANAGED;
3547		}
3548		goto validate;
3549	}
3550
3551	pv = NULL;
3552
3553	/*
3554	 * Mapping has changed, invalidate old range and fall through to
3555	 * handle validating new mapping.
3556	 */
3557	if (opa) {
3558		if (origpte & PG_W)
3559			pmap->pm_stats.wired_count--;
3560		if (origpte & PG_MANAGED) {
3561			om = PHYS_TO_VM_PAGE(opa);
3562			pv = pmap_pvh_remove(&om->md, pmap, va);
3563		}
3564		if (mpte != NULL) {
3565			mpte->wire_count--;
3566			KASSERT(mpte->wire_count > 0,
3567			    ("pmap_enter: missing reference to page table page,"
3568			     " va: 0x%x", va));
3569		}
3570	} else
3571		pmap->pm_stats.resident_count++;
3572
3573	/*
3574	 * Enter on the PV list if part of our managed memory.
3575	 */
3576	if ((m->oflags & VPO_UNMANAGED) == 0) {
3577		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3578		    ("pmap_enter: managed mapping within the clean submap"));
3579		if (pv == NULL)
3580			pv = get_pv_entry(pmap, FALSE);
3581		pv->pv_va = va;
3582		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3583		pa |= PG_MANAGED;
3584	} else if (pv != NULL)
3585		free_pv_entry(pmap, pv);
3586
3587	/*
3588	 * Increment counters
3589	 */
3590	if (wired)
3591		pmap->pm_stats.wired_count++;
3592
3593validate:
3594	/*
3595	 * Now validate mapping with desired protection/wiring.
3596	 */
3597	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3598	if ((prot & VM_PROT_WRITE) != 0) {
3599		newpte |= PG_RW;
3600		if ((newpte & PG_MANAGED) != 0)
3601			vm_page_aflag_set(m, PGA_WRITEABLE);
3602	}
3603#if defined(PAE) || defined(PAE_TABLES)
3604	if ((prot & VM_PROT_EXECUTE) == 0)
3605		newpte |= pg_nx;
3606#endif
3607	if (wired)
3608		newpte |= PG_W;
3609	if (va < VM_MAXUSER_ADDRESS)
3610		newpte |= PG_U;
3611	if (pmap == kernel_pmap)
3612		newpte |= pgeflag;
3613
3614	/*
3615	 * if the mapping or permission bits are different, we need
3616	 * to update the pte.
3617	 */
3618	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3619		newpte |= PG_A;
3620		if ((flags & VM_PROT_WRITE) != 0)
3621			newpte |= PG_M;
3622		if (origpte & PG_V) {
3623			invlva = FALSE;
3624			origpte = pte_load_store(pte, newpte);
3625			if (origpte & PG_A) {
3626				if (origpte & PG_MANAGED)
3627					vm_page_aflag_set(om, PGA_REFERENCED);
3628				if (opa != VM_PAGE_TO_PHYS(m))
3629					invlva = TRUE;
3630#if defined(PAE) || defined(PAE_TABLES)
3631				if ((origpte & PG_NX) == 0 &&
3632				    (newpte & PG_NX) != 0)
3633					invlva = TRUE;
3634#endif
3635			}
3636			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3637				if ((origpte & PG_MANAGED) != 0)
3638					vm_page_dirty(om);
3639				if ((prot & VM_PROT_WRITE) == 0)
3640					invlva = TRUE;
3641			}
3642			if ((origpte & PG_MANAGED) != 0 &&
3643			    TAILQ_EMPTY(&om->md.pv_list) &&
3644			    ((om->flags & PG_FICTITIOUS) != 0 ||
3645			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3646				vm_page_aflag_clear(om, PGA_WRITEABLE);
3647			if (invlva)
3648				pmap_invalidate_page(pmap, va);
3649		} else
3650			pte_store(pte, newpte);
3651	}
3652
3653	/*
3654	 * If both the page table page and the reservation are fully
3655	 * populated, then attempt promotion.
3656	 */
3657	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3658	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3659	    vm_reserv_level_iffullpop(m) == 0)
3660		pmap_promote_pde(pmap, pde, va);
3661
3662	sched_unpin();
3663	rw_wunlock(&pvh_global_lock);
3664	PMAP_UNLOCK(pmap);
3665	return (KERN_SUCCESS);
3666}
3667
3668/*
3669 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3670 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3671 * blocking, (2) a mapping already exists at the specified virtual address, or
3672 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3673 */
3674static boolean_t
3675pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3676{
3677	pd_entry_t *pde, newpde;
3678
3679	rw_assert(&pvh_global_lock, RA_WLOCKED);
3680	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3681	pde = pmap_pde(pmap, va);
3682	if (*pde != 0) {
3683		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3684		    " in pmap %p", va, pmap);
3685		return (FALSE);
3686	}
3687	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3688	    PG_PS | PG_V;
3689	if ((m->oflags & VPO_UNMANAGED) == 0) {
3690		newpde |= PG_MANAGED;
3691
3692		/*
3693		 * Abort this mapping if its PV entry could not be created.
3694		 */
3695		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3696			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3697			    " in pmap %p", va, pmap);
3698			return (FALSE);
3699		}
3700	}
3701#if defined(PAE) || defined(PAE_TABLES)
3702	if ((prot & VM_PROT_EXECUTE) == 0)
3703		newpde |= pg_nx;
3704#endif
3705	if (va < VM_MAXUSER_ADDRESS)
3706		newpde |= PG_U;
3707
3708	/*
3709	 * Increment counters.
3710	 */
3711	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3712
3713	/*
3714	 * Map the superpage.
3715	 */
3716	pde_store(pde, newpde);
3717
3718	pmap_pde_mappings++;
3719	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3720	    " in pmap %p", va, pmap);
3721	return (TRUE);
3722}
3723
3724/*
3725 * Maps a sequence of resident pages belonging to the same object.
3726 * The sequence begins with the given page m_start.  This page is
3727 * mapped at the given virtual address start.  Each subsequent page is
3728 * mapped at a virtual address that is offset from start by the same
3729 * amount as the page is offset from m_start within the object.  The
3730 * last page in the sequence is the page with the largest offset from
3731 * m_start that can be mapped at a virtual address less than the given
3732 * virtual address end.  Not every virtual page between start and end
3733 * is mapped; only those for which a resident page exists with the
3734 * corresponding offset from m_start are mapped.
3735 */
3736void
3737pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3738    vm_page_t m_start, vm_prot_t prot)
3739{
3740	vm_offset_t va;
3741	vm_page_t m, mpte;
3742	vm_pindex_t diff, psize;
3743
3744	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3745
3746	psize = atop(end - start);
3747	mpte = NULL;
3748	m = m_start;
3749	rw_wlock(&pvh_global_lock);
3750	PMAP_LOCK(pmap);
3751	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3752		va = start + ptoa(diff);
3753		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3754		    m->psind == 1 && pg_ps_enabled &&
3755		    pmap_enter_pde(pmap, va, m, prot))
3756			m = &m[NBPDR / PAGE_SIZE - 1];
3757		else
3758			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3759			    mpte);
3760		m = TAILQ_NEXT(m, listq);
3761	}
3762	rw_wunlock(&pvh_global_lock);
3763	PMAP_UNLOCK(pmap);
3764}
3765
3766/*
3767 * this code makes some *MAJOR* assumptions:
3768 * 1. Current pmap & pmap exists.
3769 * 2. Not wired.
3770 * 3. Read access.
3771 * 4. No page table pages.
3772 * but is *MUCH* faster than pmap_enter...
3773 */
3774
3775void
3776pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3777{
3778
3779	rw_wlock(&pvh_global_lock);
3780	PMAP_LOCK(pmap);
3781	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3782	rw_wunlock(&pvh_global_lock);
3783	PMAP_UNLOCK(pmap);
3784}
3785
3786static vm_page_t
3787pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3788    vm_prot_t prot, vm_page_t mpte)
3789{
3790	pt_entry_t *pte;
3791	vm_paddr_t pa;
3792	struct spglist free;
3793
3794	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3795	    (m->oflags & VPO_UNMANAGED) != 0,
3796	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3797	rw_assert(&pvh_global_lock, RA_WLOCKED);
3798	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3799
3800	/*
3801	 * In the case that a page table page is not
3802	 * resident, we are creating it here.
3803	 */
3804	if (va < VM_MAXUSER_ADDRESS) {
3805		u_int ptepindex;
3806		pd_entry_t ptepa;
3807
3808		/*
3809		 * Calculate pagetable page index
3810		 */
3811		ptepindex = va >> PDRSHIFT;
3812		if (mpte && (mpte->pindex == ptepindex)) {
3813			mpte->wire_count++;
3814		} else {
3815			/*
3816			 * Get the page directory entry
3817			 */
3818			ptepa = pmap->pm_pdir[ptepindex];
3819
3820			/*
3821			 * If the page table page is mapped, we just increment
3822			 * the hold count, and activate it.
3823			 */
3824			if (ptepa) {
3825				if (ptepa & PG_PS)
3826					return (NULL);
3827				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3828				mpte->wire_count++;
3829			} else {
3830				mpte = _pmap_allocpte(pmap, ptepindex,
3831				    PMAP_ENTER_NOSLEEP);
3832				if (mpte == NULL)
3833					return (mpte);
3834			}
3835		}
3836	} else {
3837		mpte = NULL;
3838	}
3839
3840	/*
3841	 * This call to vtopte makes the assumption that we are
3842	 * entering the page into the current pmap.  In order to support
3843	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3844	 * But that isn't as quick as vtopte.
3845	 */
3846	pte = vtopte(va);
3847	if (*pte) {
3848		if (mpte != NULL) {
3849			mpte->wire_count--;
3850			mpte = NULL;
3851		}
3852		return (mpte);
3853	}
3854
3855	/*
3856	 * Enter on the PV list if part of our managed memory.
3857	 */
3858	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3859	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3860		if (mpte != NULL) {
3861			SLIST_INIT(&free);
3862			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3863				pmap_invalidate_page(pmap, va);
3864				pmap_free_zero_pages(&free);
3865			}
3866
3867			mpte = NULL;
3868		}
3869		return (mpte);
3870	}
3871
3872	/*
3873	 * Increment counters
3874	 */
3875	pmap->pm_stats.resident_count++;
3876
3877	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3878#if defined(PAE) || defined(PAE_TABLES)
3879	if ((prot & VM_PROT_EXECUTE) == 0)
3880		pa |= pg_nx;
3881#endif
3882
3883	/*
3884	 * Now validate mapping with RO protection
3885	 */
3886	if ((m->oflags & VPO_UNMANAGED) != 0)
3887		pte_store(pte, pa | PG_V | PG_U);
3888	else
3889		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3890	return (mpte);
3891}
3892
3893/*
3894 * Make a temporary mapping for a physical address.  This is only intended
3895 * to be used for panic dumps.
3896 */
3897void *
3898pmap_kenter_temporary(vm_paddr_t pa, int i)
3899{
3900	vm_offset_t va;
3901
3902	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3903	pmap_kenter(va, pa);
3904	invlpg(va);
3905	return ((void *)crashdumpmap);
3906}
3907
3908/*
3909 * This code maps large physical mmap regions into the
3910 * processor address space.  Note that some shortcuts
3911 * are taken, but the code works.
3912 */
3913void
3914pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3915    vm_pindex_t pindex, vm_size_t size)
3916{
3917	pd_entry_t *pde;
3918	vm_paddr_t pa, ptepa;
3919	vm_page_t p;
3920	int pat_mode;
3921
3922	VM_OBJECT_ASSERT_WLOCKED(object);
3923	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3924	    ("pmap_object_init_pt: non-device object"));
3925	if (pseflag &&
3926	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3927		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3928			return;
3929		p = vm_page_lookup(object, pindex);
3930		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3931		    ("pmap_object_init_pt: invalid page %p", p));
3932		pat_mode = p->md.pat_mode;
3933
3934		/*
3935		 * Abort the mapping if the first page is not physically
3936		 * aligned to a 2/4MB page boundary.
3937		 */
3938		ptepa = VM_PAGE_TO_PHYS(p);
3939		if (ptepa & (NBPDR - 1))
3940			return;
3941
3942		/*
3943		 * Skip the first page.  Abort the mapping if the rest of
3944		 * the pages are not physically contiguous or have differing
3945		 * memory attributes.
3946		 */
3947		p = TAILQ_NEXT(p, listq);
3948		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3949		    pa += PAGE_SIZE) {
3950			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3951			    ("pmap_object_init_pt: invalid page %p", p));
3952			if (pa != VM_PAGE_TO_PHYS(p) ||
3953			    pat_mode != p->md.pat_mode)
3954				return;
3955			p = TAILQ_NEXT(p, listq);
3956		}
3957
3958		/*
3959		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3960		 * "size" is a multiple of 2/4M, adding the PAT setting to
3961		 * "pa" will not affect the termination of this loop.
3962		 */
3963		PMAP_LOCK(pmap);
3964		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3965		    size; pa += NBPDR) {
3966			pde = pmap_pde(pmap, addr);
3967			if (*pde == 0) {
3968				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3969				    PG_U | PG_RW | PG_V);
3970				pmap->pm_stats.resident_count += NBPDR /
3971				    PAGE_SIZE;
3972				pmap_pde_mappings++;
3973			}
3974			/* Else continue on if the PDE is already valid. */
3975			addr += NBPDR;
3976		}
3977		PMAP_UNLOCK(pmap);
3978	}
3979}
3980
3981/*
3982 *	Clear the wired attribute from the mappings for the specified range of
3983 *	addresses in the given pmap.  Every valid mapping within that range
3984 *	must have the wired attribute set.  In contrast, invalid mappings
3985 *	cannot have the wired attribute set, so they are ignored.
3986 *
3987 *	The wired attribute of the page table entry is not a hardware feature,
3988 *	so there is no need to invalidate any TLB entries.
3989 */
3990void
3991pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3992{
3993	vm_offset_t pdnxt;
3994	pd_entry_t *pde;
3995	pt_entry_t *pte;
3996	boolean_t pv_lists_locked;
3997
3998	if (pmap_is_current(pmap))
3999		pv_lists_locked = FALSE;
4000	else {
4001		pv_lists_locked = TRUE;
4002resume:
4003		rw_wlock(&pvh_global_lock);
4004		sched_pin();
4005	}
4006	PMAP_LOCK(pmap);
4007	for (; sva < eva; sva = pdnxt) {
4008		pdnxt = (sva + NBPDR) & ~PDRMASK;
4009		if (pdnxt < sva)
4010			pdnxt = eva;
4011		pde = pmap_pde(pmap, sva);
4012		if ((*pde & PG_V) == 0)
4013			continue;
4014		if ((*pde & PG_PS) != 0) {
4015			if ((*pde & PG_W) == 0)
4016				panic("pmap_unwire: pde %#jx is missing PG_W",
4017				    (uintmax_t)*pde);
4018
4019			/*
4020			 * Are we unwiring the entire large page?  If not,
4021			 * demote the mapping and fall through.
4022			 */
4023			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
4024				/*
4025				 * Regardless of whether a pde (or pte) is 32
4026				 * or 64 bits in size, PG_W is among the least
4027				 * significant 32 bits.
4028				 */
4029				atomic_clear_int((u_int *)pde, PG_W);
4030				pmap->pm_stats.wired_count -= NBPDR /
4031				    PAGE_SIZE;
4032				continue;
4033			} else {
4034				if (!pv_lists_locked) {
4035					pv_lists_locked = TRUE;
4036					if (!rw_try_wlock(&pvh_global_lock)) {
4037						PMAP_UNLOCK(pmap);
4038						/* Repeat sva. */
4039						goto resume;
4040					}
4041					sched_pin();
4042				}
4043				if (!pmap_demote_pde(pmap, pde, sva))
4044					panic("pmap_unwire: demotion failed");
4045			}
4046		}
4047		if (pdnxt > eva)
4048			pdnxt = eva;
4049		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4050		    sva += PAGE_SIZE) {
4051			if ((*pte & PG_V) == 0)
4052				continue;
4053			if ((*pte & PG_W) == 0)
4054				panic("pmap_unwire: pte %#jx is missing PG_W",
4055				    (uintmax_t)*pte);
4056
4057			/*
4058			 * PG_W must be cleared atomically.  Although the pmap
4059			 * lock synchronizes access to PG_W, another processor
4060			 * could be setting PG_M and/or PG_A concurrently.
4061			 *
4062			 * PG_W is among the least significant 32 bits.
4063			 */
4064			atomic_clear_int((u_int *)pte, PG_W);
4065			pmap->pm_stats.wired_count--;
4066		}
4067	}
4068	if (pv_lists_locked) {
4069		sched_unpin();
4070		rw_wunlock(&pvh_global_lock);
4071	}
4072	PMAP_UNLOCK(pmap);
4073}
4074
4075
4076/*
4077 *	Copy the range specified by src_addr/len
4078 *	from the source map to the range dst_addr/len
4079 *	in the destination map.
4080 *
4081 *	This routine is only advisory and need not do anything.
4082 */
4083
4084void
4085pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4086    vm_offset_t src_addr)
4087{
4088	struct spglist free;
4089	vm_offset_t addr;
4090	vm_offset_t end_addr = src_addr + len;
4091	vm_offset_t pdnxt;
4092
4093	if (dst_addr != src_addr)
4094		return;
4095
4096	if (!pmap_is_current(src_pmap))
4097		return;
4098
4099	rw_wlock(&pvh_global_lock);
4100	if (dst_pmap < src_pmap) {
4101		PMAP_LOCK(dst_pmap);
4102		PMAP_LOCK(src_pmap);
4103	} else {
4104		PMAP_LOCK(src_pmap);
4105		PMAP_LOCK(dst_pmap);
4106	}
4107	sched_pin();
4108	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4109		pt_entry_t *src_pte, *dst_pte;
4110		vm_page_t dstmpte, srcmpte;
4111		pd_entry_t srcptepaddr;
4112		u_int ptepindex;
4113
4114		KASSERT(addr < UPT_MIN_ADDRESS,
4115		    ("pmap_copy: invalid to pmap_copy page tables"));
4116
4117		pdnxt = (addr + NBPDR) & ~PDRMASK;
4118		if (pdnxt < addr)
4119			pdnxt = end_addr;
4120		ptepindex = addr >> PDRSHIFT;
4121
4122		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4123		if (srcptepaddr == 0)
4124			continue;
4125
4126		if (srcptepaddr & PG_PS) {
4127			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4128				continue;
4129			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4130			    ((srcptepaddr & PG_MANAGED) == 0 ||
4131			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4132			    PG_PS_FRAME))) {
4133				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4134				    ~PG_W;
4135				dst_pmap->pm_stats.resident_count +=
4136				    NBPDR / PAGE_SIZE;
4137				pmap_pde_mappings++;
4138			}
4139			continue;
4140		}
4141
4142		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4143		KASSERT(srcmpte->wire_count > 0,
4144		    ("pmap_copy: source page table page is unused"));
4145
4146		if (pdnxt > end_addr)
4147			pdnxt = end_addr;
4148
4149		src_pte = vtopte(addr);
4150		while (addr < pdnxt) {
4151			pt_entry_t ptetemp;
4152			ptetemp = *src_pte;
4153			/*
4154			 * we only virtual copy managed pages
4155			 */
4156			if ((ptetemp & PG_MANAGED) != 0) {
4157				dstmpte = pmap_allocpte(dst_pmap, addr,
4158				    PMAP_ENTER_NOSLEEP);
4159				if (dstmpte == NULL)
4160					goto out;
4161				dst_pte = pmap_pte_quick(dst_pmap, addr);
4162				if (*dst_pte == 0 &&
4163				    pmap_try_insert_pv_entry(dst_pmap, addr,
4164				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4165					/*
4166					 * Clear the wired, modified, and
4167					 * accessed (referenced) bits
4168					 * during the copy.
4169					 */
4170					*dst_pte = ptetemp & ~(PG_W | PG_M |
4171					    PG_A);
4172					dst_pmap->pm_stats.resident_count++;
4173	 			} else {
4174					SLIST_INIT(&free);
4175					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4176					    &free)) {
4177						pmap_invalidate_page(dst_pmap,
4178						    addr);
4179						pmap_free_zero_pages(&free);
4180					}
4181					goto out;
4182				}
4183				if (dstmpte->wire_count >= srcmpte->wire_count)
4184					break;
4185			}
4186			addr += PAGE_SIZE;
4187			src_pte++;
4188		}
4189	}
4190out:
4191	sched_unpin();
4192	rw_wunlock(&pvh_global_lock);
4193	PMAP_UNLOCK(src_pmap);
4194	PMAP_UNLOCK(dst_pmap);
4195}
4196
4197static __inline void
4198pagezero(void *page)
4199{
4200#if defined(I686_CPU)
4201	if (cpu_class == CPUCLASS_686) {
4202#if defined(CPU_ENABLE_SSE)
4203		if (cpu_feature & CPUID_SSE2)
4204			sse2_pagezero(page);
4205		else
4206#endif
4207			i686_pagezero(page);
4208	} else
4209#endif
4210		bzero(page, PAGE_SIZE);
4211}
4212
4213/*
4214 *	pmap_zero_page zeros the specified hardware page by mapping
4215 *	the page into KVM and using bzero to clear its contents.
4216 */
4217void
4218pmap_zero_page(vm_page_t m)
4219{
4220	pt_entry_t *cmap_pte2;
4221	struct pcpu *pc;
4222
4223	sched_pin();
4224	pc = pcpu_find(curcpu);
4225	cmap_pte2 = pc->pc_cmap_pte2;
4226	mtx_lock(&pc->pc_cmap_lock);
4227	if (*cmap_pte2)
4228		panic("pmap_zero_page: CMAP2 busy");
4229	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4230	    pmap_cache_bits(m->md.pat_mode, 0);
4231	invlcaddr(pc->pc_cmap_addr2);
4232	pagezero(pc->pc_cmap_addr2);
4233	*cmap_pte2 = 0;
4234
4235	/*
4236	 * Unpin the thread before releasing the lock.  Otherwise the thread
4237	 * could be rescheduled while still bound to the current CPU, only
4238	 * to unpin itself immediately upon resuming execution.
4239	 */
4240	sched_unpin();
4241	mtx_unlock(&pc->pc_cmap_lock);
4242}
4243
4244/*
4245 *	pmap_zero_page_area zeros the specified hardware page by mapping
4246 *	the page into KVM and using bzero to clear its contents.
4247 *
4248 *	off and size may not cover an area beyond a single hardware page.
4249 */
4250void
4251pmap_zero_page_area(vm_page_t m, int off, int size)
4252{
4253	pt_entry_t *cmap_pte2;
4254	struct pcpu *pc;
4255
4256	sched_pin();
4257	pc = pcpu_find(curcpu);
4258	cmap_pte2 = pc->pc_cmap_pte2;
4259	mtx_lock(&pc->pc_cmap_lock);
4260	if (*cmap_pte2)
4261		panic("pmap_zero_page_area: CMAP2 busy");
4262	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4263	    pmap_cache_bits(m->md.pat_mode, 0);
4264	invlcaddr(pc->pc_cmap_addr2);
4265	if (off == 0 && size == PAGE_SIZE)
4266		pagezero(pc->pc_cmap_addr2);
4267	else
4268		bzero(pc->pc_cmap_addr2 + off, size);
4269	*cmap_pte2 = 0;
4270	sched_unpin();
4271	mtx_unlock(&pc->pc_cmap_lock);
4272}
4273
4274/*
4275 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4276 *	the page into KVM and using bzero to clear its contents.  This
4277 *	is intended to be called from the vm_pagezero process only and
4278 *	outside of Giant.
4279 */
4280void
4281pmap_zero_page_idle(vm_page_t m)
4282{
4283
4284	if (*CMAP3)
4285		panic("pmap_zero_page_idle: CMAP3 busy");
4286	sched_pin();
4287	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4288	    pmap_cache_bits(m->md.pat_mode, 0);
4289	invlcaddr(CADDR3);
4290	pagezero(CADDR3);
4291	*CMAP3 = 0;
4292	sched_unpin();
4293}
4294
4295/*
4296 *	pmap_copy_page copies the specified (machine independent)
4297 *	page by mapping the page into virtual memory and using
4298 *	bcopy to copy the page, one machine dependent page at a
4299 *	time.
4300 */
4301void
4302pmap_copy_page(vm_page_t src, vm_page_t dst)
4303{
4304	pt_entry_t *cmap_pte1, *cmap_pte2;
4305	struct pcpu *pc;
4306
4307	sched_pin();
4308	pc = pcpu_find(curcpu);
4309	cmap_pte1 = pc->pc_cmap_pte1;
4310	cmap_pte2 = pc->pc_cmap_pte2;
4311	mtx_lock(&pc->pc_cmap_lock);
4312	if (*cmap_pte1)
4313		panic("pmap_copy_page: CMAP1 busy");
4314	if (*cmap_pte2)
4315		panic("pmap_copy_page: CMAP2 busy");
4316	*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4317	    pmap_cache_bits(src->md.pat_mode, 0);
4318	invlcaddr(pc->pc_cmap_addr1);
4319	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4320	    pmap_cache_bits(dst->md.pat_mode, 0);
4321	invlcaddr(pc->pc_cmap_addr2);
4322	bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
4323	*cmap_pte1 = 0;
4324	*cmap_pte2 = 0;
4325	sched_unpin();
4326	mtx_unlock(&pc->pc_cmap_lock);
4327}
4328
4329int unmapped_buf_allowed = 1;
4330
4331void
4332pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4333    vm_offset_t b_offset, int xfersize)
4334{
4335	vm_page_t a_pg, b_pg;
4336	char *a_cp, *b_cp;
4337	vm_offset_t a_pg_offset, b_pg_offset;
4338	pt_entry_t *cmap_pte1, *cmap_pte2;
4339	struct pcpu *pc;
4340	int cnt;
4341
4342	sched_pin();
4343	pc = pcpu_find(curcpu);
4344	cmap_pte1 = pc->pc_cmap_pte1;
4345	cmap_pte2 = pc->pc_cmap_pte2;
4346	mtx_lock(&pc->pc_cmap_lock);
4347	if (*cmap_pte1 != 0)
4348		panic("pmap_copy_pages: CMAP1 busy");
4349	if (*cmap_pte2 != 0)
4350		panic("pmap_copy_pages: CMAP2 busy");
4351	while (xfersize > 0) {
4352		a_pg = ma[a_offset >> PAGE_SHIFT];
4353		a_pg_offset = a_offset & PAGE_MASK;
4354		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4355		b_pg = mb[b_offset >> PAGE_SHIFT];
4356		b_pg_offset = b_offset & PAGE_MASK;
4357		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4358		*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4359		    pmap_cache_bits(a_pg->md.pat_mode, 0);
4360		invlcaddr(pc->pc_cmap_addr1);
4361		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4362		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4363		invlcaddr(pc->pc_cmap_addr2);
4364		a_cp = pc->pc_cmap_addr1 + a_pg_offset;
4365		b_cp = pc->pc_cmap_addr2 + b_pg_offset;
4366		bcopy(a_cp, b_cp, cnt);
4367		a_offset += cnt;
4368		b_offset += cnt;
4369		xfersize -= cnt;
4370	}
4371	*cmap_pte1 = 0;
4372	*cmap_pte2 = 0;
4373	sched_unpin();
4374	mtx_unlock(&pc->pc_cmap_lock);
4375}
4376
4377/*
4378 * Returns true if the pmap's pv is one of the first
4379 * 16 pvs linked to from this page.  This count may
4380 * be changed upwards or downwards in the future; it
4381 * is only necessary that true be returned for a small
4382 * subset of pmaps for proper page aging.
4383 */
4384boolean_t
4385pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4386{
4387	struct md_page *pvh;
4388	pv_entry_t pv;
4389	int loops = 0;
4390	boolean_t rv;
4391
4392	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4393	    ("pmap_page_exists_quick: page %p is not managed", m));
4394	rv = FALSE;
4395	rw_wlock(&pvh_global_lock);
4396	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4397		if (PV_PMAP(pv) == pmap) {
4398			rv = TRUE;
4399			break;
4400		}
4401		loops++;
4402		if (loops >= 16)
4403			break;
4404	}
4405	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4406		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4407		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4408			if (PV_PMAP(pv) == pmap) {
4409				rv = TRUE;
4410				break;
4411			}
4412			loops++;
4413			if (loops >= 16)
4414				break;
4415		}
4416	}
4417	rw_wunlock(&pvh_global_lock);
4418	return (rv);
4419}
4420
4421/*
4422 *	pmap_page_wired_mappings:
4423 *
4424 *	Return the number of managed mappings to the given physical page
4425 *	that are wired.
4426 */
4427int
4428pmap_page_wired_mappings(vm_page_t m)
4429{
4430	int count;
4431
4432	count = 0;
4433	if ((m->oflags & VPO_UNMANAGED) != 0)
4434		return (count);
4435	rw_wlock(&pvh_global_lock);
4436	count = pmap_pvh_wired_mappings(&m->md, count);
4437	if ((m->flags & PG_FICTITIOUS) == 0) {
4438	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4439	        count);
4440	}
4441	rw_wunlock(&pvh_global_lock);
4442	return (count);
4443}
4444
4445/*
4446 *	pmap_pvh_wired_mappings:
4447 *
4448 *	Return the updated number "count" of managed mappings that are wired.
4449 */
4450static int
4451pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4452{
4453	pmap_t pmap;
4454	pt_entry_t *pte;
4455	pv_entry_t pv;
4456
4457	rw_assert(&pvh_global_lock, RA_WLOCKED);
4458	sched_pin();
4459	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4460		pmap = PV_PMAP(pv);
4461		PMAP_LOCK(pmap);
4462		pte = pmap_pte_quick(pmap, pv->pv_va);
4463		if ((*pte & PG_W) != 0)
4464			count++;
4465		PMAP_UNLOCK(pmap);
4466	}
4467	sched_unpin();
4468	return (count);
4469}
4470
4471/*
4472 * Returns TRUE if the given page is mapped individually or as part of
4473 * a 4mpage.  Otherwise, returns FALSE.
4474 */
4475boolean_t
4476pmap_page_is_mapped(vm_page_t m)
4477{
4478	boolean_t rv;
4479
4480	if ((m->oflags & VPO_UNMANAGED) != 0)
4481		return (FALSE);
4482	rw_wlock(&pvh_global_lock);
4483	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4484	    ((m->flags & PG_FICTITIOUS) == 0 &&
4485	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4486	rw_wunlock(&pvh_global_lock);
4487	return (rv);
4488}
4489
4490/*
4491 * Remove all pages from specified address space
4492 * this aids process exit speeds.  Also, this code
4493 * is special cased for current process only, but
4494 * can have the more generic (and slightly slower)
4495 * mode enabled.  This is much faster than pmap_remove
4496 * in the case of running down an entire address space.
4497 */
4498void
4499pmap_remove_pages(pmap_t pmap)
4500{
4501	pt_entry_t *pte, tpte;
4502	vm_page_t m, mpte, mt;
4503	pv_entry_t pv;
4504	struct md_page *pvh;
4505	struct pv_chunk *pc, *npc;
4506	struct spglist free;
4507	int field, idx;
4508	int32_t bit;
4509	uint32_t inuse, bitmask;
4510	int allfree;
4511
4512	if (pmap != PCPU_GET(curpmap)) {
4513		printf("warning: pmap_remove_pages called with non-current pmap\n");
4514		return;
4515	}
4516	SLIST_INIT(&free);
4517	rw_wlock(&pvh_global_lock);
4518	PMAP_LOCK(pmap);
4519	sched_pin();
4520	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4521		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4522		    pc->pc_pmap));
4523		allfree = 1;
4524		for (field = 0; field < _NPCM; field++) {
4525			inuse = ~pc->pc_map[field] & pc_freemask[field];
4526			while (inuse != 0) {
4527				bit = bsfl(inuse);
4528				bitmask = 1UL << bit;
4529				idx = field * 32 + bit;
4530				pv = &pc->pc_pventry[idx];
4531				inuse &= ~bitmask;
4532
4533				pte = pmap_pde(pmap, pv->pv_va);
4534				tpte = *pte;
4535				if ((tpte & PG_PS) == 0) {
4536					pte = vtopte(pv->pv_va);
4537					tpte = *pte & ~PG_PTE_PAT;
4538				}
4539
4540				if (tpte == 0) {
4541					printf(
4542					    "TPTE at %p  IS ZERO @ VA %08x\n",
4543					    pte, pv->pv_va);
4544					panic("bad pte");
4545				}
4546
4547/*
4548 * We cannot remove wired pages from a process' mapping at this time
4549 */
4550				if (tpte & PG_W) {
4551					allfree = 0;
4552					continue;
4553				}
4554
4555				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4556				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4557				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4558				    m, (uintmax_t)m->phys_addr,
4559				    (uintmax_t)tpte));
4560
4561				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4562				    m < &vm_page_array[vm_page_array_size],
4563				    ("pmap_remove_pages: bad tpte %#jx",
4564				    (uintmax_t)tpte));
4565
4566				pte_clear(pte);
4567
4568				/*
4569				 * Update the vm_page_t clean/reference bits.
4570				 */
4571				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4572					if ((tpte & PG_PS) != 0) {
4573						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4574							vm_page_dirty(mt);
4575					} else
4576						vm_page_dirty(m);
4577				}
4578
4579				/* Mark free */
4580				PV_STAT(pv_entry_frees++);
4581				PV_STAT(pv_entry_spare++);
4582				pv_entry_count--;
4583				pc->pc_map[field] |= bitmask;
4584				if ((tpte & PG_PS) != 0) {
4585					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4586					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4587					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4588					if (TAILQ_EMPTY(&pvh->pv_list)) {
4589						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4590							if (TAILQ_EMPTY(&mt->md.pv_list))
4591								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4592					}
4593					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4594					if (mpte != NULL) {
4595						pmap_remove_pt_page(pmap, mpte);
4596						pmap->pm_stats.resident_count--;
4597						KASSERT(mpte->wire_count == NPTEPG,
4598						    ("pmap_remove_pages: pte page wire count error"));
4599						mpte->wire_count = 0;
4600						pmap_add_delayed_free_list(mpte, &free, FALSE);
4601						atomic_subtract_int(&vm_cnt.v_wire_count, 1);
4602					}
4603				} else {
4604					pmap->pm_stats.resident_count--;
4605					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4606					if (TAILQ_EMPTY(&m->md.pv_list) &&
4607					    (m->flags & PG_FICTITIOUS) == 0) {
4608						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4609						if (TAILQ_EMPTY(&pvh->pv_list))
4610							vm_page_aflag_clear(m, PGA_WRITEABLE);
4611					}
4612					pmap_unuse_pt(pmap, pv->pv_va, &free);
4613				}
4614			}
4615		}
4616		if (allfree) {
4617			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4618			free_pv_chunk(pc);
4619		}
4620	}
4621	sched_unpin();
4622	pmap_invalidate_all(pmap);
4623	rw_wunlock(&pvh_global_lock);
4624	PMAP_UNLOCK(pmap);
4625	pmap_free_zero_pages(&free);
4626}
4627
4628/*
4629 *	pmap_is_modified:
4630 *
4631 *	Return whether or not the specified physical page was modified
4632 *	in any physical maps.
4633 */
4634boolean_t
4635pmap_is_modified(vm_page_t m)
4636{
4637	boolean_t rv;
4638
4639	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4640	    ("pmap_is_modified: page %p is not managed", m));
4641
4642	/*
4643	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4644	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4645	 * is clear, no PTEs can have PG_M set.
4646	 */
4647	VM_OBJECT_ASSERT_WLOCKED(m->object);
4648	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4649		return (FALSE);
4650	rw_wlock(&pvh_global_lock);
4651	rv = pmap_is_modified_pvh(&m->md) ||
4652	    ((m->flags & PG_FICTITIOUS) == 0 &&
4653	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4654	rw_wunlock(&pvh_global_lock);
4655	return (rv);
4656}
4657
4658/*
4659 * Returns TRUE if any of the given mappings were used to modify
4660 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4661 * mappings are supported.
4662 */
4663static boolean_t
4664pmap_is_modified_pvh(struct md_page *pvh)
4665{
4666	pv_entry_t pv;
4667	pt_entry_t *pte;
4668	pmap_t pmap;
4669	boolean_t rv;
4670
4671	rw_assert(&pvh_global_lock, RA_WLOCKED);
4672	rv = FALSE;
4673	sched_pin();
4674	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4675		pmap = PV_PMAP(pv);
4676		PMAP_LOCK(pmap);
4677		pte = pmap_pte_quick(pmap, pv->pv_va);
4678		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4679		PMAP_UNLOCK(pmap);
4680		if (rv)
4681			break;
4682	}
4683	sched_unpin();
4684	return (rv);
4685}
4686
4687/*
4688 *	pmap_is_prefaultable:
4689 *
4690 *	Return whether or not the specified virtual address is elgible
4691 *	for prefault.
4692 */
4693boolean_t
4694pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4695{
4696	pd_entry_t *pde;
4697	pt_entry_t *pte;
4698	boolean_t rv;
4699
4700	rv = FALSE;
4701	PMAP_LOCK(pmap);
4702	pde = pmap_pde(pmap, addr);
4703	if (*pde != 0 && (*pde & PG_PS) == 0) {
4704		pte = vtopte(addr);
4705		rv = *pte == 0;
4706	}
4707	PMAP_UNLOCK(pmap);
4708	return (rv);
4709}
4710
4711/*
4712 *	pmap_is_referenced:
4713 *
4714 *	Return whether or not the specified physical page was referenced
4715 *	in any physical maps.
4716 */
4717boolean_t
4718pmap_is_referenced(vm_page_t m)
4719{
4720	boolean_t rv;
4721
4722	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4723	    ("pmap_is_referenced: page %p is not managed", m));
4724	rw_wlock(&pvh_global_lock);
4725	rv = pmap_is_referenced_pvh(&m->md) ||
4726	    ((m->flags & PG_FICTITIOUS) == 0 &&
4727	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4728	rw_wunlock(&pvh_global_lock);
4729	return (rv);
4730}
4731
4732/*
4733 * Returns TRUE if any of the given mappings were referenced and FALSE
4734 * otherwise.  Both page and 4mpage mappings are supported.
4735 */
4736static boolean_t
4737pmap_is_referenced_pvh(struct md_page *pvh)
4738{
4739	pv_entry_t pv;
4740	pt_entry_t *pte;
4741	pmap_t pmap;
4742	boolean_t rv;
4743
4744	rw_assert(&pvh_global_lock, RA_WLOCKED);
4745	rv = FALSE;
4746	sched_pin();
4747	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4748		pmap = PV_PMAP(pv);
4749		PMAP_LOCK(pmap);
4750		pte = pmap_pte_quick(pmap, pv->pv_va);
4751		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4752		PMAP_UNLOCK(pmap);
4753		if (rv)
4754			break;
4755	}
4756	sched_unpin();
4757	return (rv);
4758}
4759
4760/*
4761 * Clear the write and modified bits in each of the given page's mappings.
4762 */
4763void
4764pmap_remove_write(vm_page_t m)
4765{
4766	struct md_page *pvh;
4767	pv_entry_t next_pv, pv;
4768	pmap_t pmap;
4769	pd_entry_t *pde;
4770	pt_entry_t oldpte, *pte;
4771	vm_offset_t va;
4772
4773	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4774	    ("pmap_remove_write: page %p is not managed", m));
4775
4776	/*
4777	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4778	 * set by another thread while the object is locked.  Thus,
4779	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4780	 */
4781	VM_OBJECT_ASSERT_WLOCKED(m->object);
4782	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4783		return;
4784	rw_wlock(&pvh_global_lock);
4785	sched_pin();
4786	if ((m->flags & PG_FICTITIOUS) != 0)
4787		goto small_mappings;
4788	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4789	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4790		va = pv->pv_va;
4791		pmap = PV_PMAP(pv);
4792		PMAP_LOCK(pmap);
4793		pde = pmap_pde(pmap, va);
4794		if ((*pde & PG_RW) != 0)
4795			(void)pmap_demote_pde(pmap, pde, va);
4796		PMAP_UNLOCK(pmap);
4797	}
4798small_mappings:
4799	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4800		pmap = PV_PMAP(pv);
4801		PMAP_LOCK(pmap);
4802		pde = pmap_pde(pmap, pv->pv_va);
4803		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4804		    " a 4mpage in page %p's pv list", m));
4805		pte = pmap_pte_quick(pmap, pv->pv_va);
4806retry:
4807		oldpte = *pte;
4808		if ((oldpte & PG_RW) != 0) {
4809			/*
4810			 * Regardless of whether a pte is 32 or 64 bits
4811			 * in size, PG_RW and PG_M are among the least
4812			 * significant 32 bits.
4813			 */
4814			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4815			    oldpte & ~(PG_RW | PG_M)))
4816				goto retry;
4817			if ((oldpte & PG_M) != 0)
4818				vm_page_dirty(m);
4819			pmap_invalidate_page(pmap, pv->pv_va);
4820		}
4821		PMAP_UNLOCK(pmap);
4822	}
4823	vm_page_aflag_clear(m, PGA_WRITEABLE);
4824	sched_unpin();
4825	rw_wunlock(&pvh_global_lock);
4826}
4827
4828#define	PMAP_TS_REFERENCED_MAX	5
4829
4830/*
4831 *	pmap_ts_referenced:
4832 *
4833 *	Return a count of reference bits for a page, clearing those bits.
4834 *	It is not necessary for every reference bit to be cleared, but it
4835 *	is necessary that 0 only be returned when there are truly no
4836 *	reference bits set.
4837 *
4838 *	XXX: The exact number of bits to check and clear is a matter that
4839 *	should be tested and standardized at some point in the future for
4840 *	optimal aging of shared pages.
4841 *
4842 *	As an optimization, update the page's dirty field if a modified bit is
4843 *	found while counting reference bits.  This opportunistic update can be
4844 *	performed at low cost and can eliminate the need for some future calls
4845 *	to pmap_is_modified().  However, since this function stops after
4846 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4847 *	dirty pages.  Those dirty pages will only be detected by a future call
4848 *	to pmap_is_modified().
4849 */
4850int
4851pmap_ts_referenced(vm_page_t m)
4852{
4853	struct md_page *pvh;
4854	pv_entry_t pv, pvf;
4855	pmap_t pmap;
4856	pd_entry_t *pde;
4857	pt_entry_t *pte;
4858	vm_paddr_t pa;
4859	int rtval = 0;
4860
4861	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4862	    ("pmap_ts_referenced: page %p is not managed", m));
4863	pa = VM_PAGE_TO_PHYS(m);
4864	pvh = pa_to_pvh(pa);
4865	rw_wlock(&pvh_global_lock);
4866	sched_pin();
4867	if ((m->flags & PG_FICTITIOUS) != 0 ||
4868	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4869		goto small_mappings;
4870	pv = pvf;
4871	do {
4872		pmap = PV_PMAP(pv);
4873		PMAP_LOCK(pmap);
4874		pde = pmap_pde(pmap, pv->pv_va);
4875		if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4876			/*
4877			 * Although "*pde" is mapping a 2/4MB page, because
4878			 * this function is called at a 4KB page granularity,
4879			 * we only update the 4KB page under test.
4880			 */
4881			vm_page_dirty(m);
4882		}
4883		if ((*pde & PG_A) != 0) {
4884			/*
4885			 * Since this reference bit is shared by either 1024
4886			 * or 512 4KB pages, it should not be cleared every
4887			 * time it is tested.  Apply a simple "hash" function
4888			 * on the physical page number, the virtual superpage
4889			 * number, and the pmap address to select one 4KB page
4890			 * out of the 1024 or 512 on which testing the
4891			 * reference bit will result in clearing that bit.
4892			 * This function is designed to avoid the selection of
4893			 * the same 4KB page for every 2- or 4MB page mapping.
4894			 *
4895			 * On demotion, a mapping that hasn't been referenced
4896			 * is simply destroyed.  To avoid the possibility of a
4897			 * subsequent page fault on a demoted wired mapping,
4898			 * always leave its reference bit set.  Moreover,
4899			 * since the superpage is wired, the current state of
4900			 * its reference bit won't affect page replacement.
4901			 */
4902			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4903			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4904			    (*pde & PG_W) == 0) {
4905				atomic_clear_int((u_int *)pde, PG_A);
4906				pmap_invalidate_page(pmap, pv->pv_va);
4907			}
4908			rtval++;
4909		}
4910		PMAP_UNLOCK(pmap);
4911		/* Rotate the PV list if it has more than one entry. */
4912		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4913			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4914			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4915		}
4916		if (rtval >= PMAP_TS_REFERENCED_MAX)
4917			goto out;
4918	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4919small_mappings:
4920	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4921		goto out;
4922	pv = pvf;
4923	do {
4924		pmap = PV_PMAP(pv);
4925		PMAP_LOCK(pmap);
4926		pde = pmap_pde(pmap, pv->pv_va);
4927		KASSERT((*pde & PG_PS) == 0,
4928		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4929		    m));
4930		pte = pmap_pte_quick(pmap, pv->pv_va);
4931		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4932			vm_page_dirty(m);
4933		if ((*pte & PG_A) != 0) {
4934			atomic_clear_int((u_int *)pte, PG_A);
4935			pmap_invalidate_page(pmap, pv->pv_va);
4936			rtval++;
4937		}
4938		PMAP_UNLOCK(pmap);
4939		/* Rotate the PV list if it has more than one entry. */
4940		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4941			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4942			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4943		}
4944	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4945	    PMAP_TS_REFERENCED_MAX);
4946out:
4947	sched_unpin();
4948	rw_wunlock(&pvh_global_lock);
4949	return (rtval);
4950}
4951
4952/*
4953 *	Apply the given advice to the specified range of addresses within the
4954 *	given pmap.  Depending on the advice, clear the referenced and/or
4955 *	modified flags in each mapping and set the mapped page's dirty field.
4956 */
4957void
4958pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4959{
4960	pd_entry_t oldpde, *pde;
4961	pt_entry_t *pte;
4962	vm_offset_t va, pdnxt;
4963	vm_page_t m;
4964	boolean_t anychanged, pv_lists_locked;
4965
4966	if (advice != MADV_DONTNEED && advice != MADV_FREE)
4967		return;
4968	if (pmap_is_current(pmap))
4969		pv_lists_locked = FALSE;
4970	else {
4971		pv_lists_locked = TRUE;
4972resume:
4973		rw_wlock(&pvh_global_lock);
4974		sched_pin();
4975	}
4976	anychanged = FALSE;
4977	PMAP_LOCK(pmap);
4978	for (; sva < eva; sva = pdnxt) {
4979		pdnxt = (sva + NBPDR) & ~PDRMASK;
4980		if (pdnxt < sva)
4981			pdnxt = eva;
4982		pde = pmap_pde(pmap, sva);
4983		oldpde = *pde;
4984		if ((oldpde & PG_V) == 0)
4985			continue;
4986		else if ((oldpde & PG_PS) != 0) {
4987			if ((oldpde & PG_MANAGED) == 0)
4988				continue;
4989			if (!pv_lists_locked) {
4990				pv_lists_locked = TRUE;
4991				if (!rw_try_wlock(&pvh_global_lock)) {
4992					if (anychanged)
4993						pmap_invalidate_all(pmap);
4994					PMAP_UNLOCK(pmap);
4995					goto resume;
4996				}
4997				sched_pin();
4998			}
4999			if (!pmap_demote_pde(pmap, pde, sva)) {
5000				/*
5001				 * The large page mapping was destroyed.
5002				 */
5003				continue;
5004			}
5005
5006			/*
5007			 * Unless the page mappings are wired, remove the
5008			 * mapping to a single page so that a subsequent
5009			 * access may repromote.  Since the underlying page
5010			 * table page is fully populated, this removal never
5011			 * frees a page table page.
5012			 */
5013			if ((oldpde & PG_W) == 0) {
5014				pte = pmap_pte_quick(pmap, sva);
5015				KASSERT((*pte & PG_V) != 0,
5016				    ("pmap_advise: invalid PTE"));
5017				pmap_remove_pte(pmap, pte, sva, NULL);
5018				anychanged = TRUE;
5019			}
5020		}
5021		if (pdnxt > eva)
5022			pdnxt = eva;
5023		va = pdnxt;
5024		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
5025		    sva += PAGE_SIZE) {
5026			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
5027				goto maybe_invlrng;
5028			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5029				if (advice == MADV_DONTNEED) {
5030					/*
5031					 * Future calls to pmap_is_modified()
5032					 * can be avoided by making the page
5033					 * dirty now.
5034					 */
5035					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5036					vm_page_dirty(m);
5037				}
5038				atomic_clear_int((u_int *)pte, PG_M | PG_A);
5039			} else if ((*pte & PG_A) != 0)
5040				atomic_clear_int((u_int *)pte, PG_A);
5041			else
5042				goto maybe_invlrng;
5043			if ((*pte & PG_G) != 0) {
5044				if (va == pdnxt)
5045					va = sva;
5046			} else
5047				anychanged = TRUE;
5048			continue;
5049maybe_invlrng:
5050			if (va != pdnxt) {
5051				pmap_invalidate_range(pmap, va, sva);
5052				va = pdnxt;
5053			}
5054		}
5055		if (va != pdnxt)
5056			pmap_invalidate_range(pmap, va, sva);
5057	}
5058	if (anychanged)
5059		pmap_invalidate_all(pmap);
5060	if (pv_lists_locked) {
5061		sched_unpin();
5062		rw_wunlock(&pvh_global_lock);
5063	}
5064	PMAP_UNLOCK(pmap);
5065}
5066
5067/*
5068 *	Clear the modify bits on the specified physical page.
5069 */
5070void
5071pmap_clear_modify(vm_page_t m)
5072{
5073	struct md_page *pvh;
5074	pv_entry_t next_pv, pv;
5075	pmap_t pmap;
5076	pd_entry_t oldpde, *pde;
5077	pt_entry_t oldpte, *pte;
5078	vm_offset_t va;
5079
5080	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5081	    ("pmap_clear_modify: page %p is not managed", m));
5082	VM_OBJECT_ASSERT_WLOCKED(m->object);
5083	KASSERT(!vm_page_xbusied(m),
5084	    ("pmap_clear_modify: page %p is exclusive busied", m));
5085
5086	/*
5087	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5088	 * If the object containing the page is locked and the page is not
5089	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5090	 */
5091	if ((m->aflags & PGA_WRITEABLE) == 0)
5092		return;
5093	rw_wlock(&pvh_global_lock);
5094	sched_pin();
5095	if ((m->flags & PG_FICTITIOUS) != 0)
5096		goto small_mappings;
5097	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5098	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5099		va = pv->pv_va;
5100		pmap = PV_PMAP(pv);
5101		PMAP_LOCK(pmap);
5102		pde = pmap_pde(pmap, va);
5103		oldpde = *pde;
5104		if ((oldpde & PG_RW) != 0) {
5105			if (pmap_demote_pde(pmap, pde, va)) {
5106				if ((oldpde & PG_W) == 0) {
5107					/*
5108					 * Write protect the mapping to a
5109					 * single page so that a subsequent
5110					 * write access may repromote.
5111					 */
5112					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5113					    PG_PS_FRAME);
5114					pte = pmap_pte_quick(pmap, va);
5115					oldpte = *pte;
5116					if ((oldpte & PG_V) != 0) {
5117						/*
5118						 * Regardless of whether a pte is 32 or 64 bits
5119						 * in size, PG_RW and PG_M are among the least
5120						 * significant 32 bits.
5121						 */
5122						while (!atomic_cmpset_int((u_int *)pte,
5123						    oldpte,
5124						    oldpte & ~(PG_M | PG_RW)))
5125							oldpte = *pte;
5126						vm_page_dirty(m);
5127						pmap_invalidate_page(pmap, va);
5128					}
5129				}
5130			}
5131		}
5132		PMAP_UNLOCK(pmap);
5133	}
5134small_mappings:
5135	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5136		pmap = PV_PMAP(pv);
5137		PMAP_LOCK(pmap);
5138		pde = pmap_pde(pmap, pv->pv_va);
5139		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5140		    " a 4mpage in page %p's pv list", m));
5141		pte = pmap_pte_quick(pmap, pv->pv_va);
5142		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5143			/*
5144			 * Regardless of whether a pte is 32 or 64 bits
5145			 * in size, PG_M is among the least significant
5146			 * 32 bits.
5147			 */
5148			atomic_clear_int((u_int *)pte, PG_M);
5149			pmap_invalidate_page(pmap, pv->pv_va);
5150		}
5151		PMAP_UNLOCK(pmap);
5152	}
5153	sched_unpin();
5154	rw_wunlock(&pvh_global_lock);
5155}
5156
5157/*
5158 * Miscellaneous support routines follow
5159 */
5160
5161/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5162static __inline void
5163pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5164{
5165	u_int opte, npte;
5166
5167	/*
5168	 * The cache mode bits are all in the low 32-bits of the
5169	 * PTE, so we can just spin on updating the low 32-bits.
5170	 */
5171	do {
5172		opte = *(u_int *)pte;
5173		npte = opte & ~PG_PTE_CACHE;
5174		npte |= cache_bits;
5175	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5176}
5177
5178/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5179static __inline void
5180pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5181{
5182	u_int opde, npde;
5183
5184	/*
5185	 * The cache mode bits are all in the low 32-bits of the
5186	 * PDE, so we can just spin on updating the low 32-bits.
5187	 */
5188	do {
5189		opde = *(u_int *)pde;
5190		npde = opde & ~PG_PDE_CACHE;
5191		npde |= cache_bits;
5192	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5193}
5194
5195/*
5196 * Map a set of physical memory pages into the kernel virtual
5197 * address space. Return a pointer to where it is mapped. This
5198 * routine is intended to be used for mapping device memory,
5199 * NOT real memory.
5200 */
5201void *
5202pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5203{
5204	struct pmap_preinit_mapping *ppim;
5205	vm_offset_t va, offset;
5206	vm_size_t tmpsize;
5207	int i;
5208
5209	offset = pa & PAGE_MASK;
5210	size = round_page(offset + size);
5211	pa = pa & PG_FRAME;
5212
5213	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5214		va = KERNBASE + pa;
5215	else if (!pmap_initialized) {
5216		va = 0;
5217		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5218			ppim = pmap_preinit_mapping + i;
5219			if (ppim->va == 0) {
5220				ppim->pa = pa;
5221				ppim->sz = size;
5222				ppim->mode = mode;
5223				ppim->va = virtual_avail;
5224				virtual_avail += size;
5225				va = ppim->va;
5226				break;
5227			}
5228		}
5229		if (va == 0)
5230			panic("%s: too many preinit mappings", __func__);
5231	} else {
5232		/*
5233		 * If we have a preinit mapping, re-use it.
5234		 */
5235		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5236			ppim = pmap_preinit_mapping + i;
5237			if (ppim->pa == pa && ppim->sz == size &&
5238			    ppim->mode == mode)
5239				return ((void *)(ppim->va + offset));
5240		}
5241		va = kva_alloc(size);
5242		if (va == 0)
5243			panic("%s: Couldn't allocate KVA", __func__);
5244	}
5245	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5246		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5247	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5248	pmap_invalidate_cache_range(va, va + size, FALSE);
5249	return ((void *)(va + offset));
5250}
5251
5252void *
5253pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5254{
5255
5256	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5257}
5258
5259void *
5260pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5261{
5262
5263	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5264}
5265
5266void
5267pmap_unmapdev(vm_offset_t va, vm_size_t size)
5268{
5269	struct pmap_preinit_mapping *ppim;
5270	vm_offset_t offset;
5271	int i;
5272
5273	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5274		return;
5275	offset = va & PAGE_MASK;
5276	size = round_page(offset + size);
5277	va = trunc_page(va);
5278	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5279		ppim = pmap_preinit_mapping + i;
5280		if (ppim->va == va && ppim->sz == size) {
5281			if (pmap_initialized)
5282				return;
5283			ppim->pa = 0;
5284			ppim->va = 0;
5285			ppim->sz = 0;
5286			ppim->mode = 0;
5287			if (va + size == virtual_avail)
5288				virtual_avail = va;
5289			return;
5290		}
5291	}
5292	if (pmap_initialized)
5293		kva_free(va, size);
5294}
5295
5296/*
5297 * Sets the memory attribute for the specified page.
5298 */
5299void
5300pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5301{
5302
5303	m->md.pat_mode = ma;
5304	if ((m->flags & PG_FICTITIOUS) != 0)
5305		return;
5306
5307	/*
5308	 * If "m" is a normal page, flush it from the cache.
5309	 * See pmap_invalidate_cache_range().
5310	 *
5311	 * First, try to find an existing mapping of the page by sf
5312	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5313	 * flushes the cache.
5314	 */
5315	if (sf_buf_invalidate_cache(m))
5316		return;
5317
5318	/*
5319	 * If page is not mapped by sf buffer, but CPU does not
5320	 * support self snoop, map the page transient and do
5321	 * invalidation. In the worst case, whole cache is flushed by
5322	 * pmap_invalidate_cache_range().
5323	 */
5324	if ((cpu_feature & CPUID_SS) == 0)
5325		pmap_flush_page(m);
5326}
5327
5328static void
5329pmap_flush_page(vm_page_t m)
5330{
5331	pt_entry_t *cmap_pte2;
5332	struct pcpu *pc;
5333	vm_offset_t sva, eva;
5334	bool useclflushopt;
5335
5336	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
5337	if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
5338		sched_pin();
5339		pc = pcpu_find(curcpu);
5340		cmap_pte2 = pc->pc_cmap_pte2;
5341		mtx_lock(&pc->pc_cmap_lock);
5342		if (*cmap_pte2)
5343			panic("pmap_flush_page: CMAP2 busy");
5344		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5345		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5346		invlcaddr(pc->pc_cmap_addr2);
5347		sva = (vm_offset_t)pc->pc_cmap_addr2;
5348		eva = sva + PAGE_SIZE;
5349
5350		/*
5351		 * Use mfence or sfence despite the ordering implied by
5352		 * mtx_{un,}lock() because clflush on non-Intel CPUs
5353		 * and clflushopt are not guaranteed to be ordered by
5354		 * any other instruction.
5355		 */
5356		if (useclflushopt)
5357			sfence();
5358		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5359			mfence();
5360		for (; sva < eva; sva += cpu_clflush_line_size) {
5361			if (useclflushopt)
5362				clflushopt(sva);
5363			else
5364				clflush(sva);
5365		}
5366		if (useclflushopt)
5367			sfence();
5368		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5369			mfence();
5370		*cmap_pte2 = 0;
5371		sched_unpin();
5372		mtx_unlock(&pc->pc_cmap_lock);
5373	} else
5374		pmap_invalidate_cache();
5375}
5376
5377/*
5378 * Changes the specified virtual address range's memory type to that given by
5379 * the parameter "mode".  The specified virtual address range must be
5380 * completely contained within either the kernel map.
5381 *
5382 * Returns zero if the change completed successfully, and either EINVAL or
5383 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5384 * of the virtual address range was not mapped, and ENOMEM is returned if
5385 * there was insufficient memory available to complete the change.
5386 */
5387int
5388pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5389{
5390	vm_offset_t base, offset, tmpva;
5391	pd_entry_t *pde;
5392	pt_entry_t *pte;
5393	int cache_bits_pte, cache_bits_pde;
5394	boolean_t changed;
5395
5396	base = trunc_page(va);
5397	offset = va & PAGE_MASK;
5398	size = round_page(offset + size);
5399
5400	/*
5401	 * Only supported on kernel virtual addresses above the recursive map.
5402	 */
5403	if (base < VM_MIN_KERNEL_ADDRESS)
5404		return (EINVAL);
5405
5406	cache_bits_pde = pmap_cache_bits(mode, 1);
5407	cache_bits_pte = pmap_cache_bits(mode, 0);
5408	changed = FALSE;
5409
5410	/*
5411	 * Pages that aren't mapped aren't supported.  Also break down
5412	 * 2/4MB pages into 4KB pages if required.
5413	 */
5414	PMAP_LOCK(kernel_pmap);
5415	for (tmpva = base; tmpva < base + size; ) {
5416		pde = pmap_pde(kernel_pmap, tmpva);
5417		if (*pde == 0) {
5418			PMAP_UNLOCK(kernel_pmap);
5419			return (EINVAL);
5420		}
5421		if (*pde & PG_PS) {
5422			/*
5423			 * If the current 2/4MB page already has
5424			 * the required memory type, then we need not
5425			 * demote this page.  Just increment tmpva to
5426			 * the next 2/4MB page frame.
5427			 */
5428			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5429				tmpva = trunc_4mpage(tmpva) + NBPDR;
5430				continue;
5431			}
5432
5433			/*
5434			 * If the current offset aligns with a 2/4MB
5435			 * page frame and there is at least 2/4MB left
5436			 * within the range, then we need not break
5437			 * down this page into 4KB pages.
5438			 */
5439			if ((tmpva & PDRMASK) == 0 &&
5440			    tmpva + PDRMASK < base + size) {
5441				tmpva += NBPDR;
5442				continue;
5443			}
5444			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5445				PMAP_UNLOCK(kernel_pmap);
5446				return (ENOMEM);
5447			}
5448		}
5449		pte = vtopte(tmpva);
5450		if (*pte == 0) {
5451			PMAP_UNLOCK(kernel_pmap);
5452			return (EINVAL);
5453		}
5454		tmpva += PAGE_SIZE;
5455	}
5456	PMAP_UNLOCK(kernel_pmap);
5457
5458	/*
5459	 * Ok, all the pages exist, so run through them updating their
5460	 * cache mode if required.
5461	 */
5462	for (tmpva = base; tmpva < base + size; ) {
5463		pde = pmap_pde(kernel_pmap, tmpva);
5464		if (*pde & PG_PS) {
5465			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5466				pmap_pde_attr(pde, cache_bits_pde);
5467				changed = TRUE;
5468			}
5469			tmpva = trunc_4mpage(tmpva) + NBPDR;
5470		} else {
5471			pte = vtopte(tmpva);
5472			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5473				pmap_pte_attr(pte, cache_bits_pte);
5474				changed = TRUE;
5475			}
5476			tmpva += PAGE_SIZE;
5477		}
5478	}
5479
5480	/*
5481	 * Flush CPU caches to make sure any data isn't cached that
5482	 * shouldn't be, etc.
5483	 */
5484	if (changed) {
5485		pmap_invalidate_range(kernel_pmap, base, tmpva);
5486		pmap_invalidate_cache_range(base, tmpva, FALSE);
5487	}
5488	return (0);
5489}
5490
5491/*
5492 * perform the pmap work for mincore
5493 */
5494int
5495pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5496{
5497	pd_entry_t *pdep;
5498	pt_entry_t *ptep, pte;
5499	vm_paddr_t pa;
5500	int val;
5501
5502	PMAP_LOCK(pmap);
5503retry:
5504	pdep = pmap_pde(pmap, addr);
5505	if (*pdep != 0) {
5506		if (*pdep & PG_PS) {
5507			pte = *pdep;
5508			/* Compute the physical address of the 4KB page. */
5509			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5510			    PG_FRAME;
5511			val = MINCORE_SUPER;
5512		} else {
5513			ptep = pmap_pte(pmap, addr);
5514			pte = *ptep;
5515			pmap_pte_release(ptep);
5516			pa = pte & PG_FRAME;
5517			val = 0;
5518		}
5519	} else {
5520		pte = 0;
5521		pa = 0;
5522		val = 0;
5523	}
5524	if ((pte & PG_V) != 0) {
5525		val |= MINCORE_INCORE;
5526		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5527			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5528		if ((pte & PG_A) != 0)
5529			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5530	}
5531	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5532	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5533	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5534		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5535		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5536			goto retry;
5537	} else
5538		PA_UNLOCK_COND(*locked_pa);
5539	PMAP_UNLOCK(pmap);
5540	return (val);
5541}
5542
5543void
5544pmap_activate(struct thread *td)
5545{
5546	pmap_t	pmap, oldpmap;
5547	u_int	cpuid;
5548	u_int32_t  cr3;
5549
5550	critical_enter();
5551	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5552	oldpmap = PCPU_GET(curpmap);
5553	cpuid = PCPU_GET(cpuid);
5554#if defined(SMP)
5555	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5556	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5557#else
5558	CPU_CLR(cpuid, &oldpmap->pm_active);
5559	CPU_SET(cpuid, &pmap->pm_active);
5560#endif
5561#if defined(PAE) || defined(PAE_TABLES)
5562	cr3 = vtophys(pmap->pm_pdpt);
5563#else
5564	cr3 = vtophys(pmap->pm_pdir);
5565#endif
5566	/*
5567	 * pmap_activate is for the current thread on the current cpu
5568	 */
5569	td->td_pcb->pcb_cr3 = cr3;
5570	load_cr3(cr3);
5571	PCPU_SET(curpmap, pmap);
5572	critical_exit();
5573}
5574
5575void
5576pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5577{
5578}
5579
5580/*
5581 *	Increase the starting virtual address of the given mapping if a
5582 *	different alignment might result in more superpage mappings.
5583 */
5584void
5585pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5586    vm_offset_t *addr, vm_size_t size)
5587{
5588	vm_offset_t superpage_offset;
5589
5590	if (size < NBPDR)
5591		return;
5592	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5593		offset += ptoa(object->pg_color);
5594	superpage_offset = offset & PDRMASK;
5595	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5596	    (*addr & PDRMASK) == superpage_offset)
5597		return;
5598	if ((*addr & PDRMASK) < superpage_offset)
5599		*addr = (*addr & ~PDRMASK) + superpage_offset;
5600	else
5601		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5602}
5603
5604vm_offset_t
5605pmap_quick_enter_page(vm_page_t m)
5606{
5607	vm_offset_t qaddr;
5608	pt_entry_t *pte;
5609
5610	critical_enter();
5611	qaddr = PCPU_GET(qmap_addr);
5612	pte = vtopte(qaddr);
5613
5614	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
5615	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
5616	    pmap_cache_bits(pmap_page_get_memattr(m), 0);
5617	invlpg(qaddr);
5618
5619	return (qaddr);
5620}
5621
5622void
5623pmap_quick_remove_page(vm_offset_t addr)
5624{
5625	vm_offset_t qaddr;
5626	pt_entry_t *pte;
5627
5628	qaddr = PCPU_GET(qmap_addr);
5629	pte = vtopte(qaddr);
5630
5631	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
5632	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
5633
5634	*pte = 0;
5635	critical_exit();
5636}
5637
5638#if defined(PMAP_DEBUG)
5639pmap_pid_dump(int pid)
5640{
5641	pmap_t pmap;
5642	struct proc *p;
5643	int npte = 0;
5644	int index;
5645
5646	sx_slock(&allproc_lock);
5647	FOREACH_PROC_IN_SYSTEM(p) {
5648		if (p->p_pid != pid)
5649			continue;
5650
5651		if (p->p_vmspace) {
5652			int i,j;
5653			index = 0;
5654			pmap = vmspace_pmap(p->p_vmspace);
5655			for (i = 0; i < NPDEPTD; i++) {
5656				pd_entry_t *pde;
5657				pt_entry_t *pte;
5658				vm_offset_t base = i << PDRSHIFT;
5659
5660				pde = &pmap->pm_pdir[i];
5661				if (pde && pmap_pde_v(pde)) {
5662					for (j = 0; j < NPTEPG; j++) {
5663						vm_offset_t va = base + (j << PAGE_SHIFT);
5664						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5665							if (index) {
5666								index = 0;
5667								printf("\n");
5668							}
5669							sx_sunlock(&allproc_lock);
5670							return (npte);
5671						}
5672						pte = pmap_pte(pmap, va);
5673						if (pte && pmap_pte_v(pte)) {
5674							pt_entry_t pa;
5675							vm_page_t m;
5676							pa = *pte;
5677							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5678							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5679								va, pa, m->hold_count, m->wire_count, m->flags);
5680							npte++;
5681							index++;
5682							if (index >= 2) {
5683								index = 0;
5684								printf("\n");
5685							} else {
5686								printf(" ");
5687							}
5688						}
5689					}
5690				}
5691			}
5692		}
5693	}
5694	sx_sunlock(&allproc_lock);
5695	return (npte);
5696}
5697#endif
5698