pmap.c revision 269072
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 269072 2014-07-24 16:29:44Z kib $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bus.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/ktr.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/rwlock.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/_unrhdr.h>
122#include <sys/smp.h>
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#include <machine/intr_machdep.h>
138#include <machine/apicvar.h>
139#include <machine/cpu.h>
140#include <machine/cputypes.h>
141#include <machine/md_var.h>
142#include <machine/pcb.h>
143#include <machine/specialreg.h>
144#ifdef SMP
145#include <machine/smp.h>
146#endif
147
148static __inline boolean_t
149pmap_emulate_ad_bits(pmap_t pmap)
150{
151
152	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
153}
154
155static __inline pt_entry_t
156pmap_valid_bit(pmap_t pmap)
157{
158	pt_entry_t mask;
159
160	switch (pmap->pm_type) {
161	case PT_X86:
162		mask = X86_PG_V;
163		break;
164	case PT_EPT:
165		if (pmap_emulate_ad_bits(pmap))
166			mask = EPT_PG_EMUL_V;
167		else
168			mask = EPT_PG_READ;
169		break;
170	default:
171		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
172	}
173
174	return (mask);
175}
176
177static __inline pt_entry_t
178pmap_rw_bit(pmap_t pmap)
179{
180	pt_entry_t mask;
181
182	switch (pmap->pm_type) {
183	case PT_X86:
184		mask = X86_PG_RW;
185		break;
186	case PT_EPT:
187		if (pmap_emulate_ad_bits(pmap))
188			mask = EPT_PG_EMUL_RW;
189		else
190			mask = EPT_PG_WRITE;
191		break;
192	default:
193		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
194	}
195
196	return (mask);
197}
198
199static __inline pt_entry_t
200pmap_global_bit(pmap_t pmap)
201{
202	pt_entry_t mask;
203
204	switch (pmap->pm_type) {
205	case PT_X86:
206		mask = X86_PG_G;
207		break;
208	case PT_EPT:
209		mask = 0;
210		break;
211	default:
212		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
213	}
214
215	return (mask);
216}
217
218static __inline pt_entry_t
219pmap_accessed_bit(pmap_t pmap)
220{
221	pt_entry_t mask;
222
223	switch (pmap->pm_type) {
224	case PT_X86:
225		mask = X86_PG_A;
226		break;
227	case PT_EPT:
228		if (pmap_emulate_ad_bits(pmap))
229			mask = EPT_PG_READ;
230		else
231			mask = EPT_PG_A;
232		break;
233	default:
234		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
235	}
236
237	return (mask);
238}
239
240static __inline pt_entry_t
241pmap_modified_bit(pmap_t pmap)
242{
243	pt_entry_t mask;
244
245	switch (pmap->pm_type) {
246	case PT_X86:
247		mask = X86_PG_M;
248		break;
249	case PT_EPT:
250		if (pmap_emulate_ad_bits(pmap))
251			mask = EPT_PG_WRITE;
252		else
253			mask = EPT_PG_M;
254		break;
255	default:
256		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
257	}
258
259	return (mask);
260}
261
262#if !defined(DIAGNOSTIC)
263#ifdef __GNUC_GNU_INLINE__
264#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
265#else
266#define PMAP_INLINE	extern inline
267#endif
268#else
269#define PMAP_INLINE
270#endif
271
272#ifdef PV_STATS
273#define PV_STAT(x)	do { x ; } while (0)
274#else
275#define PV_STAT(x)	do { } while (0)
276#endif
277
278#define	pa_index(pa)	((pa) >> PDRSHIFT)
279#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
280
281#define	NPV_LIST_LOCKS	MAXCPU
282
283#define	PHYS_TO_PV_LIST_LOCK(pa)	\
284			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
285
286#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
287	struct rwlock **_lockp = (lockp);		\
288	struct rwlock *_new_lock;			\
289							\
290	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
291	if (_new_lock != *_lockp) {			\
292		if (*_lockp != NULL)			\
293			rw_wunlock(*_lockp);		\
294		*_lockp = _new_lock;			\
295		rw_wlock(*_lockp);			\
296	}						\
297} while (0)
298
299#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
300			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
301
302#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
303	struct rwlock **_lockp = (lockp);		\
304							\
305	if (*_lockp != NULL) {				\
306		rw_wunlock(*_lockp);			\
307		*_lockp = NULL;				\
308	}						\
309} while (0)
310
311#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
312			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
313
314struct pmap kernel_pmap_store;
315
316vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318
319int nkpt;
320SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
321    "Number of kernel page table pages allocated on bootup");
322
323static int ndmpdp;
324vm_paddr_t dmaplimit;
325vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
326pt_entry_t pg_nx;
327
328static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
329
330static int pat_works = 1;
331SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
332    "Is page attribute table fully functional?");
333
334static int pg_ps_enabled = 1;
335SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
336    "Are large page mappings enabled?");
337
338#define	PAT_INDEX_SIZE	8
339static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
340
341static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
342static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
343u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
344u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
345
346static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
347static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
348static int		ndmpdpphys;	/* number of DMPDPphys pages */
349
350static struct rwlock_padalign pvh_global_lock;
351
352/*
353 * Data for the pv entry allocation mechanism
354 */
355static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
356static struct mtx pv_chunks_mutex;
357static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
358static struct md_page *pv_table;
359
360/*
361 * All those kernel PT submaps that BSD is so fond of
362 */
363pt_entry_t *CMAP1 = 0;
364caddr_t CADDR1 = 0;
365
366static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
367
368static struct unrhdr pcid_unr;
369static struct mtx pcid_mtx;
370int pmap_pcid_enabled = 0;
371SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
372    0, "Is TLB Context ID enabled ?");
373int invpcid_works = 0;
374SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
375    "Is the invpcid instruction available ?");
376
377static int
378pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
379{
380	int i;
381	uint64_t res;
382
383	res = 0;
384	CPU_FOREACH(i) {
385		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
386	}
387	return (sysctl_handle_64(oidp, &res, 0, req));
388}
389SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
390    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
391    "Count of saved TLB context on switch");
392
393/* pmap_copy_pages() over non-DMAP */
394static struct mtx cpage_lock;
395static vm_offset_t cpage_a;
396static vm_offset_t cpage_b;
397
398/*
399 * Crashdump maps.
400 */
401static caddr_t crashdumpmap;
402
403static void	free_pv_chunk(struct pv_chunk *pc);
404static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
405static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
406static int	popcnt_pc_map_elem(uint64_t elem);
407static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
408static void	reserve_pv_entries(pmap_t pmap, int needed,
409		    struct rwlock **lockp);
410static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
411		    struct rwlock **lockp);
412static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
413		    struct rwlock **lockp);
414static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
415		    struct rwlock **lockp);
416static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
417static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
418		    vm_offset_t va);
419
420static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
421static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
422static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
423    vm_offset_t va, struct rwlock **lockp);
424static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
425    vm_offset_t va);
426static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
427    vm_prot_t prot, struct rwlock **lockp);
428static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
429    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
430static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
431static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
432static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
433static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
434static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
435static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
436    struct rwlock **lockp);
437static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
438    vm_prot_t prot);
439static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
440static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
441    struct spglist *free, struct rwlock **lockp);
442static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
443    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
444static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
445static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
446    struct spglist *free);
447static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
448    vm_page_t m, struct rwlock **lockp);
449static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
450    pd_entry_t newpde);
451static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
452
453static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
454		struct rwlock **lockp);
455static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
456		struct rwlock **lockp);
457static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
458		struct rwlock **lockp);
459
460static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
461    struct spglist *free);
462static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
463static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
464
465/*
466 * Move the kernel virtual free pointer to the next
467 * 2MB.  This is used to help improve performance
468 * by using a large (2MB) page for much of the kernel
469 * (.text, .data, .bss)
470 */
471static vm_offset_t
472pmap_kmem_choose(vm_offset_t addr)
473{
474	vm_offset_t newaddr = addr;
475
476	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
477	return (newaddr);
478}
479
480/********************/
481/* Inline functions */
482/********************/
483
484/* Return a non-clipped PD index for a given VA */
485static __inline vm_pindex_t
486pmap_pde_pindex(vm_offset_t va)
487{
488	return (va >> PDRSHIFT);
489}
490
491
492/* Return various clipped indexes for a given VA */
493static __inline vm_pindex_t
494pmap_pte_index(vm_offset_t va)
495{
496
497	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
498}
499
500static __inline vm_pindex_t
501pmap_pde_index(vm_offset_t va)
502{
503
504	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
505}
506
507static __inline vm_pindex_t
508pmap_pdpe_index(vm_offset_t va)
509{
510
511	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
512}
513
514static __inline vm_pindex_t
515pmap_pml4e_index(vm_offset_t va)
516{
517
518	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
519}
520
521/* Return a pointer to the PML4 slot that corresponds to a VA */
522static __inline pml4_entry_t *
523pmap_pml4e(pmap_t pmap, vm_offset_t va)
524{
525
526	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
527}
528
529/* Return a pointer to the PDP slot that corresponds to a VA */
530static __inline pdp_entry_t *
531pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
532{
533	pdp_entry_t *pdpe;
534
535	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
536	return (&pdpe[pmap_pdpe_index(va)]);
537}
538
539/* Return a pointer to the PDP slot that corresponds to a VA */
540static __inline pdp_entry_t *
541pmap_pdpe(pmap_t pmap, vm_offset_t va)
542{
543	pml4_entry_t *pml4e;
544	pt_entry_t PG_V;
545
546	PG_V = pmap_valid_bit(pmap);
547	pml4e = pmap_pml4e(pmap, va);
548	if ((*pml4e & PG_V) == 0)
549		return (NULL);
550	return (pmap_pml4e_to_pdpe(pml4e, va));
551}
552
553/* Return a pointer to the PD slot that corresponds to a VA */
554static __inline pd_entry_t *
555pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
556{
557	pd_entry_t *pde;
558
559	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
560	return (&pde[pmap_pde_index(va)]);
561}
562
563/* Return a pointer to the PD slot that corresponds to a VA */
564static __inline pd_entry_t *
565pmap_pde(pmap_t pmap, vm_offset_t va)
566{
567	pdp_entry_t *pdpe;
568	pt_entry_t PG_V;
569
570	PG_V = pmap_valid_bit(pmap);
571	pdpe = pmap_pdpe(pmap, va);
572	if (pdpe == NULL || (*pdpe & PG_V) == 0)
573		return (NULL);
574	return (pmap_pdpe_to_pde(pdpe, va));
575}
576
577/* Return a pointer to the PT slot that corresponds to a VA */
578static __inline pt_entry_t *
579pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
580{
581	pt_entry_t *pte;
582
583	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
584	return (&pte[pmap_pte_index(va)]);
585}
586
587/* Return a pointer to the PT slot that corresponds to a VA */
588static __inline pt_entry_t *
589pmap_pte(pmap_t pmap, vm_offset_t va)
590{
591	pd_entry_t *pde;
592	pt_entry_t PG_V;
593
594	PG_V = pmap_valid_bit(pmap);
595	pde = pmap_pde(pmap, va);
596	if (pde == NULL || (*pde & PG_V) == 0)
597		return (NULL);
598	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
599		return ((pt_entry_t *)pde);
600	return (pmap_pde_to_pte(pde, va));
601}
602
603static __inline void
604pmap_resident_count_inc(pmap_t pmap, int count)
605{
606
607	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
608	pmap->pm_stats.resident_count += count;
609}
610
611static __inline void
612pmap_resident_count_dec(pmap_t pmap, int count)
613{
614
615	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
616	KASSERT(pmap->pm_stats.resident_count >= count,
617	    ("pmap %p resident count underflow %ld %d", pmap,
618	    pmap->pm_stats.resident_count, count));
619	pmap->pm_stats.resident_count -= count;
620}
621
622PMAP_INLINE pt_entry_t *
623vtopte(vm_offset_t va)
624{
625	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
626
627	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
628
629	return (PTmap + ((va >> PAGE_SHIFT) & mask));
630}
631
632static __inline pd_entry_t *
633vtopde(vm_offset_t va)
634{
635	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
636
637	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
638
639	return (PDmap + ((va >> PDRSHIFT) & mask));
640}
641
642static u_int64_t
643allocpages(vm_paddr_t *firstaddr, int n)
644{
645	u_int64_t ret;
646
647	ret = *firstaddr;
648	bzero((void *)ret, n * PAGE_SIZE);
649	*firstaddr += n * PAGE_SIZE;
650	return (ret);
651}
652
653CTASSERT(powerof2(NDMPML4E));
654
655/* number of kernel PDP slots */
656#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
657
658static void
659nkpt_init(vm_paddr_t addr)
660{
661	int pt_pages;
662
663#ifdef NKPT
664	pt_pages = NKPT;
665#else
666	pt_pages = howmany(addr, 1 << PDRSHIFT);
667	pt_pages += NKPDPE(pt_pages);
668
669	/*
670	 * Add some slop beyond the bare minimum required for bootstrapping
671	 * the kernel.
672	 *
673	 * This is quite important when allocating KVA for kernel modules.
674	 * The modules are required to be linked in the negative 2GB of
675	 * the address space.  If we run out of KVA in this region then
676	 * pmap_growkernel() will need to allocate page table pages to map
677	 * the entire 512GB of KVA space which is an unnecessary tax on
678	 * physical memory.
679	 */
680	pt_pages += 8;		/* 16MB additional slop for kernel modules */
681#endif
682	nkpt = pt_pages;
683}
684
685static void
686create_pagetables(vm_paddr_t *firstaddr)
687{
688	int i, j, ndm1g, nkpdpe;
689	pt_entry_t *pt_p;
690	pd_entry_t *pd_p;
691	pdp_entry_t *pdp_p;
692	pml4_entry_t *p4_p;
693
694	/* Allocate page table pages for the direct map */
695	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
696	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
697		ndmpdp = 4;
698	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
699	if (ndmpdpphys > NDMPML4E) {
700		/*
701		 * Each NDMPML4E allows 512 GB, so limit to that,
702		 * and then readjust ndmpdp and ndmpdpphys.
703		 */
704		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
705		Maxmem = atop(NDMPML4E * NBPML4);
706		ndmpdpphys = NDMPML4E;
707		ndmpdp = NDMPML4E * NPDEPG;
708	}
709	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
710	ndm1g = 0;
711	if ((amd_feature & AMDID_PAGE1GB) != 0)
712		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
713	if (ndm1g < ndmpdp)
714		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
715	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
716
717	/* Allocate pages */
718	KPML4phys = allocpages(firstaddr, 1);
719	KPDPphys = allocpages(firstaddr, NKPML4E);
720
721	/*
722	 * Allocate the initial number of kernel page table pages required to
723	 * bootstrap.  We defer this until after all memory-size dependent
724	 * allocations are done (e.g. direct map), so that we don't have to
725	 * build in too much slop in our estimate.
726	 *
727	 * Note that when NKPML4E > 1, we have an empty page underneath
728	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
729	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
730	 */
731	nkpt_init(*firstaddr);
732	nkpdpe = NKPDPE(nkpt);
733
734	KPTphys = allocpages(firstaddr, nkpt);
735	KPDphys = allocpages(firstaddr, nkpdpe);
736
737	/* Fill in the underlying page table pages */
738	/* Nominally read-only (but really R/W) from zero to physfree */
739	/* XXX not fully used, underneath 2M pages */
740	pt_p = (pt_entry_t *)KPTphys;
741	for (i = 0; ptoa(i) < *firstaddr; i++)
742		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
743
744	/* Now map the page tables at their location within PTmap */
745	pd_p = (pd_entry_t *)KPDphys;
746	for (i = 0; i < nkpt; i++)
747		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
748
749	/* Map from zero to end of allocations under 2M pages */
750	/* This replaces some of the KPTphys entries above */
751	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
752		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
753		    X86_PG_G;
754
755	/* And connect up the PD to the PDP (leaving room for L4 pages) */
756	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
757	for (i = 0; i < nkpdpe; i++)
758		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
759		    PG_U;
760
761	/*
762	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
763	 * the end of physical memory is not aligned to a 1GB page boundary,
764	 * then the residual physical memory is mapped with 2MB pages.  Later,
765	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
766	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
767	 * that are partially used.
768	 */
769	pd_p = (pd_entry_t *)DMPDphys;
770	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
771		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
772		/* Preset PG_M and PG_A because demotion expects it. */
773		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
774		    X86_PG_M | X86_PG_A;
775	}
776	pdp_p = (pdp_entry_t *)DMPDPphys;
777	for (i = 0; i < ndm1g; i++) {
778		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
779		/* Preset PG_M and PG_A because demotion expects it. */
780		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
781		    X86_PG_M | X86_PG_A;
782	}
783	for (j = 0; i < ndmpdp; i++, j++) {
784		pdp_p[i] = DMPDphys + ptoa(j);
785		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
786	}
787
788	/* And recursively map PML4 to itself in order to get PTmap */
789	p4_p = (pml4_entry_t *)KPML4phys;
790	p4_p[PML4PML4I] = KPML4phys;
791	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
792
793	/* Connect the Direct Map slot(s) up to the PML4. */
794	for (i = 0; i < ndmpdpphys; i++) {
795		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
796		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
797	}
798
799	/* Connect the KVA slots up to the PML4 */
800	for (i = 0; i < NKPML4E; i++) {
801		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
802		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
803	}
804}
805
806/*
807 *	Bootstrap the system enough to run with virtual memory.
808 *
809 *	On amd64 this is called after mapping has already been enabled
810 *	and just syncs the pmap module with what has already been done.
811 *	[We can't call it easily with mapping off since the kernel is not
812 *	mapped with PA == VA, hence we would have to relocate every address
813 *	from the linked base (virtual) address "KERNBASE" to the actual
814 *	(physical) address starting relative to 0]
815 */
816void
817pmap_bootstrap(vm_paddr_t *firstaddr)
818{
819	vm_offset_t va;
820	pt_entry_t *pte;
821
822	/*
823	 * Create an initial set of page tables to run the kernel in.
824	 */
825	create_pagetables(firstaddr);
826
827	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
828	virtual_avail = pmap_kmem_choose(virtual_avail);
829
830	virtual_end = VM_MAX_KERNEL_ADDRESS;
831
832
833	/* XXX do %cr0 as well */
834	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
835	load_cr3(KPML4phys);
836	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
837		load_cr4(rcr4() | CR4_SMEP);
838
839	/*
840	 * Initialize the kernel pmap (which is statically allocated).
841	 */
842	PMAP_LOCK_INIT(kernel_pmap);
843	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
844	kernel_pmap->pm_cr3 = KPML4phys;
845	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
846	CPU_FILL(&kernel_pmap->pm_save);	/* always superset of pm_active */
847	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
848	kernel_pmap->pm_flags = pmap_flags;
849
850 	/*
851	 * Initialize the global pv list lock.
852	 */
853	rw_init(&pvh_global_lock, "pmap pv global");
854
855	/*
856	 * Reserve some special page table entries/VA space for temporary
857	 * mapping of pages.
858	 */
859#define	SYSMAP(c, p, v, n)	\
860	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
861
862	va = virtual_avail;
863	pte = vtopte(va);
864
865	/*
866	 * Crashdump maps.  The first page is reused as CMAP1 for the
867	 * memory test.
868	 */
869	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
870	CADDR1 = crashdumpmap;
871
872	virtual_avail = va;
873
874	/* Initialize the PAT MSR. */
875	pmap_init_pat();
876
877	/* Initialize TLB Context Id. */
878	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
879	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
880		load_cr4(rcr4() | CR4_PCIDE);
881		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
882		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
883		/* Check for INVPCID support */
884		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
885		    != 0;
886		kernel_pmap->pm_pcid = 0;
887#ifndef SMP
888		pmap_pcid_enabled = 0;
889#endif
890	} else
891		pmap_pcid_enabled = 0;
892}
893
894/*
895 * Setup the PAT MSR.
896 */
897void
898pmap_init_pat(void)
899{
900	int pat_table[PAT_INDEX_SIZE];
901	uint64_t pat_msr;
902	u_long cr0, cr4;
903	int i;
904
905	/* Bail if this CPU doesn't implement PAT. */
906	if ((cpu_feature & CPUID_PAT) == 0)
907		panic("no PAT??");
908
909	/* Set default PAT index table. */
910	for (i = 0; i < PAT_INDEX_SIZE; i++)
911		pat_table[i] = -1;
912	pat_table[PAT_WRITE_BACK] = 0;
913	pat_table[PAT_WRITE_THROUGH] = 1;
914	pat_table[PAT_UNCACHEABLE] = 3;
915	pat_table[PAT_WRITE_COMBINING] = 3;
916	pat_table[PAT_WRITE_PROTECTED] = 3;
917	pat_table[PAT_UNCACHED] = 3;
918
919	/* Initialize default PAT entries. */
920	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
921	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
922	    PAT_VALUE(2, PAT_UNCACHED) |
923	    PAT_VALUE(3, PAT_UNCACHEABLE) |
924	    PAT_VALUE(4, PAT_WRITE_BACK) |
925	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
926	    PAT_VALUE(6, PAT_UNCACHED) |
927	    PAT_VALUE(7, PAT_UNCACHEABLE);
928
929	if (pat_works) {
930		/*
931		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
932		 * Program 5 and 6 as WP and WC.
933		 * Leave 4 and 7 as WB and UC.
934		 */
935		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
936		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
937		    PAT_VALUE(6, PAT_WRITE_COMBINING);
938		pat_table[PAT_UNCACHED] = 2;
939		pat_table[PAT_WRITE_PROTECTED] = 5;
940		pat_table[PAT_WRITE_COMBINING] = 6;
941	} else {
942		/*
943		 * Just replace PAT Index 2 with WC instead of UC-.
944		 */
945		pat_msr &= ~PAT_MASK(2);
946		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
947		pat_table[PAT_WRITE_COMBINING] = 2;
948	}
949
950	/* Disable PGE. */
951	cr4 = rcr4();
952	load_cr4(cr4 & ~CR4_PGE);
953
954	/* Disable caches (CD = 1, NW = 0). */
955	cr0 = rcr0();
956	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
957
958	/* Flushes caches and TLBs. */
959	wbinvd();
960	invltlb();
961
962	/* Update PAT and index table. */
963	wrmsr(MSR_PAT, pat_msr);
964	for (i = 0; i < PAT_INDEX_SIZE; i++)
965		pat_index[i] = pat_table[i];
966
967	/* Flush caches and TLBs again. */
968	wbinvd();
969	invltlb();
970
971	/* Restore caches and PGE. */
972	load_cr0(cr0);
973	load_cr4(cr4);
974}
975
976/*
977 *	Initialize a vm_page's machine-dependent fields.
978 */
979void
980pmap_page_init(vm_page_t m)
981{
982
983	TAILQ_INIT(&m->md.pv_list);
984	m->md.pat_mode = PAT_WRITE_BACK;
985}
986
987/*
988 *	Initialize the pmap module.
989 *	Called by vm_init, to initialize any structures that the pmap
990 *	system needs to map virtual memory.
991 */
992void
993pmap_init(void)
994{
995	vm_page_t mpte;
996	vm_size_t s;
997	int i, pv_npg;
998
999	/*
1000	 * Initialize the vm page array entries for the kernel pmap's
1001	 * page table pages.
1002	 */
1003	for (i = 0; i < nkpt; i++) {
1004		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1005		KASSERT(mpte >= vm_page_array &&
1006		    mpte < &vm_page_array[vm_page_array_size],
1007		    ("pmap_init: page table page is out of range"));
1008		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1009		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1010	}
1011
1012	/*
1013	 * If the kernel is running on a virtual machine, then it must assume
1014	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1015	 * be prepared for the hypervisor changing the vendor and family that
1016	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1017	 * 10h Erratum 383 is enabled if the processor's feature set does not
1018	 * include at least one feature that is only supported by older Intel
1019	 * or newer AMD processors.
1020	 */
1021	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
1022	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1023	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1024	    AMDID2_FMA4)) == 0)
1025		workaround_erratum383 = 1;
1026
1027	/*
1028	 * Are large page mappings enabled?
1029	 */
1030	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1031	if (pg_ps_enabled) {
1032		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1033		    ("pmap_init: can't assign to pagesizes[1]"));
1034		pagesizes[1] = NBPDR;
1035	}
1036
1037	/*
1038	 * Initialize the pv chunk list mutex.
1039	 */
1040	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1041
1042	/*
1043	 * Initialize the pool of pv list locks.
1044	 */
1045	for (i = 0; i < NPV_LIST_LOCKS; i++)
1046		rw_init(&pv_list_locks[i], "pmap pv list");
1047
1048	/*
1049	 * Calculate the size of the pv head table for superpages.
1050	 */
1051	for (i = 0; phys_avail[i + 1]; i += 2);
1052	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
1053
1054	/*
1055	 * Allocate memory for the pv head table for superpages.
1056	 */
1057	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1058	s = round_page(s);
1059	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1060	    M_WAITOK | M_ZERO);
1061	for (i = 0; i < pv_npg; i++)
1062		TAILQ_INIT(&pv_table[i].pv_list);
1063
1064	mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
1065	cpage_a = kva_alloc(PAGE_SIZE);
1066	cpage_b = kva_alloc(PAGE_SIZE);
1067}
1068
1069static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1070    "2MB page mapping counters");
1071
1072static u_long pmap_pde_demotions;
1073SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1074    &pmap_pde_demotions, 0, "2MB page demotions");
1075
1076static u_long pmap_pde_mappings;
1077SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1078    &pmap_pde_mappings, 0, "2MB page mappings");
1079
1080static u_long pmap_pde_p_failures;
1081SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1082    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1083
1084static u_long pmap_pde_promotions;
1085SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1086    &pmap_pde_promotions, 0, "2MB page promotions");
1087
1088static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1089    "1GB page mapping counters");
1090
1091static u_long pmap_pdpe_demotions;
1092SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1093    &pmap_pdpe_demotions, 0, "1GB page demotions");
1094
1095/***************************************************
1096 * Low level helper routines.....
1097 ***************************************************/
1098
1099static pt_entry_t
1100pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1101{
1102	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1103
1104	switch (pmap->pm_type) {
1105	case PT_X86:
1106		/* Verify that both PAT bits are not set at the same time */
1107		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1108		    ("Invalid PAT bits in entry %#lx", entry));
1109
1110		/* Swap the PAT bits if one of them is set */
1111		if ((entry & x86_pat_bits) != 0)
1112			entry ^= x86_pat_bits;
1113		break;
1114	case PT_EPT:
1115		/*
1116		 * Nothing to do - the memory attributes are represented
1117		 * the same way for regular pages and superpages.
1118		 */
1119		break;
1120	default:
1121		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1122	}
1123
1124	return (entry);
1125}
1126
1127/*
1128 * Determine the appropriate bits to set in a PTE or PDE for a specified
1129 * caching mode.
1130 */
1131static int
1132pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1133{
1134	int cache_bits, pat_flag, pat_idx;
1135
1136	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1137		panic("Unknown caching mode %d\n", mode);
1138
1139	switch (pmap->pm_type) {
1140	case PT_X86:
1141		/* The PAT bit is different for PTE's and PDE's. */
1142		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1143
1144		/* Map the caching mode to a PAT index. */
1145		pat_idx = pat_index[mode];
1146
1147		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1148		cache_bits = 0;
1149		if (pat_idx & 0x4)
1150			cache_bits |= pat_flag;
1151		if (pat_idx & 0x2)
1152			cache_bits |= PG_NC_PCD;
1153		if (pat_idx & 0x1)
1154			cache_bits |= PG_NC_PWT;
1155		break;
1156
1157	case PT_EPT:
1158		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1159		break;
1160
1161	default:
1162		panic("unsupported pmap type %d", pmap->pm_type);
1163	}
1164
1165	return (cache_bits);
1166}
1167
1168static int
1169pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1170{
1171	int mask;
1172
1173	switch (pmap->pm_type) {
1174	case PT_X86:
1175		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1176		break;
1177	case PT_EPT:
1178		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1179		break;
1180	default:
1181		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1182	}
1183
1184	return (mask);
1185}
1186
1187static __inline boolean_t
1188pmap_ps_enabled(pmap_t pmap)
1189{
1190
1191	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1192}
1193
1194static void
1195pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1196{
1197
1198	switch (pmap->pm_type) {
1199	case PT_X86:
1200		break;
1201	case PT_EPT:
1202		/*
1203		 * XXX
1204		 * This is a little bogus since the generation number is
1205		 * supposed to be bumped up when a region of the address
1206		 * space is invalidated in the page tables.
1207		 *
1208		 * In this case the old PDE entry is valid but yet we want
1209		 * to make sure that any mappings using the old entry are
1210		 * invalidated in the TLB.
1211		 *
1212		 * The reason this works as expected is because we rendezvous
1213		 * "all" host cpus and force any vcpu context to exit as a
1214		 * side-effect.
1215		 */
1216		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1217		break;
1218	default:
1219		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1220	}
1221	pde_store(pde, newpde);
1222}
1223
1224/*
1225 * After changing the page size for the specified virtual address in the page
1226 * table, flush the corresponding entries from the processor's TLB.  Only the
1227 * calling processor's TLB is affected.
1228 *
1229 * The calling thread must be pinned to a processor.
1230 */
1231static void
1232pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1233{
1234	pt_entry_t PG_G;
1235
1236	if (pmap->pm_type == PT_EPT)
1237		return;
1238
1239	KASSERT(pmap->pm_type == PT_X86,
1240	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1241
1242	PG_G = pmap_global_bit(pmap);
1243
1244	if ((newpde & PG_PS) == 0)
1245		/* Demotion: flush a specific 2MB page mapping. */
1246		invlpg(va);
1247	else if ((newpde & PG_G) == 0)
1248		/*
1249		 * Promotion: flush every 4KB page mapping from the TLB
1250		 * because there are too many to flush individually.
1251		 */
1252		invltlb();
1253	else {
1254		/*
1255		 * Promotion: flush every 4KB page mapping from the TLB,
1256		 * including any global (PG_G) mappings.
1257		 */
1258		invltlb_globpcid();
1259	}
1260}
1261#ifdef SMP
1262
1263static void
1264pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1265{
1266	struct invpcid_descr d;
1267	uint64_t cr3;
1268
1269	if (invpcid_works) {
1270		d.pcid = pmap->pm_pcid;
1271		d.pad = 0;
1272		d.addr = va;
1273		invpcid(&d, INVPCID_ADDR);
1274		return;
1275	}
1276
1277	cr3 = rcr3();
1278	critical_enter();
1279	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1280	invlpg(va);
1281	load_cr3(cr3 | CR3_PCID_SAVE);
1282	critical_exit();
1283}
1284
1285/*
1286 * For SMP, these functions have to use the IPI mechanism for coherence.
1287 *
1288 * N.B.: Before calling any of the following TLB invalidation functions,
1289 * the calling processor must ensure that all stores updating a non-
1290 * kernel page table are globally performed.  Otherwise, another
1291 * processor could cache an old, pre-update entry without being
1292 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1293 * active on another processor after its pm_active field is checked by
1294 * one of the following functions but before a store updating the page
1295 * table is globally performed. (2) The pmap becomes active on another
1296 * processor before its pm_active field is checked but due to
1297 * speculative loads one of the following functions stills reads the
1298 * pmap as inactive on the other processor.
1299 *
1300 * The kernel page table is exempt because its pm_active field is
1301 * immutable.  The kernel page table is always active on every
1302 * processor.
1303 */
1304
1305/*
1306 * Interrupt the cpus that are executing in the guest context.
1307 * This will force the vcpu to exit and the cached EPT mappings
1308 * will be invalidated by the host before the next vmresume.
1309 */
1310static __inline void
1311pmap_invalidate_ept(pmap_t pmap)
1312{
1313	int ipinum;
1314
1315	sched_pin();
1316	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1317	    ("pmap_invalidate_ept: absurd pm_active"));
1318
1319	/*
1320	 * The TLB mappings associated with a vcpu context are not
1321	 * flushed each time a different vcpu is chosen to execute.
1322	 *
1323	 * This is in contrast with a process's vtop mappings that
1324	 * are flushed from the TLB on each context switch.
1325	 *
1326	 * Therefore we need to do more than just a TLB shootdown on
1327	 * the active cpus in 'pmap->pm_active'. To do this we keep
1328	 * track of the number of invalidations performed on this pmap.
1329	 *
1330	 * Each vcpu keeps a cache of this counter and compares it
1331	 * just before a vmresume. If the counter is out-of-date an
1332	 * invept will be done to flush stale mappings from the TLB.
1333	 */
1334	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1335
1336	/*
1337	 * Force the vcpu to exit and trap back into the hypervisor.
1338	 */
1339	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1340	ipi_selected(pmap->pm_active, ipinum);
1341	sched_unpin();
1342}
1343
1344void
1345pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1346{
1347	cpuset_t other_cpus;
1348	u_int cpuid;
1349
1350	if (pmap->pm_type == PT_EPT) {
1351		pmap_invalidate_ept(pmap);
1352		return;
1353	}
1354
1355	KASSERT(pmap->pm_type == PT_X86,
1356	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1357
1358	sched_pin();
1359	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1360		if (!pmap_pcid_enabled) {
1361			invlpg(va);
1362		} else {
1363			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1364				if (pmap == PCPU_GET(curpmap))
1365					invlpg(va);
1366				else
1367					pmap_invalidate_page_pcid(pmap, va);
1368			} else {
1369				invltlb_globpcid();
1370			}
1371		}
1372		smp_invlpg(pmap, va);
1373	} else {
1374		cpuid = PCPU_GET(cpuid);
1375		other_cpus = all_cpus;
1376		CPU_CLR(cpuid, &other_cpus);
1377		if (CPU_ISSET(cpuid, &pmap->pm_active))
1378			invlpg(va);
1379		else if (pmap_pcid_enabled) {
1380			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1381				pmap_invalidate_page_pcid(pmap, va);
1382			else
1383				invltlb_globpcid();
1384		}
1385		if (pmap_pcid_enabled)
1386			CPU_AND(&other_cpus, &pmap->pm_save);
1387		else
1388			CPU_AND(&other_cpus, &pmap->pm_active);
1389		if (!CPU_EMPTY(&other_cpus))
1390			smp_masked_invlpg(other_cpus, pmap, va);
1391	}
1392	sched_unpin();
1393}
1394
1395static void
1396pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1397{
1398	struct invpcid_descr d;
1399	uint64_t cr3;
1400	vm_offset_t addr;
1401
1402	if (invpcid_works) {
1403		d.pcid = pmap->pm_pcid;
1404		d.pad = 0;
1405		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1406			d.addr = addr;
1407			invpcid(&d, INVPCID_ADDR);
1408		}
1409		return;
1410	}
1411
1412	cr3 = rcr3();
1413	critical_enter();
1414	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1415	for (addr = sva; addr < eva; addr += PAGE_SIZE)
1416		invlpg(addr);
1417	load_cr3(cr3 | CR3_PCID_SAVE);
1418	critical_exit();
1419}
1420
1421void
1422pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1423{
1424	cpuset_t other_cpus;
1425	vm_offset_t addr;
1426	u_int cpuid;
1427
1428	if (pmap->pm_type == PT_EPT) {
1429		pmap_invalidate_ept(pmap);
1430		return;
1431	}
1432
1433	KASSERT(pmap->pm_type == PT_X86,
1434	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1435
1436	sched_pin();
1437	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1438		if (!pmap_pcid_enabled) {
1439			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1440				invlpg(addr);
1441		} else {
1442			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1443				if (pmap == PCPU_GET(curpmap)) {
1444					for (addr = sva; addr < eva;
1445					    addr += PAGE_SIZE)
1446						invlpg(addr);
1447				} else {
1448					pmap_invalidate_range_pcid(pmap,
1449					    sva, eva);
1450				}
1451			} else {
1452				invltlb_globpcid();
1453			}
1454		}
1455		smp_invlpg_range(pmap, sva, eva);
1456	} else {
1457		cpuid = PCPU_GET(cpuid);
1458		other_cpus = all_cpus;
1459		CPU_CLR(cpuid, &other_cpus);
1460		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1461			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1462				invlpg(addr);
1463		} else if (pmap_pcid_enabled) {
1464			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1465				pmap_invalidate_range_pcid(pmap, sva, eva);
1466			else
1467				invltlb_globpcid();
1468		}
1469		if (pmap_pcid_enabled)
1470			CPU_AND(&other_cpus, &pmap->pm_save);
1471		else
1472			CPU_AND(&other_cpus, &pmap->pm_active);
1473		if (!CPU_EMPTY(&other_cpus))
1474			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1475	}
1476	sched_unpin();
1477}
1478
1479void
1480pmap_invalidate_all(pmap_t pmap)
1481{
1482	cpuset_t other_cpus;
1483	struct invpcid_descr d;
1484	uint64_t cr3;
1485	u_int cpuid;
1486
1487	if (pmap->pm_type == PT_EPT) {
1488		pmap_invalidate_ept(pmap);
1489		return;
1490	}
1491
1492	KASSERT(pmap->pm_type == PT_X86,
1493	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1494
1495	sched_pin();
1496	cpuid = PCPU_GET(cpuid);
1497	if (pmap == kernel_pmap ||
1498	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1499	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1500		if (invpcid_works) {
1501			bzero(&d, sizeof(d));
1502			invpcid(&d, INVPCID_CTXGLOB);
1503		} else {
1504			invltlb_globpcid();
1505		}
1506		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1507			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1508		smp_invltlb(pmap);
1509	} else {
1510		other_cpus = all_cpus;
1511		CPU_CLR(cpuid, &other_cpus);
1512
1513		/*
1514		 * This logic is duplicated in the Xinvltlb shootdown
1515		 * IPI handler.
1516		 */
1517		if (pmap_pcid_enabled) {
1518			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1519				if (invpcid_works) {
1520					d.pcid = pmap->pm_pcid;
1521					d.pad = 0;
1522					d.addr = 0;
1523					invpcid(&d, INVPCID_CTX);
1524				} else {
1525					cr3 = rcr3();
1526					critical_enter();
1527
1528					/*
1529					 * Bit 63 is clear, pcid TLB
1530					 * entries are invalidated.
1531					 */
1532					load_cr3(pmap->pm_cr3);
1533					load_cr3(cr3 | CR3_PCID_SAVE);
1534					critical_exit();
1535				}
1536			} else {
1537				invltlb_globpcid();
1538			}
1539		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
1540			invltlb();
1541		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1542			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1543		if (pmap_pcid_enabled)
1544			CPU_AND(&other_cpus, &pmap->pm_save);
1545		else
1546			CPU_AND(&other_cpus, &pmap->pm_active);
1547		if (!CPU_EMPTY(&other_cpus))
1548			smp_masked_invltlb(other_cpus, pmap);
1549	}
1550	sched_unpin();
1551}
1552
1553void
1554pmap_invalidate_cache(void)
1555{
1556
1557	sched_pin();
1558	wbinvd();
1559	smp_cache_flush();
1560	sched_unpin();
1561}
1562
1563struct pde_action {
1564	cpuset_t invalidate;	/* processors that invalidate their TLB */
1565	pmap_t pmap;
1566	vm_offset_t va;
1567	pd_entry_t *pde;
1568	pd_entry_t newpde;
1569	u_int store;		/* processor that updates the PDE */
1570};
1571
1572static void
1573pmap_update_pde_action(void *arg)
1574{
1575	struct pde_action *act = arg;
1576
1577	if (act->store == PCPU_GET(cpuid))
1578		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1579}
1580
1581static void
1582pmap_update_pde_teardown(void *arg)
1583{
1584	struct pde_action *act = arg;
1585
1586	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1587		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1588}
1589
1590/*
1591 * Change the page size for the specified virtual address in a way that
1592 * prevents any possibility of the TLB ever having two entries that map the
1593 * same virtual address using different page sizes.  This is the recommended
1594 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1595 * machine check exception for a TLB state that is improperly diagnosed as a
1596 * hardware error.
1597 */
1598static void
1599pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1600{
1601	struct pde_action act;
1602	cpuset_t active, other_cpus;
1603	u_int cpuid;
1604
1605	sched_pin();
1606	cpuid = PCPU_GET(cpuid);
1607	other_cpus = all_cpus;
1608	CPU_CLR(cpuid, &other_cpus);
1609	if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
1610		active = all_cpus;
1611	else {
1612		active = pmap->pm_active;
1613		CPU_AND_ATOMIC(&pmap->pm_save, &active);
1614	}
1615	if (CPU_OVERLAP(&active, &other_cpus)) {
1616		act.store = cpuid;
1617		act.invalidate = active;
1618		act.va = va;
1619		act.pmap = pmap;
1620		act.pde = pde;
1621		act.newpde = newpde;
1622		CPU_SET(cpuid, &active);
1623		smp_rendezvous_cpus(active,
1624		    smp_no_rendevous_barrier, pmap_update_pde_action,
1625		    pmap_update_pde_teardown, &act);
1626	} else {
1627		pmap_update_pde_store(pmap, pde, newpde);
1628		if (CPU_ISSET(cpuid, &active))
1629			pmap_update_pde_invalidate(pmap, va, newpde);
1630	}
1631	sched_unpin();
1632}
1633#else /* !SMP */
1634/*
1635 * Normal, non-SMP, invalidation functions.
1636 * We inline these within pmap.c for speed.
1637 */
1638PMAP_INLINE void
1639pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1640{
1641
1642	switch (pmap->pm_type) {
1643	case PT_X86:
1644		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1645			invlpg(va);
1646		break;
1647	case PT_EPT:
1648		pmap->pm_eptgen++;
1649		break;
1650	default:
1651		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1652	}
1653}
1654
1655PMAP_INLINE void
1656pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1657{
1658	vm_offset_t addr;
1659
1660	switch (pmap->pm_type) {
1661	case PT_X86:
1662		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1663			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1664				invlpg(addr);
1665		break;
1666	case PT_EPT:
1667		pmap->pm_eptgen++;
1668		break;
1669	default:
1670		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1671	}
1672}
1673
1674PMAP_INLINE void
1675pmap_invalidate_all(pmap_t pmap)
1676{
1677
1678	switch (pmap->pm_type) {
1679	case PT_X86:
1680		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1681			invltlb();
1682		break;
1683	case PT_EPT:
1684		pmap->pm_eptgen++;
1685		break;
1686	default:
1687		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1688	}
1689}
1690
1691PMAP_INLINE void
1692pmap_invalidate_cache(void)
1693{
1694
1695	wbinvd();
1696}
1697
1698static void
1699pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1700{
1701
1702	pmap_update_pde_store(pmap, pde, newpde);
1703	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1704		pmap_update_pde_invalidate(pmap, va, newpde);
1705	else
1706		CPU_ZERO(&pmap->pm_save);
1707}
1708#endif /* !SMP */
1709
1710#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1711
1712void
1713pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1714{
1715
1716	KASSERT((sva & PAGE_MASK) == 0,
1717	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1718	KASSERT((eva & PAGE_MASK) == 0,
1719	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1720
1721	if (cpu_feature & CPUID_SS)
1722		; /* If "Self Snoop" is supported, do nothing. */
1723	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1724	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1725
1726		/*
1727		 * XXX: Some CPUs fault, hang, or trash the local APIC
1728		 * registers if we use CLFLUSH on the local APIC
1729		 * range.  The local APIC is always uncached, so we
1730		 * don't need to flush for that range anyway.
1731		 */
1732		if (pmap_kextract(sva) == lapic_paddr)
1733			return;
1734
1735		/*
1736		 * Otherwise, do per-cache line flush.  Use the mfence
1737		 * instruction to insure that previous stores are
1738		 * included in the write-back.  The processor
1739		 * propagates flush to other processors in the cache
1740		 * coherence domain.
1741		 */
1742		mfence();
1743		for (; sva < eva; sva += cpu_clflush_line_size)
1744			clflush(sva);
1745		mfence();
1746	} else {
1747
1748		/*
1749		 * No targeted cache flush methods are supported by CPU,
1750		 * or the supplied range is bigger than 2MB.
1751		 * Globally invalidate cache.
1752		 */
1753		pmap_invalidate_cache();
1754	}
1755}
1756
1757/*
1758 * Remove the specified set of pages from the data and instruction caches.
1759 *
1760 * In contrast to pmap_invalidate_cache_range(), this function does not
1761 * rely on the CPU's self-snoop feature, because it is intended for use
1762 * when moving pages into a different cache domain.
1763 */
1764void
1765pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1766{
1767	vm_offset_t daddr, eva;
1768	int i;
1769
1770	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1771	    (cpu_feature & CPUID_CLFSH) == 0)
1772		pmap_invalidate_cache();
1773	else {
1774		mfence();
1775		for (i = 0; i < count; i++) {
1776			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1777			eva = daddr + PAGE_SIZE;
1778			for (; daddr < eva; daddr += cpu_clflush_line_size)
1779				clflush(daddr);
1780		}
1781		mfence();
1782	}
1783}
1784
1785/*
1786 *	Routine:	pmap_extract
1787 *	Function:
1788 *		Extract the physical page address associated
1789 *		with the given map/virtual_address pair.
1790 */
1791vm_paddr_t
1792pmap_extract(pmap_t pmap, vm_offset_t va)
1793{
1794	pdp_entry_t *pdpe;
1795	pd_entry_t *pde;
1796	pt_entry_t *pte, PG_V;
1797	vm_paddr_t pa;
1798
1799	pa = 0;
1800	PG_V = pmap_valid_bit(pmap);
1801	PMAP_LOCK(pmap);
1802	pdpe = pmap_pdpe(pmap, va);
1803	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1804		if ((*pdpe & PG_PS) != 0)
1805			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1806		else {
1807			pde = pmap_pdpe_to_pde(pdpe, va);
1808			if ((*pde & PG_V) != 0) {
1809				if ((*pde & PG_PS) != 0) {
1810					pa = (*pde & PG_PS_FRAME) |
1811					    (va & PDRMASK);
1812				} else {
1813					pte = pmap_pde_to_pte(pde, va);
1814					pa = (*pte & PG_FRAME) |
1815					    (va & PAGE_MASK);
1816				}
1817			}
1818		}
1819	}
1820	PMAP_UNLOCK(pmap);
1821	return (pa);
1822}
1823
1824/*
1825 *	Routine:	pmap_extract_and_hold
1826 *	Function:
1827 *		Atomically extract and hold the physical page
1828 *		with the given pmap and virtual address pair
1829 *		if that mapping permits the given protection.
1830 */
1831vm_page_t
1832pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1833{
1834	pd_entry_t pde, *pdep;
1835	pt_entry_t pte, PG_RW, PG_V;
1836	vm_paddr_t pa;
1837	vm_page_t m;
1838
1839	pa = 0;
1840	m = NULL;
1841	PG_RW = pmap_rw_bit(pmap);
1842	PG_V = pmap_valid_bit(pmap);
1843	PMAP_LOCK(pmap);
1844retry:
1845	pdep = pmap_pde(pmap, va);
1846	if (pdep != NULL && (pde = *pdep)) {
1847		if (pde & PG_PS) {
1848			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1849				if (vm_page_pa_tryrelock(pmap, (pde &
1850				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1851					goto retry;
1852				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1853				    (va & PDRMASK));
1854				vm_page_hold(m);
1855			}
1856		} else {
1857			pte = *pmap_pde_to_pte(pdep, va);
1858			if ((pte & PG_V) &&
1859			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1860				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1861				    &pa))
1862					goto retry;
1863				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1864				vm_page_hold(m);
1865			}
1866		}
1867	}
1868	PA_UNLOCK_COND(pa);
1869	PMAP_UNLOCK(pmap);
1870	return (m);
1871}
1872
1873vm_paddr_t
1874pmap_kextract(vm_offset_t va)
1875{
1876	pd_entry_t pde;
1877	vm_paddr_t pa;
1878
1879	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1880		pa = DMAP_TO_PHYS(va);
1881	} else {
1882		pde = *vtopde(va);
1883		if (pde & PG_PS) {
1884			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1885		} else {
1886			/*
1887			 * Beware of a concurrent promotion that changes the
1888			 * PDE at this point!  For example, vtopte() must not
1889			 * be used to access the PTE because it would use the
1890			 * new PDE.  It is, however, safe to use the old PDE
1891			 * because the page table page is preserved by the
1892			 * promotion.
1893			 */
1894			pa = *pmap_pde_to_pte(&pde, va);
1895			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1896		}
1897	}
1898	return (pa);
1899}
1900
1901/***************************************************
1902 * Low level mapping routines.....
1903 ***************************************************/
1904
1905/*
1906 * Add a wired page to the kva.
1907 * Note: not SMP coherent.
1908 */
1909PMAP_INLINE void
1910pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1911{
1912	pt_entry_t *pte;
1913
1914	pte = vtopte(va);
1915	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
1916}
1917
1918static __inline void
1919pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1920{
1921	pt_entry_t *pte;
1922	int cache_bits;
1923
1924	pte = vtopte(va);
1925	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
1926	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
1927}
1928
1929/*
1930 * Remove a page from the kernel pagetables.
1931 * Note: not SMP coherent.
1932 */
1933PMAP_INLINE void
1934pmap_kremove(vm_offset_t va)
1935{
1936	pt_entry_t *pte;
1937
1938	pte = vtopte(va);
1939	pte_clear(pte);
1940}
1941
1942/*
1943 *	Used to map a range of physical addresses into kernel
1944 *	virtual address space.
1945 *
1946 *	The value passed in '*virt' is a suggested virtual address for
1947 *	the mapping. Architectures which can support a direct-mapped
1948 *	physical to virtual region can return the appropriate address
1949 *	within that region, leaving '*virt' unchanged. Other
1950 *	architectures should map the pages starting at '*virt' and
1951 *	update '*virt' with the first usable address after the mapped
1952 *	region.
1953 */
1954vm_offset_t
1955pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1956{
1957	return PHYS_TO_DMAP(start);
1958}
1959
1960
1961/*
1962 * Add a list of wired pages to the kva
1963 * this routine is only used for temporary
1964 * kernel mappings that do not need to have
1965 * page modification or references recorded.
1966 * Note that old mappings are simply written
1967 * over.  The page *must* be wired.
1968 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1969 */
1970void
1971pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1972{
1973	pt_entry_t *endpte, oldpte, pa, *pte;
1974	vm_page_t m;
1975	int cache_bits;
1976
1977	oldpte = 0;
1978	pte = vtopte(sva);
1979	endpte = pte + count;
1980	while (pte < endpte) {
1981		m = *ma++;
1982		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
1983		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
1984		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
1985			oldpte |= *pte;
1986			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
1987		}
1988		pte++;
1989	}
1990	if (__predict_false((oldpte & X86_PG_V) != 0))
1991		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1992		    PAGE_SIZE);
1993}
1994
1995/*
1996 * This routine tears out page mappings from the
1997 * kernel -- it is meant only for temporary mappings.
1998 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1999 */
2000void
2001pmap_qremove(vm_offset_t sva, int count)
2002{
2003	vm_offset_t va;
2004
2005	va = sva;
2006	while (count-- > 0) {
2007		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2008		pmap_kremove(va);
2009		va += PAGE_SIZE;
2010	}
2011	pmap_invalidate_range(kernel_pmap, sva, va);
2012}
2013
2014/***************************************************
2015 * Page table page management routines.....
2016 ***************************************************/
2017static __inline void
2018pmap_free_zero_pages(struct spglist *free)
2019{
2020	vm_page_t m;
2021
2022	while ((m = SLIST_FIRST(free)) != NULL) {
2023		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2024		/* Preserve the page's PG_ZERO setting. */
2025		vm_page_free_toq(m);
2026	}
2027}
2028
2029/*
2030 * Schedule the specified unused page table page to be freed.  Specifically,
2031 * add the page to the specified list of pages that will be released to the
2032 * physical memory manager after the TLB has been updated.
2033 */
2034static __inline void
2035pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2036    boolean_t set_PG_ZERO)
2037{
2038
2039	if (set_PG_ZERO)
2040		m->flags |= PG_ZERO;
2041	else
2042		m->flags &= ~PG_ZERO;
2043	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2044}
2045
2046/*
2047 * Inserts the specified page table page into the specified pmap's collection
2048 * of idle page table pages.  Each of a pmap's page table pages is responsible
2049 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2050 * ordered by this virtual address range.
2051 */
2052static __inline int
2053pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2054{
2055
2056	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2057	return (vm_radix_insert(&pmap->pm_root, mpte));
2058}
2059
2060/*
2061 * Looks for a page table page mapping the specified virtual address in the
2062 * specified pmap's collection of idle page table pages.  Returns NULL if there
2063 * is no page table page corresponding to the specified virtual address.
2064 */
2065static __inline vm_page_t
2066pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2067{
2068
2069	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2070	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2071}
2072
2073/*
2074 * Removes the specified page table page from the specified pmap's collection
2075 * of idle page table pages.  The specified page table page must be a member of
2076 * the pmap's collection.
2077 */
2078static __inline void
2079pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2080{
2081
2082	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2083	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2084}
2085
2086/*
2087 * Decrements a page table page's wire count, which is used to record the
2088 * number of valid page table entries within the page.  If the wire count
2089 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2090 * page table page was unmapped and FALSE otherwise.
2091 */
2092static inline boolean_t
2093pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2094{
2095
2096	--m->wire_count;
2097	if (m->wire_count == 0) {
2098		_pmap_unwire_ptp(pmap, va, m, free);
2099		return (TRUE);
2100	} else
2101		return (FALSE);
2102}
2103
2104static void
2105_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2106{
2107
2108	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2109	/*
2110	 * unmap the page table page
2111	 */
2112	if (m->pindex >= (NUPDE + NUPDPE)) {
2113		/* PDP page */
2114		pml4_entry_t *pml4;
2115		pml4 = pmap_pml4e(pmap, va);
2116		*pml4 = 0;
2117	} else if (m->pindex >= NUPDE) {
2118		/* PD page */
2119		pdp_entry_t *pdp;
2120		pdp = pmap_pdpe(pmap, va);
2121		*pdp = 0;
2122	} else {
2123		/* PTE page */
2124		pd_entry_t *pd;
2125		pd = pmap_pde(pmap, va);
2126		*pd = 0;
2127	}
2128	pmap_resident_count_dec(pmap, 1);
2129	if (m->pindex < NUPDE) {
2130		/* We just released a PT, unhold the matching PD */
2131		vm_page_t pdpg;
2132
2133		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2134		pmap_unwire_ptp(pmap, va, pdpg, free);
2135	}
2136	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2137		/* We just released a PD, unhold the matching PDP */
2138		vm_page_t pdppg;
2139
2140		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2141		pmap_unwire_ptp(pmap, va, pdppg, free);
2142	}
2143
2144	/*
2145	 * This is a release store so that the ordinary store unmapping
2146	 * the page table page is globally performed before TLB shoot-
2147	 * down is begun.
2148	 */
2149	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2150
2151	/*
2152	 * Put page on a list so that it is released after
2153	 * *ALL* TLB shootdown is done
2154	 */
2155	pmap_add_delayed_free_list(m, free, TRUE);
2156}
2157
2158/*
2159 * After removing a page table entry, this routine is used to
2160 * conditionally free the page, and manage the hold/wire counts.
2161 */
2162static int
2163pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2164    struct spglist *free)
2165{
2166	vm_page_t mpte;
2167
2168	if (va >= VM_MAXUSER_ADDRESS)
2169		return (0);
2170	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2171	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2172	return (pmap_unwire_ptp(pmap, va, mpte, free));
2173}
2174
2175void
2176pmap_pinit0(pmap_t pmap)
2177{
2178
2179	PMAP_LOCK_INIT(pmap);
2180	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2181	pmap->pm_cr3 = KPML4phys;
2182	pmap->pm_root.rt_root = 0;
2183	CPU_ZERO(&pmap->pm_active);
2184	CPU_ZERO(&pmap->pm_save);
2185	PCPU_SET(curpmap, pmap);
2186	TAILQ_INIT(&pmap->pm_pvchunk);
2187	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2188	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2189	pmap->pm_flags = pmap_flags;
2190}
2191
2192/*
2193 * Initialize a preallocated and zeroed pmap structure,
2194 * such as one in a vmspace structure.
2195 */
2196int
2197pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2198{
2199	vm_page_t pml4pg;
2200	vm_paddr_t pml4phys;
2201	int i;
2202
2203	/*
2204	 * allocate the page directory page
2205	 */
2206	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2207	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2208		VM_WAIT;
2209
2210	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2211	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2212	pmap->pm_pcid = -1;
2213	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2214
2215	if ((pml4pg->flags & PG_ZERO) == 0)
2216		pagezero(pmap->pm_pml4);
2217
2218	/*
2219	 * Do not install the host kernel mappings in the nested page
2220	 * tables. These mappings are meaningless in the guest physical
2221	 * address space.
2222	 */
2223	if ((pmap->pm_type = pm_type) == PT_X86) {
2224		pmap->pm_cr3 = pml4phys;
2225
2226		/* Wire in kernel global address entries. */
2227		for (i = 0; i < NKPML4E; i++) {
2228			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2229			    X86_PG_RW | X86_PG_V | PG_U;
2230		}
2231		for (i = 0; i < ndmpdpphys; i++) {
2232			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2233			    X86_PG_RW | X86_PG_V | PG_U;
2234		}
2235
2236		/* install self-referential address mapping entry(s) */
2237		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2238		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2239
2240		if (pmap_pcid_enabled) {
2241			pmap->pm_pcid = alloc_unr(&pcid_unr);
2242			if (pmap->pm_pcid != -1)
2243				pmap->pm_cr3 |= pmap->pm_pcid;
2244		}
2245	}
2246
2247	pmap->pm_root.rt_root = 0;
2248	CPU_ZERO(&pmap->pm_active);
2249	TAILQ_INIT(&pmap->pm_pvchunk);
2250	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2251	pmap->pm_flags = flags;
2252	pmap->pm_eptgen = 0;
2253	CPU_ZERO(&pmap->pm_save);
2254
2255	return (1);
2256}
2257
2258int
2259pmap_pinit(pmap_t pmap)
2260{
2261
2262	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2263}
2264
2265/*
2266 * This routine is called if the desired page table page does not exist.
2267 *
2268 * If page table page allocation fails, this routine may sleep before
2269 * returning NULL.  It sleeps only if a lock pointer was given.
2270 *
2271 * Note: If a page allocation fails at page table level two or three,
2272 * one or two pages may be held during the wait, only to be released
2273 * afterwards.  This conservative approach is easily argued to avoid
2274 * race conditions.
2275 */
2276static vm_page_t
2277_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2278{
2279	vm_page_t m, pdppg, pdpg;
2280	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2281
2282	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2283
2284	PG_A = pmap_accessed_bit(pmap);
2285	PG_M = pmap_modified_bit(pmap);
2286	PG_V = pmap_valid_bit(pmap);
2287	PG_RW = pmap_rw_bit(pmap);
2288
2289	/*
2290	 * Allocate a page table page.
2291	 */
2292	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2293	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2294		if (lockp != NULL) {
2295			RELEASE_PV_LIST_LOCK(lockp);
2296			PMAP_UNLOCK(pmap);
2297			rw_runlock(&pvh_global_lock);
2298			VM_WAIT;
2299			rw_rlock(&pvh_global_lock);
2300			PMAP_LOCK(pmap);
2301		}
2302
2303		/*
2304		 * Indicate the need to retry.  While waiting, the page table
2305		 * page may have been allocated.
2306		 */
2307		return (NULL);
2308	}
2309	if ((m->flags & PG_ZERO) == 0)
2310		pmap_zero_page(m);
2311
2312	/*
2313	 * Map the pagetable page into the process address space, if
2314	 * it isn't already there.
2315	 */
2316
2317	if (ptepindex >= (NUPDE + NUPDPE)) {
2318		pml4_entry_t *pml4;
2319		vm_pindex_t pml4index;
2320
2321		/* Wire up a new PDPE page */
2322		pml4index = ptepindex - (NUPDE + NUPDPE);
2323		pml4 = &pmap->pm_pml4[pml4index];
2324		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2325
2326	} else if (ptepindex >= NUPDE) {
2327		vm_pindex_t pml4index;
2328		vm_pindex_t pdpindex;
2329		pml4_entry_t *pml4;
2330		pdp_entry_t *pdp;
2331
2332		/* Wire up a new PDE page */
2333		pdpindex = ptepindex - NUPDE;
2334		pml4index = pdpindex >> NPML4EPGSHIFT;
2335
2336		pml4 = &pmap->pm_pml4[pml4index];
2337		if ((*pml4 & PG_V) == 0) {
2338			/* Have to allocate a new pdp, recurse */
2339			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2340			    lockp) == NULL) {
2341				--m->wire_count;
2342				atomic_subtract_int(&cnt.v_wire_count, 1);
2343				vm_page_free_zero(m);
2344				return (NULL);
2345			}
2346		} else {
2347			/* Add reference to pdp page */
2348			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2349			pdppg->wire_count++;
2350		}
2351		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2352
2353		/* Now find the pdp page */
2354		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2355		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2356
2357	} else {
2358		vm_pindex_t pml4index;
2359		vm_pindex_t pdpindex;
2360		pml4_entry_t *pml4;
2361		pdp_entry_t *pdp;
2362		pd_entry_t *pd;
2363
2364		/* Wire up a new PTE page */
2365		pdpindex = ptepindex >> NPDPEPGSHIFT;
2366		pml4index = pdpindex >> NPML4EPGSHIFT;
2367
2368		/* First, find the pdp and check that its valid. */
2369		pml4 = &pmap->pm_pml4[pml4index];
2370		if ((*pml4 & PG_V) == 0) {
2371			/* Have to allocate a new pd, recurse */
2372			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2373			    lockp) == NULL) {
2374				--m->wire_count;
2375				atomic_subtract_int(&cnt.v_wire_count, 1);
2376				vm_page_free_zero(m);
2377				return (NULL);
2378			}
2379			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2380			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2381		} else {
2382			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2383			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2384			if ((*pdp & PG_V) == 0) {
2385				/* Have to allocate a new pd, recurse */
2386				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2387				    lockp) == NULL) {
2388					--m->wire_count;
2389					atomic_subtract_int(&cnt.v_wire_count,
2390					    1);
2391					vm_page_free_zero(m);
2392					return (NULL);
2393				}
2394			} else {
2395				/* Add reference to the pd page */
2396				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2397				pdpg->wire_count++;
2398			}
2399		}
2400		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2401
2402		/* Now we know where the page directory page is */
2403		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2404		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2405	}
2406
2407	pmap_resident_count_inc(pmap, 1);
2408
2409	return (m);
2410}
2411
2412static vm_page_t
2413pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2414{
2415	vm_pindex_t pdpindex, ptepindex;
2416	pdp_entry_t *pdpe, PG_V;
2417	vm_page_t pdpg;
2418
2419	PG_V = pmap_valid_bit(pmap);
2420
2421retry:
2422	pdpe = pmap_pdpe(pmap, va);
2423	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2424		/* Add a reference to the pd page. */
2425		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2426		pdpg->wire_count++;
2427	} else {
2428		/* Allocate a pd page. */
2429		ptepindex = pmap_pde_pindex(va);
2430		pdpindex = ptepindex >> NPDPEPGSHIFT;
2431		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2432		if (pdpg == NULL && lockp != NULL)
2433			goto retry;
2434	}
2435	return (pdpg);
2436}
2437
2438static vm_page_t
2439pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2440{
2441	vm_pindex_t ptepindex;
2442	pd_entry_t *pd, PG_V;
2443	vm_page_t m;
2444
2445	PG_V = pmap_valid_bit(pmap);
2446
2447	/*
2448	 * Calculate pagetable page index
2449	 */
2450	ptepindex = pmap_pde_pindex(va);
2451retry:
2452	/*
2453	 * Get the page directory entry
2454	 */
2455	pd = pmap_pde(pmap, va);
2456
2457	/*
2458	 * This supports switching from a 2MB page to a
2459	 * normal 4K page.
2460	 */
2461	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2462		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2463			/*
2464			 * Invalidation of the 2MB page mapping may have caused
2465			 * the deallocation of the underlying PD page.
2466			 */
2467			pd = NULL;
2468		}
2469	}
2470
2471	/*
2472	 * If the page table page is mapped, we just increment the
2473	 * hold count, and activate it.
2474	 */
2475	if (pd != NULL && (*pd & PG_V) != 0) {
2476		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2477		m->wire_count++;
2478	} else {
2479		/*
2480		 * Here if the pte page isn't mapped, or if it has been
2481		 * deallocated.
2482		 */
2483		m = _pmap_allocpte(pmap, ptepindex, lockp);
2484		if (m == NULL && lockp != NULL)
2485			goto retry;
2486	}
2487	return (m);
2488}
2489
2490
2491/***************************************************
2492 * Pmap allocation/deallocation routines.
2493 ***************************************************/
2494
2495/*
2496 * Release any resources held by the given physical map.
2497 * Called when a pmap initialized by pmap_pinit is being released.
2498 * Should only be called if the map contains no valid mappings.
2499 */
2500void
2501pmap_release(pmap_t pmap)
2502{
2503	vm_page_t m;
2504	int i;
2505
2506	KASSERT(pmap->pm_stats.resident_count == 0,
2507	    ("pmap_release: pmap resident count %ld != 0",
2508	    pmap->pm_stats.resident_count));
2509	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2510	    ("pmap_release: pmap has reserved page table page(s)"));
2511
2512	if (pmap_pcid_enabled) {
2513		/*
2514		 * Invalidate any left TLB entries, to allow the reuse
2515		 * of the pcid.
2516		 */
2517		pmap_invalidate_all(pmap);
2518	}
2519
2520	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2521
2522	for (i = 0; i < NKPML4E; i++)	/* KVA */
2523		pmap->pm_pml4[KPML4BASE + i] = 0;
2524	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2525		pmap->pm_pml4[DMPML4I + i] = 0;
2526	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2527
2528	m->wire_count--;
2529	atomic_subtract_int(&cnt.v_wire_count, 1);
2530	vm_page_free_zero(m);
2531	if (pmap->pm_pcid != -1)
2532		free_unr(&pcid_unr, pmap->pm_pcid);
2533}
2534
2535static int
2536kvm_size(SYSCTL_HANDLER_ARGS)
2537{
2538	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2539
2540	return sysctl_handle_long(oidp, &ksize, 0, req);
2541}
2542SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2543    0, 0, kvm_size, "LU", "Size of KVM");
2544
2545static int
2546kvm_free(SYSCTL_HANDLER_ARGS)
2547{
2548	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2549
2550	return sysctl_handle_long(oidp, &kfree, 0, req);
2551}
2552SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2553    0, 0, kvm_free, "LU", "Amount of KVM free");
2554
2555/*
2556 * grow the number of kernel page table entries, if needed
2557 */
2558void
2559pmap_growkernel(vm_offset_t addr)
2560{
2561	vm_paddr_t paddr;
2562	vm_page_t nkpg;
2563	pd_entry_t *pde, newpdir;
2564	pdp_entry_t *pdpe;
2565
2566	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2567
2568	/*
2569	 * Return if "addr" is within the range of kernel page table pages
2570	 * that were preallocated during pmap bootstrap.  Moreover, leave
2571	 * "kernel_vm_end" and the kernel page table as they were.
2572	 *
2573	 * The correctness of this action is based on the following
2574	 * argument: vm_map_findspace() allocates contiguous ranges of the
2575	 * kernel virtual address space.  It calls this function if a range
2576	 * ends after "kernel_vm_end".  If the kernel is mapped between
2577	 * "kernel_vm_end" and "addr", then the range cannot begin at
2578	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2579	 * than the kernel.  Thus, there is no immediate need to allocate
2580	 * any new kernel page table pages between "kernel_vm_end" and
2581	 * "KERNBASE".
2582	 */
2583	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2584		return;
2585
2586	addr = roundup2(addr, NBPDR);
2587	if (addr - 1 >= kernel_map->max_offset)
2588		addr = kernel_map->max_offset;
2589	while (kernel_vm_end < addr) {
2590		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2591		if ((*pdpe & X86_PG_V) == 0) {
2592			/* We need a new PDP entry */
2593			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2594			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2595			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2596			if (nkpg == NULL)
2597				panic("pmap_growkernel: no memory to grow kernel");
2598			if ((nkpg->flags & PG_ZERO) == 0)
2599				pmap_zero_page(nkpg);
2600			paddr = VM_PAGE_TO_PHYS(nkpg);
2601			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2602			    X86_PG_A | X86_PG_M);
2603			continue; /* try again */
2604		}
2605		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2606		if ((*pde & X86_PG_V) != 0) {
2607			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2608			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2609				kernel_vm_end = kernel_map->max_offset;
2610				break;
2611			}
2612			continue;
2613		}
2614
2615		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2616		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2617		    VM_ALLOC_ZERO);
2618		if (nkpg == NULL)
2619			panic("pmap_growkernel: no memory to grow kernel");
2620		if ((nkpg->flags & PG_ZERO) == 0)
2621			pmap_zero_page(nkpg);
2622		paddr = VM_PAGE_TO_PHYS(nkpg);
2623		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2624		pde_store(pde, newpdir);
2625
2626		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2627		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2628			kernel_vm_end = kernel_map->max_offset;
2629			break;
2630		}
2631	}
2632}
2633
2634
2635/***************************************************
2636 * page management routines.
2637 ***************************************************/
2638
2639CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2640CTASSERT(_NPCM == 3);
2641CTASSERT(_NPCPV == 168);
2642
2643static __inline struct pv_chunk *
2644pv_to_chunk(pv_entry_t pv)
2645{
2646
2647	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2648}
2649
2650#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2651
2652#define	PC_FREE0	0xfffffffffffffffful
2653#define	PC_FREE1	0xfffffffffffffffful
2654#define	PC_FREE2	0x000000fffffffffful
2655
2656static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2657
2658#ifdef PV_STATS
2659static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2660
2661SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2662	"Current number of pv entry chunks");
2663SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2664	"Current number of pv entry chunks allocated");
2665SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2666	"Current number of pv entry chunks frees");
2667SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2668	"Number of times tried to get a chunk page but failed.");
2669
2670static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2671static int pv_entry_spare;
2672
2673SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2674	"Current number of pv entry frees");
2675SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2676	"Current number of pv entry allocs");
2677SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2678	"Current number of pv entries");
2679SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2680	"Current number of spare pv entries");
2681#endif
2682
2683/*
2684 * We are in a serious low memory condition.  Resort to
2685 * drastic measures to free some pages so we can allocate
2686 * another pv entry chunk.
2687 *
2688 * Returns NULL if PV entries were reclaimed from the specified pmap.
2689 *
2690 * We do not, however, unmap 2mpages because subsequent accesses will
2691 * allocate per-page pv entries until repromotion occurs, thereby
2692 * exacerbating the shortage of free pv entries.
2693 */
2694static vm_page_t
2695reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2696{
2697	struct pch new_tail;
2698	struct pv_chunk *pc;
2699	struct md_page *pvh;
2700	pd_entry_t *pde;
2701	pmap_t pmap;
2702	pt_entry_t *pte, tpte;
2703	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2704	pv_entry_t pv;
2705	vm_offset_t va;
2706	vm_page_t m, m_pc;
2707	struct spglist free;
2708	uint64_t inuse;
2709	int bit, field, freed;
2710
2711	rw_assert(&pvh_global_lock, RA_LOCKED);
2712	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2713	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2714	pmap = NULL;
2715	m_pc = NULL;
2716	PG_G = PG_A = PG_M = PG_RW = 0;
2717	SLIST_INIT(&free);
2718	TAILQ_INIT(&new_tail);
2719	mtx_lock(&pv_chunks_mutex);
2720	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2721		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2722		mtx_unlock(&pv_chunks_mutex);
2723		if (pmap != pc->pc_pmap) {
2724			if (pmap != NULL) {
2725				pmap_invalidate_all(pmap);
2726				if (pmap != locked_pmap)
2727					PMAP_UNLOCK(pmap);
2728			}
2729			pmap = pc->pc_pmap;
2730			/* Avoid deadlock and lock recursion. */
2731			if (pmap > locked_pmap) {
2732				RELEASE_PV_LIST_LOCK(lockp);
2733				PMAP_LOCK(pmap);
2734			} else if (pmap != locked_pmap &&
2735			    !PMAP_TRYLOCK(pmap)) {
2736				pmap = NULL;
2737				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2738				mtx_lock(&pv_chunks_mutex);
2739				continue;
2740			}
2741			PG_G = pmap_global_bit(pmap);
2742			PG_A = pmap_accessed_bit(pmap);
2743			PG_M = pmap_modified_bit(pmap);
2744			PG_RW = pmap_rw_bit(pmap);
2745		}
2746
2747		/*
2748		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2749		 */
2750		freed = 0;
2751		for (field = 0; field < _NPCM; field++) {
2752			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2753			    inuse != 0; inuse &= ~(1UL << bit)) {
2754				bit = bsfq(inuse);
2755				pv = &pc->pc_pventry[field * 64 + bit];
2756				va = pv->pv_va;
2757				pde = pmap_pde(pmap, va);
2758				if ((*pde & PG_PS) != 0)
2759					continue;
2760				pte = pmap_pde_to_pte(pde, va);
2761				if ((*pte & PG_W) != 0)
2762					continue;
2763				tpte = pte_load_clear(pte);
2764				if ((tpte & PG_G) != 0)
2765					pmap_invalidate_page(pmap, va);
2766				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2767				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2768					vm_page_dirty(m);
2769				if ((tpte & PG_A) != 0)
2770					vm_page_aflag_set(m, PGA_REFERENCED);
2771				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2772				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2773				m->md.pv_gen++;
2774				if (TAILQ_EMPTY(&m->md.pv_list) &&
2775				    (m->flags & PG_FICTITIOUS) == 0) {
2776					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2777					if (TAILQ_EMPTY(&pvh->pv_list)) {
2778						vm_page_aflag_clear(m,
2779						    PGA_WRITEABLE);
2780					}
2781				}
2782				pc->pc_map[field] |= 1UL << bit;
2783				pmap_unuse_pt(pmap, va, *pde, &free);
2784				freed++;
2785			}
2786		}
2787		if (freed == 0) {
2788			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2789			mtx_lock(&pv_chunks_mutex);
2790			continue;
2791		}
2792		/* Every freed mapping is for a 4 KB page. */
2793		pmap_resident_count_dec(pmap, freed);
2794		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2795		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2796		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2797		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2798		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2799		    pc->pc_map[2] == PC_FREE2) {
2800			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2801			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2802			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2803			/* Entire chunk is free; return it. */
2804			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2805			dump_drop_page(m_pc->phys_addr);
2806			mtx_lock(&pv_chunks_mutex);
2807			break;
2808		}
2809		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2810		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2811		mtx_lock(&pv_chunks_mutex);
2812		/* One freed pv entry in locked_pmap is sufficient. */
2813		if (pmap == locked_pmap)
2814			break;
2815	}
2816	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2817	mtx_unlock(&pv_chunks_mutex);
2818	if (pmap != NULL) {
2819		pmap_invalidate_all(pmap);
2820		if (pmap != locked_pmap)
2821			PMAP_UNLOCK(pmap);
2822	}
2823	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2824		m_pc = SLIST_FIRST(&free);
2825		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2826		/* Recycle a freed page table page. */
2827		m_pc->wire_count = 1;
2828		atomic_add_int(&cnt.v_wire_count, 1);
2829	}
2830	pmap_free_zero_pages(&free);
2831	return (m_pc);
2832}
2833
2834/*
2835 * free the pv_entry back to the free list
2836 */
2837static void
2838free_pv_entry(pmap_t pmap, pv_entry_t pv)
2839{
2840	struct pv_chunk *pc;
2841	int idx, field, bit;
2842
2843	rw_assert(&pvh_global_lock, RA_LOCKED);
2844	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2845	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2846	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2847	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2848	pc = pv_to_chunk(pv);
2849	idx = pv - &pc->pc_pventry[0];
2850	field = idx / 64;
2851	bit = idx % 64;
2852	pc->pc_map[field] |= 1ul << bit;
2853	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2854	    pc->pc_map[2] != PC_FREE2) {
2855		/* 98% of the time, pc is already at the head of the list. */
2856		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2857			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2858			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2859		}
2860		return;
2861	}
2862	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2863	free_pv_chunk(pc);
2864}
2865
2866static void
2867free_pv_chunk(struct pv_chunk *pc)
2868{
2869	vm_page_t m;
2870
2871	mtx_lock(&pv_chunks_mutex);
2872 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2873	mtx_unlock(&pv_chunks_mutex);
2874	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2875	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2876	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2877	/* entire chunk is free, return it */
2878	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2879	dump_drop_page(m->phys_addr);
2880	vm_page_unwire(m, 0);
2881	vm_page_free(m);
2882}
2883
2884/*
2885 * Returns a new PV entry, allocating a new PV chunk from the system when
2886 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2887 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2888 * returned.
2889 *
2890 * The given PV list lock may be released.
2891 */
2892static pv_entry_t
2893get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2894{
2895	int bit, field;
2896	pv_entry_t pv;
2897	struct pv_chunk *pc;
2898	vm_page_t m;
2899
2900	rw_assert(&pvh_global_lock, RA_LOCKED);
2901	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2902	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2903retry:
2904	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2905	if (pc != NULL) {
2906		for (field = 0; field < _NPCM; field++) {
2907			if (pc->pc_map[field]) {
2908				bit = bsfq(pc->pc_map[field]);
2909				break;
2910			}
2911		}
2912		if (field < _NPCM) {
2913			pv = &pc->pc_pventry[field * 64 + bit];
2914			pc->pc_map[field] &= ~(1ul << bit);
2915			/* If this was the last item, move it to tail */
2916			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2917			    pc->pc_map[2] == 0) {
2918				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2919				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2920				    pc_list);
2921			}
2922			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2923			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2924			return (pv);
2925		}
2926	}
2927	/* No free items, allocate another chunk */
2928	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2929	    VM_ALLOC_WIRED);
2930	if (m == NULL) {
2931		if (lockp == NULL) {
2932			PV_STAT(pc_chunk_tryfail++);
2933			return (NULL);
2934		}
2935		m = reclaim_pv_chunk(pmap, lockp);
2936		if (m == NULL)
2937			goto retry;
2938	}
2939	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2940	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2941	dump_add_page(m->phys_addr);
2942	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2943	pc->pc_pmap = pmap;
2944	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2945	pc->pc_map[1] = PC_FREE1;
2946	pc->pc_map[2] = PC_FREE2;
2947	mtx_lock(&pv_chunks_mutex);
2948	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2949	mtx_unlock(&pv_chunks_mutex);
2950	pv = &pc->pc_pventry[0];
2951	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2952	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2953	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2954	return (pv);
2955}
2956
2957/*
2958 * Returns the number of one bits within the given PV chunk map element.
2959 */
2960static int
2961popcnt_pc_map_elem(uint64_t elem)
2962{
2963	int count;
2964
2965	/*
2966	 * This simple method of counting the one bits performs well because
2967	 * the given element typically contains more zero bits than one bits.
2968	 */
2969	count = 0;
2970	for (; elem != 0; elem &= elem - 1)
2971		count++;
2972	return (count);
2973}
2974
2975/*
2976 * Ensure that the number of spare PV entries in the specified pmap meets or
2977 * exceeds the given count, "needed".
2978 *
2979 * The given PV list lock may be released.
2980 */
2981static void
2982reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2983{
2984	struct pch new_tail;
2985	struct pv_chunk *pc;
2986	int avail, free;
2987	vm_page_t m;
2988
2989	rw_assert(&pvh_global_lock, RA_LOCKED);
2990	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2991	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2992
2993	/*
2994	 * Newly allocated PV chunks must be stored in a private list until
2995	 * the required number of PV chunks have been allocated.  Otherwise,
2996	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2997	 * contrast, these chunks must be added to the pmap upon allocation.
2998	 */
2999	TAILQ_INIT(&new_tail);
3000retry:
3001	avail = 0;
3002	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3003		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
3004			free = popcnt_pc_map_elem(pc->pc_map[0]);
3005			free += popcnt_pc_map_elem(pc->pc_map[1]);
3006			free += popcnt_pc_map_elem(pc->pc_map[2]);
3007		} else {
3008			free = popcntq(pc->pc_map[0]);
3009			free += popcntq(pc->pc_map[1]);
3010			free += popcntq(pc->pc_map[2]);
3011		}
3012		if (free == 0)
3013			break;
3014		avail += free;
3015		if (avail >= needed)
3016			break;
3017	}
3018	for (; avail < needed; avail += _NPCPV) {
3019		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3020		    VM_ALLOC_WIRED);
3021		if (m == NULL) {
3022			m = reclaim_pv_chunk(pmap, lockp);
3023			if (m == NULL)
3024				goto retry;
3025		}
3026		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3027		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3028		dump_add_page(m->phys_addr);
3029		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3030		pc->pc_pmap = pmap;
3031		pc->pc_map[0] = PC_FREE0;
3032		pc->pc_map[1] = PC_FREE1;
3033		pc->pc_map[2] = PC_FREE2;
3034		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3035		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3036		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3037	}
3038	if (!TAILQ_EMPTY(&new_tail)) {
3039		mtx_lock(&pv_chunks_mutex);
3040		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3041		mtx_unlock(&pv_chunks_mutex);
3042	}
3043}
3044
3045/*
3046 * First find and then remove the pv entry for the specified pmap and virtual
3047 * address from the specified pv list.  Returns the pv entry if found and NULL
3048 * otherwise.  This operation can be performed on pv lists for either 4KB or
3049 * 2MB page mappings.
3050 */
3051static __inline pv_entry_t
3052pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3053{
3054	pv_entry_t pv;
3055
3056	rw_assert(&pvh_global_lock, RA_LOCKED);
3057	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3058		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3059			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3060			pvh->pv_gen++;
3061			break;
3062		}
3063	}
3064	return (pv);
3065}
3066
3067/*
3068 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3069 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3070 * entries for each of the 4KB page mappings.
3071 */
3072static void
3073pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3074    struct rwlock **lockp)
3075{
3076	struct md_page *pvh;
3077	struct pv_chunk *pc;
3078	pv_entry_t pv;
3079	vm_offset_t va_last;
3080	vm_page_t m;
3081	int bit, field;
3082
3083	rw_assert(&pvh_global_lock, RA_LOCKED);
3084	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3085	KASSERT((pa & PDRMASK) == 0,
3086	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3087	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3088
3089	/*
3090	 * Transfer the 2mpage's pv entry for this mapping to the first
3091	 * page's pv list.  Once this transfer begins, the pv list lock
3092	 * must not be released until the last pv entry is reinstantiated.
3093	 */
3094	pvh = pa_to_pvh(pa);
3095	va = trunc_2mpage(va);
3096	pv = pmap_pvh_remove(pvh, pmap, va);
3097	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3098	m = PHYS_TO_VM_PAGE(pa);
3099	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3100	m->md.pv_gen++;
3101	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3102	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3103	va_last = va + NBPDR - PAGE_SIZE;
3104	for (;;) {
3105		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3106		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3107		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3108		for (field = 0; field < _NPCM; field++) {
3109			while (pc->pc_map[field]) {
3110				bit = bsfq(pc->pc_map[field]);
3111				pc->pc_map[field] &= ~(1ul << bit);
3112				pv = &pc->pc_pventry[field * 64 + bit];
3113				va += PAGE_SIZE;
3114				pv->pv_va = va;
3115				m++;
3116				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3117			    ("pmap_pv_demote_pde: page %p is not managed", m));
3118				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3119				m->md.pv_gen++;
3120				if (va == va_last)
3121					goto out;
3122			}
3123		}
3124		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3125		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3126	}
3127out:
3128	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3129		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3130		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3131	}
3132	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3133	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3134}
3135
3136/*
3137 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3138 * replace the many pv entries for the 4KB page mappings by a single pv entry
3139 * for the 2MB page mapping.
3140 */
3141static void
3142pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3143    struct rwlock **lockp)
3144{
3145	struct md_page *pvh;
3146	pv_entry_t pv;
3147	vm_offset_t va_last;
3148	vm_page_t m;
3149
3150	rw_assert(&pvh_global_lock, RA_LOCKED);
3151	KASSERT((pa & PDRMASK) == 0,
3152	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3153	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3154
3155	/*
3156	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3157	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3158	 * a transfer avoids the possibility that get_pv_entry() calls
3159	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3160	 * mappings that is being promoted.
3161	 */
3162	m = PHYS_TO_VM_PAGE(pa);
3163	va = trunc_2mpage(va);
3164	pv = pmap_pvh_remove(&m->md, pmap, va);
3165	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3166	pvh = pa_to_pvh(pa);
3167	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3168	pvh->pv_gen++;
3169	/* Free the remaining NPTEPG - 1 pv entries. */
3170	va_last = va + NBPDR - PAGE_SIZE;
3171	do {
3172		m++;
3173		va += PAGE_SIZE;
3174		pmap_pvh_free(&m->md, pmap, va);
3175	} while (va < va_last);
3176}
3177
3178/*
3179 * First find and then destroy the pv entry for the specified pmap and virtual
3180 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3181 * page mappings.
3182 */
3183static void
3184pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3185{
3186	pv_entry_t pv;
3187
3188	pv = pmap_pvh_remove(pvh, pmap, va);
3189	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3190	free_pv_entry(pmap, pv);
3191}
3192
3193/*
3194 * Conditionally create the PV entry for a 4KB page mapping if the required
3195 * memory can be allocated without resorting to reclamation.
3196 */
3197static boolean_t
3198pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3199    struct rwlock **lockp)
3200{
3201	pv_entry_t pv;
3202
3203	rw_assert(&pvh_global_lock, RA_LOCKED);
3204	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3205	/* Pass NULL instead of the lock pointer to disable reclamation. */
3206	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3207		pv->pv_va = va;
3208		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3209		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3210		m->md.pv_gen++;
3211		return (TRUE);
3212	} else
3213		return (FALSE);
3214}
3215
3216/*
3217 * Conditionally create the PV entry for a 2MB page mapping if the required
3218 * memory can be allocated without resorting to reclamation.
3219 */
3220static boolean_t
3221pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3222    struct rwlock **lockp)
3223{
3224	struct md_page *pvh;
3225	pv_entry_t pv;
3226
3227	rw_assert(&pvh_global_lock, RA_LOCKED);
3228	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3229	/* Pass NULL instead of the lock pointer to disable reclamation. */
3230	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3231		pv->pv_va = va;
3232		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3233		pvh = pa_to_pvh(pa);
3234		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3235		pvh->pv_gen++;
3236		return (TRUE);
3237	} else
3238		return (FALSE);
3239}
3240
3241/*
3242 * Fills a page table page with mappings to consecutive physical pages.
3243 */
3244static void
3245pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3246{
3247	pt_entry_t *pte;
3248
3249	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3250		*pte = newpte;
3251		newpte += PAGE_SIZE;
3252	}
3253}
3254
3255/*
3256 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3257 * mapping is invalidated.
3258 */
3259static boolean_t
3260pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3261{
3262	struct rwlock *lock;
3263	boolean_t rv;
3264
3265	lock = NULL;
3266	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3267	if (lock != NULL)
3268		rw_wunlock(lock);
3269	return (rv);
3270}
3271
3272static boolean_t
3273pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3274    struct rwlock **lockp)
3275{
3276	pd_entry_t newpde, oldpde;
3277	pt_entry_t *firstpte, newpte;
3278	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3279	vm_paddr_t mptepa;
3280	vm_page_t mpte;
3281	struct spglist free;
3282	int PG_PTE_CACHE;
3283
3284	PG_G = pmap_global_bit(pmap);
3285	PG_A = pmap_accessed_bit(pmap);
3286	PG_M = pmap_modified_bit(pmap);
3287	PG_RW = pmap_rw_bit(pmap);
3288	PG_V = pmap_valid_bit(pmap);
3289	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3290
3291	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3292	oldpde = *pde;
3293	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3294	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3295	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3296	    NULL)
3297		pmap_remove_pt_page(pmap, mpte);
3298	else {
3299		KASSERT((oldpde & PG_W) == 0,
3300		    ("pmap_demote_pde: page table page for a wired mapping"
3301		    " is missing"));
3302
3303		/*
3304		 * Invalidate the 2MB page mapping and return "failure" if the
3305		 * mapping was never accessed or the allocation of the new
3306		 * page table page fails.  If the 2MB page mapping belongs to
3307		 * the direct map region of the kernel's address space, then
3308		 * the page allocation request specifies the highest possible
3309		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3310		 * normal.  Page table pages are preallocated for every other
3311		 * part of the kernel address space, so the direct map region
3312		 * is the only part of the kernel address space that must be
3313		 * handled here.
3314		 */
3315		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3316		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3317		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3318		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3319			SLIST_INIT(&free);
3320			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3321			    lockp);
3322			pmap_invalidate_page(pmap, trunc_2mpage(va));
3323			pmap_free_zero_pages(&free);
3324			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3325			    " in pmap %p", va, pmap);
3326			return (FALSE);
3327		}
3328		if (va < VM_MAXUSER_ADDRESS)
3329			pmap_resident_count_inc(pmap, 1);
3330	}
3331	mptepa = VM_PAGE_TO_PHYS(mpte);
3332	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3333	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3334	KASSERT((oldpde & PG_A) != 0,
3335	    ("pmap_demote_pde: oldpde is missing PG_A"));
3336	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3337	    ("pmap_demote_pde: oldpde is missing PG_M"));
3338	newpte = oldpde & ~PG_PS;
3339	newpte = pmap_swap_pat(pmap, newpte);
3340
3341	/*
3342	 * If the page table page is new, initialize it.
3343	 */
3344	if (mpte->wire_count == 1) {
3345		mpte->wire_count = NPTEPG;
3346		pmap_fill_ptp(firstpte, newpte);
3347	}
3348	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3349	    ("pmap_demote_pde: firstpte and newpte map different physical"
3350	    " addresses"));
3351
3352	/*
3353	 * If the mapping has changed attributes, update the page table
3354	 * entries.
3355	 */
3356	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3357		pmap_fill_ptp(firstpte, newpte);
3358
3359	/*
3360	 * The spare PV entries must be reserved prior to demoting the
3361	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3362	 * of the PDE and the PV lists will be inconsistent, which can result
3363	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3364	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3365	 * PV entry for the 2MB page mapping that is being demoted.
3366	 */
3367	if ((oldpde & PG_MANAGED) != 0)
3368		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3369
3370	/*
3371	 * Demote the mapping.  This pmap is locked.  The old PDE has
3372	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3373	 * set.  Thus, there is no danger of a race with another
3374	 * processor changing the setting of PG_A and/or PG_M between
3375	 * the read above and the store below.
3376	 */
3377	if (workaround_erratum383)
3378		pmap_update_pde(pmap, va, pde, newpde);
3379	else
3380		pde_store(pde, newpde);
3381
3382	/*
3383	 * Invalidate a stale recursive mapping of the page table page.
3384	 */
3385	if (va >= VM_MAXUSER_ADDRESS)
3386		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3387
3388	/*
3389	 * Demote the PV entry.
3390	 */
3391	if ((oldpde & PG_MANAGED) != 0)
3392		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3393
3394	atomic_add_long(&pmap_pde_demotions, 1);
3395	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3396	    " in pmap %p", va, pmap);
3397	return (TRUE);
3398}
3399
3400/*
3401 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3402 */
3403static void
3404pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3405{
3406	pd_entry_t newpde;
3407	vm_paddr_t mptepa;
3408	vm_page_t mpte;
3409
3410	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3411	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3412	mpte = pmap_lookup_pt_page(pmap, va);
3413	if (mpte == NULL)
3414		panic("pmap_remove_kernel_pde: Missing pt page.");
3415
3416	pmap_remove_pt_page(pmap, mpte);
3417	mptepa = VM_PAGE_TO_PHYS(mpte);
3418	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3419
3420	/*
3421	 * Initialize the page table page.
3422	 */
3423	pagezero((void *)PHYS_TO_DMAP(mptepa));
3424
3425	/*
3426	 * Demote the mapping.
3427	 */
3428	if (workaround_erratum383)
3429		pmap_update_pde(pmap, va, pde, newpde);
3430	else
3431		pde_store(pde, newpde);
3432
3433	/*
3434	 * Invalidate a stale recursive mapping of the page table page.
3435	 */
3436	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3437}
3438
3439/*
3440 * pmap_remove_pde: do the things to unmap a superpage in a process
3441 */
3442static int
3443pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3444    struct spglist *free, struct rwlock **lockp)
3445{
3446	struct md_page *pvh;
3447	pd_entry_t oldpde;
3448	vm_offset_t eva, va;
3449	vm_page_t m, mpte;
3450	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3451
3452	PG_G = pmap_global_bit(pmap);
3453	PG_A = pmap_accessed_bit(pmap);
3454	PG_M = pmap_modified_bit(pmap);
3455	PG_RW = pmap_rw_bit(pmap);
3456
3457	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3458	KASSERT((sva & PDRMASK) == 0,
3459	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3460	oldpde = pte_load_clear(pdq);
3461	if (oldpde & PG_W)
3462		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3463
3464	/*
3465	 * Machines that don't support invlpg, also don't support
3466	 * PG_G.
3467	 */
3468	if (oldpde & PG_G)
3469		pmap_invalidate_page(kernel_pmap, sva);
3470	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3471	if (oldpde & PG_MANAGED) {
3472		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3473		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3474		pmap_pvh_free(pvh, pmap, sva);
3475		eva = sva + NBPDR;
3476		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3477		    va < eva; va += PAGE_SIZE, m++) {
3478			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3479				vm_page_dirty(m);
3480			if (oldpde & PG_A)
3481				vm_page_aflag_set(m, PGA_REFERENCED);
3482			if (TAILQ_EMPTY(&m->md.pv_list) &&
3483			    TAILQ_EMPTY(&pvh->pv_list))
3484				vm_page_aflag_clear(m, PGA_WRITEABLE);
3485		}
3486	}
3487	if (pmap == kernel_pmap) {
3488		pmap_remove_kernel_pde(pmap, pdq, sva);
3489	} else {
3490		mpte = pmap_lookup_pt_page(pmap, sva);
3491		if (mpte != NULL) {
3492			pmap_remove_pt_page(pmap, mpte);
3493			pmap_resident_count_dec(pmap, 1);
3494			KASSERT(mpte->wire_count == NPTEPG,
3495			    ("pmap_remove_pde: pte page wire count error"));
3496			mpte->wire_count = 0;
3497			pmap_add_delayed_free_list(mpte, free, FALSE);
3498			atomic_subtract_int(&cnt.v_wire_count, 1);
3499		}
3500	}
3501	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3502}
3503
3504/*
3505 * pmap_remove_pte: do the things to unmap a page in a process
3506 */
3507static int
3508pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3509    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3510{
3511	struct md_page *pvh;
3512	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3513	vm_page_t m;
3514
3515	PG_A = pmap_accessed_bit(pmap);
3516	PG_M = pmap_modified_bit(pmap);
3517	PG_RW = pmap_rw_bit(pmap);
3518
3519	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3520	oldpte = pte_load_clear(ptq);
3521	if (oldpte & PG_W)
3522		pmap->pm_stats.wired_count -= 1;
3523	pmap_resident_count_dec(pmap, 1);
3524	if (oldpte & PG_MANAGED) {
3525		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3526		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3527			vm_page_dirty(m);
3528		if (oldpte & PG_A)
3529			vm_page_aflag_set(m, PGA_REFERENCED);
3530		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3531		pmap_pvh_free(&m->md, pmap, va);
3532		if (TAILQ_EMPTY(&m->md.pv_list) &&
3533		    (m->flags & PG_FICTITIOUS) == 0) {
3534			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3535			if (TAILQ_EMPTY(&pvh->pv_list))
3536				vm_page_aflag_clear(m, PGA_WRITEABLE);
3537		}
3538	}
3539	return (pmap_unuse_pt(pmap, va, ptepde, free));
3540}
3541
3542/*
3543 * Remove a single page from a process address space
3544 */
3545static void
3546pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3547    struct spglist *free)
3548{
3549	struct rwlock *lock;
3550	pt_entry_t *pte, PG_V;
3551
3552	PG_V = pmap_valid_bit(pmap);
3553	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3554	if ((*pde & PG_V) == 0)
3555		return;
3556	pte = pmap_pde_to_pte(pde, va);
3557	if ((*pte & PG_V) == 0)
3558		return;
3559	lock = NULL;
3560	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3561	if (lock != NULL)
3562		rw_wunlock(lock);
3563	pmap_invalidate_page(pmap, va);
3564}
3565
3566/*
3567 *	Remove the given range of addresses from the specified map.
3568 *
3569 *	It is assumed that the start and end are properly
3570 *	rounded to the page size.
3571 */
3572void
3573pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3574{
3575	struct rwlock *lock;
3576	vm_offset_t va, va_next;
3577	pml4_entry_t *pml4e;
3578	pdp_entry_t *pdpe;
3579	pd_entry_t ptpaddr, *pde;
3580	pt_entry_t *pte, PG_G, PG_V;
3581	struct spglist free;
3582	int anyvalid;
3583
3584	PG_G = pmap_global_bit(pmap);
3585	PG_V = pmap_valid_bit(pmap);
3586
3587	/*
3588	 * Perform an unsynchronized read.  This is, however, safe.
3589	 */
3590	if (pmap->pm_stats.resident_count == 0)
3591		return;
3592
3593	anyvalid = 0;
3594	SLIST_INIT(&free);
3595
3596	rw_rlock(&pvh_global_lock);
3597	PMAP_LOCK(pmap);
3598
3599	/*
3600	 * special handling of removing one page.  a very
3601	 * common operation and easy to short circuit some
3602	 * code.
3603	 */
3604	if (sva + PAGE_SIZE == eva) {
3605		pde = pmap_pde(pmap, sva);
3606		if (pde && (*pde & PG_PS) == 0) {
3607			pmap_remove_page(pmap, sva, pde, &free);
3608			goto out;
3609		}
3610	}
3611
3612	lock = NULL;
3613	for (; sva < eva; sva = va_next) {
3614
3615		if (pmap->pm_stats.resident_count == 0)
3616			break;
3617
3618		pml4e = pmap_pml4e(pmap, sva);
3619		if ((*pml4e & PG_V) == 0) {
3620			va_next = (sva + NBPML4) & ~PML4MASK;
3621			if (va_next < sva)
3622				va_next = eva;
3623			continue;
3624		}
3625
3626		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3627		if ((*pdpe & PG_V) == 0) {
3628			va_next = (sva + NBPDP) & ~PDPMASK;
3629			if (va_next < sva)
3630				va_next = eva;
3631			continue;
3632		}
3633
3634		/*
3635		 * Calculate index for next page table.
3636		 */
3637		va_next = (sva + NBPDR) & ~PDRMASK;
3638		if (va_next < sva)
3639			va_next = eva;
3640
3641		pde = pmap_pdpe_to_pde(pdpe, sva);
3642		ptpaddr = *pde;
3643
3644		/*
3645		 * Weed out invalid mappings.
3646		 */
3647		if (ptpaddr == 0)
3648			continue;
3649
3650		/*
3651		 * Check for large page.
3652		 */
3653		if ((ptpaddr & PG_PS) != 0) {
3654			/*
3655			 * Are we removing the entire large page?  If not,
3656			 * demote the mapping and fall through.
3657			 */
3658			if (sva + NBPDR == va_next && eva >= va_next) {
3659				/*
3660				 * The TLB entry for a PG_G mapping is
3661				 * invalidated by pmap_remove_pde().
3662				 */
3663				if ((ptpaddr & PG_G) == 0)
3664					anyvalid = 1;
3665				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3666				continue;
3667			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3668			    &lock)) {
3669				/* The large page mapping was destroyed. */
3670				continue;
3671			} else
3672				ptpaddr = *pde;
3673		}
3674
3675		/*
3676		 * Limit our scan to either the end of the va represented
3677		 * by the current page table page, or to the end of the
3678		 * range being removed.
3679		 */
3680		if (va_next > eva)
3681			va_next = eva;
3682
3683		va = va_next;
3684		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3685		    sva += PAGE_SIZE) {
3686			if (*pte == 0) {
3687				if (va != va_next) {
3688					pmap_invalidate_range(pmap, va, sva);
3689					va = va_next;
3690				}
3691				continue;
3692			}
3693			if ((*pte & PG_G) == 0)
3694				anyvalid = 1;
3695			else if (va == va_next)
3696				va = sva;
3697			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3698			    &lock)) {
3699				sva += PAGE_SIZE;
3700				break;
3701			}
3702		}
3703		if (va != va_next)
3704			pmap_invalidate_range(pmap, va, sva);
3705	}
3706	if (lock != NULL)
3707		rw_wunlock(lock);
3708out:
3709	if (anyvalid)
3710		pmap_invalidate_all(pmap);
3711	rw_runlock(&pvh_global_lock);
3712	PMAP_UNLOCK(pmap);
3713	pmap_free_zero_pages(&free);
3714}
3715
3716/*
3717 *	Routine:	pmap_remove_all
3718 *	Function:
3719 *		Removes this physical page from
3720 *		all physical maps in which it resides.
3721 *		Reflects back modify bits to the pager.
3722 *
3723 *	Notes:
3724 *		Original versions of this routine were very
3725 *		inefficient because they iteratively called
3726 *		pmap_remove (slow...)
3727 */
3728
3729void
3730pmap_remove_all(vm_page_t m)
3731{
3732	struct md_page *pvh;
3733	pv_entry_t pv;
3734	pmap_t pmap;
3735	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3736	pd_entry_t *pde;
3737	vm_offset_t va;
3738	struct spglist free;
3739
3740	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3741	    ("pmap_remove_all: page %p is not managed", m));
3742	SLIST_INIT(&free);
3743	rw_wlock(&pvh_global_lock);
3744	if ((m->flags & PG_FICTITIOUS) != 0)
3745		goto small_mappings;
3746	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3747	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3748		pmap = PV_PMAP(pv);
3749		PMAP_LOCK(pmap);
3750		va = pv->pv_va;
3751		pde = pmap_pde(pmap, va);
3752		(void)pmap_demote_pde(pmap, pde, va);
3753		PMAP_UNLOCK(pmap);
3754	}
3755small_mappings:
3756	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3757		pmap = PV_PMAP(pv);
3758		PMAP_LOCK(pmap);
3759		PG_A = pmap_accessed_bit(pmap);
3760		PG_M = pmap_modified_bit(pmap);
3761		PG_RW = pmap_rw_bit(pmap);
3762		pmap_resident_count_dec(pmap, 1);
3763		pde = pmap_pde(pmap, pv->pv_va);
3764		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3765		    " a 2mpage in page %p's pv list", m));
3766		pte = pmap_pde_to_pte(pde, pv->pv_va);
3767		tpte = pte_load_clear(pte);
3768		if (tpte & PG_W)
3769			pmap->pm_stats.wired_count--;
3770		if (tpte & PG_A)
3771			vm_page_aflag_set(m, PGA_REFERENCED);
3772
3773		/*
3774		 * Update the vm_page_t clean and reference bits.
3775		 */
3776		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3777			vm_page_dirty(m);
3778		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3779		pmap_invalidate_page(pmap, pv->pv_va);
3780		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3781		m->md.pv_gen++;
3782		free_pv_entry(pmap, pv);
3783		PMAP_UNLOCK(pmap);
3784	}
3785	vm_page_aflag_clear(m, PGA_WRITEABLE);
3786	rw_wunlock(&pvh_global_lock);
3787	pmap_free_zero_pages(&free);
3788}
3789
3790/*
3791 * pmap_protect_pde: do the things to protect a 2mpage in a process
3792 */
3793static boolean_t
3794pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3795{
3796	pd_entry_t newpde, oldpde;
3797	vm_offset_t eva, va;
3798	vm_page_t m;
3799	boolean_t anychanged;
3800	pt_entry_t PG_G, PG_M, PG_RW;
3801
3802	PG_G = pmap_global_bit(pmap);
3803	PG_M = pmap_modified_bit(pmap);
3804	PG_RW = pmap_rw_bit(pmap);
3805
3806	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3807	KASSERT((sva & PDRMASK) == 0,
3808	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3809	anychanged = FALSE;
3810retry:
3811	oldpde = newpde = *pde;
3812	if (oldpde & PG_MANAGED) {
3813		eva = sva + NBPDR;
3814		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3815		    va < eva; va += PAGE_SIZE, m++)
3816			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3817				vm_page_dirty(m);
3818	}
3819	if ((prot & VM_PROT_WRITE) == 0)
3820		newpde &= ~(PG_RW | PG_M);
3821	if ((prot & VM_PROT_EXECUTE) == 0)
3822		newpde |= pg_nx;
3823	if (newpde != oldpde) {
3824		if (!atomic_cmpset_long(pde, oldpde, newpde))
3825			goto retry;
3826		if (oldpde & PG_G)
3827			pmap_invalidate_page(pmap, sva);
3828		else
3829			anychanged = TRUE;
3830	}
3831	return (anychanged);
3832}
3833
3834/*
3835 *	Set the physical protection on the
3836 *	specified range of this map as requested.
3837 */
3838void
3839pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3840{
3841	vm_offset_t va_next;
3842	pml4_entry_t *pml4e;
3843	pdp_entry_t *pdpe;
3844	pd_entry_t ptpaddr, *pde;
3845	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3846	boolean_t anychanged, pv_lists_locked;
3847
3848	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3849		pmap_remove(pmap, sva, eva);
3850		return;
3851	}
3852
3853	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3854	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3855		return;
3856
3857	PG_G = pmap_global_bit(pmap);
3858	PG_M = pmap_modified_bit(pmap);
3859	PG_V = pmap_valid_bit(pmap);
3860	PG_RW = pmap_rw_bit(pmap);
3861	pv_lists_locked = FALSE;
3862resume:
3863	anychanged = FALSE;
3864
3865	PMAP_LOCK(pmap);
3866	for (; sva < eva; sva = va_next) {
3867
3868		pml4e = pmap_pml4e(pmap, sva);
3869		if ((*pml4e & PG_V) == 0) {
3870			va_next = (sva + NBPML4) & ~PML4MASK;
3871			if (va_next < sva)
3872				va_next = eva;
3873			continue;
3874		}
3875
3876		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3877		if ((*pdpe & PG_V) == 0) {
3878			va_next = (sva + NBPDP) & ~PDPMASK;
3879			if (va_next < sva)
3880				va_next = eva;
3881			continue;
3882		}
3883
3884		va_next = (sva + NBPDR) & ~PDRMASK;
3885		if (va_next < sva)
3886			va_next = eva;
3887
3888		pde = pmap_pdpe_to_pde(pdpe, sva);
3889		ptpaddr = *pde;
3890
3891		/*
3892		 * Weed out invalid mappings.
3893		 */
3894		if (ptpaddr == 0)
3895			continue;
3896
3897		/*
3898		 * Check for large page.
3899		 */
3900		if ((ptpaddr & PG_PS) != 0) {
3901			/*
3902			 * Are we protecting the entire large page?  If not,
3903			 * demote the mapping and fall through.
3904			 */
3905			if (sva + NBPDR == va_next && eva >= va_next) {
3906				/*
3907				 * The TLB entry for a PG_G mapping is
3908				 * invalidated by pmap_protect_pde().
3909				 */
3910				if (pmap_protect_pde(pmap, pde, sva, prot))
3911					anychanged = TRUE;
3912				continue;
3913			} else {
3914				if (!pv_lists_locked) {
3915					pv_lists_locked = TRUE;
3916					if (!rw_try_rlock(&pvh_global_lock)) {
3917						if (anychanged)
3918							pmap_invalidate_all(
3919							    pmap);
3920						PMAP_UNLOCK(pmap);
3921						rw_rlock(&pvh_global_lock);
3922						goto resume;
3923					}
3924				}
3925				if (!pmap_demote_pde(pmap, pde, sva)) {
3926					/*
3927					 * The large page mapping was
3928					 * destroyed.
3929					 */
3930					continue;
3931				}
3932			}
3933		}
3934
3935		if (va_next > eva)
3936			va_next = eva;
3937
3938		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3939		    sva += PAGE_SIZE) {
3940			pt_entry_t obits, pbits;
3941			vm_page_t m;
3942
3943retry:
3944			obits = pbits = *pte;
3945			if ((pbits & PG_V) == 0)
3946				continue;
3947
3948			if ((prot & VM_PROT_WRITE) == 0) {
3949				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3950				    (PG_MANAGED | PG_M | PG_RW)) {
3951					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3952					vm_page_dirty(m);
3953				}
3954				pbits &= ~(PG_RW | PG_M);
3955			}
3956			if ((prot & VM_PROT_EXECUTE) == 0)
3957				pbits |= pg_nx;
3958
3959			if (pbits != obits) {
3960				if (!atomic_cmpset_long(pte, obits, pbits))
3961					goto retry;
3962				if (obits & PG_G)
3963					pmap_invalidate_page(pmap, sva);
3964				else
3965					anychanged = TRUE;
3966			}
3967		}
3968	}
3969	if (anychanged)
3970		pmap_invalidate_all(pmap);
3971	if (pv_lists_locked)
3972		rw_runlock(&pvh_global_lock);
3973	PMAP_UNLOCK(pmap);
3974}
3975
3976/*
3977 * Tries to promote the 512, contiguous 4KB page mappings that are within a
3978 * single page table page (PTP) to a single 2MB page mapping.  For promotion
3979 * to occur, two conditions must be met: (1) the 4KB page mappings must map
3980 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3981 * identical characteristics.
3982 */
3983static void
3984pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3985    struct rwlock **lockp)
3986{
3987	pd_entry_t newpde;
3988	pt_entry_t *firstpte, oldpte, pa, *pte;
3989	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
3990	vm_offset_t oldpteva;
3991	vm_page_t mpte;
3992	int PG_PTE_CACHE;
3993
3994	PG_A = pmap_accessed_bit(pmap);
3995	PG_G = pmap_global_bit(pmap);
3996	PG_M = pmap_modified_bit(pmap);
3997	PG_V = pmap_valid_bit(pmap);
3998	PG_RW = pmap_rw_bit(pmap);
3999	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4000
4001	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4002
4003	/*
4004	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4005	 * either invalid, unused, or does not map the first 4KB physical page
4006	 * within a 2MB page.
4007	 */
4008	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4009setpde:
4010	newpde = *firstpte;
4011	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4012		atomic_add_long(&pmap_pde_p_failures, 1);
4013		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4014		    " in pmap %p", va, pmap);
4015		return;
4016	}
4017	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4018		/*
4019		 * When PG_M is already clear, PG_RW can be cleared without
4020		 * a TLB invalidation.
4021		 */
4022		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4023			goto setpde;
4024		newpde &= ~PG_RW;
4025	}
4026
4027	/*
4028	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4029	 * PTE maps an unexpected 4KB physical page or does not have identical
4030	 * characteristics to the first PTE.
4031	 */
4032	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4033	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4034setpte:
4035		oldpte = *pte;
4036		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4037			atomic_add_long(&pmap_pde_p_failures, 1);
4038			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4039			    " in pmap %p", va, pmap);
4040			return;
4041		}
4042		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4043			/*
4044			 * When PG_M is already clear, PG_RW can be cleared
4045			 * without a TLB invalidation.
4046			 */
4047			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4048				goto setpte;
4049			oldpte &= ~PG_RW;
4050			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
4051			    (va & ~PDRMASK);
4052			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4053			    " in pmap %p", oldpteva, pmap);
4054		}
4055		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4056			atomic_add_long(&pmap_pde_p_failures, 1);
4057			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4058			    " in pmap %p", va, pmap);
4059			return;
4060		}
4061		pa -= PAGE_SIZE;
4062	}
4063
4064	/*
4065	 * Save the page table page in its current state until the PDE
4066	 * mapping the superpage is demoted by pmap_demote_pde() or
4067	 * destroyed by pmap_remove_pde().
4068	 */
4069	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4070	KASSERT(mpte >= vm_page_array &&
4071	    mpte < &vm_page_array[vm_page_array_size],
4072	    ("pmap_promote_pde: page table page is out of range"));
4073	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4074	    ("pmap_promote_pde: page table page's pindex is wrong"));
4075	if (pmap_insert_pt_page(pmap, mpte)) {
4076		atomic_add_long(&pmap_pde_p_failures, 1);
4077		CTR2(KTR_PMAP,
4078		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4079		    pmap);
4080		return;
4081	}
4082
4083	/*
4084	 * Promote the pv entries.
4085	 */
4086	if ((newpde & PG_MANAGED) != 0)
4087		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4088
4089	/*
4090	 * Propagate the PAT index to its proper position.
4091	 */
4092	newpde = pmap_swap_pat(pmap, newpde);
4093
4094	/*
4095	 * Map the superpage.
4096	 */
4097	if (workaround_erratum383)
4098		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4099	else
4100		pde_store(pde, PG_PS | newpde);
4101
4102	atomic_add_long(&pmap_pde_promotions, 1);
4103	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4104	    " in pmap %p", va, pmap);
4105}
4106
4107/*
4108 *	Insert the given physical page (p) at
4109 *	the specified virtual address (v) in the
4110 *	target physical map with the protection requested.
4111 *
4112 *	If specified, the page will be wired down, meaning
4113 *	that the related pte can not be reclaimed.
4114 *
4115 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4116 *	or lose information.  That is, this routine must actually
4117 *	insert this page into the given map NOW.
4118 */
4119void
4120pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
4121    vm_prot_t prot, boolean_t wired)
4122{
4123	struct rwlock *lock;
4124	pd_entry_t *pde;
4125	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4126	pt_entry_t newpte, origpte;
4127	pv_entry_t pv;
4128	vm_paddr_t opa, pa;
4129	vm_page_t mpte, om;
4130
4131	PG_A = pmap_accessed_bit(pmap);
4132	PG_G = pmap_global_bit(pmap);
4133	PG_M = pmap_modified_bit(pmap);
4134	PG_V = pmap_valid_bit(pmap);
4135	PG_RW = pmap_rw_bit(pmap);
4136
4137	va = trunc_page(va);
4138	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4139	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4140	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4141	    va));
4142	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4143	    va >= kmi.clean_eva,
4144	    ("pmap_enter: managed mapping within the clean submap"));
4145	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4146		VM_OBJECT_ASSERT_WLOCKED(m->object);
4147	pa = VM_PAGE_TO_PHYS(m);
4148	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4149	if ((access & VM_PROT_WRITE) != 0)
4150		newpte |= PG_M;
4151	if ((prot & VM_PROT_WRITE) != 0)
4152		newpte |= PG_RW;
4153	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4154	    ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
4155	if ((prot & VM_PROT_EXECUTE) == 0)
4156		newpte |= pg_nx;
4157	if (wired)
4158		newpte |= PG_W;
4159	if (va < VM_MAXUSER_ADDRESS)
4160		newpte |= PG_U;
4161	if (pmap == kernel_pmap)
4162		newpte |= PG_G;
4163	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4164
4165	/*
4166	 * Set modified bit gratuitously for writeable mappings if
4167	 * the page is unmanaged. We do not want to take a fault
4168	 * to do the dirty bit accounting for these mappings.
4169	 */
4170	if ((m->oflags & VPO_UNMANAGED) != 0) {
4171		if ((newpte & PG_RW) != 0)
4172			newpte |= PG_M;
4173	}
4174
4175	mpte = NULL;
4176
4177	lock = NULL;
4178	rw_rlock(&pvh_global_lock);
4179	PMAP_LOCK(pmap);
4180
4181	/*
4182	 * In the case that a page table page is not
4183	 * resident, we are creating it here.
4184	 */
4185retry:
4186	pde = pmap_pde(pmap, va);
4187	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4188	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4189		pte = pmap_pde_to_pte(pde, va);
4190		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4191			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4192			mpte->wire_count++;
4193		}
4194	} else if (va < VM_MAXUSER_ADDRESS) {
4195		/*
4196		 * Here if the pte page isn't mapped, or if it has been
4197		 * deallocated.
4198		 */
4199		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
4200		goto retry;
4201	} else
4202		panic("pmap_enter: invalid page directory va=%#lx", va);
4203
4204	origpte = *pte;
4205
4206	/*
4207	 * Is the specified virtual address already mapped?
4208	 */
4209	if ((origpte & PG_V) != 0) {
4210		/*
4211		 * Wiring change, just update stats. We don't worry about
4212		 * wiring PT pages as they remain resident as long as there
4213		 * are valid mappings in them. Hence, if a user page is wired,
4214		 * the PT page will be also.
4215		 */
4216		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4217			pmap->pm_stats.wired_count++;
4218		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4219			pmap->pm_stats.wired_count--;
4220
4221		/*
4222		 * Remove the extra PT page reference.
4223		 */
4224		if (mpte != NULL) {
4225			mpte->wire_count--;
4226			KASSERT(mpte->wire_count > 0,
4227			    ("pmap_enter: missing reference to page table page,"
4228			     " va: 0x%lx", va));
4229		}
4230
4231		/*
4232		 * Has the physical page changed?
4233		 */
4234		opa = origpte & PG_FRAME;
4235		if (opa == pa) {
4236			/*
4237			 * No, might be a protection or wiring change.
4238			 */
4239			if ((origpte & PG_MANAGED) != 0) {
4240				newpte |= PG_MANAGED;
4241				if ((newpte & PG_RW) != 0)
4242					vm_page_aflag_set(m, PGA_WRITEABLE);
4243			}
4244			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4245				goto unchanged;
4246			goto validate;
4247		}
4248	} else {
4249		/*
4250		 * Increment the counters.
4251		 */
4252		if ((newpte & PG_W) != 0)
4253			pmap->pm_stats.wired_count++;
4254		pmap_resident_count_inc(pmap, 1);
4255	}
4256
4257	/*
4258	 * Enter on the PV list if part of our managed memory.
4259	 */
4260	if ((m->oflags & VPO_UNMANAGED) == 0) {
4261		newpte |= PG_MANAGED;
4262		pv = get_pv_entry(pmap, &lock);
4263		pv->pv_va = va;
4264		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4265		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4266		m->md.pv_gen++;
4267		if ((newpte & PG_RW) != 0)
4268			vm_page_aflag_set(m, PGA_WRITEABLE);
4269	}
4270
4271	/*
4272	 * Update the PTE.
4273	 */
4274	if ((origpte & PG_V) != 0) {
4275validate:
4276		origpte = pte_load_store(pte, newpte);
4277		opa = origpte & PG_FRAME;
4278		if (opa != pa) {
4279			if ((origpte & PG_MANAGED) != 0) {
4280				om = PHYS_TO_VM_PAGE(opa);
4281				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4282				    PG_RW))
4283					vm_page_dirty(om);
4284				if ((origpte & PG_A) != 0)
4285					vm_page_aflag_set(om, PGA_REFERENCED);
4286				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4287				pmap_pvh_free(&om->md, pmap, va);
4288				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4289				    TAILQ_EMPTY(&om->md.pv_list) &&
4290				    ((om->flags & PG_FICTITIOUS) != 0 ||
4291				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4292					vm_page_aflag_clear(om, PGA_WRITEABLE);
4293			}
4294		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4295		    PG_RW)) == (PG_M | PG_RW)) {
4296			if ((origpte & PG_MANAGED) != 0)
4297				vm_page_dirty(m);
4298
4299			/*
4300			 * Although the PTE may still have PG_RW set, TLB
4301			 * invalidation may nonetheless be required because
4302			 * the PTE no longer has PG_M set.
4303			 */
4304		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4305			/*
4306			 * This PTE change does not require TLB invalidation.
4307			 */
4308			goto unchanged;
4309		}
4310		if ((origpte & PG_A) != 0)
4311			pmap_invalidate_page(pmap, va);
4312	} else
4313		pte_store(pte, newpte);
4314
4315unchanged:
4316
4317	/*
4318	 * If both the page table page and the reservation are fully
4319	 * populated, then attempt promotion.
4320	 */
4321	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4322	    pmap_ps_enabled(pmap) &&
4323	    (m->flags & PG_FICTITIOUS) == 0 &&
4324	    vm_reserv_level_iffullpop(m) == 0)
4325		pmap_promote_pde(pmap, pde, va, &lock);
4326
4327	if (lock != NULL)
4328		rw_wunlock(lock);
4329	rw_runlock(&pvh_global_lock);
4330	PMAP_UNLOCK(pmap);
4331}
4332
4333/*
4334 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4335 * otherwise.  Fails if (1) a page table page cannot be allocated without
4336 * blocking, (2) a mapping already exists at the specified virtual address, or
4337 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4338 */
4339static boolean_t
4340pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4341    struct rwlock **lockp)
4342{
4343	pd_entry_t *pde, newpde;
4344	pt_entry_t PG_V;
4345	vm_page_t mpde;
4346	struct spglist free;
4347
4348	PG_V = pmap_valid_bit(pmap);
4349	rw_assert(&pvh_global_lock, RA_LOCKED);
4350	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4351
4352	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4353		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4354		    " in pmap %p", va, pmap);
4355		return (FALSE);
4356	}
4357	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4358	pde = &pde[pmap_pde_index(va)];
4359	if ((*pde & PG_V) != 0) {
4360		KASSERT(mpde->wire_count > 1,
4361		    ("pmap_enter_pde: mpde's wire count is too low"));
4362		mpde->wire_count--;
4363		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4364		    " in pmap %p", va, pmap);
4365		return (FALSE);
4366	}
4367	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4368	    PG_PS | PG_V;
4369	if ((m->oflags & VPO_UNMANAGED) == 0) {
4370		newpde |= PG_MANAGED;
4371
4372		/*
4373		 * Abort this mapping if its PV entry could not be created.
4374		 */
4375		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4376		    lockp)) {
4377			SLIST_INIT(&free);
4378			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4379				pmap_invalidate_page(pmap, va);
4380				pmap_free_zero_pages(&free);
4381			}
4382			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4383			    " in pmap %p", va, pmap);
4384			return (FALSE);
4385		}
4386	}
4387	if ((prot & VM_PROT_EXECUTE) == 0)
4388		newpde |= pg_nx;
4389	if (va < VM_MAXUSER_ADDRESS)
4390		newpde |= PG_U;
4391
4392	/*
4393	 * Increment counters.
4394	 */
4395	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4396
4397	/*
4398	 * Map the superpage.
4399	 */
4400	pde_store(pde, newpde);
4401
4402	atomic_add_long(&pmap_pde_mappings, 1);
4403	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4404	    " in pmap %p", va, pmap);
4405	return (TRUE);
4406}
4407
4408/*
4409 * Maps a sequence of resident pages belonging to the same object.
4410 * The sequence begins with the given page m_start.  This page is
4411 * mapped at the given virtual address start.  Each subsequent page is
4412 * mapped at a virtual address that is offset from start by the same
4413 * amount as the page is offset from m_start within the object.  The
4414 * last page in the sequence is the page with the largest offset from
4415 * m_start that can be mapped at a virtual address less than the given
4416 * virtual address end.  Not every virtual page between start and end
4417 * is mapped; only those for which a resident page exists with the
4418 * corresponding offset from m_start are mapped.
4419 */
4420void
4421pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4422    vm_page_t m_start, vm_prot_t prot)
4423{
4424	struct rwlock *lock;
4425	vm_offset_t va;
4426	vm_page_t m, mpte;
4427	vm_pindex_t diff, psize;
4428
4429	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4430
4431	psize = atop(end - start);
4432	mpte = NULL;
4433	m = m_start;
4434	lock = NULL;
4435	rw_rlock(&pvh_global_lock);
4436	PMAP_LOCK(pmap);
4437	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4438		va = start + ptoa(diff);
4439		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4440		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4441		    pmap_enter_pde(pmap, va, m, prot, &lock))
4442			m = &m[NBPDR / PAGE_SIZE - 1];
4443		else
4444			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4445			    mpte, &lock);
4446		m = TAILQ_NEXT(m, listq);
4447	}
4448	if (lock != NULL)
4449		rw_wunlock(lock);
4450	rw_runlock(&pvh_global_lock);
4451	PMAP_UNLOCK(pmap);
4452}
4453
4454/*
4455 * this code makes some *MAJOR* assumptions:
4456 * 1. Current pmap & pmap exists.
4457 * 2. Not wired.
4458 * 3. Read access.
4459 * 4. No page table pages.
4460 * but is *MUCH* faster than pmap_enter...
4461 */
4462
4463void
4464pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4465{
4466	struct rwlock *lock;
4467
4468	lock = NULL;
4469	rw_rlock(&pvh_global_lock);
4470	PMAP_LOCK(pmap);
4471	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4472	if (lock != NULL)
4473		rw_wunlock(lock);
4474	rw_runlock(&pvh_global_lock);
4475	PMAP_UNLOCK(pmap);
4476}
4477
4478static vm_page_t
4479pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4480    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4481{
4482	struct spglist free;
4483	pt_entry_t *pte, PG_V;
4484	vm_paddr_t pa;
4485
4486	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4487	    (m->oflags & VPO_UNMANAGED) != 0,
4488	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4489	PG_V = pmap_valid_bit(pmap);
4490	rw_assert(&pvh_global_lock, RA_LOCKED);
4491	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4492
4493	/*
4494	 * In the case that a page table page is not
4495	 * resident, we are creating it here.
4496	 */
4497	if (va < VM_MAXUSER_ADDRESS) {
4498		vm_pindex_t ptepindex;
4499		pd_entry_t *ptepa;
4500
4501		/*
4502		 * Calculate pagetable page index
4503		 */
4504		ptepindex = pmap_pde_pindex(va);
4505		if (mpte && (mpte->pindex == ptepindex)) {
4506			mpte->wire_count++;
4507		} else {
4508			/*
4509			 * Get the page directory entry
4510			 */
4511			ptepa = pmap_pde(pmap, va);
4512
4513			/*
4514			 * If the page table page is mapped, we just increment
4515			 * the hold count, and activate it.  Otherwise, we
4516			 * attempt to allocate a page table page.  If this
4517			 * attempt fails, we don't retry.  Instead, we give up.
4518			 */
4519			if (ptepa && (*ptepa & PG_V) != 0) {
4520				if (*ptepa & PG_PS)
4521					return (NULL);
4522				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4523				mpte->wire_count++;
4524			} else {
4525				/*
4526				 * Pass NULL instead of the PV list lock
4527				 * pointer, because we don't intend to sleep.
4528				 */
4529				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4530				if (mpte == NULL)
4531					return (mpte);
4532			}
4533		}
4534		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4535		pte = &pte[pmap_pte_index(va)];
4536	} else {
4537		mpte = NULL;
4538		pte = vtopte(va);
4539	}
4540	if (*pte) {
4541		if (mpte != NULL) {
4542			mpte->wire_count--;
4543			mpte = NULL;
4544		}
4545		return (mpte);
4546	}
4547
4548	/*
4549	 * Enter on the PV list if part of our managed memory.
4550	 */
4551	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4552	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4553		if (mpte != NULL) {
4554			SLIST_INIT(&free);
4555			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4556				pmap_invalidate_page(pmap, va);
4557				pmap_free_zero_pages(&free);
4558			}
4559			mpte = NULL;
4560		}
4561		return (mpte);
4562	}
4563
4564	/*
4565	 * Increment counters
4566	 */
4567	pmap_resident_count_inc(pmap, 1);
4568
4569	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4570	if ((prot & VM_PROT_EXECUTE) == 0)
4571		pa |= pg_nx;
4572
4573	/*
4574	 * Now validate mapping with RO protection
4575	 */
4576	if ((m->oflags & VPO_UNMANAGED) != 0)
4577		pte_store(pte, pa | PG_V | PG_U);
4578	else
4579		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4580	return (mpte);
4581}
4582
4583/*
4584 * Make a temporary mapping for a physical address.  This is only intended
4585 * to be used for panic dumps.
4586 */
4587void *
4588pmap_kenter_temporary(vm_paddr_t pa, int i)
4589{
4590	vm_offset_t va;
4591
4592	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4593	pmap_kenter(va, pa);
4594	invlpg(va);
4595	return ((void *)crashdumpmap);
4596}
4597
4598/*
4599 * This code maps large physical mmap regions into the
4600 * processor address space.  Note that some shortcuts
4601 * are taken, but the code works.
4602 */
4603void
4604pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4605    vm_pindex_t pindex, vm_size_t size)
4606{
4607	pd_entry_t *pde;
4608	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4609	vm_paddr_t pa, ptepa;
4610	vm_page_t p, pdpg;
4611	int pat_mode;
4612
4613	PG_A = pmap_accessed_bit(pmap);
4614	PG_M = pmap_modified_bit(pmap);
4615	PG_V = pmap_valid_bit(pmap);
4616	PG_RW = pmap_rw_bit(pmap);
4617
4618	VM_OBJECT_ASSERT_WLOCKED(object);
4619	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4620	    ("pmap_object_init_pt: non-device object"));
4621	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4622		if (!pmap_ps_enabled(pmap))
4623			return;
4624		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4625			return;
4626		p = vm_page_lookup(object, pindex);
4627		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4628		    ("pmap_object_init_pt: invalid page %p", p));
4629		pat_mode = p->md.pat_mode;
4630
4631		/*
4632		 * Abort the mapping if the first page is not physically
4633		 * aligned to a 2MB page boundary.
4634		 */
4635		ptepa = VM_PAGE_TO_PHYS(p);
4636		if (ptepa & (NBPDR - 1))
4637			return;
4638
4639		/*
4640		 * Skip the first page.  Abort the mapping if the rest of
4641		 * the pages are not physically contiguous or have differing
4642		 * memory attributes.
4643		 */
4644		p = TAILQ_NEXT(p, listq);
4645		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4646		    pa += PAGE_SIZE) {
4647			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4648			    ("pmap_object_init_pt: invalid page %p", p));
4649			if (pa != VM_PAGE_TO_PHYS(p) ||
4650			    pat_mode != p->md.pat_mode)
4651				return;
4652			p = TAILQ_NEXT(p, listq);
4653		}
4654
4655		/*
4656		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4657		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4658		 * will not affect the termination of this loop.
4659		 */
4660		PMAP_LOCK(pmap);
4661		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4662		    pa < ptepa + size; pa += NBPDR) {
4663			pdpg = pmap_allocpde(pmap, addr, NULL);
4664			if (pdpg == NULL) {
4665				/*
4666				 * The creation of mappings below is only an
4667				 * optimization.  If a page directory page
4668				 * cannot be allocated without blocking,
4669				 * continue on to the next mapping rather than
4670				 * blocking.
4671				 */
4672				addr += NBPDR;
4673				continue;
4674			}
4675			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4676			pde = &pde[pmap_pde_index(addr)];
4677			if ((*pde & PG_V) == 0) {
4678				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4679				    PG_U | PG_RW | PG_V);
4680				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4681				atomic_add_long(&pmap_pde_mappings, 1);
4682			} else {
4683				/* Continue on if the PDE is already valid. */
4684				pdpg->wire_count--;
4685				KASSERT(pdpg->wire_count > 0,
4686				    ("pmap_object_init_pt: missing reference "
4687				    "to page directory page, va: 0x%lx", addr));
4688			}
4689			addr += NBPDR;
4690		}
4691		PMAP_UNLOCK(pmap);
4692	}
4693}
4694
4695/*
4696 *	Routine:	pmap_change_wiring
4697 *	Function:	Change the wiring attribute for a map/virtual-address
4698 *			pair.
4699 *	In/out conditions:
4700 *			The mapping must already exist in the pmap.
4701 */
4702void
4703pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
4704{
4705	pd_entry_t *pde;
4706	pt_entry_t *pte;
4707	boolean_t pv_lists_locked;
4708
4709	pv_lists_locked = FALSE;
4710
4711	/*
4712	 * Wiring is not a hardware characteristic so there is no need to
4713	 * invalidate TLB.
4714	 */
4715retry:
4716	PMAP_LOCK(pmap);
4717	pde = pmap_pde(pmap, va);
4718	if ((*pde & PG_PS) != 0) {
4719		if (!wired != ((*pde & PG_W) == 0)) {
4720			if (!pv_lists_locked) {
4721				pv_lists_locked = TRUE;
4722				if (!rw_try_rlock(&pvh_global_lock)) {
4723					PMAP_UNLOCK(pmap);
4724					rw_rlock(&pvh_global_lock);
4725					goto retry;
4726				}
4727			}
4728			if (!pmap_demote_pde(pmap, pde, va))
4729				panic("pmap_change_wiring: demotion failed");
4730		} else
4731			goto out;
4732	}
4733	pte = pmap_pde_to_pte(pde, va);
4734	if (wired && (*pte & PG_W) == 0) {
4735		pmap->pm_stats.wired_count++;
4736		atomic_set_long(pte, PG_W);
4737	} else if (!wired && (*pte & PG_W) != 0) {
4738		pmap->pm_stats.wired_count--;
4739		atomic_clear_long(pte, PG_W);
4740	}
4741out:
4742	if (pv_lists_locked)
4743		rw_runlock(&pvh_global_lock);
4744	PMAP_UNLOCK(pmap);
4745}
4746
4747/*
4748 *	Copy the range specified by src_addr/len
4749 *	from the source map to the range dst_addr/len
4750 *	in the destination map.
4751 *
4752 *	This routine is only advisory and need not do anything.
4753 */
4754
4755void
4756pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4757    vm_offset_t src_addr)
4758{
4759	struct rwlock *lock;
4760	struct spglist free;
4761	vm_offset_t addr;
4762	vm_offset_t end_addr = src_addr + len;
4763	vm_offset_t va_next;
4764	pt_entry_t PG_A, PG_M, PG_V;
4765
4766	if (dst_addr != src_addr)
4767		return;
4768
4769	if (dst_pmap->pm_type != src_pmap->pm_type)
4770		return;
4771
4772	/*
4773	 * EPT page table entries that require emulation of A/D bits are
4774	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4775	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4776	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4777	 * implementations flag an EPT misconfiguration for exec-only
4778	 * mappings we skip this function entirely for emulated pmaps.
4779	 */
4780	if (pmap_emulate_ad_bits(dst_pmap))
4781		return;
4782
4783	lock = NULL;
4784	rw_rlock(&pvh_global_lock);
4785	if (dst_pmap < src_pmap) {
4786		PMAP_LOCK(dst_pmap);
4787		PMAP_LOCK(src_pmap);
4788	} else {
4789		PMAP_LOCK(src_pmap);
4790		PMAP_LOCK(dst_pmap);
4791	}
4792
4793	PG_A = pmap_accessed_bit(dst_pmap);
4794	PG_M = pmap_modified_bit(dst_pmap);
4795	PG_V = pmap_valid_bit(dst_pmap);
4796
4797	for (addr = src_addr; addr < end_addr; addr = va_next) {
4798		pt_entry_t *src_pte, *dst_pte;
4799		vm_page_t dstmpde, dstmpte, srcmpte;
4800		pml4_entry_t *pml4e;
4801		pdp_entry_t *pdpe;
4802		pd_entry_t srcptepaddr, *pde;
4803
4804		KASSERT(addr < UPT_MIN_ADDRESS,
4805		    ("pmap_copy: invalid to pmap_copy page tables"));
4806
4807		pml4e = pmap_pml4e(src_pmap, addr);
4808		if ((*pml4e & PG_V) == 0) {
4809			va_next = (addr + NBPML4) & ~PML4MASK;
4810			if (va_next < addr)
4811				va_next = end_addr;
4812			continue;
4813		}
4814
4815		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4816		if ((*pdpe & PG_V) == 0) {
4817			va_next = (addr + NBPDP) & ~PDPMASK;
4818			if (va_next < addr)
4819				va_next = end_addr;
4820			continue;
4821		}
4822
4823		va_next = (addr + NBPDR) & ~PDRMASK;
4824		if (va_next < addr)
4825			va_next = end_addr;
4826
4827		pde = pmap_pdpe_to_pde(pdpe, addr);
4828		srcptepaddr = *pde;
4829		if (srcptepaddr == 0)
4830			continue;
4831
4832		if (srcptepaddr & PG_PS) {
4833			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4834				continue;
4835			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4836			if (dstmpde == NULL)
4837				break;
4838			pde = (pd_entry_t *)
4839			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4840			pde = &pde[pmap_pde_index(addr)];
4841			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4842			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4843			    PG_PS_FRAME, &lock))) {
4844				*pde = srcptepaddr & ~PG_W;
4845				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4846			} else
4847				dstmpde->wire_count--;
4848			continue;
4849		}
4850
4851		srcptepaddr &= PG_FRAME;
4852		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4853		KASSERT(srcmpte->wire_count > 0,
4854		    ("pmap_copy: source page table page is unused"));
4855
4856		if (va_next > end_addr)
4857			va_next = end_addr;
4858
4859		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4860		src_pte = &src_pte[pmap_pte_index(addr)];
4861		dstmpte = NULL;
4862		while (addr < va_next) {
4863			pt_entry_t ptetemp;
4864			ptetemp = *src_pte;
4865			/*
4866			 * we only virtual copy managed pages
4867			 */
4868			if ((ptetemp & PG_MANAGED) != 0) {
4869				if (dstmpte != NULL &&
4870				    dstmpte->pindex == pmap_pde_pindex(addr))
4871					dstmpte->wire_count++;
4872				else if ((dstmpte = pmap_allocpte(dst_pmap,
4873				    addr, NULL)) == NULL)
4874					goto out;
4875				dst_pte = (pt_entry_t *)
4876				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4877				dst_pte = &dst_pte[pmap_pte_index(addr)];
4878				if (*dst_pte == 0 &&
4879				    pmap_try_insert_pv_entry(dst_pmap, addr,
4880				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4881				    &lock)) {
4882					/*
4883					 * Clear the wired, modified, and
4884					 * accessed (referenced) bits
4885					 * during the copy.
4886					 */
4887					*dst_pte = ptetemp & ~(PG_W | PG_M |
4888					    PG_A);
4889					pmap_resident_count_inc(dst_pmap, 1);
4890				} else {
4891					SLIST_INIT(&free);
4892					if (pmap_unwire_ptp(dst_pmap, addr,
4893					    dstmpte, &free)) {
4894						pmap_invalidate_page(dst_pmap,
4895						    addr);
4896						pmap_free_zero_pages(&free);
4897					}
4898					goto out;
4899				}
4900				if (dstmpte->wire_count >= srcmpte->wire_count)
4901					break;
4902			}
4903			addr += PAGE_SIZE;
4904			src_pte++;
4905		}
4906	}
4907out:
4908	if (lock != NULL)
4909		rw_wunlock(lock);
4910	rw_runlock(&pvh_global_lock);
4911	PMAP_UNLOCK(src_pmap);
4912	PMAP_UNLOCK(dst_pmap);
4913}
4914
4915/*
4916 *	pmap_zero_page zeros the specified hardware page by mapping
4917 *	the page into KVM and using bzero to clear its contents.
4918 */
4919void
4920pmap_zero_page(vm_page_t m)
4921{
4922	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4923
4924	pagezero((void *)va);
4925}
4926
4927/*
4928 *	pmap_zero_page_area zeros the specified hardware page by mapping
4929 *	the page into KVM and using bzero to clear its contents.
4930 *
4931 *	off and size may not cover an area beyond a single hardware page.
4932 */
4933void
4934pmap_zero_page_area(vm_page_t m, int off, int size)
4935{
4936	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4937
4938	if (off == 0 && size == PAGE_SIZE)
4939		pagezero((void *)va);
4940	else
4941		bzero((char *)va + off, size);
4942}
4943
4944/*
4945 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4946 *	the page into KVM and using bzero to clear its contents.  This
4947 *	is intended to be called from the vm_pagezero process only and
4948 *	outside of Giant.
4949 */
4950void
4951pmap_zero_page_idle(vm_page_t m)
4952{
4953	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4954
4955	pagezero((void *)va);
4956}
4957
4958/*
4959 *	pmap_copy_page copies the specified (machine independent)
4960 *	page by mapping the page into virtual memory and using
4961 *	bcopy to copy the page, one machine dependent page at a
4962 *	time.
4963 */
4964void
4965pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4966{
4967	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4968	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4969
4970	pagecopy((void *)src, (void *)dst);
4971}
4972
4973int unmapped_buf_allowed = 1;
4974
4975void
4976pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4977    vm_offset_t b_offset, int xfersize)
4978{
4979	void *a_cp, *b_cp;
4980	vm_page_t m_a, m_b;
4981	vm_paddr_t p_a, p_b;
4982	pt_entry_t *pte;
4983	vm_offset_t a_pg_offset, b_pg_offset;
4984	int cnt;
4985	boolean_t pinned;
4986
4987	pinned = FALSE;
4988	while (xfersize > 0) {
4989		a_pg_offset = a_offset & PAGE_MASK;
4990		m_a = ma[a_offset >> PAGE_SHIFT];
4991		p_a = m_a->phys_addr;
4992		b_pg_offset = b_offset & PAGE_MASK;
4993		m_b = mb[b_offset >> PAGE_SHIFT];
4994		p_b = m_b->phys_addr;
4995		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4996		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4997		if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
4998		    p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
4999			mtx_lock(&cpage_lock);
5000			sched_pin();
5001			pinned = TRUE;
5002			pte = vtopte(cpage_a);
5003			*pte = p_a | X86_PG_A | X86_PG_V |
5004			    pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
5005			invlpg(cpage_a);
5006			a_cp = (char *)cpage_a + a_pg_offset;
5007		} else {
5008			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5009		}
5010		if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
5011		    p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
5012			if (!pinned) {
5013				mtx_lock(&cpage_lock);
5014				sched_pin();
5015				pinned = TRUE;
5016			}
5017			pte = vtopte(cpage_b);
5018			*pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
5019			    X86_PG_V | pmap_cache_bits(kernel_pmap,
5020			    m_b->md.pat_mode, 0);
5021			invlpg(cpage_b);
5022			b_cp = (char *)cpage_b + b_pg_offset;
5023		} else {
5024			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5025		}
5026		bcopy(a_cp, b_cp, cnt);
5027		if (__predict_false(pinned)) {
5028			sched_unpin();
5029			mtx_unlock(&cpage_lock);
5030			pinned = FALSE;
5031		}
5032		a_offset += cnt;
5033		b_offset += cnt;
5034		xfersize -= cnt;
5035	}
5036}
5037
5038/*
5039 * Returns true if the pmap's pv is one of the first
5040 * 16 pvs linked to from this page.  This count may
5041 * be changed upwards or downwards in the future; it
5042 * is only necessary that true be returned for a small
5043 * subset of pmaps for proper page aging.
5044 */
5045boolean_t
5046pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5047{
5048	struct md_page *pvh;
5049	struct rwlock *lock;
5050	pv_entry_t pv;
5051	int loops = 0;
5052	boolean_t rv;
5053
5054	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5055	    ("pmap_page_exists_quick: page %p is not managed", m));
5056	rv = FALSE;
5057	rw_rlock(&pvh_global_lock);
5058	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5059	rw_rlock(lock);
5060	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5061		if (PV_PMAP(pv) == pmap) {
5062			rv = TRUE;
5063			break;
5064		}
5065		loops++;
5066		if (loops >= 16)
5067			break;
5068	}
5069	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5070		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5071		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5072			if (PV_PMAP(pv) == pmap) {
5073				rv = TRUE;
5074				break;
5075			}
5076			loops++;
5077			if (loops >= 16)
5078				break;
5079		}
5080	}
5081	rw_runlock(lock);
5082	rw_runlock(&pvh_global_lock);
5083	return (rv);
5084}
5085
5086/*
5087 *	pmap_page_wired_mappings:
5088 *
5089 *	Return the number of managed mappings to the given physical page
5090 *	that are wired.
5091 */
5092int
5093pmap_page_wired_mappings(vm_page_t m)
5094{
5095	struct rwlock *lock;
5096	struct md_page *pvh;
5097	pmap_t pmap;
5098	pt_entry_t *pte;
5099	pv_entry_t pv;
5100	int count, md_gen, pvh_gen;
5101
5102	if ((m->oflags & VPO_UNMANAGED) != 0)
5103		return (0);
5104	rw_rlock(&pvh_global_lock);
5105	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5106	rw_rlock(lock);
5107restart:
5108	count = 0;
5109	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5110		pmap = PV_PMAP(pv);
5111		if (!PMAP_TRYLOCK(pmap)) {
5112			md_gen = m->md.pv_gen;
5113			rw_runlock(lock);
5114			PMAP_LOCK(pmap);
5115			rw_rlock(lock);
5116			if (md_gen != m->md.pv_gen) {
5117				PMAP_UNLOCK(pmap);
5118				goto restart;
5119			}
5120		}
5121		pte = pmap_pte(pmap, pv->pv_va);
5122		if ((*pte & PG_W) != 0)
5123			count++;
5124		PMAP_UNLOCK(pmap);
5125	}
5126	if ((m->flags & PG_FICTITIOUS) == 0) {
5127		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5128		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5129			pmap = PV_PMAP(pv);
5130			if (!PMAP_TRYLOCK(pmap)) {
5131				md_gen = m->md.pv_gen;
5132				pvh_gen = pvh->pv_gen;
5133				rw_runlock(lock);
5134				PMAP_LOCK(pmap);
5135				rw_rlock(lock);
5136				if (md_gen != m->md.pv_gen ||
5137				    pvh_gen != pvh->pv_gen) {
5138					PMAP_UNLOCK(pmap);
5139					goto restart;
5140				}
5141			}
5142			pte = pmap_pde(pmap, pv->pv_va);
5143			if ((*pte & PG_W) != 0)
5144				count++;
5145			PMAP_UNLOCK(pmap);
5146		}
5147	}
5148	rw_runlock(lock);
5149	rw_runlock(&pvh_global_lock);
5150	return (count);
5151}
5152
5153/*
5154 * Returns TRUE if the given page is mapped individually or as part of
5155 * a 2mpage.  Otherwise, returns FALSE.
5156 */
5157boolean_t
5158pmap_page_is_mapped(vm_page_t m)
5159{
5160	struct rwlock *lock;
5161	boolean_t rv;
5162
5163	if ((m->oflags & VPO_UNMANAGED) != 0)
5164		return (FALSE);
5165	rw_rlock(&pvh_global_lock);
5166	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5167	rw_rlock(lock);
5168	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5169	    ((m->flags & PG_FICTITIOUS) == 0 &&
5170	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5171	rw_runlock(lock);
5172	rw_runlock(&pvh_global_lock);
5173	return (rv);
5174}
5175
5176/*
5177 * Destroy all managed, non-wired mappings in the given user-space
5178 * pmap.  This pmap cannot be active on any processor besides the
5179 * caller.
5180 *
5181 * This function cannot be applied to the kernel pmap.  Moreover, it
5182 * is not intended for general use.  It is only to be used during
5183 * process termination.  Consequently, it can be implemented in ways
5184 * that make it faster than pmap_remove().  First, it can more quickly
5185 * destroy mappings by iterating over the pmap's collection of PV
5186 * entries, rather than searching the page table.  Second, it doesn't
5187 * have to test and clear the page table entries atomically, because
5188 * no processor is currently accessing the user address space.  In
5189 * particular, a page table entry's dirty bit won't change state once
5190 * this function starts.
5191 */
5192void
5193pmap_remove_pages(pmap_t pmap)
5194{
5195	pd_entry_t ptepde;
5196	pt_entry_t *pte, tpte;
5197	pt_entry_t PG_M, PG_RW, PG_V;
5198	struct spglist free;
5199	vm_page_t m, mpte, mt;
5200	pv_entry_t pv;
5201	struct md_page *pvh;
5202	struct pv_chunk *pc, *npc;
5203	struct rwlock *lock;
5204	int64_t bit;
5205	uint64_t inuse, bitmask;
5206	int allfree, field, freed, idx;
5207	boolean_t superpage;
5208	vm_paddr_t pa;
5209
5210	/*
5211	 * Assert that the given pmap is only active on the current
5212	 * CPU.  Unfortunately, we cannot block another CPU from
5213	 * activating the pmap while this function is executing.
5214	 */
5215	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5216#ifdef INVARIANTS
5217	{
5218		cpuset_t other_cpus;
5219
5220		other_cpus = all_cpus;
5221		critical_enter();
5222		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5223		CPU_AND(&other_cpus, &pmap->pm_active);
5224		critical_exit();
5225		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5226	}
5227#endif
5228
5229	lock = NULL;
5230	PG_M = pmap_modified_bit(pmap);
5231	PG_V = pmap_valid_bit(pmap);
5232	PG_RW = pmap_rw_bit(pmap);
5233
5234	SLIST_INIT(&free);
5235	rw_rlock(&pvh_global_lock);
5236	PMAP_LOCK(pmap);
5237	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5238		allfree = 1;
5239		freed = 0;
5240		for (field = 0; field < _NPCM; field++) {
5241			inuse = ~pc->pc_map[field] & pc_freemask[field];
5242			while (inuse != 0) {
5243				bit = bsfq(inuse);
5244				bitmask = 1UL << bit;
5245				idx = field * 64 + bit;
5246				pv = &pc->pc_pventry[idx];
5247				inuse &= ~bitmask;
5248
5249				pte = pmap_pdpe(pmap, pv->pv_va);
5250				ptepde = *pte;
5251				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5252				tpte = *pte;
5253				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5254					superpage = FALSE;
5255					ptepde = tpte;
5256					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5257					    PG_FRAME);
5258					pte = &pte[pmap_pte_index(pv->pv_va)];
5259					tpte = *pte;
5260				} else {
5261					/*
5262					 * Keep track whether 'tpte' is a
5263					 * superpage explicitly instead of
5264					 * relying on PG_PS being set.
5265					 *
5266					 * This is because PG_PS is numerically
5267					 * identical to PG_PTE_PAT and thus a
5268					 * regular page could be mistaken for
5269					 * a superpage.
5270					 */
5271					superpage = TRUE;
5272				}
5273
5274				if ((tpte & PG_V) == 0) {
5275					panic("bad pte va %lx pte %lx",
5276					    pv->pv_va, tpte);
5277				}
5278
5279/*
5280 * We cannot remove wired pages from a process' mapping at this time
5281 */
5282				if (tpte & PG_W) {
5283					allfree = 0;
5284					continue;
5285				}
5286
5287				if (superpage)
5288					pa = tpte & PG_PS_FRAME;
5289				else
5290					pa = tpte & PG_FRAME;
5291
5292				m = PHYS_TO_VM_PAGE(pa);
5293				KASSERT(m->phys_addr == pa,
5294				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5295				    m, (uintmax_t)m->phys_addr,
5296				    (uintmax_t)tpte));
5297
5298				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5299				    m < &vm_page_array[vm_page_array_size],
5300				    ("pmap_remove_pages: bad tpte %#jx",
5301				    (uintmax_t)tpte));
5302
5303				pte_clear(pte);
5304
5305				/*
5306				 * Update the vm_page_t clean/reference bits.
5307				 */
5308				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5309					if (superpage) {
5310						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5311							vm_page_dirty(mt);
5312					} else
5313						vm_page_dirty(m);
5314				}
5315
5316				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5317
5318				/* Mark free */
5319				pc->pc_map[field] |= bitmask;
5320				if (superpage) {
5321					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5322					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5323					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5324					pvh->pv_gen++;
5325					if (TAILQ_EMPTY(&pvh->pv_list)) {
5326						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5327							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5328							    TAILQ_EMPTY(&mt->md.pv_list))
5329								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5330					}
5331					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5332					if (mpte != NULL) {
5333						pmap_remove_pt_page(pmap, mpte);
5334						pmap_resident_count_dec(pmap, 1);
5335						KASSERT(mpte->wire_count == NPTEPG,
5336						    ("pmap_remove_pages: pte page wire count error"));
5337						mpte->wire_count = 0;
5338						pmap_add_delayed_free_list(mpte, &free, FALSE);
5339						atomic_subtract_int(&cnt.v_wire_count, 1);
5340					}
5341				} else {
5342					pmap_resident_count_dec(pmap, 1);
5343					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5344					m->md.pv_gen++;
5345					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5346					    TAILQ_EMPTY(&m->md.pv_list) &&
5347					    (m->flags & PG_FICTITIOUS) == 0) {
5348						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5349						if (TAILQ_EMPTY(&pvh->pv_list))
5350							vm_page_aflag_clear(m, PGA_WRITEABLE);
5351					}
5352				}
5353				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5354				freed++;
5355			}
5356		}
5357		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5358		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5359		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5360		if (allfree) {
5361			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5362			free_pv_chunk(pc);
5363		}
5364	}
5365	if (lock != NULL)
5366		rw_wunlock(lock);
5367	pmap_invalidate_all(pmap);
5368	rw_runlock(&pvh_global_lock);
5369	PMAP_UNLOCK(pmap);
5370	pmap_free_zero_pages(&free);
5371}
5372
5373static boolean_t
5374pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5375{
5376	struct rwlock *lock;
5377	pv_entry_t pv;
5378	struct md_page *pvh;
5379	pt_entry_t *pte, mask;
5380	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5381	pmap_t pmap;
5382	int md_gen, pvh_gen;
5383	boolean_t rv;
5384
5385	rv = FALSE;
5386	rw_rlock(&pvh_global_lock);
5387	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5388	rw_rlock(lock);
5389restart:
5390	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5391		pmap = PV_PMAP(pv);
5392		if (!PMAP_TRYLOCK(pmap)) {
5393			md_gen = m->md.pv_gen;
5394			rw_runlock(lock);
5395			PMAP_LOCK(pmap);
5396			rw_rlock(lock);
5397			if (md_gen != m->md.pv_gen) {
5398				PMAP_UNLOCK(pmap);
5399				goto restart;
5400			}
5401		}
5402		pte = pmap_pte(pmap, pv->pv_va);
5403		mask = 0;
5404		if (modified) {
5405			PG_M = pmap_modified_bit(pmap);
5406			PG_RW = pmap_rw_bit(pmap);
5407			mask |= PG_RW | PG_M;
5408		}
5409		if (accessed) {
5410			PG_A = pmap_accessed_bit(pmap);
5411			PG_V = pmap_valid_bit(pmap);
5412			mask |= PG_V | PG_A;
5413		}
5414		rv = (*pte & mask) == mask;
5415		PMAP_UNLOCK(pmap);
5416		if (rv)
5417			goto out;
5418	}
5419	if ((m->flags & PG_FICTITIOUS) == 0) {
5420		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5421		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5422			pmap = PV_PMAP(pv);
5423			if (!PMAP_TRYLOCK(pmap)) {
5424				md_gen = m->md.pv_gen;
5425				pvh_gen = pvh->pv_gen;
5426				rw_runlock(lock);
5427				PMAP_LOCK(pmap);
5428				rw_rlock(lock);
5429				if (md_gen != m->md.pv_gen ||
5430				    pvh_gen != pvh->pv_gen) {
5431					PMAP_UNLOCK(pmap);
5432					goto restart;
5433				}
5434			}
5435			pte = pmap_pde(pmap, pv->pv_va);
5436			mask = 0;
5437			if (modified) {
5438				PG_M = pmap_modified_bit(pmap);
5439				PG_RW = pmap_rw_bit(pmap);
5440				mask |= PG_RW | PG_M;
5441			}
5442			if (accessed) {
5443				PG_A = pmap_accessed_bit(pmap);
5444				PG_V = pmap_valid_bit(pmap);
5445				mask |= PG_V | PG_A;
5446			}
5447			rv = (*pte & mask) == mask;
5448			PMAP_UNLOCK(pmap);
5449			if (rv)
5450				goto out;
5451		}
5452	}
5453out:
5454	rw_runlock(lock);
5455	rw_runlock(&pvh_global_lock);
5456	return (rv);
5457}
5458
5459/*
5460 *	pmap_is_modified:
5461 *
5462 *	Return whether or not the specified physical page was modified
5463 *	in any physical maps.
5464 */
5465boolean_t
5466pmap_is_modified(vm_page_t m)
5467{
5468
5469	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5470	    ("pmap_is_modified: page %p is not managed", m));
5471
5472	/*
5473	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5474	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5475	 * is clear, no PTEs can have PG_M set.
5476	 */
5477	VM_OBJECT_ASSERT_WLOCKED(m->object);
5478	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5479		return (FALSE);
5480	return (pmap_page_test_mappings(m, FALSE, TRUE));
5481}
5482
5483/*
5484 *	pmap_is_prefaultable:
5485 *
5486 *	Return whether or not the specified virtual address is eligible
5487 *	for prefault.
5488 */
5489boolean_t
5490pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5491{
5492	pd_entry_t *pde;
5493	pt_entry_t *pte, PG_V;
5494	boolean_t rv;
5495
5496	PG_V = pmap_valid_bit(pmap);
5497	rv = FALSE;
5498	PMAP_LOCK(pmap);
5499	pde = pmap_pde(pmap, addr);
5500	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5501		pte = pmap_pde_to_pte(pde, addr);
5502		rv = (*pte & PG_V) == 0;
5503	}
5504	PMAP_UNLOCK(pmap);
5505	return (rv);
5506}
5507
5508/*
5509 *	pmap_is_referenced:
5510 *
5511 *	Return whether or not the specified physical page was referenced
5512 *	in any physical maps.
5513 */
5514boolean_t
5515pmap_is_referenced(vm_page_t m)
5516{
5517
5518	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5519	    ("pmap_is_referenced: page %p is not managed", m));
5520	return (pmap_page_test_mappings(m, TRUE, FALSE));
5521}
5522
5523/*
5524 * Clear the write and modified bits in each of the given page's mappings.
5525 */
5526void
5527pmap_remove_write(vm_page_t m)
5528{
5529	struct md_page *pvh;
5530	pmap_t pmap;
5531	struct rwlock *lock;
5532	pv_entry_t next_pv, pv;
5533	pd_entry_t *pde;
5534	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5535	vm_offset_t va;
5536	int pvh_gen, md_gen;
5537
5538	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5539	    ("pmap_remove_write: page %p is not managed", m));
5540
5541	/*
5542	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5543	 * set by another thread while the object is locked.  Thus,
5544	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5545	 */
5546	VM_OBJECT_ASSERT_WLOCKED(m->object);
5547	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5548		return;
5549	rw_rlock(&pvh_global_lock);
5550	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5551	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5552retry_pv_loop:
5553	rw_wlock(lock);
5554	if ((m->flags & PG_FICTITIOUS) != 0)
5555		goto small_mappings;
5556	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5557		pmap = PV_PMAP(pv);
5558		if (!PMAP_TRYLOCK(pmap)) {
5559			pvh_gen = pvh->pv_gen;
5560			rw_wunlock(lock);
5561			PMAP_LOCK(pmap);
5562			rw_wlock(lock);
5563			if (pvh_gen != pvh->pv_gen) {
5564				PMAP_UNLOCK(pmap);
5565				rw_wunlock(lock);
5566				goto retry_pv_loop;
5567			}
5568		}
5569		PG_RW = pmap_rw_bit(pmap);
5570		va = pv->pv_va;
5571		pde = pmap_pde(pmap, va);
5572		if ((*pde & PG_RW) != 0)
5573			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5574		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5575		    ("inconsistent pv lock %p %p for page %p",
5576		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5577		PMAP_UNLOCK(pmap);
5578	}
5579small_mappings:
5580	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5581		pmap = PV_PMAP(pv);
5582		if (!PMAP_TRYLOCK(pmap)) {
5583			pvh_gen = pvh->pv_gen;
5584			md_gen = m->md.pv_gen;
5585			rw_wunlock(lock);
5586			PMAP_LOCK(pmap);
5587			rw_wlock(lock);
5588			if (pvh_gen != pvh->pv_gen ||
5589			    md_gen != m->md.pv_gen) {
5590				PMAP_UNLOCK(pmap);
5591				rw_wunlock(lock);
5592				goto retry_pv_loop;
5593			}
5594		}
5595		PG_M = pmap_modified_bit(pmap);
5596		PG_RW = pmap_rw_bit(pmap);
5597		pde = pmap_pde(pmap, pv->pv_va);
5598		KASSERT((*pde & PG_PS) == 0,
5599		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5600		    m));
5601		pte = pmap_pde_to_pte(pde, pv->pv_va);
5602retry:
5603		oldpte = *pte;
5604		if (oldpte & PG_RW) {
5605			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5606			    ~(PG_RW | PG_M)))
5607				goto retry;
5608			if ((oldpte & PG_M) != 0)
5609				vm_page_dirty(m);
5610			pmap_invalidate_page(pmap, pv->pv_va);
5611		}
5612		PMAP_UNLOCK(pmap);
5613	}
5614	rw_wunlock(lock);
5615	vm_page_aflag_clear(m, PGA_WRITEABLE);
5616	rw_runlock(&pvh_global_lock);
5617}
5618
5619static __inline boolean_t
5620safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5621{
5622
5623	if (!pmap_emulate_ad_bits(pmap))
5624		return (TRUE);
5625
5626	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5627
5628	/*
5629	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5630	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5631	 * if the EPT_PG_WRITE bit is set.
5632	 */
5633	if ((pte & EPT_PG_WRITE) != 0)
5634		return (FALSE);
5635
5636	/*
5637	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5638	 */
5639	if ((pte & EPT_PG_EXECUTE) == 0 ||
5640	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5641		return (TRUE);
5642	else
5643		return (FALSE);
5644}
5645
5646#define	PMAP_TS_REFERENCED_MAX	5
5647
5648/*
5649 *	pmap_ts_referenced:
5650 *
5651 *	Return a count of reference bits for a page, clearing those bits.
5652 *	It is not necessary for every reference bit to be cleared, but it
5653 *	is necessary that 0 only be returned when there are truly no
5654 *	reference bits set.
5655 *
5656 *	XXX: The exact number of bits to check and clear is a matter that
5657 *	should be tested and standardized at some point in the future for
5658 *	optimal aging of shared pages.
5659 */
5660int
5661pmap_ts_referenced(vm_page_t m)
5662{
5663	struct md_page *pvh;
5664	pv_entry_t pv, pvf;
5665	pmap_t pmap;
5666	struct rwlock *lock;
5667	pd_entry_t oldpde, *pde;
5668	pt_entry_t *pte, PG_A;
5669	vm_offset_t va;
5670	vm_paddr_t pa;
5671	int cleared, md_gen, not_cleared, pvh_gen;
5672	struct spglist free;
5673	boolean_t demoted;
5674
5675	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5676	    ("pmap_ts_referenced: page %p is not managed", m));
5677	SLIST_INIT(&free);
5678	cleared = 0;
5679	pa = VM_PAGE_TO_PHYS(m);
5680	lock = PHYS_TO_PV_LIST_LOCK(pa);
5681	pvh = pa_to_pvh(pa);
5682	rw_rlock(&pvh_global_lock);
5683	rw_wlock(lock);
5684retry:
5685	not_cleared = 0;
5686	if ((m->flags & PG_FICTITIOUS) != 0 ||
5687	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5688		goto small_mappings;
5689	pv = pvf;
5690	do {
5691		if (pvf == NULL)
5692			pvf = pv;
5693		pmap = PV_PMAP(pv);
5694		if (!PMAP_TRYLOCK(pmap)) {
5695			pvh_gen = pvh->pv_gen;
5696			rw_wunlock(lock);
5697			PMAP_LOCK(pmap);
5698			rw_wlock(lock);
5699			if (pvh_gen != pvh->pv_gen) {
5700				PMAP_UNLOCK(pmap);
5701				goto retry;
5702			}
5703		}
5704		PG_A = pmap_accessed_bit(pmap);
5705		va = pv->pv_va;
5706		pde = pmap_pde(pmap, pv->pv_va);
5707		oldpde = *pde;
5708		if ((*pde & PG_A) != 0) {
5709			/*
5710			 * Since this reference bit is shared by 512 4KB
5711			 * pages, it should not be cleared every time it is
5712			 * tested.  Apply a simple "hash" function on the
5713			 * physical page number, the virtual superpage number,
5714			 * and the pmap address to select one 4KB page out of
5715			 * the 512 on which testing the reference bit will
5716			 * result in clearing that reference bit.  This
5717			 * function is designed to avoid the selection of the
5718			 * same 4KB page for every 2MB page mapping.
5719			 *
5720			 * On demotion, a mapping that hasn't been referenced
5721			 * is simply destroyed.  To avoid the possibility of a
5722			 * subsequent page fault on a demoted wired mapping,
5723			 * always leave its reference bit set.  Moreover,
5724			 * since the superpage is wired, the current state of
5725			 * its reference bit won't affect page replacement.
5726			 */
5727			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5728			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5729			    (*pde & PG_W) == 0) {
5730				if (safe_to_clear_referenced(pmap, oldpde)) {
5731					atomic_clear_long(pde, PG_A);
5732					pmap_invalidate_page(pmap, pv->pv_va);
5733					demoted = FALSE;
5734				} else if (pmap_demote_pde_locked(pmap, pde,
5735				    pv->pv_va, &lock)) {
5736					/*
5737					 * Remove the mapping to a single page
5738					 * so that a subsequent access may
5739					 * repromote.  Since the underlying
5740					 * page table page is fully populated,
5741					 * this removal never frees a page
5742					 * table page.
5743					 */
5744					demoted = TRUE;
5745					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5746					    PG_PS_FRAME);
5747					pte = pmap_pde_to_pte(pde, va);
5748					pmap_remove_pte(pmap, pte, va, *pde,
5749					    NULL, &lock);
5750					pmap_invalidate_page(pmap, va);
5751				} else
5752					demoted = TRUE;
5753
5754				if (demoted) {
5755					/*
5756					 * The superpage mapping was removed
5757					 * entirely and therefore 'pv' is no
5758					 * longer valid.
5759					 */
5760					if (pvf == pv)
5761						pvf = NULL;
5762					pv = NULL;
5763				}
5764				cleared++;
5765				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5766				    ("inconsistent pv lock %p %p for page %p",
5767				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5768			} else
5769				not_cleared++;
5770		}
5771		PMAP_UNLOCK(pmap);
5772		/* Rotate the PV list if it has more than one entry. */
5773		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5774			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5775			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5776			pvh->pv_gen++;
5777		}
5778		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5779			goto out;
5780	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5781small_mappings:
5782	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5783		goto out;
5784	pv = pvf;
5785	do {
5786		if (pvf == NULL)
5787			pvf = pv;
5788		pmap = PV_PMAP(pv);
5789		if (!PMAP_TRYLOCK(pmap)) {
5790			pvh_gen = pvh->pv_gen;
5791			md_gen = m->md.pv_gen;
5792			rw_wunlock(lock);
5793			PMAP_LOCK(pmap);
5794			rw_wlock(lock);
5795			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5796				PMAP_UNLOCK(pmap);
5797				goto retry;
5798			}
5799		}
5800		PG_A = pmap_accessed_bit(pmap);
5801		pde = pmap_pde(pmap, pv->pv_va);
5802		KASSERT((*pde & PG_PS) == 0,
5803		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5804		    m));
5805		pte = pmap_pde_to_pte(pde, pv->pv_va);
5806		if ((*pte & PG_A) != 0) {
5807			if (safe_to_clear_referenced(pmap, *pte)) {
5808				atomic_clear_long(pte, PG_A);
5809				pmap_invalidate_page(pmap, pv->pv_va);
5810				cleared++;
5811			} else if ((*pte & PG_W) == 0) {
5812				/*
5813				 * Wired pages cannot be paged out so
5814				 * doing accessed bit emulation for
5815				 * them is wasted effort. We do the
5816				 * hard work for unwired pages only.
5817				 */
5818				pmap_remove_pte(pmap, pte, pv->pv_va,
5819				    *pde, &free, &lock);
5820				pmap_invalidate_page(pmap, pv->pv_va);
5821				cleared++;
5822				if (pvf == pv)
5823					pvf = NULL;
5824				pv = NULL;
5825				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5826				    ("inconsistent pv lock %p %p for page %p",
5827				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5828			} else
5829				not_cleared++;
5830		}
5831		PMAP_UNLOCK(pmap);
5832		/* Rotate the PV list if it has more than one entry. */
5833		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5834			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5835			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5836			m->md.pv_gen++;
5837		}
5838	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5839	    not_cleared < PMAP_TS_REFERENCED_MAX);
5840out:
5841	rw_wunlock(lock);
5842	rw_runlock(&pvh_global_lock);
5843	pmap_free_zero_pages(&free);
5844	return (cleared + not_cleared);
5845}
5846
5847/*
5848 *	Apply the given advice to the specified range of addresses within the
5849 *	given pmap.  Depending on the advice, clear the referenced and/or
5850 *	modified flags in each mapping and set the mapped page's dirty field.
5851 */
5852void
5853pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5854{
5855	struct rwlock *lock;
5856	pml4_entry_t *pml4e;
5857	pdp_entry_t *pdpe;
5858	pd_entry_t oldpde, *pde;
5859	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
5860	vm_offset_t va_next;
5861	vm_page_t m;
5862	boolean_t anychanged, pv_lists_locked;
5863
5864	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5865		return;
5866
5867	/*
5868	 * A/D bit emulation requires an alternate code path when clearing
5869	 * the modified and accessed bits below. Since this function is
5870	 * advisory in nature we skip it entirely for pmaps that require
5871	 * A/D bit emulation.
5872	 */
5873	if (pmap_emulate_ad_bits(pmap))
5874		return;
5875
5876	PG_A = pmap_accessed_bit(pmap);
5877	PG_G = pmap_global_bit(pmap);
5878	PG_M = pmap_modified_bit(pmap);
5879	PG_V = pmap_valid_bit(pmap);
5880	PG_RW = pmap_rw_bit(pmap);
5881
5882	pv_lists_locked = FALSE;
5883resume:
5884	anychanged = FALSE;
5885	PMAP_LOCK(pmap);
5886	for (; sva < eva; sva = va_next) {
5887		pml4e = pmap_pml4e(pmap, sva);
5888		if ((*pml4e & PG_V) == 0) {
5889			va_next = (sva + NBPML4) & ~PML4MASK;
5890			if (va_next < sva)
5891				va_next = eva;
5892			continue;
5893		}
5894		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5895		if ((*pdpe & PG_V) == 0) {
5896			va_next = (sva + NBPDP) & ~PDPMASK;
5897			if (va_next < sva)
5898				va_next = eva;
5899			continue;
5900		}
5901		va_next = (sva + NBPDR) & ~PDRMASK;
5902		if (va_next < sva)
5903			va_next = eva;
5904		pde = pmap_pdpe_to_pde(pdpe, sva);
5905		oldpde = *pde;
5906		if ((oldpde & PG_V) == 0)
5907			continue;
5908		else if ((oldpde & PG_PS) != 0) {
5909			if ((oldpde & PG_MANAGED) == 0)
5910				continue;
5911			if (!pv_lists_locked) {
5912				pv_lists_locked = TRUE;
5913				if (!rw_try_rlock(&pvh_global_lock)) {
5914					if (anychanged)
5915						pmap_invalidate_all(pmap);
5916					PMAP_UNLOCK(pmap);
5917					rw_rlock(&pvh_global_lock);
5918					goto resume;
5919				}
5920			}
5921			lock = NULL;
5922			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
5923				if (lock != NULL)
5924					rw_wunlock(lock);
5925
5926				/*
5927				 * The large page mapping was destroyed.
5928				 */
5929				continue;
5930			}
5931
5932			/*
5933			 * Unless the page mappings are wired, remove the
5934			 * mapping to a single page so that a subsequent
5935			 * access may repromote.  Since the underlying page
5936			 * table page is fully populated, this removal never
5937			 * frees a page table page.
5938			 */
5939			if ((oldpde & PG_W) == 0) {
5940				pte = pmap_pde_to_pte(pde, sva);
5941				KASSERT((*pte & PG_V) != 0,
5942				    ("pmap_advise: invalid PTE"));
5943				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
5944				    &lock);
5945				anychanged = TRUE;
5946			}
5947			if (lock != NULL)
5948				rw_wunlock(lock);
5949		}
5950		if (va_next > eva)
5951			va_next = eva;
5952		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5953		    sva += PAGE_SIZE) {
5954			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
5955			    PG_V))
5956				continue;
5957			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5958				if (advice == MADV_DONTNEED) {
5959					/*
5960					 * Future calls to pmap_is_modified()
5961					 * can be avoided by making the page
5962					 * dirty now.
5963					 */
5964					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5965					vm_page_dirty(m);
5966				}
5967				atomic_clear_long(pte, PG_M | PG_A);
5968			} else if ((*pte & PG_A) != 0)
5969				atomic_clear_long(pte, PG_A);
5970			else
5971				continue;
5972			if ((*pte & PG_G) != 0)
5973				pmap_invalidate_page(pmap, sva);
5974			else
5975				anychanged = TRUE;
5976		}
5977	}
5978	if (anychanged)
5979		pmap_invalidate_all(pmap);
5980	if (pv_lists_locked)
5981		rw_runlock(&pvh_global_lock);
5982	PMAP_UNLOCK(pmap);
5983}
5984
5985/*
5986 *	Clear the modify bits on the specified physical page.
5987 */
5988void
5989pmap_clear_modify(vm_page_t m)
5990{
5991	struct md_page *pvh;
5992	pmap_t pmap;
5993	pv_entry_t next_pv, pv;
5994	pd_entry_t oldpde, *pde;
5995	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
5996	struct rwlock *lock;
5997	vm_offset_t va;
5998	int md_gen, pvh_gen;
5999
6000	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6001	    ("pmap_clear_modify: page %p is not managed", m));
6002	VM_OBJECT_ASSERT_WLOCKED(m->object);
6003	KASSERT(!vm_page_xbusied(m),
6004	    ("pmap_clear_modify: page %p is exclusive busied", m));
6005
6006	/*
6007	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6008	 * If the object containing the page is locked and the page is not
6009	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6010	 */
6011	if ((m->aflags & PGA_WRITEABLE) == 0)
6012		return;
6013	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6014	rw_rlock(&pvh_global_lock);
6015	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6016	rw_wlock(lock);
6017restart:
6018	if ((m->flags & PG_FICTITIOUS) != 0)
6019		goto small_mappings;
6020	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6021		pmap = PV_PMAP(pv);
6022		if (!PMAP_TRYLOCK(pmap)) {
6023			pvh_gen = pvh->pv_gen;
6024			rw_wunlock(lock);
6025			PMAP_LOCK(pmap);
6026			rw_wlock(lock);
6027			if (pvh_gen != pvh->pv_gen) {
6028				PMAP_UNLOCK(pmap);
6029				goto restart;
6030			}
6031		}
6032		PG_M = pmap_modified_bit(pmap);
6033		PG_V = pmap_valid_bit(pmap);
6034		PG_RW = pmap_rw_bit(pmap);
6035		va = pv->pv_va;
6036		pde = pmap_pde(pmap, va);
6037		oldpde = *pde;
6038		if ((oldpde & PG_RW) != 0) {
6039			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6040				if ((oldpde & PG_W) == 0) {
6041					/*
6042					 * Write protect the mapping to a
6043					 * single page so that a subsequent
6044					 * write access may repromote.
6045					 */
6046					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6047					    PG_PS_FRAME);
6048					pte = pmap_pde_to_pte(pde, va);
6049					oldpte = *pte;
6050					if ((oldpte & PG_V) != 0) {
6051						while (!atomic_cmpset_long(pte,
6052						    oldpte,
6053						    oldpte & ~(PG_M | PG_RW)))
6054							oldpte = *pte;
6055						vm_page_dirty(m);
6056						pmap_invalidate_page(pmap, va);
6057					}
6058				}
6059			}
6060		}
6061		PMAP_UNLOCK(pmap);
6062	}
6063small_mappings:
6064	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6065		pmap = PV_PMAP(pv);
6066		if (!PMAP_TRYLOCK(pmap)) {
6067			md_gen = m->md.pv_gen;
6068			pvh_gen = pvh->pv_gen;
6069			rw_wunlock(lock);
6070			PMAP_LOCK(pmap);
6071			rw_wlock(lock);
6072			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6073				PMAP_UNLOCK(pmap);
6074				goto restart;
6075			}
6076		}
6077		PG_M = pmap_modified_bit(pmap);
6078		PG_RW = pmap_rw_bit(pmap);
6079		pde = pmap_pde(pmap, pv->pv_va);
6080		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6081		    " a 2mpage in page %p's pv list", m));
6082		pte = pmap_pde_to_pte(pde, pv->pv_va);
6083		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6084			atomic_clear_long(pte, PG_M);
6085			pmap_invalidate_page(pmap, pv->pv_va);
6086		}
6087		PMAP_UNLOCK(pmap);
6088	}
6089	rw_wunlock(lock);
6090	rw_runlock(&pvh_global_lock);
6091}
6092
6093/*
6094 * Miscellaneous support routines follow
6095 */
6096
6097/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6098static __inline void
6099pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6100{
6101	u_int opte, npte;
6102
6103	/*
6104	 * The cache mode bits are all in the low 32-bits of the
6105	 * PTE, so we can just spin on updating the low 32-bits.
6106	 */
6107	do {
6108		opte = *(u_int *)pte;
6109		npte = opte & ~mask;
6110		npte |= cache_bits;
6111	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6112}
6113
6114/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6115static __inline void
6116pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6117{
6118	u_int opde, npde;
6119
6120	/*
6121	 * The cache mode bits are all in the low 32-bits of the
6122	 * PDE, so we can just spin on updating the low 32-bits.
6123	 */
6124	do {
6125		opde = *(u_int *)pde;
6126		npde = opde & ~mask;
6127		npde |= cache_bits;
6128	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6129}
6130
6131/*
6132 * Map a set of physical memory pages into the kernel virtual
6133 * address space. Return a pointer to where it is mapped. This
6134 * routine is intended to be used for mapping device memory,
6135 * NOT real memory.
6136 */
6137void *
6138pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6139{
6140	vm_offset_t va, offset;
6141	vm_size_t tmpsize;
6142
6143	/*
6144	 * If the specified range of physical addresses fits within the direct
6145	 * map window, use the direct map.
6146	 */
6147	if (pa < dmaplimit && pa + size < dmaplimit) {
6148		va = PHYS_TO_DMAP(pa);
6149		if (!pmap_change_attr(va, size, mode))
6150			return ((void *)va);
6151	}
6152	offset = pa & PAGE_MASK;
6153	size = round_page(offset + size);
6154	va = kva_alloc(size);
6155	if (!va)
6156		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
6157	pa = trunc_page(pa);
6158	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6159		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6160	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6161	pmap_invalidate_cache_range(va, va + tmpsize);
6162	return ((void *)(va + offset));
6163}
6164
6165void *
6166pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6167{
6168
6169	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6170}
6171
6172void *
6173pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6174{
6175
6176	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6177}
6178
6179void
6180pmap_unmapdev(vm_offset_t va, vm_size_t size)
6181{
6182	vm_offset_t base, offset;
6183
6184	/* If we gave a direct map region in pmap_mapdev, do nothing */
6185	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6186		return;
6187	base = trunc_page(va);
6188	offset = va & PAGE_MASK;
6189	size = round_page(offset + size);
6190	kva_free(base, size);
6191}
6192
6193/*
6194 * Tries to demote a 1GB page mapping.
6195 */
6196static boolean_t
6197pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6198{
6199	pdp_entry_t newpdpe, oldpdpe;
6200	pd_entry_t *firstpde, newpde, *pde;
6201	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6202	vm_paddr_t mpdepa;
6203	vm_page_t mpde;
6204
6205	PG_A = pmap_accessed_bit(pmap);
6206	PG_M = pmap_modified_bit(pmap);
6207	PG_V = pmap_valid_bit(pmap);
6208	PG_RW = pmap_rw_bit(pmap);
6209
6210	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6211	oldpdpe = *pdpe;
6212	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6213	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6214	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6215	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6216		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6217		    " in pmap %p", va, pmap);
6218		return (FALSE);
6219	}
6220	mpdepa = VM_PAGE_TO_PHYS(mpde);
6221	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6222	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6223	KASSERT((oldpdpe & PG_A) != 0,
6224	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6225	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6226	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6227	newpde = oldpdpe;
6228
6229	/*
6230	 * Initialize the page directory page.
6231	 */
6232	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6233		*pde = newpde;
6234		newpde += NBPDR;
6235	}
6236
6237	/*
6238	 * Demote the mapping.
6239	 */
6240	*pdpe = newpdpe;
6241
6242	/*
6243	 * Invalidate a stale recursive mapping of the page directory page.
6244	 */
6245	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6246
6247	pmap_pdpe_demotions++;
6248	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6249	    " in pmap %p", va, pmap);
6250	return (TRUE);
6251}
6252
6253/*
6254 * Sets the memory attribute for the specified page.
6255 */
6256void
6257pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6258{
6259
6260	m->md.pat_mode = ma;
6261
6262	/*
6263	 * If "m" is a normal page, update its direct mapping.  This update
6264	 * can be relied upon to perform any cache operations that are
6265	 * required for data coherence.
6266	 */
6267	if ((m->flags & PG_FICTITIOUS) == 0 &&
6268	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6269	    m->md.pat_mode))
6270		panic("memory attribute change on the direct map failed");
6271}
6272
6273/*
6274 * Changes the specified virtual address range's memory type to that given by
6275 * the parameter "mode".  The specified virtual address range must be
6276 * completely contained within either the direct map or the kernel map.  If
6277 * the virtual address range is contained within the kernel map, then the
6278 * memory type for each of the corresponding ranges of the direct map is also
6279 * changed.  (The corresponding ranges of the direct map are those ranges that
6280 * map the same physical pages as the specified virtual address range.)  These
6281 * changes to the direct map are necessary because Intel describes the
6282 * behavior of their processors as "undefined" if two or more mappings to the
6283 * same physical page have different memory types.
6284 *
6285 * Returns zero if the change completed successfully, and either EINVAL or
6286 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6287 * of the virtual address range was not mapped, and ENOMEM is returned if
6288 * there was insufficient memory available to complete the change.  In the
6289 * latter case, the memory type may have been changed on some part of the
6290 * virtual address range or the direct map.
6291 */
6292int
6293pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6294{
6295	int error;
6296
6297	PMAP_LOCK(kernel_pmap);
6298	error = pmap_change_attr_locked(va, size, mode);
6299	PMAP_UNLOCK(kernel_pmap);
6300	return (error);
6301}
6302
6303static int
6304pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6305{
6306	vm_offset_t base, offset, tmpva;
6307	vm_paddr_t pa_start, pa_end;
6308	pdp_entry_t *pdpe;
6309	pd_entry_t *pde;
6310	pt_entry_t *pte;
6311	int cache_bits_pte, cache_bits_pde, error;
6312	boolean_t changed;
6313
6314	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6315	base = trunc_page(va);
6316	offset = va & PAGE_MASK;
6317	size = round_page(offset + size);
6318
6319	/*
6320	 * Only supported on kernel virtual addresses, including the direct
6321	 * map but excluding the recursive map.
6322	 */
6323	if (base < DMAP_MIN_ADDRESS)
6324		return (EINVAL);
6325
6326	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6327	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6328	changed = FALSE;
6329
6330	/*
6331	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6332	 * into 4KB pages if required.
6333	 */
6334	for (tmpva = base; tmpva < base + size; ) {
6335		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6336		if (*pdpe == 0)
6337			return (EINVAL);
6338		if (*pdpe & PG_PS) {
6339			/*
6340			 * If the current 1GB page already has the required
6341			 * memory type, then we need not demote this page. Just
6342			 * increment tmpva to the next 1GB page frame.
6343			 */
6344			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6345				tmpva = trunc_1gpage(tmpva) + NBPDP;
6346				continue;
6347			}
6348
6349			/*
6350			 * If the current offset aligns with a 1GB page frame
6351			 * and there is at least 1GB left within the range, then
6352			 * we need not break down this page into 2MB pages.
6353			 */
6354			if ((tmpva & PDPMASK) == 0 &&
6355			    tmpva + PDPMASK < base + size) {
6356				tmpva += NBPDP;
6357				continue;
6358			}
6359			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6360				return (ENOMEM);
6361		}
6362		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6363		if (*pde == 0)
6364			return (EINVAL);
6365		if (*pde & PG_PS) {
6366			/*
6367			 * If the current 2MB page already has the required
6368			 * memory type, then we need not demote this page. Just
6369			 * increment tmpva to the next 2MB page frame.
6370			 */
6371			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6372				tmpva = trunc_2mpage(tmpva) + NBPDR;
6373				continue;
6374			}
6375
6376			/*
6377			 * If the current offset aligns with a 2MB page frame
6378			 * and there is at least 2MB left within the range, then
6379			 * we need not break down this page into 4KB pages.
6380			 */
6381			if ((tmpva & PDRMASK) == 0 &&
6382			    tmpva + PDRMASK < base + size) {
6383				tmpva += NBPDR;
6384				continue;
6385			}
6386			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6387				return (ENOMEM);
6388		}
6389		pte = pmap_pde_to_pte(pde, tmpva);
6390		if (*pte == 0)
6391			return (EINVAL);
6392		tmpva += PAGE_SIZE;
6393	}
6394	error = 0;
6395
6396	/*
6397	 * Ok, all the pages exist, so run through them updating their
6398	 * cache mode if required.
6399	 */
6400	pa_start = pa_end = 0;
6401	for (tmpva = base; tmpva < base + size; ) {
6402		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6403		if (*pdpe & PG_PS) {
6404			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6405				pmap_pde_attr(pdpe, cache_bits_pde,
6406				    X86_PG_PDE_CACHE);
6407				changed = TRUE;
6408			}
6409			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6410				if (pa_start == pa_end) {
6411					/* Start physical address run. */
6412					pa_start = *pdpe & PG_PS_FRAME;
6413					pa_end = pa_start + NBPDP;
6414				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6415					pa_end += NBPDP;
6416				else {
6417					/* Run ended, update direct map. */
6418					error = pmap_change_attr_locked(
6419					    PHYS_TO_DMAP(pa_start),
6420					    pa_end - pa_start, mode);
6421					if (error != 0)
6422						break;
6423					/* Start physical address run. */
6424					pa_start = *pdpe & PG_PS_FRAME;
6425					pa_end = pa_start + NBPDP;
6426				}
6427			}
6428			tmpva = trunc_1gpage(tmpva) + NBPDP;
6429			continue;
6430		}
6431		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6432		if (*pde & PG_PS) {
6433			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6434				pmap_pde_attr(pde, cache_bits_pde,
6435				    X86_PG_PDE_CACHE);
6436				changed = TRUE;
6437			}
6438			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6439				if (pa_start == pa_end) {
6440					/* Start physical address run. */
6441					pa_start = *pde & PG_PS_FRAME;
6442					pa_end = pa_start + NBPDR;
6443				} else if (pa_end == (*pde & PG_PS_FRAME))
6444					pa_end += NBPDR;
6445				else {
6446					/* Run ended, update direct map. */
6447					error = pmap_change_attr_locked(
6448					    PHYS_TO_DMAP(pa_start),
6449					    pa_end - pa_start, mode);
6450					if (error != 0)
6451						break;
6452					/* Start physical address run. */
6453					pa_start = *pde & PG_PS_FRAME;
6454					pa_end = pa_start + NBPDR;
6455				}
6456			}
6457			tmpva = trunc_2mpage(tmpva) + NBPDR;
6458		} else {
6459			pte = pmap_pde_to_pte(pde, tmpva);
6460			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6461				pmap_pte_attr(pte, cache_bits_pte,
6462				    X86_PG_PTE_CACHE);
6463				changed = TRUE;
6464			}
6465			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6466				if (pa_start == pa_end) {
6467					/* Start physical address run. */
6468					pa_start = *pte & PG_FRAME;
6469					pa_end = pa_start + PAGE_SIZE;
6470				} else if (pa_end == (*pte & PG_FRAME))
6471					pa_end += PAGE_SIZE;
6472				else {
6473					/* Run ended, update direct map. */
6474					error = pmap_change_attr_locked(
6475					    PHYS_TO_DMAP(pa_start),
6476					    pa_end - pa_start, mode);
6477					if (error != 0)
6478						break;
6479					/* Start physical address run. */
6480					pa_start = *pte & PG_FRAME;
6481					pa_end = pa_start + PAGE_SIZE;
6482				}
6483			}
6484			tmpva += PAGE_SIZE;
6485		}
6486	}
6487	if (error == 0 && pa_start != pa_end)
6488		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6489		    pa_end - pa_start, mode);
6490
6491	/*
6492	 * Flush CPU caches if required to make sure any data isn't cached that
6493	 * shouldn't be, etc.
6494	 */
6495	if (changed) {
6496		pmap_invalidate_range(kernel_pmap, base, tmpva);
6497		pmap_invalidate_cache_range(base, tmpva);
6498	}
6499	return (error);
6500}
6501
6502/*
6503 * Demotes any mapping within the direct map region that covers more than the
6504 * specified range of physical addresses.  This range's size must be a power
6505 * of two and its starting address must be a multiple of its size.  Since the
6506 * demotion does not change any attributes of the mapping, a TLB invalidation
6507 * is not mandatory.  The caller may, however, request a TLB invalidation.
6508 */
6509void
6510pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6511{
6512	pdp_entry_t *pdpe;
6513	pd_entry_t *pde;
6514	vm_offset_t va;
6515	boolean_t changed;
6516
6517	if (len == 0)
6518		return;
6519	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6520	KASSERT((base & (len - 1)) == 0,
6521	    ("pmap_demote_DMAP: base is not a multiple of len"));
6522	if (len < NBPDP && base < dmaplimit) {
6523		va = PHYS_TO_DMAP(base);
6524		changed = FALSE;
6525		PMAP_LOCK(kernel_pmap);
6526		pdpe = pmap_pdpe(kernel_pmap, va);
6527		if ((*pdpe & X86_PG_V) == 0)
6528			panic("pmap_demote_DMAP: invalid PDPE");
6529		if ((*pdpe & PG_PS) != 0) {
6530			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6531				panic("pmap_demote_DMAP: PDPE failed");
6532			changed = TRUE;
6533		}
6534		if (len < NBPDR) {
6535			pde = pmap_pdpe_to_pde(pdpe, va);
6536			if ((*pde & X86_PG_V) == 0)
6537				panic("pmap_demote_DMAP: invalid PDE");
6538			if ((*pde & PG_PS) != 0) {
6539				if (!pmap_demote_pde(kernel_pmap, pde, va))
6540					panic("pmap_demote_DMAP: PDE failed");
6541				changed = TRUE;
6542			}
6543		}
6544		if (changed && invalidate)
6545			pmap_invalidate_page(kernel_pmap, va);
6546		PMAP_UNLOCK(kernel_pmap);
6547	}
6548}
6549
6550/*
6551 * perform the pmap work for mincore
6552 */
6553int
6554pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6555{
6556	pd_entry_t *pdep;
6557	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6558	vm_paddr_t pa;
6559	int val;
6560
6561	PG_A = pmap_accessed_bit(pmap);
6562	PG_M = pmap_modified_bit(pmap);
6563	PG_V = pmap_valid_bit(pmap);
6564	PG_RW = pmap_rw_bit(pmap);
6565
6566	PMAP_LOCK(pmap);
6567retry:
6568	pdep = pmap_pde(pmap, addr);
6569	if (pdep != NULL && (*pdep & PG_V)) {
6570		if (*pdep & PG_PS) {
6571			pte = *pdep;
6572			/* Compute the physical address of the 4KB page. */
6573			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6574			    PG_FRAME;
6575			val = MINCORE_SUPER;
6576		} else {
6577			pte = *pmap_pde_to_pte(pdep, addr);
6578			pa = pte & PG_FRAME;
6579			val = 0;
6580		}
6581	} else {
6582		pte = 0;
6583		pa = 0;
6584		val = 0;
6585	}
6586	if ((pte & PG_V) != 0) {
6587		val |= MINCORE_INCORE;
6588		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6589			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6590		if ((pte & PG_A) != 0)
6591			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6592	}
6593	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6594	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6595	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6596		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6597		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6598			goto retry;
6599	} else
6600		PA_UNLOCK_COND(*locked_pa);
6601	PMAP_UNLOCK(pmap);
6602	return (val);
6603}
6604
6605void
6606pmap_activate(struct thread *td)
6607{
6608	pmap_t	pmap, oldpmap;
6609	u_int	cpuid;
6610
6611	critical_enter();
6612	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6613	oldpmap = PCPU_GET(curpmap);
6614	cpuid = PCPU_GET(cpuid);
6615#ifdef SMP
6616	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6617	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6618	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6619#else
6620	CPU_CLR(cpuid, &oldpmap->pm_active);
6621	CPU_SET(cpuid, &pmap->pm_active);
6622	CPU_SET(cpuid, &pmap->pm_save);
6623#endif
6624	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6625	load_cr3(pmap->pm_cr3);
6626	PCPU_SET(curpmap, pmap);
6627	critical_exit();
6628}
6629
6630void
6631pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6632{
6633}
6634
6635/*
6636 *	Increase the starting virtual address of the given mapping if a
6637 *	different alignment might result in more superpage mappings.
6638 */
6639void
6640pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6641    vm_offset_t *addr, vm_size_t size)
6642{
6643	vm_offset_t superpage_offset;
6644
6645	if (size < NBPDR)
6646		return;
6647	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6648		offset += ptoa(object->pg_color);
6649	superpage_offset = offset & PDRMASK;
6650	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6651	    (*addr & PDRMASK) == superpage_offset)
6652		return;
6653	if ((*addr & PDRMASK) < superpage_offset)
6654		*addr = (*addr & ~PDRMASK) + superpage_offset;
6655	else
6656		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6657}
6658
6659#ifdef INVARIANTS
6660static unsigned long num_dirty_emulations;
6661SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6662	     &num_dirty_emulations, 0, NULL);
6663
6664static unsigned long num_accessed_emulations;
6665SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6666	     &num_accessed_emulations, 0, NULL);
6667
6668static unsigned long num_superpage_accessed_emulations;
6669SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6670	     &num_superpage_accessed_emulations, 0, NULL);
6671
6672static unsigned long ad_emulation_superpage_promotions;
6673SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6674	     &ad_emulation_superpage_promotions, 0, NULL);
6675#endif	/* INVARIANTS */
6676
6677int
6678pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6679{
6680	int rv;
6681	struct rwlock *lock;
6682	vm_page_t m, mpte;
6683	pd_entry_t *pde;
6684	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6685	boolean_t pv_lists_locked;
6686
6687	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6688	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6689
6690	if (!pmap_emulate_ad_bits(pmap))
6691		return (-1);
6692
6693	PG_A = pmap_accessed_bit(pmap);
6694	PG_M = pmap_modified_bit(pmap);
6695	PG_V = pmap_valid_bit(pmap);
6696	PG_RW = pmap_rw_bit(pmap);
6697
6698	rv = -1;
6699	lock = NULL;
6700	pv_lists_locked = FALSE;
6701retry:
6702	PMAP_LOCK(pmap);
6703
6704	pde = pmap_pde(pmap, va);
6705	if (pde == NULL || (*pde & PG_V) == 0)
6706		goto done;
6707
6708	if ((*pde & PG_PS) != 0) {
6709		if (ftype == VM_PROT_READ) {
6710#ifdef INVARIANTS
6711			atomic_add_long(&num_superpage_accessed_emulations, 1);
6712#endif
6713			*pde |= PG_A;
6714			rv = 0;
6715		}
6716		goto done;
6717	}
6718
6719	pte = pmap_pde_to_pte(pde, va);
6720	if ((*pte & PG_V) == 0)
6721		goto done;
6722
6723	if (ftype == VM_PROT_WRITE) {
6724		if ((*pte & PG_RW) == 0)
6725			goto done;
6726		*pte |= PG_M;
6727	}
6728	*pte |= PG_A;
6729
6730	/* try to promote the mapping */
6731	if (va < VM_MAXUSER_ADDRESS)
6732		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6733	else
6734		mpte = NULL;
6735
6736	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6737
6738	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6739	    pmap_ps_enabled(pmap) &&
6740	    (m->flags & PG_FICTITIOUS) == 0 &&
6741	    vm_reserv_level_iffullpop(m) == 0) {
6742		if (!pv_lists_locked) {
6743			pv_lists_locked = TRUE;
6744			if (!rw_try_rlock(&pvh_global_lock)) {
6745				PMAP_UNLOCK(pmap);
6746				rw_rlock(&pvh_global_lock);
6747				goto retry;
6748			}
6749		}
6750		pmap_promote_pde(pmap, pde, va, &lock);
6751#ifdef INVARIANTS
6752		atomic_add_long(&ad_emulation_superpage_promotions, 1);
6753#endif
6754	}
6755#ifdef INVARIANTS
6756	if (ftype == VM_PROT_WRITE)
6757		atomic_add_long(&num_dirty_emulations, 1);
6758	else
6759		atomic_add_long(&num_accessed_emulations, 1);
6760#endif
6761	rv = 0;		/* success */
6762done:
6763	if (lock != NULL)
6764		rw_wunlock(lock);
6765	if (pv_lists_locked)
6766		rw_runlock(&pvh_global_lock);
6767	PMAP_UNLOCK(pmap);
6768	return (rv);
6769}
6770
6771void
6772pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
6773{
6774	pml4_entry_t *pml4;
6775	pdp_entry_t *pdp;
6776	pd_entry_t *pde;
6777	pt_entry_t *pte, PG_V;
6778	int idx;
6779
6780	idx = 0;
6781	PG_V = pmap_valid_bit(pmap);
6782	PMAP_LOCK(pmap);
6783
6784	pml4 = pmap_pml4e(pmap, va);
6785	ptr[idx++] = *pml4;
6786	if ((*pml4 & PG_V) == 0)
6787		goto done;
6788
6789	pdp = pmap_pml4e_to_pdpe(pml4, va);
6790	ptr[idx++] = *pdp;
6791	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
6792		goto done;
6793
6794	pde = pmap_pdpe_to_pde(pdp, va);
6795	ptr[idx++] = *pde;
6796	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
6797		goto done;
6798
6799	pte = pmap_pde_to_pte(pde, va);
6800	ptr[idx++] = *pte;
6801
6802done:
6803	PMAP_UNLOCK(pmap);
6804	*num = idx;
6805}
6806
6807#include "opt_ddb.h"
6808#ifdef DDB
6809#include <ddb/ddb.h>
6810
6811DB_SHOW_COMMAND(pte, pmap_print_pte)
6812{
6813	pmap_t pmap;
6814	pml4_entry_t *pml4;
6815	pdp_entry_t *pdp;
6816	pd_entry_t *pde;
6817	pt_entry_t *pte, PG_V;
6818	vm_offset_t va;
6819
6820	if (have_addr) {
6821		va = (vm_offset_t)addr;
6822		pmap = PCPU_GET(curpmap); /* XXX */
6823	} else {
6824		db_printf("show pte addr\n");
6825		return;
6826	}
6827	PG_V = pmap_valid_bit(pmap);
6828	pml4 = pmap_pml4e(pmap, va);
6829	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
6830	if ((*pml4 & PG_V) == 0) {
6831		db_printf("\n");
6832		return;
6833	}
6834	pdp = pmap_pml4e_to_pdpe(pml4, va);
6835	db_printf(" pdpe %#016lx", *pdp);
6836	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
6837		db_printf("\n");
6838		return;
6839	}
6840	pde = pmap_pdpe_to_pde(pdp, va);
6841	db_printf(" pde %#016lx", *pde);
6842	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
6843		db_printf("\n");
6844		return;
6845	}
6846	pte = pmap_pde_to_pte(pde, va);
6847	db_printf(" pte %#016lx\n", *pte);
6848}
6849
6850DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
6851{
6852	vm_paddr_t a;
6853
6854	if (have_addr) {
6855		a = (vm_paddr_t)addr;
6856		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
6857	} else {
6858		db_printf("show phys2dmap addr\n");
6859	}
6860}
6861#endif
6862