pmap.c revision 252646
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 252646 2013-07-03 23:21:25Z neel $");
81
82/*
83 *	Manages physical address maps.
84 *
85 *	Since the information managed by this module is
86 *	also stored by the logical address mapping module,
87 *	this module may throw away valid virtual-to-physical
88 *	mappings at almost any time.  However, invalidations
89 *	of virtual-to-physical mappings must be done as
90 *	requested.
91 *
92 *	In order to cope with hardware architectures which
93 *	make virtual-to-physical map invalidates expensive,
94 *	this module may delay invalidate or reduced protection
95 *	operations until such time as they are actually
96 *	necessary.  This module is given full information as
97 *	to which processors are currently using which maps,
98 *	and to when physical maps must be made correct.
99 */
100
101#include "opt_pmap.h"
102#include "opt_vm.h"
103
104#include <sys/param.h>
105#include <sys/bus.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/ktr.h>
109#include <sys/lock.h>
110#include <sys/malloc.h>
111#include <sys/mman.h>
112#include <sys/mutex.h>
113#include <sys/proc.h>
114#include <sys/rwlock.h>
115#include <sys/sx.h>
116#include <sys/vmmeter.h>
117#include <sys/sched.h>
118#include <sys/sysctl.h>
119#ifdef SMP
120#include <sys/smp.h>
121#else
122#include <sys/cpuset.h>
123#endif
124
125#include <vm/vm.h>
126#include <vm/vm_param.h>
127#include <vm/vm_kern.h>
128#include <vm/vm_page.h>
129#include <vm/vm_map.h>
130#include <vm/vm_object.h>
131#include <vm/vm_extern.h>
132#include <vm/vm_pageout.h>
133#include <vm/vm_pager.h>
134#include <vm/vm_radix.h>
135#include <vm/vm_reserv.h>
136#include <vm/uma.h>
137
138#include <machine/intr_machdep.h>
139#include <machine/apicvar.h>
140#include <machine/cpu.h>
141#include <machine/cputypes.h>
142#include <machine/md_var.h>
143#include <machine/pcb.h>
144#include <machine/specialreg.h>
145#ifdef SMP
146#include <machine/smp.h>
147#endif
148
149#if !defined(DIAGNOSTIC)
150#ifdef __GNUC_GNU_INLINE__
151#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
152#else
153#define PMAP_INLINE	extern inline
154#endif
155#else
156#define PMAP_INLINE
157#endif
158
159#ifdef PV_STATS
160#define PV_STAT(x)	do { x ; } while (0)
161#else
162#define PV_STAT(x)	do { } while (0)
163#endif
164
165#define	pa_index(pa)	((pa) >> PDRSHIFT)
166#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
167
168#define	NPV_LIST_LOCKS	MAXCPU
169
170#define	PHYS_TO_PV_LIST_LOCK(pa)	\
171			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
172
173#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
174	struct rwlock **_lockp = (lockp);		\
175	struct rwlock *_new_lock;			\
176							\
177	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
178	if (_new_lock != *_lockp) {			\
179		if (*_lockp != NULL)			\
180			rw_wunlock(*_lockp);		\
181		*_lockp = _new_lock;			\
182		rw_wlock(*_lockp);			\
183	}						\
184} while (0)
185
186#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
187			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
188
189#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
190	struct rwlock **_lockp = (lockp);		\
191							\
192	if (*_lockp != NULL) {				\
193		rw_wunlock(*_lockp);			\
194		*_lockp = NULL;				\
195	}						\
196} while (0)
197
198#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
199			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
200
201struct pmap kernel_pmap_store;
202
203vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
204vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
205
206int nkpt;
207SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
208    "Number of kernel page table pages allocated on bootup");
209
210static int ndmpdp;
211static vm_paddr_t dmaplimit;
212vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
213pt_entry_t pg_nx;
214
215static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
216
217static int pat_works = 1;
218SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
219    "Is page attribute table fully functional?");
220
221static int pg_ps_enabled = 1;
222SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
223    "Are large page mappings enabled?");
224
225#define	PAT_INDEX_SIZE	8
226static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
227
228static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
229static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
230u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
231u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
232
233static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
234static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
235
236static struct rwlock_padalign pvh_global_lock;
237
238/*
239 * Data for the pv entry allocation mechanism
240 */
241static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
242static struct mtx pv_chunks_mutex;
243static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
244static struct md_page *pv_table;
245
246/*
247 * All those kernel PT submaps that BSD is so fond of
248 */
249pt_entry_t *CMAP1 = 0;
250caddr_t CADDR1 = 0;
251
252/*
253 * Crashdump maps.
254 */
255static caddr_t crashdumpmap;
256
257static void	free_pv_chunk(struct pv_chunk *pc);
258static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
259static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
260static int	popcnt_pc_map_elem(uint64_t elem);
261static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
262static void	reserve_pv_entries(pmap_t pmap, int needed,
263		    struct rwlock **lockp);
264static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
265		    struct rwlock **lockp);
266static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
267		    struct rwlock **lockp);
268static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
269		    struct rwlock **lockp);
270static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
271static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
272		    vm_offset_t va);
273static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
274
275static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
276static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
277static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
278    vm_offset_t va, struct rwlock **lockp);
279static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
280    vm_offset_t va);
281static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
282    vm_prot_t prot, struct rwlock **lockp);
283static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
284    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
285static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
286static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
287static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
288static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
289static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
290static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
291static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
292static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
293    struct rwlock **lockp);
294static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
295    vm_prot_t prot);
296static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
297static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
298		vm_page_t *free, struct rwlock **lockp);
299static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
300		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free,
301		struct rwlock **lockp);
302static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
303static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
304    vm_page_t *free);
305static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
306    vm_page_t m, struct rwlock **lockp);
307static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
308    pd_entry_t newpde);
309static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
310
311static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
312		struct rwlock **lockp);
313static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
314		struct rwlock **lockp);
315static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
316		struct rwlock **lockp);
317
318static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
319                vm_page_t *free);
320static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
321static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
322
323/*
324 * Move the kernel virtual free pointer to the next
325 * 2MB.  This is used to help improve performance
326 * by using a large (2MB) page for much of the kernel
327 * (.text, .data, .bss)
328 */
329static vm_offset_t
330pmap_kmem_choose(vm_offset_t addr)
331{
332	vm_offset_t newaddr = addr;
333
334	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
335	return (newaddr);
336}
337
338/********************/
339/* Inline functions */
340/********************/
341
342/* Return a non-clipped PD index for a given VA */
343static __inline vm_pindex_t
344pmap_pde_pindex(vm_offset_t va)
345{
346	return (va >> PDRSHIFT);
347}
348
349
350/* Return various clipped indexes for a given VA */
351static __inline vm_pindex_t
352pmap_pte_index(vm_offset_t va)
353{
354
355	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
356}
357
358static __inline vm_pindex_t
359pmap_pde_index(vm_offset_t va)
360{
361
362	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
363}
364
365static __inline vm_pindex_t
366pmap_pdpe_index(vm_offset_t va)
367{
368
369	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
370}
371
372static __inline vm_pindex_t
373pmap_pml4e_index(vm_offset_t va)
374{
375
376	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
377}
378
379/* Return a pointer to the PML4 slot that corresponds to a VA */
380static __inline pml4_entry_t *
381pmap_pml4e(pmap_t pmap, vm_offset_t va)
382{
383
384	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
385}
386
387/* Return a pointer to the PDP slot that corresponds to a VA */
388static __inline pdp_entry_t *
389pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
390{
391	pdp_entry_t *pdpe;
392
393	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
394	return (&pdpe[pmap_pdpe_index(va)]);
395}
396
397/* Return a pointer to the PDP slot that corresponds to a VA */
398static __inline pdp_entry_t *
399pmap_pdpe(pmap_t pmap, vm_offset_t va)
400{
401	pml4_entry_t *pml4e;
402
403	pml4e = pmap_pml4e(pmap, va);
404	if ((*pml4e & PG_V) == 0)
405		return (NULL);
406	return (pmap_pml4e_to_pdpe(pml4e, va));
407}
408
409/* Return a pointer to the PD slot that corresponds to a VA */
410static __inline pd_entry_t *
411pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
412{
413	pd_entry_t *pde;
414
415	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
416	return (&pde[pmap_pde_index(va)]);
417}
418
419/* Return a pointer to the PD slot that corresponds to a VA */
420static __inline pd_entry_t *
421pmap_pde(pmap_t pmap, vm_offset_t va)
422{
423	pdp_entry_t *pdpe;
424
425	pdpe = pmap_pdpe(pmap, va);
426	if (pdpe == NULL || (*pdpe & PG_V) == 0)
427		return (NULL);
428	return (pmap_pdpe_to_pde(pdpe, va));
429}
430
431/* Return a pointer to the PT slot that corresponds to a VA */
432static __inline pt_entry_t *
433pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
434{
435	pt_entry_t *pte;
436
437	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
438	return (&pte[pmap_pte_index(va)]);
439}
440
441/* Return a pointer to the PT slot that corresponds to a VA */
442static __inline pt_entry_t *
443pmap_pte(pmap_t pmap, vm_offset_t va)
444{
445	pd_entry_t *pde;
446
447	pde = pmap_pde(pmap, va);
448	if (pde == NULL || (*pde & PG_V) == 0)
449		return (NULL);
450	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
451		return ((pt_entry_t *)pde);
452	return (pmap_pde_to_pte(pde, va));
453}
454
455static __inline void
456pmap_resident_count_inc(pmap_t pmap, int count)
457{
458
459	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
460	pmap->pm_stats.resident_count += count;
461}
462
463static __inline void
464pmap_resident_count_dec(pmap_t pmap, int count)
465{
466
467	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
468	pmap->pm_stats.resident_count -= count;
469}
470
471PMAP_INLINE pt_entry_t *
472vtopte(vm_offset_t va)
473{
474	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
475
476	return (PTmap + ((va >> PAGE_SHIFT) & mask));
477}
478
479static __inline pd_entry_t *
480vtopde(vm_offset_t va)
481{
482	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
483
484	return (PDmap + ((va >> PDRSHIFT) & mask));
485}
486
487static u_int64_t
488allocpages(vm_paddr_t *firstaddr, int n)
489{
490	u_int64_t ret;
491
492	ret = *firstaddr;
493	bzero((void *)ret, n * PAGE_SIZE);
494	*firstaddr += n * PAGE_SIZE;
495	return (ret);
496}
497
498CTASSERT(powerof2(NDMPML4E));
499
500/* number of kernel PDP slots */
501#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
502
503static void
504nkpt_init(vm_paddr_t addr)
505{
506	int pt_pages;
507
508#ifdef NKPT
509	pt_pages = NKPT;
510#else
511	pt_pages = howmany(addr, 1 << PDRSHIFT);
512	pt_pages += NKPDPE(pt_pages);
513
514	/*
515	 * Add some slop beyond the bare minimum required for bootstrapping
516	 * the kernel.
517	 *
518	 * This is quite important when allocating KVA for kernel modules.
519	 * The modules are required to be linked in the negative 2GB of
520	 * the address space.  If we run out of KVA in this region then
521	 * pmap_growkernel() will need to allocate page table pages to map
522	 * the entire 512GB of KVA space which is an unnecessary tax on
523	 * physical memory.
524	 */
525	pt_pages += 8;		/* 16MB additional slop for kernel modules */
526#endif
527	nkpt = pt_pages;
528}
529
530static void
531create_pagetables(vm_paddr_t *firstaddr)
532{
533	int i, j, ndm1g, nkpdpe;
534
535	/* Allocate page table pages for the direct map */
536	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
537	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
538		ndmpdp = 4;
539	DMPDPphys = allocpages(firstaddr, NDMPML4E);
540	ndm1g = 0;
541	if ((amd_feature & AMDID_PAGE1GB) != 0)
542		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
543	if (ndm1g < ndmpdp)
544		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
545	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
546
547	/* Allocate pages */
548	KPML4phys = allocpages(firstaddr, 1);
549	KPDPphys = allocpages(firstaddr, NKPML4E);
550
551	/*
552	 * Allocate the initial number of kernel page table pages required to
553	 * bootstrap.  We defer this until after all memory-size dependent
554	 * allocations are done (e.g. direct map), so that we don't have to
555	 * build in too much slop in our estimate.
556	 */
557	nkpt_init(*firstaddr);
558	nkpdpe = NKPDPE(nkpt);
559
560	KPTphys = allocpages(firstaddr, nkpt);
561	KPDphys = allocpages(firstaddr, nkpdpe);
562
563	/* Fill in the underlying page table pages */
564	/* Read-only from zero to physfree */
565	/* XXX not fully used, underneath 2M pages */
566	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
567		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
568		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
569	}
570
571	/* Now map the page tables at their location within PTmap */
572	for (i = 0; i < nkpt; i++) {
573		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
574		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
575	}
576
577	/* Map from zero to end of allocations under 2M pages */
578	/* This replaces some of the KPTphys entries above */
579	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
580		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
581		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
582	}
583
584	/* And connect up the PD to the PDP */
585	for (i = 0; i < nkpdpe; i++) {
586		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
587		    (i << PAGE_SHIFT);
588		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
589	}
590
591	/*
592	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
593	 * the end of physical memory is not aligned to a 1GB page boundary,
594	 * then the residual physical memory is mapped with 2MB pages.  Later,
595	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
596	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
597	 * that are partially used.
598	 */
599	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
600		((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
601		/* Preset PG_M and PG_A because demotion expects it. */
602		((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
603		    PG_M | PG_A;
604	}
605	for (i = 0; i < ndm1g; i++) {
606		((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
607		/* Preset PG_M and PG_A because demotion expects it. */
608		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
609		    PG_M | PG_A;
610	}
611	for (j = 0; i < ndmpdp; i++, j++) {
612		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
613		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
614	}
615
616	/* And recursively map PML4 to itself in order to get PTmap */
617	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
618	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
619
620	/* Connect the Direct Map slot(s) up to the PML4. */
621	for (i = 0; i < NDMPML4E; i++) {
622		((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
623		    (i << PAGE_SHIFT);
624		((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
625	}
626
627	/* Connect the KVA slot up to the PML4 */
628	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
629	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
630}
631
632/*
633 *	Bootstrap the system enough to run with virtual memory.
634 *
635 *	On amd64 this is called after mapping has already been enabled
636 *	and just syncs the pmap module with what has already been done.
637 *	[We can't call it easily with mapping off since the kernel is not
638 *	mapped with PA == VA, hence we would have to relocate every address
639 *	from the linked base (virtual) address "KERNBASE" to the actual
640 *	(physical) address starting relative to 0]
641 */
642void
643pmap_bootstrap(vm_paddr_t *firstaddr)
644{
645	vm_offset_t va;
646	pt_entry_t *pte, *unused;
647
648	/*
649	 * Create an initial set of page tables to run the kernel in.
650	 */
651	create_pagetables(firstaddr);
652
653	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
654	virtual_avail = pmap_kmem_choose(virtual_avail);
655
656	virtual_end = VM_MAX_KERNEL_ADDRESS;
657
658
659	/* XXX do %cr0 as well */
660	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
661	load_cr3(KPML4phys);
662	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
663		load_cr4(rcr4() | CR4_SMEP);
664
665	/*
666	 * Initialize the kernel pmap (which is statically allocated).
667	 */
668	PMAP_LOCK_INIT(kernel_pmap);
669	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
670	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
671	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
672
673 	/*
674	 * Initialize the global pv list lock.
675	 */
676	rw_init(&pvh_global_lock, "pmap pv global");
677
678	/*
679	 * Reserve some special page table entries/VA space for temporary
680	 * mapping of pages.
681	 */
682#define	SYSMAP(c, p, v, n)	\
683	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
684
685	va = virtual_avail;
686	pte = vtopte(va);
687
688	/*
689	 * CMAP1 is only used for the memory test.
690	 */
691	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
692
693	/*
694	 * Crashdump maps.
695	 */
696	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
697
698	virtual_avail = va;
699
700	/* Initialize the PAT MSR. */
701	pmap_init_pat();
702}
703
704/*
705 * Setup the PAT MSR.
706 */
707void
708pmap_init_pat(void)
709{
710	int pat_table[PAT_INDEX_SIZE];
711	uint64_t pat_msr;
712	u_long cr0, cr4;
713	int i;
714
715	/* Bail if this CPU doesn't implement PAT. */
716	if ((cpu_feature & CPUID_PAT) == 0)
717		panic("no PAT??");
718
719	/* Set default PAT index table. */
720	for (i = 0; i < PAT_INDEX_SIZE; i++)
721		pat_table[i] = -1;
722	pat_table[PAT_WRITE_BACK] = 0;
723	pat_table[PAT_WRITE_THROUGH] = 1;
724	pat_table[PAT_UNCACHEABLE] = 3;
725	pat_table[PAT_WRITE_COMBINING] = 3;
726	pat_table[PAT_WRITE_PROTECTED] = 3;
727	pat_table[PAT_UNCACHED] = 3;
728
729	/* Initialize default PAT entries. */
730	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
731	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
732	    PAT_VALUE(2, PAT_UNCACHED) |
733	    PAT_VALUE(3, PAT_UNCACHEABLE) |
734	    PAT_VALUE(4, PAT_WRITE_BACK) |
735	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
736	    PAT_VALUE(6, PAT_UNCACHED) |
737	    PAT_VALUE(7, PAT_UNCACHEABLE);
738
739	if (pat_works) {
740		/*
741		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
742		 * Program 5 and 6 as WP and WC.
743		 * Leave 4 and 7 as WB and UC.
744		 */
745		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
746		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
747		    PAT_VALUE(6, PAT_WRITE_COMBINING);
748		pat_table[PAT_UNCACHED] = 2;
749		pat_table[PAT_WRITE_PROTECTED] = 5;
750		pat_table[PAT_WRITE_COMBINING] = 6;
751	} else {
752		/*
753		 * Just replace PAT Index 2 with WC instead of UC-.
754		 */
755		pat_msr &= ~PAT_MASK(2);
756		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
757		pat_table[PAT_WRITE_COMBINING] = 2;
758	}
759
760	/* Disable PGE. */
761	cr4 = rcr4();
762	load_cr4(cr4 & ~CR4_PGE);
763
764	/* Disable caches (CD = 1, NW = 0). */
765	cr0 = rcr0();
766	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
767
768	/* Flushes caches and TLBs. */
769	wbinvd();
770	invltlb();
771
772	/* Update PAT and index table. */
773	wrmsr(MSR_PAT, pat_msr);
774	for (i = 0; i < PAT_INDEX_SIZE; i++)
775		pat_index[i] = pat_table[i];
776
777	/* Flush caches and TLBs again. */
778	wbinvd();
779	invltlb();
780
781	/* Restore caches and PGE. */
782	load_cr0(cr0);
783	load_cr4(cr4);
784}
785
786/*
787 *	Initialize a vm_page's machine-dependent fields.
788 */
789void
790pmap_page_init(vm_page_t m)
791{
792
793	TAILQ_INIT(&m->md.pv_list);
794	m->md.pat_mode = PAT_WRITE_BACK;
795}
796
797/*
798 *	Initialize the pmap module.
799 *	Called by vm_init, to initialize any structures that the pmap
800 *	system needs to map virtual memory.
801 */
802void
803pmap_init(void)
804{
805	vm_page_t mpte;
806	vm_size_t s;
807	int i, pv_npg;
808
809	/*
810	 * Initialize the vm page array entries for the kernel pmap's
811	 * page table pages.
812	 */
813	for (i = 0; i < nkpt; i++) {
814		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
815		KASSERT(mpte >= vm_page_array &&
816		    mpte < &vm_page_array[vm_page_array_size],
817		    ("pmap_init: page table page is out of range"));
818		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
819		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
820	}
821
822	/*
823	 * If the kernel is running in a virtual machine on an AMD Family 10h
824	 * processor, then it must assume that MCA is enabled by the virtual
825	 * machine monitor.
826	 */
827	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
828	    CPUID_TO_FAMILY(cpu_id) == 0x10)
829		workaround_erratum383 = 1;
830
831	/*
832	 * Are large page mappings enabled?
833	 */
834	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
835	if (pg_ps_enabled) {
836		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
837		    ("pmap_init: can't assign to pagesizes[1]"));
838		pagesizes[1] = NBPDR;
839	}
840
841	/*
842	 * Initialize the pv chunk list mutex.
843	 */
844	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
845
846	/*
847	 * Initialize the pool of pv list locks.
848	 */
849	for (i = 0; i < NPV_LIST_LOCKS; i++)
850		rw_init(&pv_list_locks[i], "pmap pv list");
851
852	/*
853	 * Calculate the size of the pv head table for superpages.
854	 */
855	for (i = 0; phys_avail[i + 1]; i += 2);
856	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
857
858	/*
859	 * Allocate memory for the pv head table for superpages.
860	 */
861	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
862	s = round_page(s);
863	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
864	for (i = 0; i < pv_npg; i++)
865		TAILQ_INIT(&pv_table[i].pv_list);
866}
867
868static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
869    "2MB page mapping counters");
870
871static u_long pmap_pde_demotions;
872SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
873    &pmap_pde_demotions, 0, "2MB page demotions");
874
875static u_long pmap_pde_mappings;
876SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
877    &pmap_pde_mappings, 0, "2MB page mappings");
878
879static u_long pmap_pde_p_failures;
880SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
881    &pmap_pde_p_failures, 0, "2MB page promotion failures");
882
883static u_long pmap_pde_promotions;
884SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
885    &pmap_pde_promotions, 0, "2MB page promotions");
886
887static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
888    "1GB page mapping counters");
889
890static u_long pmap_pdpe_demotions;
891SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
892    &pmap_pdpe_demotions, 0, "1GB page demotions");
893
894/***************************************************
895 * Low level helper routines.....
896 ***************************************************/
897
898/*
899 * Determine the appropriate bits to set in a PTE or PDE for a specified
900 * caching mode.
901 */
902static int
903pmap_cache_bits(int mode, boolean_t is_pde)
904{
905	int cache_bits, pat_flag, pat_idx;
906
907	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
908		panic("Unknown caching mode %d\n", mode);
909
910	/* The PAT bit is different for PTE's and PDE's. */
911	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
912
913	/* Map the caching mode to a PAT index. */
914	pat_idx = pat_index[mode];
915
916	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
917	cache_bits = 0;
918	if (pat_idx & 0x4)
919		cache_bits |= pat_flag;
920	if (pat_idx & 0x2)
921		cache_bits |= PG_NC_PCD;
922	if (pat_idx & 0x1)
923		cache_bits |= PG_NC_PWT;
924	return (cache_bits);
925}
926
927/*
928 * After changing the page size for the specified virtual address in the page
929 * table, flush the corresponding entries from the processor's TLB.  Only the
930 * calling processor's TLB is affected.
931 *
932 * The calling thread must be pinned to a processor.
933 */
934static void
935pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
936{
937	u_long cr4;
938
939	if ((newpde & PG_PS) == 0)
940		/* Demotion: flush a specific 2MB page mapping. */
941		invlpg(va);
942	else if ((newpde & PG_G) == 0)
943		/*
944		 * Promotion: flush every 4KB page mapping from the TLB
945		 * because there are too many to flush individually.
946		 */
947		invltlb();
948	else {
949		/*
950		 * Promotion: flush every 4KB page mapping from the TLB,
951		 * including any global (PG_G) mappings.
952		 */
953		cr4 = rcr4();
954		load_cr4(cr4 & ~CR4_PGE);
955		/*
956		 * Although preemption at this point could be detrimental to
957		 * performance, it would not lead to an error.  PG_G is simply
958		 * ignored if CR4.PGE is clear.  Moreover, in case this block
959		 * is re-entered, the load_cr4() either above or below will
960		 * modify CR4.PGE flushing the TLB.
961		 */
962		load_cr4(cr4 | CR4_PGE);
963	}
964}
965#ifdef SMP
966/*
967 * For SMP, these functions have to use the IPI mechanism for coherence.
968 *
969 * N.B.: Before calling any of the following TLB invalidation functions,
970 * the calling processor must ensure that all stores updating a non-
971 * kernel page table are globally performed.  Otherwise, another
972 * processor could cache an old, pre-update entry without being
973 * invalidated.  This can happen one of two ways: (1) The pmap becomes
974 * active on another processor after its pm_active field is checked by
975 * one of the following functions but before a store updating the page
976 * table is globally performed. (2) The pmap becomes active on another
977 * processor before its pm_active field is checked but due to
978 * speculative loads one of the following functions stills reads the
979 * pmap as inactive on the other processor.
980 *
981 * The kernel page table is exempt because its pm_active field is
982 * immutable.  The kernel page table is always active on every
983 * processor.
984 */
985void
986pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
987{
988	cpuset_t other_cpus;
989	u_int cpuid;
990
991	sched_pin();
992	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
993		invlpg(va);
994		smp_invlpg(va);
995	} else {
996		cpuid = PCPU_GET(cpuid);
997		other_cpus = all_cpus;
998		CPU_CLR(cpuid, &other_cpus);
999		if (CPU_ISSET(cpuid, &pmap->pm_active))
1000			invlpg(va);
1001		CPU_AND(&other_cpus, &pmap->pm_active);
1002		if (!CPU_EMPTY(&other_cpus))
1003			smp_masked_invlpg(other_cpus, va);
1004	}
1005	sched_unpin();
1006}
1007
1008void
1009pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1010{
1011	cpuset_t other_cpus;
1012	vm_offset_t addr;
1013	u_int cpuid;
1014
1015	sched_pin();
1016	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1017		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1018			invlpg(addr);
1019		smp_invlpg_range(sva, eva);
1020	} else {
1021		cpuid = PCPU_GET(cpuid);
1022		other_cpus = all_cpus;
1023		CPU_CLR(cpuid, &other_cpus);
1024		if (CPU_ISSET(cpuid, &pmap->pm_active))
1025			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1026				invlpg(addr);
1027		CPU_AND(&other_cpus, &pmap->pm_active);
1028		if (!CPU_EMPTY(&other_cpus))
1029			smp_masked_invlpg_range(other_cpus, sva, eva);
1030	}
1031	sched_unpin();
1032}
1033
1034void
1035pmap_invalidate_all(pmap_t pmap)
1036{
1037	cpuset_t other_cpus;
1038	u_int cpuid;
1039
1040	sched_pin();
1041	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1042		invltlb();
1043		smp_invltlb();
1044	} else {
1045		cpuid = PCPU_GET(cpuid);
1046		other_cpus = all_cpus;
1047		CPU_CLR(cpuid, &other_cpus);
1048		if (CPU_ISSET(cpuid, &pmap->pm_active))
1049			invltlb();
1050		CPU_AND(&other_cpus, &pmap->pm_active);
1051		if (!CPU_EMPTY(&other_cpus))
1052			smp_masked_invltlb(other_cpus);
1053	}
1054	sched_unpin();
1055}
1056
1057void
1058pmap_invalidate_cache(void)
1059{
1060
1061	sched_pin();
1062	wbinvd();
1063	smp_cache_flush();
1064	sched_unpin();
1065}
1066
1067struct pde_action {
1068	cpuset_t invalidate;	/* processors that invalidate their TLB */
1069	vm_offset_t va;
1070	pd_entry_t *pde;
1071	pd_entry_t newpde;
1072	u_int store;		/* processor that updates the PDE */
1073};
1074
1075static void
1076pmap_update_pde_action(void *arg)
1077{
1078	struct pde_action *act = arg;
1079
1080	if (act->store == PCPU_GET(cpuid))
1081		pde_store(act->pde, act->newpde);
1082}
1083
1084static void
1085pmap_update_pde_teardown(void *arg)
1086{
1087	struct pde_action *act = arg;
1088
1089	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1090		pmap_update_pde_invalidate(act->va, act->newpde);
1091}
1092
1093/*
1094 * Change the page size for the specified virtual address in a way that
1095 * prevents any possibility of the TLB ever having two entries that map the
1096 * same virtual address using different page sizes.  This is the recommended
1097 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1098 * machine check exception for a TLB state that is improperly diagnosed as a
1099 * hardware error.
1100 */
1101static void
1102pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1103{
1104	struct pde_action act;
1105	cpuset_t active, other_cpus;
1106	u_int cpuid;
1107
1108	sched_pin();
1109	cpuid = PCPU_GET(cpuid);
1110	other_cpus = all_cpus;
1111	CPU_CLR(cpuid, &other_cpus);
1112	if (pmap == kernel_pmap)
1113		active = all_cpus;
1114	else
1115		active = pmap->pm_active;
1116	if (CPU_OVERLAP(&active, &other_cpus)) {
1117		act.store = cpuid;
1118		act.invalidate = active;
1119		act.va = va;
1120		act.pde = pde;
1121		act.newpde = newpde;
1122		CPU_SET(cpuid, &active);
1123		smp_rendezvous_cpus(active,
1124		    smp_no_rendevous_barrier, pmap_update_pde_action,
1125		    pmap_update_pde_teardown, &act);
1126	} else {
1127		pde_store(pde, newpde);
1128		if (CPU_ISSET(cpuid, &active))
1129			pmap_update_pde_invalidate(va, newpde);
1130	}
1131	sched_unpin();
1132}
1133#else /* !SMP */
1134/*
1135 * Normal, non-SMP, invalidation functions.
1136 * We inline these within pmap.c for speed.
1137 */
1138PMAP_INLINE void
1139pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1140{
1141
1142	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1143		invlpg(va);
1144}
1145
1146PMAP_INLINE void
1147pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1148{
1149	vm_offset_t addr;
1150
1151	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1152		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1153			invlpg(addr);
1154}
1155
1156PMAP_INLINE void
1157pmap_invalidate_all(pmap_t pmap)
1158{
1159
1160	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1161		invltlb();
1162}
1163
1164PMAP_INLINE void
1165pmap_invalidate_cache(void)
1166{
1167
1168	wbinvd();
1169}
1170
1171static void
1172pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1173{
1174
1175	pde_store(pde, newpde);
1176	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1177		pmap_update_pde_invalidate(va, newpde);
1178}
1179#endif /* !SMP */
1180
1181#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1182
1183void
1184pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1185{
1186
1187	KASSERT((sva & PAGE_MASK) == 0,
1188	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1189	KASSERT((eva & PAGE_MASK) == 0,
1190	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1191
1192	if (cpu_feature & CPUID_SS)
1193		; /* If "Self Snoop" is supported, do nothing. */
1194	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1195	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1196
1197		/*
1198		 * XXX: Some CPUs fault, hang, or trash the local APIC
1199		 * registers if we use CLFLUSH on the local APIC
1200		 * range.  The local APIC is always uncached, so we
1201		 * don't need to flush for that range anyway.
1202		 */
1203		if (pmap_kextract(sva) == lapic_paddr)
1204			return;
1205
1206		/*
1207		 * Otherwise, do per-cache line flush.  Use the mfence
1208		 * instruction to insure that previous stores are
1209		 * included in the write-back.  The processor
1210		 * propagates flush to other processors in the cache
1211		 * coherence domain.
1212		 */
1213		mfence();
1214		for (; sva < eva; sva += cpu_clflush_line_size)
1215			clflush(sva);
1216		mfence();
1217	} else {
1218
1219		/*
1220		 * No targeted cache flush methods are supported by CPU,
1221		 * or the supplied range is bigger than 2MB.
1222		 * Globally invalidate cache.
1223		 */
1224		pmap_invalidate_cache();
1225	}
1226}
1227
1228/*
1229 * Remove the specified set of pages from the data and instruction caches.
1230 *
1231 * In contrast to pmap_invalidate_cache_range(), this function does not
1232 * rely on the CPU's self-snoop feature, because it is intended for use
1233 * when moving pages into a different cache domain.
1234 */
1235void
1236pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1237{
1238	vm_offset_t daddr, eva;
1239	int i;
1240
1241	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1242	    (cpu_feature & CPUID_CLFSH) == 0)
1243		pmap_invalidate_cache();
1244	else {
1245		mfence();
1246		for (i = 0; i < count; i++) {
1247			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1248			eva = daddr + PAGE_SIZE;
1249			for (; daddr < eva; daddr += cpu_clflush_line_size)
1250				clflush(daddr);
1251		}
1252		mfence();
1253	}
1254}
1255
1256/*
1257 * Are we current address space or kernel?
1258 */
1259static __inline int
1260pmap_is_current(pmap_t pmap)
1261{
1262	return (pmap == kernel_pmap ||
1263	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
1264}
1265
1266/*
1267 *	Routine:	pmap_extract
1268 *	Function:
1269 *		Extract the physical page address associated
1270 *		with the given map/virtual_address pair.
1271 */
1272vm_paddr_t
1273pmap_extract(pmap_t pmap, vm_offset_t va)
1274{
1275	pdp_entry_t *pdpe;
1276	pd_entry_t *pde;
1277	pt_entry_t *pte;
1278	vm_paddr_t pa;
1279
1280	pa = 0;
1281	PMAP_LOCK(pmap);
1282	pdpe = pmap_pdpe(pmap, va);
1283	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1284		if ((*pdpe & PG_PS) != 0)
1285			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1286		else {
1287			pde = pmap_pdpe_to_pde(pdpe, va);
1288			if ((*pde & PG_V) != 0) {
1289				if ((*pde & PG_PS) != 0) {
1290					pa = (*pde & PG_PS_FRAME) |
1291					    (va & PDRMASK);
1292				} else {
1293					pte = pmap_pde_to_pte(pde, va);
1294					pa = (*pte & PG_FRAME) |
1295					    (va & PAGE_MASK);
1296				}
1297			}
1298		}
1299	}
1300	PMAP_UNLOCK(pmap);
1301	return (pa);
1302}
1303
1304/*
1305 *	Routine:	pmap_extract_and_hold
1306 *	Function:
1307 *		Atomically extract and hold the physical page
1308 *		with the given pmap and virtual address pair
1309 *		if that mapping permits the given protection.
1310 */
1311vm_page_t
1312pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1313{
1314	pd_entry_t pde, *pdep;
1315	pt_entry_t pte;
1316	vm_paddr_t pa;
1317	vm_page_t m;
1318
1319	pa = 0;
1320	m = NULL;
1321	PMAP_LOCK(pmap);
1322retry:
1323	pdep = pmap_pde(pmap, va);
1324	if (pdep != NULL && (pde = *pdep)) {
1325		if (pde & PG_PS) {
1326			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1327				if (vm_page_pa_tryrelock(pmap, (pde &
1328				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1329					goto retry;
1330				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1331				    (va & PDRMASK));
1332				vm_page_hold(m);
1333			}
1334		} else {
1335			pte = *pmap_pde_to_pte(pdep, va);
1336			if ((pte & PG_V) &&
1337			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1338				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1339				    &pa))
1340					goto retry;
1341				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1342				vm_page_hold(m);
1343			}
1344		}
1345	}
1346	PA_UNLOCK_COND(pa);
1347	PMAP_UNLOCK(pmap);
1348	return (m);
1349}
1350
1351vm_paddr_t
1352pmap_kextract(vm_offset_t va)
1353{
1354	pd_entry_t pde;
1355	vm_paddr_t pa;
1356
1357	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1358		pa = DMAP_TO_PHYS(va);
1359	} else {
1360		pde = *vtopde(va);
1361		if (pde & PG_PS) {
1362			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1363		} else {
1364			/*
1365			 * Beware of a concurrent promotion that changes the
1366			 * PDE at this point!  For example, vtopte() must not
1367			 * be used to access the PTE because it would use the
1368			 * new PDE.  It is, however, safe to use the old PDE
1369			 * because the page table page is preserved by the
1370			 * promotion.
1371			 */
1372			pa = *pmap_pde_to_pte(&pde, va);
1373			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1374		}
1375	}
1376	return (pa);
1377}
1378
1379/***************************************************
1380 * Low level mapping routines.....
1381 ***************************************************/
1382
1383/*
1384 * Add a wired page to the kva.
1385 * Note: not SMP coherent.
1386 */
1387PMAP_INLINE void
1388pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1389{
1390	pt_entry_t *pte;
1391
1392	pte = vtopte(va);
1393	pte_store(pte, pa | PG_RW | PG_V | PG_G);
1394}
1395
1396static __inline void
1397pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1398{
1399	pt_entry_t *pte;
1400
1401	pte = vtopte(va);
1402	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
1403}
1404
1405/*
1406 * Remove a page from the kernel pagetables.
1407 * Note: not SMP coherent.
1408 */
1409PMAP_INLINE void
1410pmap_kremove(vm_offset_t va)
1411{
1412	pt_entry_t *pte;
1413
1414	pte = vtopte(va);
1415	pte_clear(pte);
1416}
1417
1418/*
1419 *	Used to map a range of physical addresses into kernel
1420 *	virtual address space.
1421 *
1422 *	The value passed in '*virt' is a suggested virtual address for
1423 *	the mapping. Architectures which can support a direct-mapped
1424 *	physical to virtual region can return the appropriate address
1425 *	within that region, leaving '*virt' unchanged. Other
1426 *	architectures should map the pages starting at '*virt' and
1427 *	update '*virt' with the first usable address after the mapped
1428 *	region.
1429 */
1430vm_offset_t
1431pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1432{
1433	return PHYS_TO_DMAP(start);
1434}
1435
1436
1437/*
1438 * Add a list of wired pages to the kva
1439 * this routine is only used for temporary
1440 * kernel mappings that do not need to have
1441 * page modification or references recorded.
1442 * Note that old mappings are simply written
1443 * over.  The page *must* be wired.
1444 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1445 */
1446void
1447pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1448{
1449	pt_entry_t *endpte, oldpte, pa, *pte;
1450	vm_page_t m;
1451
1452	oldpte = 0;
1453	pte = vtopte(sva);
1454	endpte = pte + count;
1455	while (pte < endpte) {
1456		m = *ma++;
1457		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1458		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1459			oldpte |= *pte;
1460			pte_store(pte, pa | PG_G | PG_RW | PG_V);
1461		}
1462		pte++;
1463	}
1464	if (__predict_false((oldpte & PG_V) != 0))
1465		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1466		    PAGE_SIZE);
1467}
1468
1469/*
1470 * This routine tears out page mappings from the
1471 * kernel -- it is meant only for temporary mappings.
1472 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1473 */
1474void
1475pmap_qremove(vm_offset_t sva, int count)
1476{
1477	vm_offset_t va;
1478
1479	va = sva;
1480	while (count-- > 0) {
1481		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
1482		pmap_kremove(va);
1483		va += PAGE_SIZE;
1484	}
1485	pmap_invalidate_range(kernel_pmap, sva, va);
1486}
1487
1488/***************************************************
1489 * Page table page management routines.....
1490 ***************************************************/
1491static __inline void
1492pmap_free_zero_pages(vm_page_t free)
1493{
1494	vm_page_t m;
1495
1496	while (free != NULL) {
1497		m = free;
1498		free = (void *)m->object;
1499		m->object = NULL;
1500		/* Preserve the page's PG_ZERO setting. */
1501		vm_page_free_toq(m);
1502	}
1503}
1504
1505/*
1506 * Schedule the specified unused page table page to be freed.  Specifically,
1507 * add the page to the specified list of pages that will be released to the
1508 * physical memory manager after the TLB has been updated.
1509 */
1510static __inline void
1511pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1512{
1513
1514	if (set_PG_ZERO)
1515		m->flags |= PG_ZERO;
1516	else
1517		m->flags &= ~PG_ZERO;
1518	m->object = (void *)*free;
1519	*free = m;
1520}
1521
1522/*
1523 * Inserts the specified page table page into the specified pmap's collection
1524 * of idle page table pages.  Each of a pmap's page table pages is responsible
1525 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1526 * ordered by this virtual address range.
1527 */
1528static __inline void
1529pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1530{
1531
1532	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1533	vm_radix_insert(&pmap->pm_root, mpte);
1534}
1535
1536/*
1537 * Looks for a page table page mapping the specified virtual address in the
1538 * specified pmap's collection of idle page table pages.  Returns NULL if there
1539 * is no page table page corresponding to the specified virtual address.
1540 */
1541static __inline vm_page_t
1542pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1543{
1544
1545	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1546	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
1547}
1548
1549/*
1550 * Removes the specified page table page from the specified pmap's collection
1551 * of idle page table pages.  The specified page table page must be a member of
1552 * the pmap's collection.
1553 */
1554static __inline void
1555pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1556{
1557
1558	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1559	vm_radix_remove(&pmap->pm_root, mpte->pindex);
1560}
1561
1562/*
1563 * Decrements a page table page's wire count, which is used to record the
1564 * number of valid page table entries within the page.  If the wire count
1565 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1566 * page table page was unmapped and FALSE otherwise.
1567 */
1568static inline boolean_t
1569pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1570{
1571
1572	--m->wire_count;
1573	if (m->wire_count == 0) {
1574		_pmap_unwire_ptp(pmap, va, m, free);
1575		return (TRUE);
1576	} else
1577		return (FALSE);
1578}
1579
1580static void
1581_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1582{
1583
1584	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1585	/*
1586	 * unmap the page table page
1587	 */
1588	if (m->pindex >= (NUPDE + NUPDPE)) {
1589		/* PDP page */
1590		pml4_entry_t *pml4;
1591		pml4 = pmap_pml4e(pmap, va);
1592		*pml4 = 0;
1593	} else if (m->pindex >= NUPDE) {
1594		/* PD page */
1595		pdp_entry_t *pdp;
1596		pdp = pmap_pdpe(pmap, va);
1597		*pdp = 0;
1598	} else {
1599		/* PTE page */
1600		pd_entry_t *pd;
1601		pd = pmap_pde(pmap, va);
1602		*pd = 0;
1603	}
1604	pmap_resident_count_dec(pmap, 1);
1605	if (m->pindex < NUPDE) {
1606		/* We just released a PT, unhold the matching PD */
1607		vm_page_t pdpg;
1608
1609		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1610		pmap_unwire_ptp(pmap, va, pdpg, free);
1611	}
1612	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1613		/* We just released a PD, unhold the matching PDP */
1614		vm_page_t pdppg;
1615
1616		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1617		pmap_unwire_ptp(pmap, va, pdppg, free);
1618	}
1619
1620	/*
1621	 * This is a release store so that the ordinary store unmapping
1622	 * the page table page is globally performed before TLB shoot-
1623	 * down is begun.
1624	 */
1625	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1626
1627	/*
1628	 * Put page on a list so that it is released after
1629	 * *ALL* TLB shootdown is done
1630	 */
1631	pmap_add_delayed_free_list(m, free, TRUE);
1632}
1633
1634/*
1635 * After removing a page table entry, this routine is used to
1636 * conditionally free the page, and manage the hold/wire counts.
1637 */
1638static int
1639pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
1640{
1641	vm_page_t mpte;
1642
1643	if (va >= VM_MAXUSER_ADDRESS)
1644		return (0);
1645	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1646	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1647	return (pmap_unwire_ptp(pmap, va, mpte, free));
1648}
1649
1650void
1651pmap_pinit0(pmap_t pmap)
1652{
1653
1654	PMAP_LOCK_INIT(pmap);
1655	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
1656	pmap->pm_root.rt_root = 0;
1657	CPU_ZERO(&pmap->pm_active);
1658	PCPU_SET(curpmap, pmap);
1659	TAILQ_INIT(&pmap->pm_pvchunk);
1660	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1661}
1662
1663/*
1664 * Initialize a preallocated and zeroed pmap structure,
1665 * such as one in a vmspace structure.
1666 */
1667int
1668pmap_pinit(pmap_t pmap)
1669{
1670	vm_page_t pml4pg;
1671	int i;
1672
1673	PMAP_LOCK_INIT(pmap);
1674
1675	/*
1676	 * allocate the page directory page
1677	 */
1678	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1679	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1680		VM_WAIT;
1681
1682	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1683
1684	if ((pml4pg->flags & PG_ZERO) == 0)
1685		pagezero(pmap->pm_pml4);
1686
1687	/* Wire in kernel global address entries. */
1688	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1689	for (i = 0; i < NDMPML4E; i++) {
1690		pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
1691		    PG_RW | PG_V | PG_U;
1692	}
1693
1694	/* install self-referential address mapping entry(s) */
1695	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1696
1697	pmap->pm_root.rt_root = 0;
1698	CPU_ZERO(&pmap->pm_active);
1699	TAILQ_INIT(&pmap->pm_pvchunk);
1700	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1701
1702	return (1);
1703}
1704
1705/*
1706 * This routine is called if the desired page table page does not exist.
1707 *
1708 * If page table page allocation fails, this routine may sleep before
1709 * returning NULL.  It sleeps only if a lock pointer was given.
1710 *
1711 * Note: If a page allocation fails at page table level two or three,
1712 * one or two pages may be held during the wait, only to be released
1713 * afterwards.  This conservative approach is easily argued to avoid
1714 * race conditions.
1715 */
1716static vm_page_t
1717_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1718{
1719	vm_page_t m, pdppg, pdpg;
1720
1721	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1722
1723	/*
1724	 * Allocate a page table page.
1725	 */
1726	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1727	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1728		if (lockp != NULL) {
1729			RELEASE_PV_LIST_LOCK(lockp);
1730			PMAP_UNLOCK(pmap);
1731			rw_runlock(&pvh_global_lock);
1732			VM_WAIT;
1733			rw_rlock(&pvh_global_lock);
1734			PMAP_LOCK(pmap);
1735		}
1736
1737		/*
1738		 * Indicate the need to retry.  While waiting, the page table
1739		 * page may have been allocated.
1740		 */
1741		return (NULL);
1742	}
1743	if ((m->flags & PG_ZERO) == 0)
1744		pmap_zero_page(m);
1745
1746	/*
1747	 * Map the pagetable page into the process address space, if
1748	 * it isn't already there.
1749	 */
1750
1751	if (ptepindex >= (NUPDE + NUPDPE)) {
1752		pml4_entry_t *pml4;
1753		vm_pindex_t pml4index;
1754
1755		/* Wire up a new PDPE page */
1756		pml4index = ptepindex - (NUPDE + NUPDPE);
1757		pml4 = &pmap->pm_pml4[pml4index];
1758		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1759
1760	} else if (ptepindex >= NUPDE) {
1761		vm_pindex_t pml4index;
1762		vm_pindex_t pdpindex;
1763		pml4_entry_t *pml4;
1764		pdp_entry_t *pdp;
1765
1766		/* Wire up a new PDE page */
1767		pdpindex = ptepindex - NUPDE;
1768		pml4index = pdpindex >> NPML4EPGSHIFT;
1769
1770		pml4 = &pmap->pm_pml4[pml4index];
1771		if ((*pml4 & PG_V) == 0) {
1772			/* Have to allocate a new pdp, recurse */
1773			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1774			    lockp) == NULL) {
1775				--m->wire_count;
1776				atomic_subtract_int(&cnt.v_wire_count, 1);
1777				vm_page_free_zero(m);
1778				return (NULL);
1779			}
1780		} else {
1781			/* Add reference to pdp page */
1782			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1783			pdppg->wire_count++;
1784		}
1785		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1786
1787		/* Now find the pdp page */
1788		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1789		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1790
1791	} else {
1792		vm_pindex_t pml4index;
1793		vm_pindex_t pdpindex;
1794		pml4_entry_t *pml4;
1795		pdp_entry_t *pdp;
1796		pd_entry_t *pd;
1797
1798		/* Wire up a new PTE page */
1799		pdpindex = ptepindex >> NPDPEPGSHIFT;
1800		pml4index = pdpindex >> NPML4EPGSHIFT;
1801
1802		/* First, find the pdp and check that its valid. */
1803		pml4 = &pmap->pm_pml4[pml4index];
1804		if ((*pml4 & PG_V) == 0) {
1805			/* Have to allocate a new pd, recurse */
1806			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1807			    lockp) == NULL) {
1808				--m->wire_count;
1809				atomic_subtract_int(&cnt.v_wire_count, 1);
1810				vm_page_free_zero(m);
1811				return (NULL);
1812			}
1813			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1814			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1815		} else {
1816			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1817			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1818			if ((*pdp & PG_V) == 0) {
1819				/* Have to allocate a new pd, recurse */
1820				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1821				    lockp) == NULL) {
1822					--m->wire_count;
1823					atomic_subtract_int(&cnt.v_wire_count,
1824					    1);
1825					vm_page_free_zero(m);
1826					return (NULL);
1827				}
1828			} else {
1829				/* Add reference to the pd page */
1830				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1831				pdpg->wire_count++;
1832			}
1833		}
1834		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1835
1836		/* Now we know where the page directory page is */
1837		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1838		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1839	}
1840
1841	pmap_resident_count_inc(pmap, 1);
1842
1843	return (m);
1844}
1845
1846static vm_page_t
1847pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1848{
1849	vm_pindex_t pdpindex, ptepindex;
1850	pdp_entry_t *pdpe;
1851	vm_page_t pdpg;
1852
1853retry:
1854	pdpe = pmap_pdpe(pmap, va);
1855	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1856		/* Add a reference to the pd page. */
1857		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1858		pdpg->wire_count++;
1859	} else {
1860		/* Allocate a pd page. */
1861		ptepindex = pmap_pde_pindex(va);
1862		pdpindex = ptepindex >> NPDPEPGSHIFT;
1863		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
1864		if (pdpg == NULL && lockp != NULL)
1865			goto retry;
1866	}
1867	return (pdpg);
1868}
1869
1870static vm_page_t
1871pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1872{
1873	vm_pindex_t ptepindex;
1874	pd_entry_t *pd;
1875	vm_page_t m;
1876
1877	/*
1878	 * Calculate pagetable page index
1879	 */
1880	ptepindex = pmap_pde_pindex(va);
1881retry:
1882	/*
1883	 * Get the page directory entry
1884	 */
1885	pd = pmap_pde(pmap, va);
1886
1887	/*
1888	 * This supports switching from a 2MB page to a
1889	 * normal 4K page.
1890	 */
1891	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1892		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
1893			/*
1894			 * Invalidation of the 2MB page mapping may have caused
1895			 * the deallocation of the underlying PD page.
1896			 */
1897			pd = NULL;
1898		}
1899	}
1900
1901	/*
1902	 * If the page table page is mapped, we just increment the
1903	 * hold count, and activate it.
1904	 */
1905	if (pd != NULL && (*pd & PG_V) != 0) {
1906		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1907		m->wire_count++;
1908	} else {
1909		/*
1910		 * Here if the pte page isn't mapped, or if it has been
1911		 * deallocated.
1912		 */
1913		m = _pmap_allocpte(pmap, ptepindex, lockp);
1914		if (m == NULL && lockp != NULL)
1915			goto retry;
1916	}
1917	return (m);
1918}
1919
1920
1921/***************************************************
1922 * Pmap allocation/deallocation routines.
1923 ***************************************************/
1924
1925/*
1926 * Release any resources held by the given physical map.
1927 * Called when a pmap initialized by pmap_pinit is being released.
1928 * Should only be called if the map contains no valid mappings.
1929 */
1930void
1931pmap_release(pmap_t pmap)
1932{
1933	vm_page_t m;
1934	int i;
1935
1936	KASSERT(pmap->pm_stats.resident_count == 0,
1937	    ("pmap_release: pmap resident count %ld != 0",
1938	    pmap->pm_stats.resident_count));
1939	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1940	    ("pmap_release: pmap has reserved page table page(s)"));
1941
1942	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1943
1944	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1945	for (i = 0; i < NDMPML4E; i++)	/* Direct Map */
1946		pmap->pm_pml4[DMPML4I + i] = 0;
1947	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1948
1949	m->wire_count--;
1950	atomic_subtract_int(&cnt.v_wire_count, 1);
1951	vm_page_free_zero(m);
1952	PMAP_LOCK_DESTROY(pmap);
1953}
1954
1955static int
1956kvm_size(SYSCTL_HANDLER_ARGS)
1957{
1958	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1959
1960	return sysctl_handle_long(oidp, &ksize, 0, req);
1961}
1962SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1963    0, 0, kvm_size, "LU", "Size of KVM");
1964
1965static int
1966kvm_free(SYSCTL_HANDLER_ARGS)
1967{
1968	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1969
1970	return sysctl_handle_long(oidp, &kfree, 0, req);
1971}
1972SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1973    0, 0, kvm_free, "LU", "Amount of KVM free");
1974
1975/*
1976 * grow the number of kernel page table entries, if needed
1977 */
1978void
1979pmap_growkernel(vm_offset_t addr)
1980{
1981	vm_paddr_t paddr;
1982	vm_page_t nkpg;
1983	pd_entry_t *pde, newpdir;
1984	pdp_entry_t *pdpe;
1985
1986	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1987
1988	/*
1989	 * Return if "addr" is within the range of kernel page table pages
1990	 * that were preallocated during pmap bootstrap.  Moreover, leave
1991	 * "kernel_vm_end" and the kernel page table as they were.
1992	 *
1993	 * The correctness of this action is based on the following
1994	 * argument: vm_map_findspace() allocates contiguous ranges of the
1995	 * kernel virtual address space.  It calls this function if a range
1996	 * ends after "kernel_vm_end".  If the kernel is mapped between
1997	 * "kernel_vm_end" and "addr", then the range cannot begin at
1998	 * "kernel_vm_end".  In fact, its beginning address cannot be less
1999	 * than the kernel.  Thus, there is no immediate need to allocate
2000	 * any new kernel page table pages between "kernel_vm_end" and
2001	 * "KERNBASE".
2002	 */
2003	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2004		return;
2005
2006	addr = roundup2(addr, NBPDR);
2007	if (addr - 1 >= kernel_map->max_offset)
2008		addr = kernel_map->max_offset;
2009	while (kernel_vm_end < addr) {
2010		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2011		if ((*pdpe & PG_V) == 0) {
2012			/* We need a new PDP entry */
2013			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2014			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2015			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2016			if (nkpg == NULL)
2017				panic("pmap_growkernel: no memory to grow kernel");
2018			if ((nkpg->flags & PG_ZERO) == 0)
2019				pmap_zero_page(nkpg);
2020			paddr = VM_PAGE_TO_PHYS(nkpg);
2021			*pdpe = (pdp_entry_t)
2022				(paddr | PG_V | PG_RW | PG_A | PG_M);
2023			continue; /* try again */
2024		}
2025		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2026		if ((*pde & PG_V) != 0) {
2027			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2028			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2029				kernel_vm_end = kernel_map->max_offset;
2030				break;
2031			}
2032			continue;
2033		}
2034
2035		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2036		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2037		    VM_ALLOC_ZERO);
2038		if (nkpg == NULL)
2039			panic("pmap_growkernel: no memory to grow kernel");
2040		if ((nkpg->flags & PG_ZERO) == 0)
2041			pmap_zero_page(nkpg);
2042		paddr = VM_PAGE_TO_PHYS(nkpg);
2043		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
2044		pde_store(pde, newpdir);
2045
2046		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2047		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2048			kernel_vm_end = kernel_map->max_offset;
2049			break;
2050		}
2051	}
2052}
2053
2054
2055/***************************************************
2056 * page management routines.
2057 ***************************************************/
2058
2059CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2060CTASSERT(_NPCM == 3);
2061CTASSERT(_NPCPV == 168);
2062
2063static __inline struct pv_chunk *
2064pv_to_chunk(pv_entry_t pv)
2065{
2066
2067	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2068}
2069
2070#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2071
2072#define	PC_FREE0	0xfffffffffffffffful
2073#define	PC_FREE1	0xfffffffffffffffful
2074#define	PC_FREE2	0x000000fffffffffful
2075
2076static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2077
2078#ifdef PV_STATS
2079static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2080
2081SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2082	"Current number of pv entry chunks");
2083SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2084	"Current number of pv entry chunks allocated");
2085SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2086	"Current number of pv entry chunks frees");
2087SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2088	"Number of times tried to get a chunk page but failed.");
2089
2090static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2091static int pv_entry_spare;
2092
2093SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2094	"Current number of pv entry frees");
2095SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2096	"Current number of pv entry allocs");
2097SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2098	"Current number of pv entries");
2099SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2100	"Current number of spare pv entries");
2101#endif
2102
2103/*
2104 * We are in a serious low memory condition.  Resort to
2105 * drastic measures to free some pages so we can allocate
2106 * another pv entry chunk.
2107 *
2108 * Returns NULL if PV entries were reclaimed from the specified pmap.
2109 *
2110 * We do not, however, unmap 2mpages because subsequent accesses will
2111 * allocate per-page pv entries until repromotion occurs, thereby
2112 * exacerbating the shortage of free pv entries.
2113 */
2114static vm_page_t
2115reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2116{
2117	struct pch new_tail;
2118	struct pv_chunk *pc;
2119	struct md_page *pvh;
2120	pd_entry_t *pde;
2121	pmap_t pmap;
2122	pt_entry_t *pte, tpte;
2123	pv_entry_t pv;
2124	vm_offset_t va;
2125	vm_page_t free, m, m_pc;
2126	uint64_t inuse;
2127	int bit, field, freed;
2128
2129	rw_assert(&pvh_global_lock, RA_LOCKED);
2130	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2131	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2132	pmap = NULL;
2133	free = m_pc = NULL;
2134	TAILQ_INIT(&new_tail);
2135	mtx_lock(&pv_chunks_mutex);
2136	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) {
2137		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2138		mtx_unlock(&pv_chunks_mutex);
2139		if (pmap != pc->pc_pmap) {
2140			if (pmap != NULL) {
2141				pmap_invalidate_all(pmap);
2142				if (pmap != locked_pmap)
2143					PMAP_UNLOCK(pmap);
2144			}
2145			pmap = pc->pc_pmap;
2146			/* Avoid deadlock and lock recursion. */
2147			if (pmap > locked_pmap) {
2148				RELEASE_PV_LIST_LOCK(lockp);
2149				PMAP_LOCK(pmap);
2150			} else if (pmap != locked_pmap &&
2151			    !PMAP_TRYLOCK(pmap)) {
2152				pmap = NULL;
2153				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2154				mtx_lock(&pv_chunks_mutex);
2155				continue;
2156			}
2157		}
2158
2159		/*
2160		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2161		 */
2162		freed = 0;
2163		for (field = 0; field < _NPCM; field++) {
2164			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2165			    inuse != 0; inuse &= ~(1UL << bit)) {
2166				bit = bsfq(inuse);
2167				pv = &pc->pc_pventry[field * 64 + bit];
2168				va = pv->pv_va;
2169				pde = pmap_pde(pmap, va);
2170				if ((*pde & PG_PS) != 0)
2171					continue;
2172				pte = pmap_pde_to_pte(pde, va);
2173				if ((*pte & PG_W) != 0)
2174					continue;
2175				tpte = pte_load_clear(pte);
2176				if ((tpte & PG_G) != 0)
2177					pmap_invalidate_page(pmap, va);
2178				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2179				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2180					vm_page_dirty(m);
2181				if ((tpte & PG_A) != 0)
2182					vm_page_aflag_set(m, PGA_REFERENCED);
2183				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2184				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2185				if (TAILQ_EMPTY(&m->md.pv_list) &&
2186				    (m->flags & PG_FICTITIOUS) == 0) {
2187					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2188					if (TAILQ_EMPTY(&pvh->pv_list)) {
2189						vm_page_aflag_clear(m,
2190						    PGA_WRITEABLE);
2191					}
2192				}
2193				pc->pc_map[field] |= 1UL << bit;
2194				pmap_unuse_pt(pmap, va, *pde, &free);
2195				freed++;
2196			}
2197		}
2198		if (freed == 0) {
2199			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2200			mtx_lock(&pv_chunks_mutex);
2201			continue;
2202		}
2203		/* Every freed mapping is for a 4 KB page. */
2204		pmap_resident_count_dec(pmap, freed);
2205		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2206		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2207		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2208		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2209		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2210		    pc->pc_map[2] == PC_FREE2) {
2211			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2212			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2213			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2214			/* Entire chunk is free; return it. */
2215			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2216			dump_drop_page(m_pc->phys_addr);
2217			mtx_lock(&pv_chunks_mutex);
2218			break;
2219		}
2220		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2221		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2222		mtx_lock(&pv_chunks_mutex);
2223		/* One freed pv entry in locked_pmap is sufficient. */
2224		if (pmap == locked_pmap)
2225			break;
2226	}
2227	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2228	mtx_unlock(&pv_chunks_mutex);
2229	if (pmap != NULL) {
2230		pmap_invalidate_all(pmap);
2231		if (pmap != locked_pmap)
2232			PMAP_UNLOCK(pmap);
2233	}
2234	if (m_pc == NULL && free != NULL) {
2235		m_pc = free;
2236		free = (void *)m_pc->object;
2237		/* Recycle a freed page table page. */
2238		m_pc->wire_count = 1;
2239		atomic_add_int(&cnt.v_wire_count, 1);
2240	}
2241	pmap_free_zero_pages(free);
2242	return (m_pc);
2243}
2244
2245/*
2246 * free the pv_entry back to the free list
2247 */
2248static void
2249free_pv_entry(pmap_t pmap, pv_entry_t pv)
2250{
2251	struct pv_chunk *pc;
2252	int idx, field, bit;
2253
2254	rw_assert(&pvh_global_lock, RA_LOCKED);
2255	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2256	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2257	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2258	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2259	pc = pv_to_chunk(pv);
2260	idx = pv - &pc->pc_pventry[0];
2261	field = idx / 64;
2262	bit = idx % 64;
2263	pc->pc_map[field] |= 1ul << bit;
2264	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2265	    pc->pc_map[2] != PC_FREE2) {
2266		/* 98% of the time, pc is already at the head of the list. */
2267		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2268			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2269			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2270		}
2271		return;
2272	}
2273	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2274	free_pv_chunk(pc);
2275}
2276
2277static void
2278free_pv_chunk(struct pv_chunk *pc)
2279{
2280	vm_page_t m;
2281
2282	mtx_lock(&pv_chunks_mutex);
2283 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2284	mtx_unlock(&pv_chunks_mutex);
2285	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2286	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2287	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2288	/* entire chunk is free, return it */
2289	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2290	dump_drop_page(m->phys_addr);
2291	vm_page_unwire(m, 0);
2292	vm_page_free(m);
2293}
2294
2295/*
2296 * Returns a new PV entry, allocating a new PV chunk from the system when
2297 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2298 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2299 * returned.
2300 *
2301 * The given PV list lock may be released.
2302 */
2303static pv_entry_t
2304get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2305{
2306	int bit, field;
2307	pv_entry_t pv;
2308	struct pv_chunk *pc;
2309	vm_page_t m;
2310
2311	rw_assert(&pvh_global_lock, RA_LOCKED);
2312	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2313	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2314retry:
2315	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2316	if (pc != NULL) {
2317		for (field = 0; field < _NPCM; field++) {
2318			if (pc->pc_map[field]) {
2319				bit = bsfq(pc->pc_map[field]);
2320				break;
2321			}
2322		}
2323		if (field < _NPCM) {
2324			pv = &pc->pc_pventry[field * 64 + bit];
2325			pc->pc_map[field] &= ~(1ul << bit);
2326			/* If this was the last item, move it to tail */
2327			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2328			    pc->pc_map[2] == 0) {
2329				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2330				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2331				    pc_list);
2332			}
2333			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2334			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2335			return (pv);
2336		}
2337	}
2338	/* No free items, allocate another chunk */
2339	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2340	    VM_ALLOC_WIRED);
2341	if (m == NULL) {
2342		if (lockp == NULL) {
2343			PV_STAT(pc_chunk_tryfail++);
2344			return (NULL);
2345		}
2346		m = reclaim_pv_chunk(pmap, lockp);
2347		if (m == NULL)
2348			goto retry;
2349	}
2350	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2351	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2352	dump_add_page(m->phys_addr);
2353	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2354	pc->pc_pmap = pmap;
2355	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2356	pc->pc_map[1] = PC_FREE1;
2357	pc->pc_map[2] = PC_FREE2;
2358	mtx_lock(&pv_chunks_mutex);
2359	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2360	mtx_unlock(&pv_chunks_mutex);
2361	pv = &pc->pc_pventry[0];
2362	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2363	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2364	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2365	return (pv);
2366}
2367
2368/*
2369 * Returns the number of one bits within the given PV chunk map element.
2370 */
2371static int
2372popcnt_pc_map_elem(uint64_t elem)
2373{
2374	int count;
2375
2376	/*
2377	 * This simple method of counting the one bits performs well because
2378	 * the given element typically contains more zero bits than one bits.
2379	 */
2380	count = 0;
2381	for (; elem != 0; elem &= elem - 1)
2382		count++;
2383	return (count);
2384}
2385
2386/*
2387 * Ensure that the number of spare PV entries in the specified pmap meets or
2388 * exceeds the given count, "needed".
2389 *
2390 * The given PV list lock may be released.
2391 */
2392static void
2393reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2394{
2395	struct pch new_tail;
2396	struct pv_chunk *pc;
2397	int avail, free;
2398	vm_page_t m;
2399
2400	rw_assert(&pvh_global_lock, RA_LOCKED);
2401	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2402	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2403
2404	/*
2405	 * Newly allocated PV chunks must be stored in a private list until
2406	 * the required number of PV chunks have been allocated.  Otherwise,
2407	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2408	 * contrast, these chunks must be added to the pmap upon allocation.
2409	 */
2410	TAILQ_INIT(&new_tail);
2411retry:
2412	avail = 0;
2413	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2414		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
2415			free = popcnt_pc_map_elem(pc->pc_map[0]);
2416			free += popcnt_pc_map_elem(pc->pc_map[1]);
2417			free += popcnt_pc_map_elem(pc->pc_map[2]);
2418		} else {
2419			free = popcntq(pc->pc_map[0]);
2420			free += popcntq(pc->pc_map[1]);
2421			free += popcntq(pc->pc_map[2]);
2422		}
2423		if (free == 0)
2424			break;
2425		avail += free;
2426		if (avail >= needed)
2427			break;
2428	}
2429	for (; avail < needed; avail += _NPCPV) {
2430		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2431		    VM_ALLOC_WIRED);
2432		if (m == NULL) {
2433			m = reclaim_pv_chunk(pmap, lockp);
2434			if (m == NULL)
2435				goto retry;
2436		}
2437		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2438		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2439		dump_add_page(m->phys_addr);
2440		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2441		pc->pc_pmap = pmap;
2442		pc->pc_map[0] = PC_FREE0;
2443		pc->pc_map[1] = PC_FREE1;
2444		pc->pc_map[2] = PC_FREE2;
2445		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2446		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2447		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2448	}
2449	if (!TAILQ_EMPTY(&new_tail)) {
2450		mtx_lock(&pv_chunks_mutex);
2451		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2452		mtx_unlock(&pv_chunks_mutex);
2453	}
2454}
2455
2456/*
2457 * First find and then remove the pv entry for the specified pmap and virtual
2458 * address from the specified pv list.  Returns the pv entry if found and NULL
2459 * otherwise.  This operation can be performed on pv lists for either 4KB or
2460 * 2MB page mappings.
2461 */
2462static __inline pv_entry_t
2463pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2464{
2465	pv_entry_t pv;
2466
2467	rw_assert(&pvh_global_lock, RA_LOCKED);
2468	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2469		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2470			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2471			break;
2472		}
2473	}
2474	return (pv);
2475}
2476
2477/*
2478 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2479 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2480 * entries for each of the 4KB page mappings.
2481 */
2482static void
2483pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2484    struct rwlock **lockp)
2485{
2486	struct md_page *pvh;
2487	struct pv_chunk *pc;
2488	pv_entry_t pv;
2489	vm_offset_t va_last;
2490	vm_page_t m;
2491	int bit, field;
2492
2493	rw_assert(&pvh_global_lock, RA_LOCKED);
2494	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2495	KASSERT((pa & PDRMASK) == 0,
2496	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
2497	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2498
2499	/*
2500	 * Transfer the 2mpage's pv entry for this mapping to the first
2501	 * page's pv list.  Once this transfer begins, the pv list lock
2502	 * must not be released until the last pv entry is reinstantiated.
2503	 */
2504	pvh = pa_to_pvh(pa);
2505	va = trunc_2mpage(va);
2506	pv = pmap_pvh_remove(pvh, pmap, va);
2507	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2508	m = PHYS_TO_VM_PAGE(pa);
2509	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2510	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2511	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
2512	va_last = va + NBPDR - PAGE_SIZE;
2513	for (;;) {
2514		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2515		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2516		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
2517		for (field = 0; field < _NPCM; field++) {
2518			while (pc->pc_map[field]) {
2519				bit = bsfq(pc->pc_map[field]);
2520				pc->pc_map[field] &= ~(1ul << bit);
2521				pv = &pc->pc_pventry[field * 64 + bit];
2522				va += PAGE_SIZE;
2523				pv->pv_va = va;
2524				m++;
2525				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2526			    ("pmap_pv_demote_pde: page %p is not managed", m));
2527				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2528				if (va == va_last)
2529					goto out;
2530			}
2531		}
2532		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2533		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2534	}
2535out:
2536	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2537		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2538		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2539	}
2540	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
2541	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
2542}
2543
2544/*
2545 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2546 * replace the many pv entries for the 4KB page mappings by a single pv entry
2547 * for the 2MB page mapping.
2548 */
2549static void
2550pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2551    struct rwlock **lockp)
2552{
2553	struct md_page *pvh;
2554	pv_entry_t pv;
2555	vm_offset_t va_last;
2556	vm_page_t m;
2557
2558	rw_assert(&pvh_global_lock, RA_LOCKED);
2559	KASSERT((pa & PDRMASK) == 0,
2560	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
2561	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2562
2563	/*
2564	 * Transfer the first page's pv entry for this mapping to the 2mpage's
2565	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
2566	 * a transfer avoids the possibility that get_pv_entry() calls
2567	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
2568	 * mappings that is being promoted.
2569	 */
2570	m = PHYS_TO_VM_PAGE(pa);
2571	va = trunc_2mpage(va);
2572	pv = pmap_pvh_remove(&m->md, pmap, va);
2573	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2574	pvh = pa_to_pvh(pa);
2575	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2576	/* Free the remaining NPTEPG - 1 pv entries. */
2577	va_last = va + NBPDR - PAGE_SIZE;
2578	do {
2579		m++;
2580		va += PAGE_SIZE;
2581		pmap_pvh_free(&m->md, pmap, va);
2582	} while (va < va_last);
2583}
2584
2585/*
2586 * First find and then destroy the pv entry for the specified pmap and virtual
2587 * address.  This operation can be performed on pv lists for either 4KB or 2MB
2588 * page mappings.
2589 */
2590static void
2591pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2592{
2593	pv_entry_t pv;
2594
2595	pv = pmap_pvh_remove(pvh, pmap, va);
2596	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2597	free_pv_entry(pmap, pv);
2598}
2599
2600/*
2601 * Conditionally create the PV entry for a 4KB page mapping if the required
2602 * memory can be allocated without resorting to reclamation.
2603 */
2604static boolean_t
2605pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2606    struct rwlock **lockp)
2607{
2608	pv_entry_t pv;
2609
2610	rw_assert(&pvh_global_lock, RA_LOCKED);
2611	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2612	/* Pass NULL instead of the lock pointer to disable reclamation. */
2613	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2614		pv->pv_va = va;
2615		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2616		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2617		return (TRUE);
2618	} else
2619		return (FALSE);
2620}
2621
2622/*
2623 * Conditionally create the PV entry for a 2MB page mapping if the required
2624 * memory can be allocated without resorting to reclamation.
2625 */
2626static boolean_t
2627pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2628    struct rwlock **lockp)
2629{
2630	struct md_page *pvh;
2631	pv_entry_t pv;
2632
2633	rw_assert(&pvh_global_lock, RA_LOCKED);
2634	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2635	/* Pass NULL instead of the lock pointer to disable reclamation. */
2636	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2637		pv->pv_va = va;
2638		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2639		pvh = pa_to_pvh(pa);
2640		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2641		return (TRUE);
2642	} else
2643		return (FALSE);
2644}
2645
2646/*
2647 * Fills a page table page with mappings to consecutive physical pages.
2648 */
2649static void
2650pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2651{
2652	pt_entry_t *pte;
2653
2654	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2655		*pte = newpte;
2656		newpte += PAGE_SIZE;
2657	}
2658}
2659
2660/*
2661 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
2662 * mapping is invalidated.
2663 */
2664static boolean_t
2665pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2666{
2667	struct rwlock *lock;
2668	boolean_t rv;
2669
2670	lock = NULL;
2671	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
2672	if (lock != NULL)
2673		rw_wunlock(lock);
2674	return (rv);
2675}
2676
2677static boolean_t
2678pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
2679    struct rwlock **lockp)
2680{
2681	pd_entry_t newpde, oldpde;
2682	pt_entry_t *firstpte, newpte;
2683	vm_paddr_t mptepa;
2684	vm_page_t free, mpte;
2685
2686	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2687	oldpde = *pde;
2688	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2689	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2690	mpte = pmap_lookup_pt_page(pmap, va);
2691	if (mpte != NULL)
2692		pmap_remove_pt_page(pmap, mpte);
2693	else {
2694		KASSERT((oldpde & PG_W) == 0,
2695		    ("pmap_demote_pde: page table page for a wired mapping"
2696		    " is missing"));
2697
2698		/*
2699		 * Invalidate the 2MB page mapping and return "failure" if the
2700		 * mapping was never accessed or the allocation of the new
2701		 * page table page fails.  If the 2MB page mapping belongs to
2702		 * the direct map region of the kernel's address space, then
2703		 * the page allocation request specifies the highest possible
2704		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
2705		 * normal.  Page table pages are preallocated for every other
2706		 * part of the kernel address space, so the direct map region
2707		 * is the only part of the kernel address space that must be
2708		 * handled here.
2709		 */
2710		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2711		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
2712		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
2713		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2714			free = NULL;
2715			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
2716			    lockp);
2717			pmap_invalidate_page(pmap, trunc_2mpage(va));
2718			pmap_free_zero_pages(free);
2719			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
2720			    " in pmap %p", va, pmap);
2721			return (FALSE);
2722		}
2723		if (va < VM_MAXUSER_ADDRESS)
2724			pmap_resident_count_inc(pmap, 1);
2725	}
2726	mptepa = VM_PAGE_TO_PHYS(mpte);
2727	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2728	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2729	KASSERT((oldpde & PG_A) != 0,
2730	    ("pmap_demote_pde: oldpde is missing PG_A"));
2731	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2732	    ("pmap_demote_pde: oldpde is missing PG_M"));
2733	newpte = oldpde & ~PG_PS;
2734	if ((newpte & PG_PDE_PAT) != 0)
2735		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2736
2737	/*
2738	 * If the page table page is new, initialize it.
2739	 */
2740	if (mpte->wire_count == 1) {
2741		mpte->wire_count = NPTEPG;
2742		pmap_fill_ptp(firstpte, newpte);
2743	}
2744	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2745	    ("pmap_demote_pde: firstpte and newpte map different physical"
2746	    " addresses"));
2747
2748	/*
2749	 * If the mapping has changed attributes, update the page table
2750	 * entries.
2751	 */
2752	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2753		pmap_fill_ptp(firstpte, newpte);
2754
2755	/*
2756	 * The spare PV entries must be reserved prior to demoting the
2757	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
2758	 * of the PDE and the PV lists will be inconsistent, which can result
2759	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
2760	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
2761	 * PV entry for the 2MB page mapping that is being demoted.
2762	 */
2763	if ((oldpde & PG_MANAGED) != 0)
2764		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
2765
2766	/*
2767	 * Demote the mapping.  This pmap is locked.  The old PDE has
2768	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2769	 * set.  Thus, there is no danger of a race with another
2770	 * processor changing the setting of PG_A and/or PG_M between
2771	 * the read above and the store below.
2772	 */
2773	if (workaround_erratum383)
2774		pmap_update_pde(pmap, va, pde, newpde);
2775	else
2776		pde_store(pde, newpde);
2777
2778	/*
2779	 * Invalidate a stale recursive mapping of the page table page.
2780	 */
2781	if (va >= VM_MAXUSER_ADDRESS)
2782		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2783
2784	/*
2785	 * Demote the PV entry.
2786	 */
2787	if ((oldpde & PG_MANAGED) != 0)
2788		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
2789
2790	atomic_add_long(&pmap_pde_demotions, 1);
2791	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
2792	    " in pmap %p", va, pmap);
2793	return (TRUE);
2794}
2795
2796/*
2797 * pmap_remove_pde: do the things to unmap a superpage in a process
2798 */
2799static int
2800pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2801    vm_page_t *free, struct rwlock **lockp)
2802{
2803	struct md_page *pvh;
2804	pd_entry_t oldpde;
2805	vm_offset_t eva, va;
2806	vm_page_t m, mpte;
2807
2808	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2809	KASSERT((sva & PDRMASK) == 0,
2810	    ("pmap_remove_pde: sva is not 2mpage aligned"));
2811	oldpde = pte_load_clear(pdq);
2812	if (oldpde & PG_W)
2813		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2814
2815	/*
2816	 * Machines that don't support invlpg, also don't support
2817	 * PG_G.
2818	 */
2819	if (oldpde & PG_G)
2820		pmap_invalidate_page(kernel_pmap, sva);
2821	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
2822	if (oldpde & PG_MANAGED) {
2823		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
2824		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2825		pmap_pvh_free(pvh, pmap, sva);
2826		eva = sva + NBPDR;
2827		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2828		    va < eva; va += PAGE_SIZE, m++) {
2829			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2830				vm_page_dirty(m);
2831			if (oldpde & PG_A)
2832				vm_page_aflag_set(m, PGA_REFERENCED);
2833			if (TAILQ_EMPTY(&m->md.pv_list) &&
2834			    TAILQ_EMPTY(&pvh->pv_list))
2835				vm_page_aflag_clear(m, PGA_WRITEABLE);
2836		}
2837	}
2838	if (pmap == kernel_pmap) {
2839		if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp))
2840			panic("pmap_remove_pde: failed demotion");
2841	} else {
2842		mpte = pmap_lookup_pt_page(pmap, sva);
2843		if (mpte != NULL) {
2844			pmap_remove_pt_page(pmap, mpte);
2845			pmap_resident_count_dec(pmap, 1);
2846			KASSERT(mpte->wire_count == NPTEPG,
2847			    ("pmap_remove_pde: pte page wire count error"));
2848			mpte->wire_count = 0;
2849			pmap_add_delayed_free_list(mpte, free, FALSE);
2850			atomic_subtract_int(&cnt.v_wire_count, 1);
2851		}
2852	}
2853	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
2854}
2855
2856/*
2857 * pmap_remove_pte: do the things to unmap a page in a process
2858 */
2859static int
2860pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2861    pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp)
2862{
2863	struct md_page *pvh;
2864	pt_entry_t oldpte;
2865	vm_page_t m;
2866
2867	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2868	oldpte = pte_load_clear(ptq);
2869	if (oldpte & PG_W)
2870		pmap->pm_stats.wired_count -= 1;
2871	pmap_resident_count_dec(pmap, 1);
2872	if (oldpte & PG_MANAGED) {
2873		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2874		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2875			vm_page_dirty(m);
2876		if (oldpte & PG_A)
2877			vm_page_aflag_set(m, PGA_REFERENCED);
2878		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2879		pmap_pvh_free(&m->md, pmap, va);
2880		if (TAILQ_EMPTY(&m->md.pv_list) &&
2881		    (m->flags & PG_FICTITIOUS) == 0) {
2882			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2883			if (TAILQ_EMPTY(&pvh->pv_list))
2884				vm_page_aflag_clear(m, PGA_WRITEABLE);
2885		}
2886	}
2887	return (pmap_unuse_pt(pmap, va, ptepde, free));
2888}
2889
2890/*
2891 * Remove a single page from a process address space
2892 */
2893static void
2894pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
2895{
2896	struct rwlock *lock;
2897	pt_entry_t *pte;
2898
2899	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2900	if ((*pde & PG_V) == 0)
2901		return;
2902	pte = pmap_pde_to_pte(pde, va);
2903	if ((*pte & PG_V) == 0)
2904		return;
2905	lock = NULL;
2906	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
2907	if (lock != NULL)
2908		rw_wunlock(lock);
2909	pmap_invalidate_page(pmap, va);
2910}
2911
2912/*
2913 *	Remove the given range of addresses from the specified map.
2914 *
2915 *	It is assumed that the start and end are properly
2916 *	rounded to the page size.
2917 */
2918void
2919pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2920{
2921	struct rwlock *lock;
2922	vm_offset_t va, va_next;
2923	pml4_entry_t *pml4e;
2924	pdp_entry_t *pdpe;
2925	pd_entry_t ptpaddr, *pde;
2926	pt_entry_t *pte;
2927	vm_page_t free = NULL;
2928	int anyvalid;
2929
2930	/*
2931	 * Perform an unsynchronized read.  This is, however, safe.
2932	 */
2933	if (pmap->pm_stats.resident_count == 0)
2934		return;
2935
2936	anyvalid = 0;
2937
2938	rw_rlock(&pvh_global_lock);
2939	PMAP_LOCK(pmap);
2940
2941	/*
2942	 * special handling of removing one page.  a very
2943	 * common operation and easy to short circuit some
2944	 * code.
2945	 */
2946	if (sva + PAGE_SIZE == eva) {
2947		pde = pmap_pde(pmap, sva);
2948		if (pde && (*pde & PG_PS) == 0) {
2949			pmap_remove_page(pmap, sva, pde, &free);
2950			goto out;
2951		}
2952	}
2953
2954	lock = NULL;
2955	for (; sva < eva; sva = va_next) {
2956
2957		if (pmap->pm_stats.resident_count == 0)
2958			break;
2959
2960		pml4e = pmap_pml4e(pmap, sva);
2961		if ((*pml4e & PG_V) == 0) {
2962			va_next = (sva + NBPML4) & ~PML4MASK;
2963			if (va_next < sva)
2964				va_next = eva;
2965			continue;
2966		}
2967
2968		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2969		if ((*pdpe & PG_V) == 0) {
2970			va_next = (sva + NBPDP) & ~PDPMASK;
2971			if (va_next < sva)
2972				va_next = eva;
2973			continue;
2974		}
2975
2976		/*
2977		 * Calculate index for next page table.
2978		 */
2979		va_next = (sva + NBPDR) & ~PDRMASK;
2980		if (va_next < sva)
2981			va_next = eva;
2982
2983		pde = pmap_pdpe_to_pde(pdpe, sva);
2984		ptpaddr = *pde;
2985
2986		/*
2987		 * Weed out invalid mappings.
2988		 */
2989		if (ptpaddr == 0)
2990			continue;
2991
2992		/*
2993		 * Check for large page.
2994		 */
2995		if ((ptpaddr & PG_PS) != 0) {
2996			/*
2997			 * Are we removing the entire large page?  If not,
2998			 * demote the mapping and fall through.
2999			 */
3000			if (sva + NBPDR == va_next && eva >= va_next) {
3001				/*
3002				 * The TLB entry for a PG_G mapping is
3003				 * invalidated by pmap_remove_pde().
3004				 */
3005				if ((ptpaddr & PG_G) == 0)
3006					anyvalid = 1;
3007				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3008				continue;
3009			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3010			    &lock)) {
3011				/* The large page mapping was destroyed. */
3012				continue;
3013			} else
3014				ptpaddr = *pde;
3015		}
3016
3017		/*
3018		 * Limit our scan to either the end of the va represented
3019		 * by the current page table page, or to the end of the
3020		 * range being removed.
3021		 */
3022		if (va_next > eva)
3023			va_next = eva;
3024
3025		va = va_next;
3026		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3027		    sva += PAGE_SIZE) {
3028			if (*pte == 0) {
3029				if (va != va_next) {
3030					pmap_invalidate_range(pmap, va, sva);
3031					va = va_next;
3032				}
3033				continue;
3034			}
3035			if ((*pte & PG_G) == 0)
3036				anyvalid = 1;
3037			else if (va == va_next)
3038				va = sva;
3039			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3040			    &lock)) {
3041				sva += PAGE_SIZE;
3042				break;
3043			}
3044		}
3045		if (va != va_next)
3046			pmap_invalidate_range(pmap, va, sva);
3047	}
3048	if (lock != NULL)
3049		rw_wunlock(lock);
3050out:
3051	if (anyvalid)
3052		pmap_invalidate_all(pmap);
3053	rw_runlock(&pvh_global_lock);
3054	PMAP_UNLOCK(pmap);
3055	pmap_free_zero_pages(free);
3056}
3057
3058/*
3059 *	Routine:	pmap_remove_all
3060 *	Function:
3061 *		Removes this physical page from
3062 *		all physical maps in which it resides.
3063 *		Reflects back modify bits to the pager.
3064 *
3065 *	Notes:
3066 *		Original versions of this routine were very
3067 *		inefficient because they iteratively called
3068 *		pmap_remove (slow...)
3069 */
3070
3071void
3072pmap_remove_all(vm_page_t m)
3073{
3074	struct md_page *pvh;
3075	pv_entry_t pv;
3076	pmap_t pmap;
3077	pt_entry_t *pte, tpte;
3078	pd_entry_t *pde;
3079	vm_offset_t va;
3080	vm_page_t free;
3081
3082	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3083	    ("pmap_remove_all: page %p is not managed", m));
3084	free = NULL;
3085	rw_wlock(&pvh_global_lock);
3086	if ((m->flags & PG_FICTITIOUS) != 0)
3087		goto small_mappings;
3088	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3089	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3090		pmap = PV_PMAP(pv);
3091		PMAP_LOCK(pmap);
3092		va = pv->pv_va;
3093		pde = pmap_pde(pmap, va);
3094		(void)pmap_demote_pde(pmap, pde, va);
3095		PMAP_UNLOCK(pmap);
3096	}
3097small_mappings:
3098	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3099		pmap = PV_PMAP(pv);
3100		PMAP_LOCK(pmap);
3101		pmap_resident_count_dec(pmap, 1);
3102		pde = pmap_pde(pmap, pv->pv_va);
3103		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3104		    " a 2mpage in page %p's pv list", m));
3105		pte = pmap_pde_to_pte(pde, pv->pv_va);
3106		tpte = pte_load_clear(pte);
3107		if (tpte & PG_W)
3108			pmap->pm_stats.wired_count--;
3109		if (tpte & PG_A)
3110			vm_page_aflag_set(m, PGA_REFERENCED);
3111
3112		/*
3113		 * Update the vm_page_t clean and reference bits.
3114		 */
3115		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3116			vm_page_dirty(m);
3117		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3118		pmap_invalidate_page(pmap, pv->pv_va);
3119		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3120		free_pv_entry(pmap, pv);
3121		PMAP_UNLOCK(pmap);
3122	}
3123	vm_page_aflag_clear(m, PGA_WRITEABLE);
3124	rw_wunlock(&pvh_global_lock);
3125	pmap_free_zero_pages(free);
3126}
3127
3128/*
3129 * pmap_protect_pde: do the things to protect a 2mpage in a process
3130 */
3131static boolean_t
3132pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3133{
3134	pd_entry_t newpde, oldpde;
3135	vm_offset_t eva, va;
3136	vm_page_t m;
3137	boolean_t anychanged;
3138
3139	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3140	KASSERT((sva & PDRMASK) == 0,
3141	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3142	anychanged = FALSE;
3143retry:
3144	oldpde = newpde = *pde;
3145	if (oldpde & PG_MANAGED) {
3146		eva = sva + NBPDR;
3147		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3148		    va < eva; va += PAGE_SIZE, m++)
3149			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3150				vm_page_dirty(m);
3151	}
3152	if ((prot & VM_PROT_WRITE) == 0)
3153		newpde &= ~(PG_RW | PG_M);
3154	if ((prot & VM_PROT_EXECUTE) == 0)
3155		newpde |= pg_nx;
3156	if (newpde != oldpde) {
3157		if (!atomic_cmpset_long(pde, oldpde, newpde))
3158			goto retry;
3159		if (oldpde & PG_G)
3160			pmap_invalidate_page(pmap, sva);
3161		else
3162			anychanged = TRUE;
3163	}
3164	return (anychanged);
3165}
3166
3167/*
3168 *	Set the physical protection on the
3169 *	specified range of this map as requested.
3170 */
3171void
3172pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3173{
3174	vm_offset_t va_next;
3175	pml4_entry_t *pml4e;
3176	pdp_entry_t *pdpe;
3177	pd_entry_t ptpaddr, *pde;
3178	pt_entry_t *pte;
3179	boolean_t anychanged, pv_lists_locked;
3180
3181	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3182		pmap_remove(pmap, sva, eva);
3183		return;
3184	}
3185
3186	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3187	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3188		return;
3189
3190	pv_lists_locked = FALSE;
3191resume:
3192	anychanged = FALSE;
3193
3194	PMAP_LOCK(pmap);
3195	for (; sva < eva; sva = va_next) {
3196
3197		pml4e = pmap_pml4e(pmap, sva);
3198		if ((*pml4e & PG_V) == 0) {
3199			va_next = (sva + NBPML4) & ~PML4MASK;
3200			if (va_next < sva)
3201				va_next = eva;
3202			continue;
3203		}
3204
3205		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3206		if ((*pdpe & PG_V) == 0) {
3207			va_next = (sva + NBPDP) & ~PDPMASK;
3208			if (va_next < sva)
3209				va_next = eva;
3210			continue;
3211		}
3212
3213		va_next = (sva + NBPDR) & ~PDRMASK;
3214		if (va_next < sva)
3215			va_next = eva;
3216
3217		pde = pmap_pdpe_to_pde(pdpe, sva);
3218		ptpaddr = *pde;
3219
3220		/*
3221		 * Weed out invalid mappings.
3222		 */
3223		if (ptpaddr == 0)
3224			continue;
3225
3226		/*
3227		 * Check for large page.
3228		 */
3229		if ((ptpaddr & PG_PS) != 0) {
3230			/*
3231			 * Are we protecting the entire large page?  If not,
3232			 * demote the mapping and fall through.
3233			 */
3234			if (sva + NBPDR == va_next && eva >= va_next) {
3235				/*
3236				 * The TLB entry for a PG_G mapping is
3237				 * invalidated by pmap_protect_pde().
3238				 */
3239				if (pmap_protect_pde(pmap, pde, sva, prot))
3240					anychanged = TRUE;
3241				continue;
3242			} else {
3243				if (!pv_lists_locked) {
3244					pv_lists_locked = TRUE;
3245					if (!rw_try_rlock(&pvh_global_lock)) {
3246						if (anychanged)
3247							pmap_invalidate_all(
3248							    pmap);
3249						PMAP_UNLOCK(pmap);
3250						rw_rlock(&pvh_global_lock);
3251						goto resume;
3252					}
3253				}
3254				if (!pmap_demote_pde(pmap, pde, sva)) {
3255					/*
3256					 * The large page mapping was
3257					 * destroyed.
3258					 */
3259					continue;
3260				}
3261			}
3262		}
3263
3264		if (va_next > eva)
3265			va_next = eva;
3266
3267		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3268		    sva += PAGE_SIZE) {
3269			pt_entry_t obits, pbits;
3270			vm_page_t m;
3271
3272retry:
3273			obits = pbits = *pte;
3274			if ((pbits & PG_V) == 0)
3275				continue;
3276
3277			if ((prot & VM_PROT_WRITE) == 0) {
3278				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3279				    (PG_MANAGED | PG_M | PG_RW)) {
3280					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3281					vm_page_dirty(m);
3282				}
3283				pbits &= ~(PG_RW | PG_M);
3284			}
3285			if ((prot & VM_PROT_EXECUTE) == 0)
3286				pbits |= pg_nx;
3287
3288			if (pbits != obits) {
3289				if (!atomic_cmpset_long(pte, obits, pbits))
3290					goto retry;
3291				if (obits & PG_G)
3292					pmap_invalidate_page(pmap, sva);
3293				else
3294					anychanged = TRUE;
3295			}
3296		}
3297	}
3298	if (anychanged)
3299		pmap_invalidate_all(pmap);
3300	if (pv_lists_locked)
3301		rw_runlock(&pvh_global_lock);
3302	PMAP_UNLOCK(pmap);
3303}
3304
3305/*
3306 * Tries to promote the 512, contiguous 4KB page mappings that are within a
3307 * single page table page (PTP) to a single 2MB page mapping.  For promotion
3308 * to occur, two conditions must be met: (1) the 4KB page mappings must map
3309 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3310 * identical characteristics.
3311 */
3312static void
3313pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3314    struct rwlock **lockp)
3315{
3316	pd_entry_t newpde;
3317	pt_entry_t *firstpte, oldpte, pa, *pte;
3318	vm_offset_t oldpteva;
3319	vm_page_t mpte;
3320
3321	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3322
3323	/*
3324	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3325	 * either invalid, unused, or does not map the first 4KB physical page
3326	 * within a 2MB page.
3327	 */
3328	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
3329setpde:
3330	newpde = *firstpte;
3331	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3332		atomic_add_long(&pmap_pde_p_failures, 1);
3333		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
3334		    " in pmap %p", va, pmap);
3335		return;
3336	}
3337	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3338		/*
3339		 * When PG_M is already clear, PG_RW can be cleared without
3340		 * a TLB invalidation.
3341		 */
3342		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
3343			goto setpde;
3344		newpde &= ~PG_RW;
3345	}
3346
3347	/*
3348	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3349	 * PTE maps an unexpected 4KB physical page or does not have identical
3350	 * characteristics to the first PTE.
3351	 */
3352	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3353	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3354setpte:
3355		oldpte = *pte;
3356		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3357			atomic_add_long(&pmap_pde_p_failures, 1);
3358			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
3359			    " in pmap %p", va, pmap);
3360			return;
3361		}
3362		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3363			/*
3364			 * When PG_M is already clear, PG_RW can be cleared
3365			 * without a TLB invalidation.
3366			 */
3367			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
3368				goto setpte;
3369			oldpte &= ~PG_RW;
3370			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3371			    (va & ~PDRMASK);
3372			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
3373			    " in pmap %p", oldpteva, pmap);
3374		}
3375		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3376			atomic_add_long(&pmap_pde_p_failures, 1);
3377			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
3378			    " in pmap %p", va, pmap);
3379			return;
3380		}
3381		pa -= PAGE_SIZE;
3382	}
3383
3384	/*
3385	 * Save the page table page in its current state until the PDE
3386	 * mapping the superpage is demoted by pmap_demote_pde() or
3387	 * destroyed by pmap_remove_pde().
3388	 */
3389	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3390	KASSERT(mpte >= vm_page_array &&
3391	    mpte < &vm_page_array[vm_page_array_size],
3392	    ("pmap_promote_pde: page table page is out of range"));
3393	KASSERT(mpte->pindex == pmap_pde_pindex(va),
3394	    ("pmap_promote_pde: page table page's pindex is wrong"));
3395	pmap_insert_pt_page(pmap, mpte);
3396
3397	/*
3398	 * Promote the pv entries.
3399	 */
3400	if ((newpde & PG_MANAGED) != 0)
3401		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
3402
3403	/*
3404	 * Propagate the PAT index to its proper position.
3405	 */
3406	if ((newpde & PG_PTE_PAT) != 0)
3407		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3408
3409	/*
3410	 * Map the superpage.
3411	 */
3412	if (workaround_erratum383)
3413		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3414	else
3415		pde_store(pde, PG_PS | newpde);
3416
3417	atomic_add_long(&pmap_pde_promotions, 1);
3418	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
3419	    " in pmap %p", va, pmap);
3420}
3421
3422/*
3423 *	Insert the given physical page (p) at
3424 *	the specified virtual address (v) in the
3425 *	target physical map with the protection requested.
3426 *
3427 *	If specified, the page will be wired down, meaning
3428 *	that the related pte can not be reclaimed.
3429 *
3430 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3431 *	or lose information.  That is, this routine must actually
3432 *	insert this page into the given map NOW.
3433 */
3434void
3435pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3436    vm_prot_t prot, boolean_t wired)
3437{
3438	struct rwlock *lock;
3439	pd_entry_t *pde;
3440	pt_entry_t *pte;
3441	pt_entry_t newpte, origpte;
3442	pv_entry_t pv;
3443	vm_paddr_t opa, pa;
3444	vm_page_t mpte, om;
3445
3446	va = trunc_page(va);
3447	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3448	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3449	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
3450	    va));
3451	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
3452	    va >= kmi.clean_eva,
3453	    ("pmap_enter: managed mapping within the clean submap"));
3454	if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == 0)
3455		VM_OBJECT_ASSERT_WLOCKED(m->object);
3456	pa = VM_PAGE_TO_PHYS(m);
3457	newpte = (pt_entry_t)(pa | PG_A | PG_V);
3458	if ((access & VM_PROT_WRITE) != 0)
3459		newpte |= PG_M;
3460	if ((prot & VM_PROT_WRITE) != 0)
3461		newpte |= PG_RW;
3462	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
3463	    ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
3464	if ((prot & VM_PROT_EXECUTE) == 0)
3465		newpte |= pg_nx;
3466	if (wired)
3467		newpte |= PG_W;
3468	if (va < VM_MAXUSER_ADDRESS)
3469		newpte |= PG_U;
3470	if (pmap == kernel_pmap)
3471		newpte |= PG_G;
3472	newpte |= pmap_cache_bits(m->md.pat_mode, 0);
3473
3474	mpte = NULL;
3475
3476	lock = NULL;
3477	rw_rlock(&pvh_global_lock);
3478	PMAP_LOCK(pmap);
3479
3480	/*
3481	 * In the case that a page table page is not
3482	 * resident, we are creating it here.
3483	 */
3484retry:
3485	pde = pmap_pde(pmap, va);
3486	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
3487	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
3488		pte = pmap_pde_to_pte(pde, va);
3489		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
3490			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3491			mpte->wire_count++;
3492		}
3493	} else if (va < VM_MAXUSER_ADDRESS) {
3494		/*
3495		 * Here if the pte page isn't mapped, or if it has been
3496		 * deallocated.
3497		 */
3498		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
3499		goto retry;
3500	} else
3501		panic("pmap_enter: invalid page directory va=%#lx", va);
3502
3503	origpte = *pte;
3504
3505	/*
3506	 * Is the specified virtual address already mapped?
3507	 */
3508	if ((origpte & PG_V) != 0) {
3509		/*
3510		 * Wiring change, just update stats. We don't worry about
3511		 * wiring PT pages as they remain resident as long as there
3512		 * are valid mappings in them. Hence, if a user page is wired,
3513		 * the PT page will be also.
3514		 */
3515		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
3516			pmap->pm_stats.wired_count++;
3517		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
3518			pmap->pm_stats.wired_count--;
3519
3520		/*
3521		 * Remove the extra PT page reference.
3522		 */
3523		if (mpte != NULL) {
3524			mpte->wire_count--;
3525			KASSERT(mpte->wire_count > 0,
3526			    ("pmap_enter: missing reference to page table page,"
3527			     " va: 0x%lx", va));
3528		}
3529
3530		/*
3531		 * Has the physical page changed?
3532		 */
3533		opa = origpte & PG_FRAME;
3534		if (opa == pa) {
3535			/*
3536			 * No, might be a protection or wiring change.
3537			 */
3538			if ((origpte & PG_MANAGED) != 0) {
3539				newpte |= PG_MANAGED;
3540				if ((newpte & PG_RW) != 0)
3541					vm_page_aflag_set(m, PGA_WRITEABLE);
3542			}
3543			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
3544				goto unchanged;
3545			goto validate;
3546		}
3547	} else {
3548		/*
3549		 * Increment the counters.
3550		 */
3551		if ((newpte & PG_W) != 0)
3552			pmap->pm_stats.wired_count++;
3553		pmap_resident_count_inc(pmap, 1);
3554	}
3555
3556	/*
3557	 * Enter on the PV list if part of our managed memory.
3558	 */
3559	if ((m->oflags & VPO_UNMANAGED) == 0) {
3560		newpte |= PG_MANAGED;
3561		pv = get_pv_entry(pmap, &lock);
3562		pv->pv_va = va;
3563		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3564		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3565		if ((newpte & PG_RW) != 0)
3566			vm_page_aflag_set(m, PGA_WRITEABLE);
3567	}
3568
3569	/*
3570	 * Update the PTE.
3571	 */
3572	if ((origpte & PG_V) != 0) {
3573validate:
3574		origpte = pte_load_store(pte, newpte);
3575		opa = origpte & PG_FRAME;
3576		if (opa != pa) {
3577			if ((origpte & PG_MANAGED) != 0) {
3578				om = PHYS_TO_VM_PAGE(opa);
3579				if ((origpte & (PG_M | PG_RW)) == (PG_M |
3580				    PG_RW))
3581					vm_page_dirty(om);
3582				if ((origpte & PG_A) != 0)
3583					vm_page_aflag_set(om, PGA_REFERENCED);
3584				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3585				pmap_pvh_free(&om->md, pmap, va);
3586				if ((om->aflags & PGA_WRITEABLE) != 0 &&
3587				    TAILQ_EMPTY(&om->md.pv_list) &&
3588				    ((om->flags & PG_FICTITIOUS) != 0 ||
3589				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3590					vm_page_aflag_clear(om, PGA_WRITEABLE);
3591			}
3592		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
3593		    PG_RW)) == (PG_M | PG_RW)) {
3594			if ((origpte & PG_MANAGED) != 0)
3595				vm_page_dirty(m);
3596
3597			/*
3598			 * Although the PTE may still have PG_RW set, TLB
3599			 * invalidation may nonetheless be required because
3600			 * the PTE no longer has PG_M set.
3601			 */
3602		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
3603			/*
3604			 * This PTE change does not require TLB invalidation.
3605			 */
3606			goto unchanged;
3607		}
3608		if ((origpte & PG_A) != 0)
3609			pmap_invalidate_page(pmap, va);
3610	} else
3611		pte_store(pte, newpte);
3612
3613unchanged:
3614
3615	/*
3616	 * If both the page table page and the reservation are fully
3617	 * populated, then attempt promotion.
3618	 */
3619	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3620	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3621	    vm_reserv_level_iffullpop(m) == 0)
3622		pmap_promote_pde(pmap, pde, va, &lock);
3623
3624	if (lock != NULL)
3625		rw_wunlock(lock);
3626	rw_runlock(&pvh_global_lock);
3627	PMAP_UNLOCK(pmap);
3628}
3629
3630/*
3631 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
3632 * otherwise.  Fails if (1) a page table page cannot be allocated without
3633 * blocking, (2) a mapping already exists at the specified virtual address, or
3634 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3635 */
3636static boolean_t
3637pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3638    struct rwlock **lockp)
3639{
3640	pd_entry_t *pde, newpde;
3641	vm_page_t free, mpde;
3642
3643	rw_assert(&pvh_global_lock, RA_LOCKED);
3644	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3645	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
3646		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3647		    " in pmap %p", va, pmap);
3648		return (FALSE);
3649	}
3650	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
3651	pde = &pde[pmap_pde_index(va)];
3652	if ((*pde & PG_V) != 0) {
3653		KASSERT(mpde->wire_count > 1,
3654		    ("pmap_enter_pde: mpde's wire count is too low"));
3655		mpde->wire_count--;
3656		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3657		    " in pmap %p", va, pmap);
3658		return (FALSE);
3659	}
3660	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3661	    PG_PS | PG_V;
3662	if ((m->oflags & VPO_UNMANAGED) == 0) {
3663		newpde |= PG_MANAGED;
3664
3665		/*
3666		 * Abort this mapping if its PV entry could not be created.
3667		 */
3668		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
3669		    lockp)) {
3670			free = NULL;
3671			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
3672				pmap_invalidate_page(pmap, va);
3673				pmap_free_zero_pages(free);
3674			}
3675			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3676			    " in pmap %p", va, pmap);
3677			return (FALSE);
3678		}
3679	}
3680	if ((prot & VM_PROT_EXECUTE) == 0)
3681		newpde |= pg_nx;
3682	if (va < VM_MAXUSER_ADDRESS)
3683		newpde |= PG_U;
3684
3685	/*
3686	 * Increment counters.
3687	 */
3688	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
3689
3690	/*
3691	 * Map the superpage.
3692	 */
3693	pde_store(pde, newpde);
3694
3695	atomic_add_long(&pmap_pde_mappings, 1);
3696	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3697	    " in pmap %p", va, pmap);
3698	return (TRUE);
3699}
3700
3701/*
3702 * Maps a sequence of resident pages belonging to the same object.
3703 * The sequence begins with the given page m_start.  This page is
3704 * mapped at the given virtual address start.  Each subsequent page is
3705 * mapped at a virtual address that is offset from start by the same
3706 * amount as the page is offset from m_start within the object.  The
3707 * last page in the sequence is the page with the largest offset from
3708 * m_start that can be mapped at a virtual address less than the given
3709 * virtual address end.  Not every virtual page between start and end
3710 * is mapped; only those for which a resident page exists with the
3711 * corresponding offset from m_start are mapped.
3712 */
3713void
3714pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3715    vm_page_t m_start, vm_prot_t prot)
3716{
3717	struct rwlock *lock;
3718	vm_offset_t va;
3719	vm_page_t m, mpte;
3720	vm_pindex_t diff, psize;
3721
3722	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3723
3724	psize = atop(end - start);
3725	mpte = NULL;
3726	m = m_start;
3727	lock = NULL;
3728	rw_rlock(&pvh_global_lock);
3729	PMAP_LOCK(pmap);
3730	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3731		va = start + ptoa(diff);
3732		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3733		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3734		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3735		    pmap_enter_pde(pmap, va, m, prot, &lock))
3736			m = &m[NBPDR / PAGE_SIZE - 1];
3737		else
3738			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3739			    mpte, &lock);
3740		m = TAILQ_NEXT(m, listq);
3741	}
3742	if (lock != NULL)
3743		rw_wunlock(lock);
3744	rw_runlock(&pvh_global_lock);
3745	PMAP_UNLOCK(pmap);
3746}
3747
3748/*
3749 * this code makes some *MAJOR* assumptions:
3750 * 1. Current pmap & pmap exists.
3751 * 2. Not wired.
3752 * 3. Read access.
3753 * 4. No page table pages.
3754 * but is *MUCH* faster than pmap_enter...
3755 */
3756
3757void
3758pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3759{
3760	struct rwlock *lock;
3761
3762	lock = NULL;
3763	rw_rlock(&pvh_global_lock);
3764	PMAP_LOCK(pmap);
3765	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3766	if (lock != NULL)
3767		rw_wunlock(lock);
3768	rw_runlock(&pvh_global_lock);
3769	PMAP_UNLOCK(pmap);
3770}
3771
3772static vm_page_t
3773pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3774    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3775{
3776	vm_page_t free;
3777	pt_entry_t *pte;
3778	vm_paddr_t pa;
3779
3780	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3781	    (m->oflags & VPO_UNMANAGED) != 0,
3782	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3783	rw_assert(&pvh_global_lock, RA_LOCKED);
3784	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3785
3786	/*
3787	 * In the case that a page table page is not
3788	 * resident, we are creating it here.
3789	 */
3790	if (va < VM_MAXUSER_ADDRESS) {
3791		vm_pindex_t ptepindex;
3792		pd_entry_t *ptepa;
3793
3794		/*
3795		 * Calculate pagetable page index
3796		 */
3797		ptepindex = pmap_pde_pindex(va);
3798		if (mpte && (mpte->pindex == ptepindex)) {
3799			mpte->wire_count++;
3800		} else {
3801			/*
3802			 * Get the page directory entry
3803			 */
3804			ptepa = pmap_pde(pmap, va);
3805
3806			/*
3807			 * If the page table page is mapped, we just increment
3808			 * the hold count, and activate it.  Otherwise, we
3809			 * attempt to allocate a page table page.  If this
3810			 * attempt fails, we don't retry.  Instead, we give up.
3811			 */
3812			if (ptepa && (*ptepa & PG_V) != 0) {
3813				if (*ptepa & PG_PS)
3814					return (NULL);
3815				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
3816				mpte->wire_count++;
3817			} else {
3818				/*
3819				 * Pass NULL instead of the PV list lock
3820				 * pointer, because we don't intend to sleep.
3821				 */
3822				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
3823				if (mpte == NULL)
3824					return (mpte);
3825			}
3826		}
3827		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3828		pte = &pte[pmap_pte_index(va)];
3829	} else {
3830		mpte = NULL;
3831		pte = vtopte(va);
3832	}
3833	if (*pte) {
3834		if (mpte != NULL) {
3835			mpte->wire_count--;
3836			mpte = NULL;
3837		}
3838		return (mpte);
3839	}
3840
3841	/*
3842	 * Enter on the PV list if part of our managed memory.
3843	 */
3844	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3845	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3846		if (mpte != NULL) {
3847			free = NULL;
3848			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3849				pmap_invalidate_page(pmap, va);
3850				pmap_free_zero_pages(free);
3851			}
3852			mpte = NULL;
3853		}
3854		return (mpte);
3855	}
3856
3857	/*
3858	 * Increment counters
3859	 */
3860	pmap_resident_count_inc(pmap, 1);
3861
3862	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3863	if ((prot & VM_PROT_EXECUTE) == 0)
3864		pa |= pg_nx;
3865
3866	/*
3867	 * Now validate mapping with RO protection
3868	 */
3869	if ((m->oflags & VPO_UNMANAGED) != 0)
3870		pte_store(pte, pa | PG_V | PG_U);
3871	else
3872		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3873	return (mpte);
3874}
3875
3876/*
3877 * Make a temporary mapping for a physical address.  This is only intended
3878 * to be used for panic dumps.
3879 */
3880void *
3881pmap_kenter_temporary(vm_paddr_t pa, int i)
3882{
3883	vm_offset_t va;
3884
3885	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3886	pmap_kenter(va, pa);
3887	invlpg(va);
3888	return ((void *)crashdumpmap);
3889}
3890
3891/*
3892 * This code maps large physical mmap regions into the
3893 * processor address space.  Note that some shortcuts
3894 * are taken, but the code works.
3895 */
3896void
3897pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3898    vm_pindex_t pindex, vm_size_t size)
3899{
3900	pd_entry_t *pde;
3901	vm_paddr_t pa, ptepa;
3902	vm_page_t p, pdpg;
3903	int pat_mode;
3904
3905	VM_OBJECT_ASSERT_WLOCKED(object);
3906	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3907	    ("pmap_object_init_pt: non-device object"));
3908	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3909		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3910			return;
3911		p = vm_page_lookup(object, pindex);
3912		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3913		    ("pmap_object_init_pt: invalid page %p", p));
3914		pat_mode = p->md.pat_mode;
3915
3916		/*
3917		 * Abort the mapping if the first page is not physically
3918		 * aligned to a 2MB page boundary.
3919		 */
3920		ptepa = VM_PAGE_TO_PHYS(p);
3921		if (ptepa & (NBPDR - 1))
3922			return;
3923
3924		/*
3925		 * Skip the first page.  Abort the mapping if the rest of
3926		 * the pages are not physically contiguous or have differing
3927		 * memory attributes.
3928		 */
3929		p = TAILQ_NEXT(p, listq);
3930		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3931		    pa += PAGE_SIZE) {
3932			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3933			    ("pmap_object_init_pt: invalid page %p", p));
3934			if (pa != VM_PAGE_TO_PHYS(p) ||
3935			    pat_mode != p->md.pat_mode)
3936				return;
3937			p = TAILQ_NEXT(p, listq);
3938		}
3939
3940		/*
3941		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
3942		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
3943		 * will not affect the termination of this loop.
3944		 */
3945		PMAP_LOCK(pmap);
3946		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3947		    size; pa += NBPDR) {
3948			pdpg = pmap_allocpde(pmap, addr, NULL);
3949			if (pdpg == NULL) {
3950				/*
3951				 * The creation of mappings below is only an
3952				 * optimization.  If a page directory page
3953				 * cannot be allocated without blocking,
3954				 * continue on to the next mapping rather than
3955				 * blocking.
3956				 */
3957				addr += NBPDR;
3958				continue;
3959			}
3960			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3961			pde = &pde[pmap_pde_index(addr)];
3962			if ((*pde & PG_V) == 0) {
3963				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3964				    PG_U | PG_RW | PG_V);
3965				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
3966				atomic_add_long(&pmap_pde_mappings, 1);
3967			} else {
3968				/* Continue on if the PDE is already valid. */
3969				pdpg->wire_count--;
3970				KASSERT(pdpg->wire_count > 0,
3971				    ("pmap_object_init_pt: missing reference "
3972				    "to page directory page, va: 0x%lx", addr));
3973			}
3974			addr += NBPDR;
3975		}
3976		PMAP_UNLOCK(pmap);
3977	}
3978}
3979
3980/*
3981 *	Routine:	pmap_change_wiring
3982 *	Function:	Change the wiring attribute for a map/virtual-address
3983 *			pair.
3984 *	In/out conditions:
3985 *			The mapping must already exist in the pmap.
3986 */
3987void
3988pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3989{
3990	pd_entry_t *pde;
3991	pt_entry_t *pte;
3992	boolean_t pv_lists_locked;
3993
3994	pv_lists_locked = FALSE;
3995
3996	/*
3997	 * Wiring is not a hardware characteristic so there is no need to
3998	 * invalidate TLB.
3999	 */
4000retry:
4001	PMAP_LOCK(pmap);
4002	pde = pmap_pde(pmap, va);
4003	if ((*pde & PG_PS) != 0) {
4004		if (!wired != ((*pde & PG_W) == 0)) {
4005			if (!pv_lists_locked) {
4006				pv_lists_locked = TRUE;
4007				if (!rw_try_rlock(&pvh_global_lock)) {
4008					PMAP_UNLOCK(pmap);
4009					rw_rlock(&pvh_global_lock);
4010					goto retry;
4011				}
4012			}
4013			if (!pmap_demote_pde(pmap, pde, va))
4014				panic("pmap_change_wiring: demotion failed");
4015		} else
4016			goto out;
4017	}
4018	pte = pmap_pde_to_pte(pde, va);
4019	if (wired && (*pte & PG_W) == 0) {
4020		pmap->pm_stats.wired_count++;
4021		atomic_set_long(pte, PG_W);
4022	} else if (!wired && (*pte & PG_W) != 0) {
4023		pmap->pm_stats.wired_count--;
4024		atomic_clear_long(pte, PG_W);
4025	}
4026out:
4027	if (pv_lists_locked)
4028		rw_runlock(&pvh_global_lock);
4029	PMAP_UNLOCK(pmap);
4030}
4031
4032/*
4033 *	Copy the range specified by src_addr/len
4034 *	from the source map to the range dst_addr/len
4035 *	in the destination map.
4036 *
4037 *	This routine is only advisory and need not do anything.
4038 */
4039
4040void
4041pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4042    vm_offset_t src_addr)
4043{
4044	struct rwlock *lock;
4045	vm_page_t   free;
4046	vm_offset_t addr;
4047	vm_offset_t end_addr = src_addr + len;
4048	vm_offset_t va_next;
4049
4050	if (dst_addr != src_addr)
4051		return;
4052
4053	lock = NULL;
4054	rw_rlock(&pvh_global_lock);
4055	if (dst_pmap < src_pmap) {
4056		PMAP_LOCK(dst_pmap);
4057		PMAP_LOCK(src_pmap);
4058	} else {
4059		PMAP_LOCK(src_pmap);
4060		PMAP_LOCK(dst_pmap);
4061	}
4062	for (addr = src_addr; addr < end_addr; addr = va_next) {
4063		pt_entry_t *src_pte, *dst_pte;
4064		vm_page_t dstmpde, dstmpte, srcmpte;
4065		pml4_entry_t *pml4e;
4066		pdp_entry_t *pdpe;
4067		pd_entry_t srcptepaddr, *pde;
4068
4069		KASSERT(addr < UPT_MIN_ADDRESS,
4070		    ("pmap_copy: invalid to pmap_copy page tables"));
4071
4072		pml4e = pmap_pml4e(src_pmap, addr);
4073		if ((*pml4e & PG_V) == 0) {
4074			va_next = (addr + NBPML4) & ~PML4MASK;
4075			if (va_next < addr)
4076				va_next = end_addr;
4077			continue;
4078		}
4079
4080		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4081		if ((*pdpe & PG_V) == 0) {
4082			va_next = (addr + NBPDP) & ~PDPMASK;
4083			if (va_next < addr)
4084				va_next = end_addr;
4085			continue;
4086		}
4087
4088		va_next = (addr + NBPDR) & ~PDRMASK;
4089		if (va_next < addr)
4090			va_next = end_addr;
4091
4092		pde = pmap_pdpe_to_pde(pdpe, addr);
4093		srcptepaddr = *pde;
4094		if (srcptepaddr == 0)
4095			continue;
4096
4097		if (srcptepaddr & PG_PS) {
4098			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4099			if (dstmpde == NULL)
4100				break;
4101			pde = (pd_entry_t *)
4102			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4103			pde = &pde[pmap_pde_index(addr)];
4104			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4105			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4106			    PG_PS_FRAME, &lock))) {
4107				*pde = srcptepaddr & ~PG_W;
4108				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4109			} else
4110				dstmpde->wire_count--;
4111			continue;
4112		}
4113
4114		srcptepaddr &= PG_FRAME;
4115		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4116		KASSERT(srcmpte->wire_count > 0,
4117		    ("pmap_copy: source page table page is unused"));
4118
4119		if (va_next > end_addr)
4120			va_next = end_addr;
4121
4122		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4123		src_pte = &src_pte[pmap_pte_index(addr)];
4124		dstmpte = NULL;
4125		while (addr < va_next) {
4126			pt_entry_t ptetemp;
4127			ptetemp = *src_pte;
4128			/*
4129			 * we only virtual copy managed pages
4130			 */
4131			if ((ptetemp & PG_MANAGED) != 0) {
4132				if (dstmpte != NULL &&
4133				    dstmpte->pindex == pmap_pde_pindex(addr))
4134					dstmpte->wire_count++;
4135				else if ((dstmpte = pmap_allocpte(dst_pmap,
4136				    addr, NULL)) == NULL)
4137					goto out;
4138				dst_pte = (pt_entry_t *)
4139				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4140				dst_pte = &dst_pte[pmap_pte_index(addr)];
4141				if (*dst_pte == 0 &&
4142				    pmap_try_insert_pv_entry(dst_pmap, addr,
4143				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4144				    &lock)) {
4145					/*
4146					 * Clear the wired, modified, and
4147					 * accessed (referenced) bits
4148					 * during the copy.
4149					 */
4150					*dst_pte = ptetemp & ~(PG_W | PG_M |
4151					    PG_A);
4152					pmap_resident_count_inc(dst_pmap, 1);
4153	 			} else {
4154					free = NULL;
4155					if (pmap_unwire_ptp(dst_pmap, addr,
4156					    dstmpte, &free)) {
4157					    	pmap_invalidate_page(dst_pmap,
4158					 	    addr);
4159				    	    	pmap_free_zero_pages(free);
4160					}
4161					goto out;
4162				}
4163				if (dstmpte->wire_count >= srcmpte->wire_count)
4164					break;
4165			}
4166			addr += PAGE_SIZE;
4167			src_pte++;
4168		}
4169	}
4170out:
4171	if (lock != NULL)
4172		rw_wunlock(lock);
4173	rw_runlock(&pvh_global_lock);
4174	PMAP_UNLOCK(src_pmap);
4175	PMAP_UNLOCK(dst_pmap);
4176}
4177
4178/*
4179 *	pmap_zero_page zeros the specified hardware page by mapping
4180 *	the page into KVM and using bzero to clear its contents.
4181 */
4182void
4183pmap_zero_page(vm_page_t m)
4184{
4185	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4186
4187	pagezero((void *)va);
4188}
4189
4190/*
4191 *	pmap_zero_page_area zeros the specified hardware page by mapping
4192 *	the page into KVM and using bzero to clear its contents.
4193 *
4194 *	off and size may not cover an area beyond a single hardware page.
4195 */
4196void
4197pmap_zero_page_area(vm_page_t m, int off, int size)
4198{
4199	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4200
4201	if (off == 0 && size == PAGE_SIZE)
4202		pagezero((void *)va);
4203	else
4204		bzero((char *)va + off, size);
4205}
4206
4207/*
4208 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4209 *	the page into KVM and using bzero to clear its contents.  This
4210 *	is intended to be called from the vm_pagezero process only and
4211 *	outside of Giant.
4212 */
4213void
4214pmap_zero_page_idle(vm_page_t m)
4215{
4216	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4217
4218	pagezero((void *)va);
4219}
4220
4221/*
4222 *	pmap_copy_page copies the specified (machine independent)
4223 *	page by mapping the page into virtual memory and using
4224 *	bcopy to copy the page, one machine dependent page at a
4225 *	time.
4226 */
4227void
4228pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4229{
4230	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4231	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4232
4233	pagecopy((void *)src, (void *)dst);
4234}
4235
4236int unmapped_buf_allowed = 1;
4237
4238void
4239pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4240    vm_offset_t b_offset, int xfersize)
4241{
4242	void *a_cp, *b_cp;
4243	vm_offset_t a_pg_offset, b_pg_offset;
4244	int cnt;
4245
4246	while (xfersize > 0) {
4247		a_pg_offset = a_offset & PAGE_MASK;
4248		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4249		a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
4250		    phys_addr) + a_pg_offset;
4251		b_pg_offset = b_offset & PAGE_MASK;
4252		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4253		b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
4254		    phys_addr) + b_pg_offset;
4255		bcopy(a_cp, b_cp, cnt);
4256		a_offset += cnt;
4257		b_offset += cnt;
4258		xfersize -= cnt;
4259	}
4260}
4261
4262/*
4263 * Returns true if the pmap's pv is one of the first
4264 * 16 pvs linked to from this page.  This count may
4265 * be changed upwards or downwards in the future; it
4266 * is only necessary that true be returned for a small
4267 * subset of pmaps for proper page aging.
4268 */
4269boolean_t
4270pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4271{
4272	struct md_page *pvh;
4273	struct rwlock *lock;
4274	pv_entry_t pv;
4275	int loops = 0;
4276	boolean_t rv;
4277
4278	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4279	    ("pmap_page_exists_quick: page %p is not managed", m));
4280	rv = FALSE;
4281	rw_rlock(&pvh_global_lock);
4282	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4283	rw_rlock(lock);
4284	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4285		if (PV_PMAP(pv) == pmap) {
4286			rv = TRUE;
4287			break;
4288		}
4289		loops++;
4290		if (loops >= 16)
4291			break;
4292	}
4293	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4294		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4295		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4296			if (PV_PMAP(pv) == pmap) {
4297				rv = TRUE;
4298				break;
4299			}
4300			loops++;
4301			if (loops >= 16)
4302				break;
4303		}
4304	}
4305	rw_runlock(lock);
4306	rw_runlock(&pvh_global_lock);
4307	return (rv);
4308}
4309
4310/*
4311 *	pmap_page_wired_mappings:
4312 *
4313 *	Return the number of managed mappings to the given physical page
4314 *	that are wired.
4315 */
4316int
4317pmap_page_wired_mappings(vm_page_t m)
4318{
4319	int count;
4320
4321	count = 0;
4322	if ((m->oflags & VPO_UNMANAGED) != 0)
4323		return (count);
4324	rw_wlock(&pvh_global_lock);
4325	count = pmap_pvh_wired_mappings(&m->md, count);
4326	if ((m->flags & PG_FICTITIOUS) == 0) {
4327	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4328	        count);
4329	}
4330	rw_wunlock(&pvh_global_lock);
4331	return (count);
4332}
4333
4334/*
4335 *	pmap_pvh_wired_mappings:
4336 *
4337 *	Return the updated number "count" of managed mappings that are wired.
4338 */
4339static int
4340pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4341{
4342	pmap_t pmap;
4343	pt_entry_t *pte;
4344	pv_entry_t pv;
4345
4346	rw_assert(&pvh_global_lock, RA_WLOCKED);
4347	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4348		pmap = PV_PMAP(pv);
4349		PMAP_LOCK(pmap);
4350		pte = pmap_pte(pmap, pv->pv_va);
4351		if ((*pte & PG_W) != 0)
4352			count++;
4353		PMAP_UNLOCK(pmap);
4354	}
4355	return (count);
4356}
4357
4358/*
4359 * Returns TRUE if the given page is mapped individually or as part of
4360 * a 2mpage.  Otherwise, returns FALSE.
4361 */
4362boolean_t
4363pmap_page_is_mapped(vm_page_t m)
4364{
4365	struct rwlock *lock;
4366	boolean_t rv;
4367
4368	if ((m->oflags & VPO_UNMANAGED) != 0)
4369		return (FALSE);
4370	rw_rlock(&pvh_global_lock);
4371	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4372	rw_rlock(lock);
4373	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4374	    ((m->flags & PG_FICTITIOUS) == 0 &&
4375	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4376	rw_runlock(lock);
4377	rw_runlock(&pvh_global_lock);
4378	return (rv);
4379}
4380
4381/*
4382 * Remove all pages from specified address space
4383 * this aids process exit speeds.  Also, this code
4384 * is special cased for current process only, but
4385 * can have the more generic (and slightly slower)
4386 * mode enabled.  This is much faster than pmap_remove
4387 * in the case of running down an entire address space.
4388 */
4389void
4390pmap_remove_pages(pmap_t pmap)
4391{
4392	pd_entry_t ptepde;
4393	pt_entry_t *pte, tpte;
4394	vm_page_t free = NULL;
4395	vm_page_t m, mpte, mt;
4396	pv_entry_t pv;
4397	struct md_page *pvh;
4398	struct pv_chunk *pc, *npc;
4399	struct rwlock *lock;
4400	int64_t bit;
4401	uint64_t inuse, bitmask;
4402	int allfree, field, freed, idx;
4403	vm_paddr_t pa;
4404
4405	if (pmap != PCPU_GET(curpmap)) {
4406		printf("warning: pmap_remove_pages called with non-current pmap\n");
4407		return;
4408	}
4409	lock = NULL;
4410	rw_rlock(&pvh_global_lock);
4411	PMAP_LOCK(pmap);
4412	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4413		allfree = 1;
4414		freed = 0;
4415		for (field = 0; field < _NPCM; field++) {
4416			inuse = ~pc->pc_map[field] & pc_freemask[field];
4417			while (inuse != 0) {
4418				bit = bsfq(inuse);
4419				bitmask = 1UL << bit;
4420				idx = field * 64 + bit;
4421				pv = &pc->pc_pventry[idx];
4422				inuse &= ~bitmask;
4423
4424				pte = pmap_pdpe(pmap, pv->pv_va);
4425				ptepde = *pte;
4426				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
4427				tpte = *pte;
4428				if ((tpte & (PG_PS | PG_V)) == PG_V) {
4429					ptepde = tpte;
4430					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
4431					    PG_FRAME);
4432					pte = &pte[pmap_pte_index(pv->pv_va)];
4433					tpte = *pte;
4434				}
4435				if ((tpte & PG_V) == 0) {
4436					panic("bad pte va %lx pte %lx",
4437					    pv->pv_va, tpte);
4438				}
4439
4440/*
4441 * We cannot remove wired pages from a process' mapping at this time
4442 */
4443				if (tpte & PG_W) {
4444					allfree = 0;
4445					continue;
4446				}
4447
4448				if (tpte & PG_PS)
4449					pa = tpte & PG_PS_FRAME;
4450				else
4451					pa = tpte & PG_FRAME;
4452
4453				m = PHYS_TO_VM_PAGE(pa);
4454				KASSERT(m->phys_addr == pa,
4455				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4456				    m, (uintmax_t)m->phys_addr,
4457				    (uintmax_t)tpte));
4458
4459				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4460				    m < &vm_page_array[vm_page_array_size],
4461				    ("pmap_remove_pages: bad tpte %#jx",
4462				    (uintmax_t)tpte));
4463
4464				pte_clear(pte);
4465
4466				/*
4467				 * Update the vm_page_t clean/reference bits.
4468				 */
4469				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4470					if ((tpte & PG_PS) != 0) {
4471						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4472							vm_page_dirty(mt);
4473					} else
4474						vm_page_dirty(m);
4475				}
4476
4477				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4478
4479				/* Mark free */
4480				pc->pc_map[field] |= bitmask;
4481				if ((tpte & PG_PS) != 0) {
4482					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
4483					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4484					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4485					if (TAILQ_EMPTY(&pvh->pv_list)) {
4486						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4487							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
4488							    TAILQ_EMPTY(&mt->md.pv_list))
4489								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4490					}
4491					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4492					if (mpte != NULL) {
4493						pmap_remove_pt_page(pmap, mpte);
4494						pmap_resident_count_dec(pmap, 1);
4495						KASSERT(mpte->wire_count == NPTEPG,
4496						    ("pmap_remove_pages: pte page wire count error"));
4497						mpte->wire_count = 0;
4498						pmap_add_delayed_free_list(mpte, &free, FALSE);
4499						atomic_subtract_int(&cnt.v_wire_count, 1);
4500					}
4501				} else {
4502					pmap_resident_count_dec(pmap, 1);
4503					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4504					if ((m->aflags & PGA_WRITEABLE) != 0 &&
4505					    TAILQ_EMPTY(&m->md.pv_list) &&
4506					    (m->flags & PG_FICTITIOUS) == 0) {
4507						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4508						if (TAILQ_EMPTY(&pvh->pv_list))
4509							vm_page_aflag_clear(m, PGA_WRITEABLE);
4510					}
4511				}
4512				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4513				freed++;
4514			}
4515		}
4516		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4517		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4518		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4519		if (allfree) {
4520			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4521			free_pv_chunk(pc);
4522		}
4523	}
4524	if (lock != NULL)
4525		rw_wunlock(lock);
4526	pmap_invalidate_all(pmap);
4527	rw_runlock(&pvh_global_lock);
4528	PMAP_UNLOCK(pmap);
4529	pmap_free_zero_pages(free);
4530}
4531
4532/*
4533 *	pmap_is_modified:
4534 *
4535 *	Return whether or not the specified physical page was modified
4536 *	in any physical maps.
4537 */
4538boolean_t
4539pmap_is_modified(vm_page_t m)
4540{
4541	boolean_t rv;
4542
4543	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4544	    ("pmap_is_modified: page %p is not managed", m));
4545
4546	/*
4547	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
4548	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4549	 * is clear, no PTEs can have PG_M set.
4550	 */
4551	VM_OBJECT_ASSERT_WLOCKED(m->object);
4552	if ((m->oflags & VPO_BUSY) == 0 &&
4553	    (m->aflags & PGA_WRITEABLE) == 0)
4554		return (FALSE);
4555	rw_wlock(&pvh_global_lock);
4556	rv = pmap_is_modified_pvh(&m->md) ||
4557	    ((m->flags & PG_FICTITIOUS) == 0 &&
4558	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4559	rw_wunlock(&pvh_global_lock);
4560	return (rv);
4561}
4562
4563/*
4564 * Returns TRUE if any of the given mappings were used to modify
4565 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4566 * mappings are supported.
4567 */
4568static boolean_t
4569pmap_is_modified_pvh(struct md_page *pvh)
4570{
4571	pv_entry_t pv;
4572	pt_entry_t *pte;
4573	pmap_t pmap;
4574	boolean_t rv;
4575
4576	rw_assert(&pvh_global_lock, RA_WLOCKED);
4577	rv = FALSE;
4578	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4579		pmap = PV_PMAP(pv);
4580		PMAP_LOCK(pmap);
4581		pte = pmap_pte(pmap, pv->pv_va);
4582		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4583		PMAP_UNLOCK(pmap);
4584		if (rv)
4585			break;
4586	}
4587	return (rv);
4588}
4589
4590/*
4591 *	pmap_is_prefaultable:
4592 *
4593 *	Return whether or not the specified virtual address is elgible
4594 *	for prefault.
4595 */
4596boolean_t
4597pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4598{
4599	pd_entry_t *pde;
4600	pt_entry_t *pte;
4601	boolean_t rv;
4602
4603	rv = FALSE;
4604	PMAP_LOCK(pmap);
4605	pde = pmap_pde(pmap, addr);
4606	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
4607		pte = pmap_pde_to_pte(pde, addr);
4608		rv = (*pte & PG_V) == 0;
4609	}
4610	PMAP_UNLOCK(pmap);
4611	return (rv);
4612}
4613
4614/*
4615 *	pmap_is_referenced:
4616 *
4617 *	Return whether or not the specified physical page was referenced
4618 *	in any physical maps.
4619 */
4620boolean_t
4621pmap_is_referenced(vm_page_t m)
4622{
4623	boolean_t rv;
4624
4625	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4626	    ("pmap_is_referenced: page %p is not managed", m));
4627	rw_wlock(&pvh_global_lock);
4628	rv = pmap_is_referenced_pvh(&m->md) ||
4629	    ((m->flags & PG_FICTITIOUS) == 0 &&
4630	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4631	rw_wunlock(&pvh_global_lock);
4632	return (rv);
4633}
4634
4635/*
4636 * Returns TRUE if any of the given mappings were referenced and FALSE
4637 * otherwise.  Both page and 2mpage mappings are supported.
4638 */
4639static boolean_t
4640pmap_is_referenced_pvh(struct md_page *pvh)
4641{
4642	pv_entry_t pv;
4643	pt_entry_t *pte;
4644	pmap_t pmap;
4645	boolean_t rv;
4646
4647	rw_assert(&pvh_global_lock, RA_WLOCKED);
4648	rv = FALSE;
4649	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4650		pmap = PV_PMAP(pv);
4651		PMAP_LOCK(pmap);
4652		pte = pmap_pte(pmap, pv->pv_va);
4653		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4654		PMAP_UNLOCK(pmap);
4655		if (rv)
4656			break;
4657	}
4658	return (rv);
4659}
4660
4661/*
4662 * Clear the write and modified bits in each of the given page's mappings.
4663 */
4664void
4665pmap_remove_write(vm_page_t m)
4666{
4667	struct md_page *pvh;
4668	pmap_t pmap;
4669	pv_entry_t next_pv, pv;
4670	pd_entry_t *pde;
4671	pt_entry_t oldpte, *pte;
4672	vm_offset_t va;
4673
4674	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4675	    ("pmap_remove_write: page %p is not managed", m));
4676
4677	/*
4678	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
4679	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
4680	 * is clear, no page table entries need updating.
4681	 */
4682	VM_OBJECT_ASSERT_WLOCKED(m->object);
4683	if ((m->oflags & VPO_BUSY) == 0 &&
4684	    (m->aflags & PGA_WRITEABLE) == 0)
4685		return;
4686	rw_wlock(&pvh_global_lock);
4687	if ((m->flags & PG_FICTITIOUS) != 0)
4688		goto small_mappings;
4689	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4690	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4691		pmap = PV_PMAP(pv);
4692		PMAP_LOCK(pmap);
4693		va = pv->pv_va;
4694		pde = pmap_pde(pmap, va);
4695		if ((*pde & PG_RW) != 0)
4696			(void)pmap_demote_pde(pmap, pde, va);
4697		PMAP_UNLOCK(pmap);
4698	}
4699small_mappings:
4700	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4701		pmap = PV_PMAP(pv);
4702		PMAP_LOCK(pmap);
4703		pde = pmap_pde(pmap, pv->pv_va);
4704		KASSERT((*pde & PG_PS) == 0,
4705		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
4706		    m));
4707		pte = pmap_pde_to_pte(pde, pv->pv_va);
4708retry:
4709		oldpte = *pte;
4710		if (oldpte & PG_RW) {
4711			if (!atomic_cmpset_long(pte, oldpte, oldpte &
4712			    ~(PG_RW | PG_M)))
4713				goto retry;
4714			if ((oldpte & PG_M) != 0)
4715				vm_page_dirty(m);
4716			pmap_invalidate_page(pmap, pv->pv_va);
4717		}
4718		PMAP_UNLOCK(pmap);
4719	}
4720	vm_page_aflag_clear(m, PGA_WRITEABLE);
4721	rw_wunlock(&pvh_global_lock);
4722}
4723
4724/*
4725 *	pmap_ts_referenced:
4726 *
4727 *	Return a count of reference bits for a page, clearing those bits.
4728 *	It is not necessary for every reference bit to be cleared, but it
4729 *	is necessary that 0 only be returned when there are truly no
4730 *	reference bits set.
4731 *
4732 *	XXX: The exact number of bits to check and clear is a matter that
4733 *	should be tested and standardized at some point in the future for
4734 *	optimal aging of shared pages.
4735 */
4736int
4737pmap_ts_referenced(vm_page_t m)
4738{
4739	struct md_page *pvh;
4740	pv_entry_t pv, pvf, pvn;
4741	pmap_t pmap;
4742	pd_entry_t oldpde, *pde;
4743	pt_entry_t *pte;
4744	vm_offset_t va;
4745	int rtval = 0;
4746
4747	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4748	    ("pmap_ts_referenced: page %p is not managed", m));
4749	rw_wlock(&pvh_global_lock);
4750	if ((m->flags & PG_FICTITIOUS) != 0)
4751		goto small_mappings;
4752	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4753	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) {
4754		pmap = PV_PMAP(pv);
4755		PMAP_LOCK(pmap);
4756		va = pv->pv_va;
4757		pde = pmap_pde(pmap, va);
4758		oldpde = *pde;
4759		if ((oldpde & PG_A) != 0) {
4760			if (pmap_demote_pde(pmap, pde, va)) {
4761				if ((oldpde & PG_W) == 0) {
4762					/*
4763					 * Remove the mapping to a single page
4764					 * so that a subsequent access may
4765					 * repromote.  Since the underlying
4766					 * page table page is fully populated,
4767					 * this removal never frees a page
4768					 * table page.
4769					 */
4770					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4771					    PG_PS_FRAME);
4772					pmap_remove_page(pmap, va, pde, NULL);
4773					rtval++;
4774					if (rtval > 4) {
4775						PMAP_UNLOCK(pmap);
4776						goto out;
4777					}
4778				}
4779			}
4780		}
4781		PMAP_UNLOCK(pmap);
4782	}
4783small_mappings:
4784	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4785		pvf = pv;
4786		do {
4787			pvn = TAILQ_NEXT(pv, pv_next);
4788			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4789			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4790			pmap = PV_PMAP(pv);
4791			PMAP_LOCK(pmap);
4792			pde = pmap_pde(pmap, pv->pv_va);
4793			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4794			    " found a 2mpage in page %p's pv list", m));
4795			pte = pmap_pde_to_pte(pde, pv->pv_va);
4796			if ((*pte & PG_A) != 0) {
4797				atomic_clear_long(pte, PG_A);
4798				pmap_invalidate_page(pmap, pv->pv_va);
4799				rtval++;
4800				if (rtval > 4)
4801					pvn = NULL;
4802			}
4803			PMAP_UNLOCK(pmap);
4804		} while ((pv = pvn) != NULL && pv != pvf);
4805	}
4806out:
4807	rw_wunlock(&pvh_global_lock);
4808	return (rtval);
4809}
4810
4811/*
4812 *	Clear the modify bits on the specified physical page.
4813 */
4814void
4815pmap_clear_modify(vm_page_t m)
4816{
4817	struct md_page *pvh;
4818	pmap_t pmap;
4819	pv_entry_t next_pv, pv;
4820	pd_entry_t oldpde, *pde;
4821	pt_entry_t oldpte, *pte;
4822	vm_offset_t va;
4823
4824	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4825	    ("pmap_clear_modify: page %p is not managed", m));
4826	VM_OBJECT_ASSERT_WLOCKED(m->object);
4827	KASSERT((m->oflags & VPO_BUSY) == 0,
4828	    ("pmap_clear_modify: page %p is busy", m));
4829
4830	/*
4831	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4832	 * If the object containing the page is locked and the page is not
4833	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
4834	 */
4835	if ((m->aflags & PGA_WRITEABLE) == 0)
4836		return;
4837	rw_wlock(&pvh_global_lock);
4838	if ((m->flags & PG_FICTITIOUS) != 0)
4839		goto small_mappings;
4840	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4841	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4842		pmap = PV_PMAP(pv);
4843		PMAP_LOCK(pmap);
4844		va = pv->pv_va;
4845		pde = pmap_pde(pmap, va);
4846		oldpde = *pde;
4847		if ((oldpde & PG_RW) != 0) {
4848			if (pmap_demote_pde(pmap, pde, va)) {
4849				if ((oldpde & PG_W) == 0) {
4850					/*
4851					 * Write protect the mapping to a
4852					 * single page so that a subsequent
4853					 * write access may repromote.
4854					 */
4855					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4856					    PG_PS_FRAME);
4857					pte = pmap_pde_to_pte(pde, va);
4858					oldpte = *pte;
4859					if ((oldpte & PG_V) != 0) {
4860						while (!atomic_cmpset_long(pte,
4861						    oldpte,
4862						    oldpte & ~(PG_M | PG_RW)))
4863							oldpte = *pte;
4864						vm_page_dirty(m);
4865						pmap_invalidate_page(pmap, va);
4866					}
4867				}
4868			}
4869		}
4870		PMAP_UNLOCK(pmap);
4871	}
4872small_mappings:
4873	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4874		pmap = PV_PMAP(pv);
4875		PMAP_LOCK(pmap);
4876		pde = pmap_pde(pmap, pv->pv_va);
4877		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4878		    " a 2mpage in page %p's pv list", m));
4879		pte = pmap_pde_to_pte(pde, pv->pv_va);
4880		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4881			atomic_clear_long(pte, PG_M);
4882			pmap_invalidate_page(pmap, pv->pv_va);
4883		}
4884		PMAP_UNLOCK(pmap);
4885	}
4886	rw_wunlock(&pvh_global_lock);
4887}
4888
4889/*
4890 *	pmap_clear_reference:
4891 *
4892 *	Clear the reference bit on the specified physical page.
4893 */
4894void
4895pmap_clear_reference(vm_page_t m)
4896{
4897	struct md_page *pvh;
4898	pmap_t pmap;
4899	pv_entry_t next_pv, pv;
4900	pd_entry_t oldpde, *pde;
4901	pt_entry_t *pte;
4902	vm_offset_t va;
4903
4904	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4905	    ("pmap_clear_reference: page %p is not managed", m));
4906	rw_wlock(&pvh_global_lock);
4907	if ((m->flags & PG_FICTITIOUS) != 0)
4908		goto small_mappings;
4909	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4910	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4911		pmap = PV_PMAP(pv);
4912		PMAP_LOCK(pmap);
4913		va = pv->pv_va;
4914		pde = pmap_pde(pmap, va);
4915		oldpde = *pde;
4916		if ((oldpde & PG_A) != 0) {
4917			if (pmap_demote_pde(pmap, pde, va)) {
4918				/*
4919				 * Remove the mapping to a single page so
4920				 * that a subsequent access may repromote.
4921				 * Since the underlying page table page is
4922				 * fully populated, this removal never frees
4923				 * a page table page.
4924				 */
4925				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4926				    PG_PS_FRAME);
4927				pmap_remove_page(pmap, va, pde, NULL);
4928			}
4929		}
4930		PMAP_UNLOCK(pmap);
4931	}
4932small_mappings:
4933	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4934		pmap = PV_PMAP(pv);
4935		PMAP_LOCK(pmap);
4936		pde = pmap_pde(pmap, pv->pv_va);
4937		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4938		    " a 2mpage in page %p's pv list", m));
4939		pte = pmap_pde_to_pte(pde, pv->pv_va);
4940		if (*pte & PG_A) {
4941			atomic_clear_long(pte, PG_A);
4942			pmap_invalidate_page(pmap, pv->pv_va);
4943		}
4944		PMAP_UNLOCK(pmap);
4945	}
4946	rw_wunlock(&pvh_global_lock);
4947}
4948
4949/*
4950 * Miscellaneous support routines follow
4951 */
4952
4953/* Adjust the cache mode for a 4KB page mapped via a PTE. */
4954static __inline void
4955pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4956{
4957	u_int opte, npte;
4958
4959	/*
4960	 * The cache mode bits are all in the low 32-bits of the
4961	 * PTE, so we can just spin on updating the low 32-bits.
4962	 */
4963	do {
4964		opte = *(u_int *)pte;
4965		npte = opte & ~PG_PTE_CACHE;
4966		npte |= cache_bits;
4967	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4968}
4969
4970/* Adjust the cache mode for a 2MB page mapped via a PDE. */
4971static __inline void
4972pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4973{
4974	u_int opde, npde;
4975
4976	/*
4977	 * The cache mode bits are all in the low 32-bits of the
4978	 * PDE, so we can just spin on updating the low 32-bits.
4979	 */
4980	do {
4981		opde = *(u_int *)pde;
4982		npde = opde & ~PG_PDE_CACHE;
4983		npde |= cache_bits;
4984	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4985}
4986
4987/*
4988 * Map a set of physical memory pages into the kernel virtual
4989 * address space. Return a pointer to where it is mapped. This
4990 * routine is intended to be used for mapping device memory,
4991 * NOT real memory.
4992 */
4993void *
4994pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4995{
4996	vm_offset_t va, offset;
4997	vm_size_t tmpsize;
4998
4999	/*
5000	 * If the specified range of physical addresses fits within the direct
5001	 * map window, use the direct map.
5002	 */
5003	if (pa < dmaplimit && pa + size < dmaplimit) {
5004		va = PHYS_TO_DMAP(pa);
5005		if (!pmap_change_attr(va, size, mode))
5006			return ((void *)va);
5007	}
5008	offset = pa & PAGE_MASK;
5009	size = round_page(offset + size);
5010	va = kmem_alloc_nofault(kernel_map, size);
5011	if (!va)
5012		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5013	pa = trunc_page(pa);
5014	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5015		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5016	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5017	pmap_invalidate_cache_range(va, va + tmpsize);
5018	return ((void *)(va + offset));
5019}
5020
5021void *
5022pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5023{
5024
5025	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5026}
5027
5028void *
5029pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5030{
5031
5032	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5033}
5034
5035void
5036pmap_unmapdev(vm_offset_t va, vm_size_t size)
5037{
5038	vm_offset_t base, offset;
5039
5040	/* If we gave a direct map region in pmap_mapdev, do nothing */
5041	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
5042		return;
5043	base = trunc_page(va);
5044	offset = va & PAGE_MASK;
5045	size = round_page(offset + size);
5046	kmem_free(kernel_map, base, size);
5047}
5048
5049/*
5050 * Tries to demote a 1GB page mapping.
5051 */
5052static boolean_t
5053pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
5054{
5055	pdp_entry_t newpdpe, oldpdpe;
5056	pd_entry_t *firstpde, newpde, *pde;
5057	vm_paddr_t mpdepa;
5058	vm_page_t mpde;
5059
5060	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5061	oldpdpe = *pdpe;
5062	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
5063	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
5064	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
5065	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
5066		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
5067		    " in pmap %p", va, pmap);
5068		return (FALSE);
5069	}
5070	mpdepa = VM_PAGE_TO_PHYS(mpde);
5071	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
5072	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
5073	KASSERT((oldpdpe & PG_A) != 0,
5074	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
5075	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
5076	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
5077	newpde = oldpdpe;
5078
5079	/*
5080	 * Initialize the page directory page.
5081	 */
5082	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
5083		*pde = newpde;
5084		newpde += NBPDR;
5085	}
5086
5087	/*
5088	 * Demote the mapping.
5089	 */
5090	*pdpe = newpdpe;
5091
5092	/*
5093	 * Invalidate a stale recursive mapping of the page directory page.
5094	 */
5095	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
5096
5097	pmap_pdpe_demotions++;
5098	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
5099	    " in pmap %p", va, pmap);
5100	return (TRUE);
5101}
5102
5103/*
5104 * Sets the memory attribute for the specified page.
5105 */
5106void
5107pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5108{
5109
5110	m->md.pat_mode = ma;
5111
5112	/*
5113	 * If "m" is a normal page, update its direct mapping.  This update
5114	 * can be relied upon to perform any cache operations that are
5115	 * required for data coherence.
5116	 */
5117	if ((m->flags & PG_FICTITIOUS) == 0 &&
5118	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
5119	    m->md.pat_mode))
5120		panic("memory attribute change on the direct map failed");
5121}
5122
5123/*
5124 * Changes the specified virtual address range's memory type to that given by
5125 * the parameter "mode".  The specified virtual address range must be
5126 * completely contained within either the direct map or the kernel map.  If
5127 * the virtual address range is contained within the kernel map, then the
5128 * memory type for each of the corresponding ranges of the direct map is also
5129 * changed.  (The corresponding ranges of the direct map are those ranges that
5130 * map the same physical pages as the specified virtual address range.)  These
5131 * changes to the direct map are necessary because Intel describes the
5132 * behavior of their processors as "undefined" if two or more mappings to the
5133 * same physical page have different memory types.
5134 *
5135 * Returns zero if the change completed successfully, and either EINVAL or
5136 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5137 * of the virtual address range was not mapped, and ENOMEM is returned if
5138 * there was insufficient memory available to complete the change.  In the
5139 * latter case, the memory type may have been changed on some part of the
5140 * virtual address range or the direct map.
5141 */
5142int
5143pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5144{
5145	int error;
5146
5147	PMAP_LOCK(kernel_pmap);
5148	error = pmap_change_attr_locked(va, size, mode);
5149	PMAP_UNLOCK(kernel_pmap);
5150	return (error);
5151}
5152
5153static int
5154pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
5155{
5156	vm_offset_t base, offset, tmpva;
5157	vm_paddr_t pa_start, pa_end;
5158	pdp_entry_t *pdpe;
5159	pd_entry_t *pde;
5160	pt_entry_t *pte;
5161	int cache_bits_pte, cache_bits_pde, error;
5162	boolean_t changed;
5163
5164	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
5165	base = trunc_page(va);
5166	offset = va & PAGE_MASK;
5167	size = round_page(offset + size);
5168
5169	/*
5170	 * Only supported on kernel virtual addresses, including the direct
5171	 * map but excluding the recursive map.
5172	 */
5173	if (base < DMAP_MIN_ADDRESS)
5174		return (EINVAL);
5175
5176	cache_bits_pde = pmap_cache_bits(mode, 1);
5177	cache_bits_pte = pmap_cache_bits(mode, 0);
5178	changed = FALSE;
5179
5180	/*
5181	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
5182	 * into 4KB pages if required.
5183	 */
5184	for (tmpva = base; tmpva < base + size; ) {
5185		pdpe = pmap_pdpe(kernel_pmap, tmpva);
5186		if (*pdpe == 0)
5187			return (EINVAL);
5188		if (*pdpe & PG_PS) {
5189			/*
5190			 * If the current 1GB page already has the required
5191			 * memory type, then we need not demote this page. Just
5192			 * increment tmpva to the next 1GB page frame.
5193			 */
5194			if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
5195				tmpva = trunc_1gpage(tmpva) + NBPDP;
5196				continue;
5197			}
5198
5199			/*
5200			 * If the current offset aligns with a 1GB page frame
5201			 * and there is at least 1GB left within the range, then
5202			 * we need not break down this page into 2MB pages.
5203			 */
5204			if ((tmpva & PDPMASK) == 0 &&
5205			    tmpva + PDPMASK < base + size) {
5206				tmpva += NBPDP;
5207				continue;
5208			}
5209			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
5210				return (ENOMEM);
5211		}
5212		pde = pmap_pdpe_to_pde(pdpe, tmpva);
5213		if (*pde == 0)
5214			return (EINVAL);
5215		if (*pde & PG_PS) {
5216			/*
5217			 * If the current 2MB page already has the required
5218			 * memory type, then we need not demote this page. Just
5219			 * increment tmpva to the next 2MB page frame.
5220			 */
5221			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5222				tmpva = trunc_2mpage(tmpva) + NBPDR;
5223				continue;
5224			}
5225
5226			/*
5227			 * If the current offset aligns with a 2MB page frame
5228			 * and there is at least 2MB left within the range, then
5229			 * we need not break down this page into 4KB pages.
5230			 */
5231			if ((tmpva & PDRMASK) == 0 &&
5232			    tmpva + PDRMASK < base + size) {
5233				tmpva += NBPDR;
5234				continue;
5235			}
5236			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
5237				return (ENOMEM);
5238		}
5239		pte = pmap_pde_to_pte(pde, tmpva);
5240		if (*pte == 0)
5241			return (EINVAL);
5242		tmpva += PAGE_SIZE;
5243	}
5244	error = 0;
5245
5246	/*
5247	 * Ok, all the pages exist, so run through them updating their
5248	 * cache mode if required.
5249	 */
5250	pa_start = pa_end = 0;
5251	for (tmpva = base; tmpva < base + size; ) {
5252		pdpe = pmap_pdpe(kernel_pmap, tmpva);
5253		if (*pdpe & PG_PS) {
5254			if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
5255				pmap_pde_attr(pdpe, cache_bits_pde);
5256				changed = TRUE;
5257			}
5258			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
5259				if (pa_start == pa_end) {
5260					/* Start physical address run. */
5261					pa_start = *pdpe & PG_PS_FRAME;
5262					pa_end = pa_start + NBPDP;
5263				} else if (pa_end == (*pdpe & PG_PS_FRAME))
5264					pa_end += NBPDP;
5265				else {
5266					/* Run ended, update direct map. */
5267					error = pmap_change_attr_locked(
5268					    PHYS_TO_DMAP(pa_start),
5269					    pa_end - pa_start, mode);
5270					if (error != 0)
5271						break;
5272					/* Start physical address run. */
5273					pa_start = *pdpe & PG_PS_FRAME;
5274					pa_end = pa_start + NBPDP;
5275				}
5276			}
5277			tmpva = trunc_1gpage(tmpva) + NBPDP;
5278			continue;
5279		}
5280		pde = pmap_pdpe_to_pde(pdpe, tmpva);
5281		if (*pde & PG_PS) {
5282			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5283				pmap_pde_attr(pde, cache_bits_pde);
5284				changed = TRUE;
5285			}
5286			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
5287				if (pa_start == pa_end) {
5288					/* Start physical address run. */
5289					pa_start = *pde & PG_PS_FRAME;
5290					pa_end = pa_start + NBPDR;
5291				} else if (pa_end == (*pde & PG_PS_FRAME))
5292					pa_end += NBPDR;
5293				else {
5294					/* Run ended, update direct map. */
5295					error = pmap_change_attr_locked(
5296					    PHYS_TO_DMAP(pa_start),
5297					    pa_end - pa_start, mode);
5298					if (error != 0)
5299						break;
5300					/* Start physical address run. */
5301					pa_start = *pde & PG_PS_FRAME;
5302					pa_end = pa_start + NBPDR;
5303				}
5304			}
5305			tmpva = trunc_2mpage(tmpva) + NBPDR;
5306		} else {
5307			pte = pmap_pde_to_pte(pde, tmpva);
5308			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5309				pmap_pte_attr(pte, cache_bits_pte);
5310				changed = TRUE;
5311			}
5312			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
5313				if (pa_start == pa_end) {
5314					/* Start physical address run. */
5315					pa_start = *pte & PG_FRAME;
5316					pa_end = pa_start + PAGE_SIZE;
5317				} else if (pa_end == (*pte & PG_FRAME))
5318					pa_end += PAGE_SIZE;
5319				else {
5320					/* Run ended, update direct map. */
5321					error = pmap_change_attr_locked(
5322					    PHYS_TO_DMAP(pa_start),
5323					    pa_end - pa_start, mode);
5324					if (error != 0)
5325						break;
5326					/* Start physical address run. */
5327					pa_start = *pte & PG_FRAME;
5328					pa_end = pa_start + PAGE_SIZE;
5329				}
5330			}
5331			tmpva += PAGE_SIZE;
5332		}
5333	}
5334	if (error == 0 && pa_start != pa_end)
5335		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
5336		    pa_end - pa_start, mode);
5337
5338	/*
5339	 * Flush CPU caches if required to make sure any data isn't cached that
5340	 * shouldn't be, etc.
5341	 */
5342	if (changed) {
5343		pmap_invalidate_range(kernel_pmap, base, tmpva);
5344		pmap_invalidate_cache_range(base, tmpva);
5345	}
5346	return (error);
5347}
5348
5349/*
5350 * Demotes any mapping within the direct map region that covers more than the
5351 * specified range of physical addresses.  This range's size must be a power
5352 * of two and its starting address must be a multiple of its size.  Since the
5353 * demotion does not change any attributes of the mapping, a TLB invalidation
5354 * is not mandatory.  The caller may, however, request a TLB invalidation.
5355 */
5356void
5357pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
5358{
5359	pdp_entry_t *pdpe;
5360	pd_entry_t *pde;
5361	vm_offset_t va;
5362	boolean_t changed;
5363
5364	if (len == 0)
5365		return;
5366	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
5367	KASSERT((base & (len - 1)) == 0,
5368	    ("pmap_demote_DMAP: base is not a multiple of len"));
5369	if (len < NBPDP && base < dmaplimit) {
5370		va = PHYS_TO_DMAP(base);
5371		changed = FALSE;
5372		PMAP_LOCK(kernel_pmap);
5373		pdpe = pmap_pdpe(kernel_pmap, va);
5374		if ((*pdpe & PG_V) == 0)
5375			panic("pmap_demote_DMAP: invalid PDPE");
5376		if ((*pdpe & PG_PS) != 0) {
5377			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
5378				panic("pmap_demote_DMAP: PDPE failed");
5379			changed = TRUE;
5380		}
5381		if (len < NBPDR) {
5382			pde = pmap_pdpe_to_pde(pdpe, va);
5383			if ((*pde & PG_V) == 0)
5384				panic("pmap_demote_DMAP: invalid PDE");
5385			if ((*pde & PG_PS) != 0) {
5386				if (!pmap_demote_pde(kernel_pmap, pde, va))
5387					panic("pmap_demote_DMAP: PDE failed");
5388				changed = TRUE;
5389			}
5390		}
5391		if (changed && invalidate)
5392			pmap_invalidate_page(kernel_pmap, va);
5393		PMAP_UNLOCK(kernel_pmap);
5394	}
5395}
5396
5397/*
5398 * perform the pmap work for mincore
5399 */
5400int
5401pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5402{
5403	pd_entry_t *pdep;
5404	pt_entry_t pte;
5405	vm_paddr_t pa;
5406	int val;
5407
5408	PMAP_LOCK(pmap);
5409retry:
5410	pdep = pmap_pde(pmap, addr);
5411	if (pdep != NULL && (*pdep & PG_V)) {
5412		if (*pdep & PG_PS) {
5413			pte = *pdep;
5414			/* Compute the physical address of the 4KB page. */
5415			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5416			    PG_FRAME;
5417			val = MINCORE_SUPER;
5418		} else {
5419			pte = *pmap_pde_to_pte(pdep, addr);
5420			pa = pte & PG_FRAME;
5421			val = 0;
5422		}
5423	} else {
5424		pte = 0;
5425		pa = 0;
5426		val = 0;
5427	}
5428	if ((pte & PG_V) != 0) {
5429		val |= MINCORE_INCORE;
5430		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5431			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5432		if ((pte & PG_A) != 0)
5433			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5434	}
5435	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5436	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5437	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5438		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5439		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5440			goto retry;
5441	} else
5442		PA_UNLOCK_COND(*locked_pa);
5443	PMAP_UNLOCK(pmap);
5444	return (val);
5445}
5446
5447void
5448pmap_activate(struct thread *td)
5449{
5450	pmap_t	pmap, oldpmap;
5451	u_int	cpuid;
5452	u_int64_t  cr3;
5453
5454	critical_enter();
5455	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5456	oldpmap = PCPU_GET(curpmap);
5457	cpuid = PCPU_GET(cpuid);
5458#ifdef SMP
5459	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5460	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5461#else
5462	CPU_CLR(cpuid, &oldpmap->pm_active);
5463	CPU_SET(cpuid, &pmap->pm_active);
5464#endif
5465	cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
5466	td->td_pcb->pcb_cr3 = cr3;
5467	load_cr3(cr3);
5468	PCPU_SET(curpmap, pmap);
5469	critical_exit();
5470}
5471
5472void
5473pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5474{
5475}
5476
5477/*
5478 *	Increase the starting virtual address of the given mapping if a
5479 *	different alignment might result in more superpage mappings.
5480 */
5481void
5482pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5483    vm_offset_t *addr, vm_size_t size)
5484{
5485	vm_offset_t superpage_offset;
5486
5487	if (size < NBPDR)
5488		return;
5489	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5490		offset += ptoa(object->pg_color);
5491	superpage_offset = offset & PDRMASK;
5492	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5493	    (*addr & PDRMASK) == superpage_offset)
5494		return;
5495	if ((*addr & PDRMASK) < superpage_offset)
5496		*addr = (*addr & ~PDRMASK) + superpage_offset;
5497	else
5498		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5499}
5500
5501#include "opt_ddb.h"
5502#ifdef DDB
5503#include <ddb/ddb.h>
5504
5505DB_SHOW_COMMAND(pte, pmap_print_pte)
5506{
5507	pmap_t pmap;
5508	pml4_entry_t *pml4;
5509	pdp_entry_t *pdp;
5510	pd_entry_t *pde;
5511	pt_entry_t *pte;
5512	vm_offset_t va;
5513
5514	if (have_addr) {
5515		va = (vm_offset_t)addr;
5516		pmap = PCPU_GET(curpmap); /* XXX */
5517	} else {
5518		db_printf("show pte addr\n");
5519		return;
5520	}
5521	pml4 = pmap_pml4e(pmap, va);
5522	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
5523	if ((*pml4 & PG_V) == 0) {
5524		db_printf("\n");
5525		return;
5526	}
5527	pdp = pmap_pml4e_to_pdpe(pml4, va);
5528	db_printf(" pdpe %#016lx", *pdp);
5529	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
5530		db_printf("\n");
5531		return;
5532	}
5533	pde = pmap_pdpe_to_pde(pdp, va);
5534	db_printf(" pde %#016lx", *pde);
5535	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
5536		db_printf("\n");
5537		return;
5538	}
5539	pte = pmap_pde_to_pte(pde, va);
5540	db_printf(" pte %#016lx\n", *pte);
5541}
5542
5543DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
5544{
5545	vm_paddr_t a;
5546
5547	if (have_addr) {
5548		a = (vm_paddr_t)addr;
5549		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
5550	} else {
5551		db_printf("show phys2dmap addr\n");
5552	}
5553}
5554#endif
5555