pmap.c revision 151910
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 151910 2005-10-31 21:25:33Z alc $");
81
82/*
83 *	Manages physical address maps.
84 *
85 *	In addition to hardware address maps, this
86 *	module is called upon to provide software-use-only
87 *	maps which may or may not be stored in the same
88 *	form as hardware maps.  These pseudo-maps are
89 *	used to store intermediate results from copy
90 *	operations to and from address spaces.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_msgbuf.h"
109
110#include <sys/param.h>
111#include <sys/systm.h>
112#include <sys/kernel.h>
113#include <sys/lock.h>
114#include <sys/malloc.h>
115#include <sys/mman.h>
116#include <sys/msgbuf.h>
117#include <sys/mutex.h>
118#include <sys/proc.h>
119#include <sys/sx.h>
120#include <sys/vmmeter.h>
121#include <sys/sched.h>
122#include <sys/sysctl.h>
123#ifdef SMP
124#include <sys/smp.h>
125#endif
126
127#include <vm/vm.h>
128#include <vm/vm_param.h>
129#include <vm/vm_kern.h>
130#include <vm/vm_page.h>
131#include <vm/vm_map.h>
132#include <vm/vm_object.h>
133#include <vm/vm_extern.h>
134#include <vm/vm_pageout.h>
135#include <vm/vm_pager.h>
136#include <vm/uma.h>
137
138#include <machine/cpu.h>
139#include <machine/cputypes.h>
140#include <machine/md_var.h>
141#include <machine/pcb.h>
142#include <machine/specialreg.h>
143#ifdef SMP
144#include <machine/smp.h>
145#endif
146
147#ifndef PMAP_SHPGPERPROC
148#define PMAP_SHPGPERPROC 200
149#endif
150
151#if defined(DIAGNOSTIC)
152#define PMAP_DIAGNOSTIC
153#endif
154
155#define MINPV 2048
156
157#if !defined(PMAP_DIAGNOSTIC)
158#define PMAP_INLINE __inline
159#else
160#define PMAP_INLINE
161#endif
162
163struct pmap kernel_pmap_store;
164
165vm_paddr_t avail_start;		/* PA of first available physical page */
166vm_paddr_t avail_end;		/* PA of last available physical page */
167vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
168vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
169
170static int nkpt;
171static int ndmpdp;
172static vm_paddr_t dmaplimit;
173vm_offset_t kernel_vm_end;
174pt_entry_t pg_nx;
175
176static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
177static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
178static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
179u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
180
181static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
182static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
183
184/*
185 * Data for the pv entry allocation mechanism
186 */
187static uma_zone_t pvzone;
188static struct vm_object pvzone_obj;
189static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
190int pmap_pagedaemon_waken;
191
192/*
193 * All those kernel PT submaps that BSD is so fond of
194 */
195pt_entry_t *CMAP1 = 0;
196caddr_t CADDR1 = 0;
197struct msgbuf *msgbufp = 0;
198
199/*
200 * Crashdump maps.
201 */
202static caddr_t crashdumpmap;
203
204static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
205static pv_entry_t get_pv_entry(void);
206static pv_entry_t pv_entry_reclaim(pmap_t locked_pmap);
207static void	pmap_clear_ptes(vm_page_t m, long bit);
208
209static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
210		vm_offset_t sva, pd_entry_t ptepde);
211static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde);
212static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
213		vm_offset_t va);
214static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
215
216static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
217static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
218
219static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
220static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m);
221static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t);
222static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
223
224CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
225CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
226
227/*
228 * Move the kernel virtual free pointer to the next
229 * 2MB.  This is used to help improve performance
230 * by using a large (2MB) page for much of the kernel
231 * (.text, .data, .bss)
232 */
233static vm_offset_t
234pmap_kmem_choose(vm_offset_t addr)
235{
236	vm_offset_t newaddr = addr;
237
238	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
239	return newaddr;
240}
241
242/********************/
243/* Inline functions */
244/********************/
245
246/* Return a non-clipped PD index for a given VA */
247static __inline vm_pindex_t
248pmap_pde_pindex(vm_offset_t va)
249{
250	return va >> PDRSHIFT;
251}
252
253
254/* Return various clipped indexes for a given VA */
255static __inline vm_pindex_t
256pmap_pte_index(vm_offset_t va)
257{
258
259	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
260}
261
262static __inline vm_pindex_t
263pmap_pde_index(vm_offset_t va)
264{
265
266	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
267}
268
269static __inline vm_pindex_t
270pmap_pdpe_index(vm_offset_t va)
271{
272
273	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
274}
275
276static __inline vm_pindex_t
277pmap_pml4e_index(vm_offset_t va)
278{
279
280	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
281}
282
283/* Return a pointer to the PML4 slot that corresponds to a VA */
284static __inline pml4_entry_t *
285pmap_pml4e(pmap_t pmap, vm_offset_t va)
286{
287
288	if (!pmap)
289		return NULL;
290	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
291}
292
293/* Return a pointer to the PDP slot that corresponds to a VA */
294static __inline pdp_entry_t *
295pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
296{
297	pdp_entry_t *pdpe;
298
299	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
300	return (&pdpe[pmap_pdpe_index(va)]);
301}
302
303/* Return a pointer to the PDP slot that corresponds to a VA */
304static __inline pdp_entry_t *
305pmap_pdpe(pmap_t pmap, vm_offset_t va)
306{
307	pml4_entry_t *pml4e;
308
309	pml4e = pmap_pml4e(pmap, va);
310	if (pml4e == NULL || (*pml4e & PG_V) == 0)
311		return NULL;
312	return (pmap_pml4e_to_pdpe(pml4e, va));
313}
314
315/* Return a pointer to the PD slot that corresponds to a VA */
316static __inline pd_entry_t *
317pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
318{
319	pd_entry_t *pde;
320
321	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
322	return (&pde[pmap_pde_index(va)]);
323}
324
325/* Return a pointer to the PD slot that corresponds to a VA */
326static __inline pd_entry_t *
327pmap_pde(pmap_t pmap, vm_offset_t va)
328{
329	pdp_entry_t *pdpe;
330
331	pdpe = pmap_pdpe(pmap, va);
332	if (pdpe == NULL || (*pdpe & PG_V) == 0)
333		 return NULL;
334	return (pmap_pdpe_to_pde(pdpe, va));
335}
336
337/* Return a pointer to the PT slot that corresponds to a VA */
338static __inline pt_entry_t *
339pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
340{
341	pt_entry_t *pte;
342
343	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
344	return (&pte[pmap_pte_index(va)]);
345}
346
347/* Return a pointer to the PT slot that corresponds to a VA */
348static __inline pt_entry_t *
349pmap_pte(pmap_t pmap, vm_offset_t va)
350{
351	pd_entry_t *pde;
352
353	pde = pmap_pde(pmap, va);
354	if (pde == NULL || (*pde & PG_V) == 0)
355		return NULL;
356	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
357		return ((pt_entry_t *)pde);
358	return (pmap_pde_to_pte(pde, va));
359}
360
361
362static __inline pt_entry_t *
363pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde)
364{
365	pd_entry_t *pde;
366
367	pde = pmap_pde(pmap, va);
368	if (pde == NULL || (*pde & PG_V) == 0)
369		return NULL;
370	*ptepde = *pde;
371	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
372		return ((pt_entry_t *)pde);
373	return (pmap_pde_to_pte(pde, va));
374}
375
376
377PMAP_INLINE pt_entry_t *
378vtopte(vm_offset_t va)
379{
380	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
381
382	return (PTmap + ((va >> PAGE_SHIFT) & mask));
383}
384
385static __inline pd_entry_t *
386vtopde(vm_offset_t va)
387{
388	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
389
390	return (PDmap + ((va >> PDRSHIFT) & mask));
391}
392
393static u_int64_t
394allocpages(int n)
395{
396	u_int64_t ret;
397
398	ret = avail_start;
399	bzero((void *)ret, n * PAGE_SIZE);
400	avail_start += n * PAGE_SIZE;
401	return (ret);
402}
403
404static void
405create_pagetables(void)
406{
407	int i;
408
409	/* Allocate pages */
410	KPTphys = allocpages(NKPT);
411	KPML4phys = allocpages(1);
412	KPDPphys = allocpages(NKPML4E);
413	KPDphys = allocpages(NKPDPE);
414
415	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
416	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
417		ndmpdp = 4;
418	DMPDPphys = allocpages(NDMPML4E);
419	DMPDphys = allocpages(ndmpdp);
420	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
421
422	/* Fill in the underlying page table pages */
423	/* Read-only from zero to physfree */
424	/* XXX not fully used, underneath 2M pages */
425	for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
426		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
427		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
428	}
429
430	/* Now map the page tables at their location within PTmap */
431	for (i = 0; i < NKPT; i++) {
432		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
433		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
434	}
435
436	/* Map from zero to end of allocations under 2M pages */
437	/* This replaces some of the KPTphys entries above */
438	for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
439		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
440		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
441	}
442
443	/* And connect up the PD to the PDP */
444	for (i = 0; i < NKPDPE; i++) {
445		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
446		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
447	}
448
449
450	/* Now set up the direct map space using 2MB pages */
451	for (i = 0; i < NPDEPG * ndmpdp; i++) {
452		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
453		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
454	}
455
456	/* And the direct map space's PDP */
457	for (i = 0; i < ndmpdp; i++) {
458		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
459		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
460	}
461
462	/* And recursively map PML4 to itself in order to get PTmap */
463	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
464	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
465
466	/* Connect the Direct Map slot up to the PML4 */
467	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
468	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
469
470	/* Connect the KVA slot up to the PML4 */
471	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
472	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
473}
474
475/*
476 *	Bootstrap the system enough to run with virtual memory.
477 *
478 *	On amd64 this is called after mapping has already been enabled
479 *	and just syncs the pmap module with what has already been done.
480 *	[We can't call it easily with mapping off since the kernel is not
481 *	mapped with PA == VA, hence we would have to relocate every address
482 *	from the linked base (virtual) address "KERNBASE" to the actual
483 *	(physical) address starting relative to 0]
484 */
485void
486pmap_bootstrap(firstaddr)
487	vm_paddr_t *firstaddr;
488{
489	vm_offset_t va;
490	pt_entry_t *pte, *unused;
491
492	avail_start = *firstaddr;
493
494	/*
495	 * Create an initial set of page tables to run the kernel in.
496	 */
497	create_pagetables();
498	*firstaddr = avail_start;
499
500	virtual_avail = (vm_offset_t) KERNBASE + avail_start;
501	virtual_avail = pmap_kmem_choose(virtual_avail);
502
503	virtual_end = VM_MAX_KERNEL_ADDRESS;
504
505
506	/* XXX do %cr0 as well */
507	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
508	load_cr3(KPML4phys);
509
510	/*
511	 * Initialize the kernel pmap (which is statically allocated).
512	 */
513	PMAP_LOCK_INIT(kernel_pmap);
514	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
515	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
516	TAILQ_INIT(&kernel_pmap->pm_pvlist);
517	nkpt = NKPT;
518
519	/*
520	 * Reserve some special page table entries/VA space for temporary
521	 * mapping of pages.
522	 */
523#define	SYSMAP(c, p, v, n)	\
524	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
525
526	va = virtual_avail;
527	pte = vtopte(va);
528
529	/*
530	 * CMAP1 is only used for the memory test.
531	 */
532	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
533
534	/*
535	 * Crashdump maps.
536	 */
537	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
538
539	/*
540	 * msgbufp is used to map the system message buffer.
541	 */
542	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
543
544	virtual_avail = va;
545
546	*CMAP1 = 0;
547
548	invltlb();
549}
550
551/*
552 *	Initialize a vm_page's machine-dependent fields.
553 */
554void
555pmap_page_init(vm_page_t m)
556{
557
558	TAILQ_INIT(&m->md.pv_list);
559	m->md.pv_list_count = 0;
560}
561
562/*
563 *	Initialize the pmap module.
564 *	Called by vm_init, to initialize any structures that the pmap
565 *	system needs to map virtual memory.
566 */
567void
568pmap_init(void)
569{
570
571	/*
572	 * init the pv free list
573	 */
574	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
575	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
576	uma_prealloc(pvzone, MINPV);
577}
578
579/*
580 * Initialize the address space (zone) for the pv_entries.  Set a
581 * high water mark so that the system can recover from excessive
582 * numbers of pv entries.
583 */
584void
585pmap_init2()
586{
587	int shpgperproc = PMAP_SHPGPERPROC;
588
589	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
590	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
591	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
592	pv_entry_high_water = 9 * (pv_entry_max / 10);
593	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
594}
595
596
597/***************************************************
598 * Low level helper routines.....
599 ***************************************************/
600
601
602/*
603 * this routine defines the region(s) of memory that should
604 * not be tested for the modified bit.
605 */
606static PMAP_INLINE int
607pmap_track_modified(vm_offset_t va)
608{
609	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
610		return 1;
611	else
612		return 0;
613}
614
615#ifdef SMP
616/*
617 * For SMP, these functions have to use the IPI mechanism for coherence.
618 */
619void
620pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
621{
622	u_int cpumask;
623	u_int other_cpus;
624
625	if (smp_started) {
626		if (!(read_rflags() & PSL_I))
627			panic("%s: interrupts disabled", __func__);
628		mtx_lock_spin(&smp_ipi_mtx);
629	} else
630		critical_enter();
631	/*
632	 * We need to disable interrupt preemption but MUST NOT have
633	 * interrupts disabled here.
634	 * XXX we may need to hold schedlock to get a coherent pm_active
635	 * XXX critical sections disable interrupts again
636	 */
637	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
638		invlpg(va);
639		smp_invlpg(va);
640	} else {
641		cpumask = PCPU_GET(cpumask);
642		other_cpus = PCPU_GET(other_cpus);
643		if (pmap->pm_active & cpumask)
644			invlpg(va);
645		if (pmap->pm_active & other_cpus)
646			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
647	}
648	if (smp_started)
649		mtx_unlock_spin(&smp_ipi_mtx);
650	else
651		critical_exit();
652}
653
654void
655pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
656{
657	u_int cpumask;
658	u_int other_cpus;
659	vm_offset_t addr;
660
661	if (smp_started) {
662		if (!(read_rflags() & PSL_I))
663			panic("%s: interrupts disabled", __func__);
664		mtx_lock_spin(&smp_ipi_mtx);
665	} else
666		critical_enter();
667	/*
668	 * We need to disable interrupt preemption but MUST NOT have
669	 * interrupts disabled here.
670	 * XXX we may need to hold schedlock to get a coherent pm_active
671	 * XXX critical sections disable interrupts again
672	 */
673	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
674		for (addr = sva; addr < eva; addr += PAGE_SIZE)
675			invlpg(addr);
676		smp_invlpg_range(sva, eva);
677	} else {
678		cpumask = PCPU_GET(cpumask);
679		other_cpus = PCPU_GET(other_cpus);
680		if (pmap->pm_active & cpumask)
681			for (addr = sva; addr < eva; addr += PAGE_SIZE)
682				invlpg(addr);
683		if (pmap->pm_active & other_cpus)
684			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
685			    sva, eva);
686	}
687	if (smp_started)
688		mtx_unlock_spin(&smp_ipi_mtx);
689	else
690		critical_exit();
691}
692
693void
694pmap_invalidate_all(pmap_t pmap)
695{
696	u_int cpumask;
697	u_int other_cpus;
698
699	if (smp_started) {
700		if (!(read_rflags() & PSL_I))
701			panic("%s: interrupts disabled", __func__);
702		mtx_lock_spin(&smp_ipi_mtx);
703	} else
704		critical_enter();
705	/*
706	 * We need to disable interrupt preemption but MUST NOT have
707	 * interrupts disabled here.
708	 * XXX we may need to hold schedlock to get a coherent pm_active
709	 * XXX critical sections disable interrupts again
710	 */
711	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
712		invltlb();
713		smp_invltlb();
714	} else {
715		cpumask = PCPU_GET(cpumask);
716		other_cpus = PCPU_GET(other_cpus);
717		if (pmap->pm_active & cpumask)
718			invltlb();
719		if (pmap->pm_active & other_cpus)
720			smp_masked_invltlb(pmap->pm_active & other_cpus);
721	}
722	if (smp_started)
723		mtx_unlock_spin(&smp_ipi_mtx);
724	else
725		critical_exit();
726}
727#else /* !SMP */
728/*
729 * Normal, non-SMP, invalidation functions.
730 * We inline these within pmap.c for speed.
731 */
732PMAP_INLINE void
733pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
734{
735
736	if (pmap == kernel_pmap || pmap->pm_active)
737		invlpg(va);
738}
739
740PMAP_INLINE void
741pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
742{
743	vm_offset_t addr;
744
745	if (pmap == kernel_pmap || pmap->pm_active)
746		for (addr = sva; addr < eva; addr += PAGE_SIZE)
747			invlpg(addr);
748}
749
750PMAP_INLINE void
751pmap_invalidate_all(pmap_t pmap)
752{
753
754	if (pmap == kernel_pmap || pmap->pm_active)
755		invltlb();
756}
757#endif /* !SMP */
758
759/*
760 * Are we current address space or kernel?
761 */
762static __inline int
763pmap_is_current(pmap_t pmap)
764{
765	return (pmap == kernel_pmap ||
766	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
767}
768
769/*
770 *	Routine:	pmap_extract
771 *	Function:
772 *		Extract the physical page address associated
773 *		with the given map/virtual_address pair.
774 */
775vm_paddr_t
776pmap_extract(pmap_t pmap, vm_offset_t va)
777{
778	vm_paddr_t rtval;
779	pt_entry_t *pte;
780	pd_entry_t pde, *pdep;
781
782	rtval = 0;
783	PMAP_LOCK(pmap);
784	pdep = pmap_pde(pmap, va);
785	if (pdep != NULL) {
786		pde = *pdep;
787		if (pde) {
788			if ((pde & PG_PS) != 0) {
789				KASSERT((pde & PG_FRAME & PDRMASK) == 0,
790				    ("pmap_extract: bad pde"));
791				rtval = (pde & PG_FRAME) | (va & PDRMASK);
792				PMAP_UNLOCK(pmap);
793				return rtval;
794			}
795			pte = pmap_pde_to_pte(pdep, va);
796			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
797		}
798	}
799	PMAP_UNLOCK(pmap);
800	return (rtval);
801}
802
803/*
804 *	Routine:	pmap_extract_and_hold
805 *	Function:
806 *		Atomically extract and hold the physical page
807 *		with the given pmap and virtual address pair
808 *		if that mapping permits the given protection.
809 */
810vm_page_t
811pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
812{
813	pd_entry_t pde, *pdep;
814	pt_entry_t pte;
815	vm_page_t m;
816
817	m = NULL;
818	vm_page_lock_queues();
819	PMAP_LOCK(pmap);
820	pdep = pmap_pde(pmap, va);
821	if (pdep != NULL && (pde = *pdep)) {
822		if (pde & PG_PS) {
823			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
824				KASSERT((pde & PG_FRAME & PDRMASK) == 0,
825				    ("pmap_extract_and_hold: bad pde"));
826				m = PHYS_TO_VM_PAGE((pde & PG_FRAME) |
827				    (va & PDRMASK));
828				vm_page_hold(m);
829			}
830		} else {
831			pte = *pmap_pde_to_pte(pdep, va);
832			if ((pte & PG_V) &&
833			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
834				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
835				vm_page_hold(m);
836			}
837		}
838	}
839	vm_page_unlock_queues();
840	PMAP_UNLOCK(pmap);
841	return (m);
842}
843
844vm_paddr_t
845pmap_kextract(vm_offset_t va)
846{
847	pd_entry_t *pde;
848	vm_paddr_t pa;
849
850	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
851		pa = DMAP_TO_PHYS(va);
852	} else {
853		pde = vtopde(va);
854		if (*pde & PG_PS) {
855			pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
856		} else {
857			pa = *vtopte(va);
858			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
859		}
860	}
861	return pa;
862}
863
864/***************************************************
865 * Low level mapping routines.....
866 ***************************************************/
867
868/*
869 * Add a wired page to the kva.
870 * Note: not SMP coherent.
871 */
872PMAP_INLINE void
873pmap_kenter(vm_offset_t va, vm_paddr_t pa)
874{
875	pt_entry_t *pte;
876
877	pte = vtopte(va);
878	pte_store(pte, pa | PG_RW | PG_V | PG_G);
879}
880
881/*
882 * Remove a page from the kernel pagetables.
883 * Note: not SMP coherent.
884 */
885PMAP_INLINE void
886pmap_kremove(vm_offset_t va)
887{
888	pt_entry_t *pte;
889
890	pte = vtopte(va);
891	pte_clear(pte);
892}
893
894/*
895 *	Used to map a range of physical addresses into kernel
896 *	virtual address space.
897 *
898 *	The value passed in '*virt' is a suggested virtual address for
899 *	the mapping. Architectures which can support a direct-mapped
900 *	physical to virtual region can return the appropriate address
901 *	within that region, leaving '*virt' unchanged. Other
902 *	architectures should map the pages starting at '*virt' and
903 *	update '*virt' with the first usable address after the mapped
904 *	region.
905 */
906vm_offset_t
907pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
908{
909	return PHYS_TO_DMAP(start);
910}
911
912
913/*
914 * Add a list of wired pages to the kva
915 * this routine is only used for temporary
916 * kernel mappings that do not need to have
917 * page modification or references recorded.
918 * Note that old mappings are simply written
919 * over.  The page *must* be wired.
920 * Note: SMP coherent.  Uses a ranged shootdown IPI.
921 */
922void
923pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
924{
925	vm_offset_t va;
926
927	va = sva;
928	while (count-- > 0) {
929		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
930		va += PAGE_SIZE;
931		m++;
932	}
933	pmap_invalidate_range(kernel_pmap, sva, va);
934}
935
936/*
937 * This routine tears out page mappings from the
938 * kernel -- it is meant only for temporary mappings.
939 * Note: SMP coherent.  Uses a ranged shootdown IPI.
940 */
941void
942pmap_qremove(vm_offset_t sva, int count)
943{
944	vm_offset_t va;
945
946	va = sva;
947	while (count-- > 0) {
948		pmap_kremove(va);
949		va += PAGE_SIZE;
950	}
951	pmap_invalidate_range(kernel_pmap, sva, va);
952}
953
954/***************************************************
955 * Page table page management routines.....
956 ***************************************************/
957
958/*
959 * This routine unholds page table pages, and if the hold count
960 * drops to zero, then it decrements the wire count.
961 */
962static PMAP_INLINE int
963pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
964{
965
966	--m->wire_count;
967	if (m->wire_count == 0)
968		return _pmap_unwire_pte_hold(pmap, va, m);
969	else
970		return 0;
971}
972
973static int
974_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
975{
976	vm_offset_t pteva;
977
978	/*
979	 * unmap the page table page
980	 */
981	if (m->pindex >= (NUPDE + NUPDPE)) {
982		/* PDP page */
983		pml4_entry_t *pml4;
984		pml4 = pmap_pml4e(pmap, va);
985		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
986		*pml4 = 0;
987	} else if (m->pindex >= NUPDE) {
988		/* PD page */
989		pdp_entry_t *pdp;
990		pdp = pmap_pdpe(pmap, va);
991		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
992		*pdp = 0;
993	} else {
994		/* PTE page */
995		pd_entry_t *pd;
996		pd = pmap_pde(pmap, va);
997		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
998		*pd = 0;
999	}
1000	--pmap->pm_stats.resident_count;
1001	if (m->pindex < NUPDE) {
1002		/* We just released a PT, unhold the matching PD */
1003		vm_page_t pdpg;
1004
1005		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1006		pmap_unwire_pte_hold(pmap, va, pdpg);
1007	}
1008	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1009		/* We just released a PD, unhold the matching PDP */
1010		vm_page_t pdppg;
1011
1012		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1013		pmap_unwire_pte_hold(pmap, va, pdppg);
1014	}
1015
1016	/*
1017	 * Do an invltlb to make the invalidated mapping
1018	 * take effect immediately.
1019	 */
1020	pmap_invalidate_page(pmap, pteva);
1021
1022	vm_page_free_zero(m);
1023	atomic_subtract_int(&cnt.v_wire_count, 1);
1024	return 1;
1025}
1026
1027/*
1028 * After removing a page table entry, this routine is used to
1029 * conditionally free the page, and manage the hold/wire counts.
1030 */
1031static int
1032pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde)
1033{
1034	vm_page_t mpte;
1035
1036	if (va >= VM_MAXUSER_ADDRESS)
1037		return 0;
1038	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1039	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1040	return pmap_unwire_pte_hold(pmap, va, mpte);
1041}
1042
1043void
1044pmap_pinit0(pmap)
1045	struct pmap *pmap;
1046{
1047
1048	PMAP_LOCK_INIT(pmap);
1049	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1050	pmap->pm_active = 0;
1051	TAILQ_INIT(&pmap->pm_pvlist);
1052	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1053}
1054
1055/*
1056 * Initialize a preallocated and zeroed pmap structure,
1057 * such as one in a vmspace structure.
1058 */
1059void
1060pmap_pinit(pmap)
1061	register struct pmap *pmap;
1062{
1063	vm_page_t pml4pg;
1064	static vm_pindex_t color;
1065
1066	PMAP_LOCK_INIT(pmap);
1067
1068	/*
1069	 * allocate the page directory page
1070	 */
1071	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1072	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1073		VM_WAIT;
1074
1075	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1076
1077	if ((pml4pg->flags & PG_ZERO) == 0)
1078		pagezero(pmap->pm_pml4);
1079
1080	/* Wire in kernel global address entries. */
1081	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1082	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1083
1084	/* install self-referential address mapping entry(s) */
1085	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1086
1087	pmap->pm_active = 0;
1088	TAILQ_INIT(&pmap->pm_pvlist);
1089	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1090}
1091
1092/*
1093 * this routine is called if the page table page is not
1094 * mapped correctly.
1095 *
1096 * Note: If a page allocation fails at page table level two or three,
1097 * one or two pages may be held during the wait, only to be released
1098 * afterwards.  This conservative approach is easily argued to avoid
1099 * race conditions.
1100 */
1101static vm_page_t
1102_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1103{
1104	vm_page_t m, pdppg, pdpg;
1105
1106	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1107	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1108	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1109
1110	/*
1111	 * Allocate a page table page.
1112	 */
1113	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1114	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1115		if (flags & M_WAITOK) {
1116			PMAP_UNLOCK(pmap);
1117			vm_page_unlock_queues();
1118			VM_WAIT;
1119			vm_page_lock_queues();
1120			PMAP_LOCK(pmap);
1121		}
1122
1123		/*
1124		 * Indicate the need to retry.  While waiting, the page table
1125		 * page may have been allocated.
1126		 */
1127		return (NULL);
1128	}
1129	if ((m->flags & PG_ZERO) == 0)
1130		pmap_zero_page(m);
1131
1132	/*
1133	 * Map the pagetable page into the process address space, if
1134	 * it isn't already there.
1135	 */
1136
1137	pmap->pm_stats.resident_count++;
1138
1139	if (ptepindex >= (NUPDE + NUPDPE)) {
1140		pml4_entry_t *pml4;
1141		vm_pindex_t pml4index;
1142
1143		/* Wire up a new PDPE page */
1144		pml4index = ptepindex - (NUPDE + NUPDPE);
1145		pml4 = &pmap->pm_pml4[pml4index];
1146		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1147
1148	} else if (ptepindex >= NUPDE) {
1149		vm_pindex_t pml4index;
1150		vm_pindex_t pdpindex;
1151		pml4_entry_t *pml4;
1152		pdp_entry_t *pdp;
1153
1154		/* Wire up a new PDE page */
1155		pdpindex = ptepindex - NUPDE;
1156		pml4index = pdpindex >> NPML4EPGSHIFT;
1157
1158		pml4 = &pmap->pm_pml4[pml4index];
1159		if ((*pml4 & PG_V) == 0) {
1160			/* Have to allocate a new pdp, recurse */
1161			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1162			    flags) == NULL) {
1163				--m->wire_count;
1164				vm_page_free(m);
1165				return (NULL);
1166			}
1167		} else {
1168			/* Add reference to pdp page */
1169			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1170			pdppg->wire_count++;
1171		}
1172		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1173
1174		/* Now find the pdp page */
1175		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1176		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1177
1178	} else {
1179		vm_pindex_t pml4index;
1180		vm_pindex_t pdpindex;
1181		pml4_entry_t *pml4;
1182		pdp_entry_t *pdp;
1183		pd_entry_t *pd;
1184
1185		/* Wire up a new PTE page */
1186		pdpindex = ptepindex >> NPDPEPGSHIFT;
1187		pml4index = pdpindex >> NPML4EPGSHIFT;
1188
1189		/* First, find the pdp and check that its valid. */
1190		pml4 = &pmap->pm_pml4[pml4index];
1191		if ((*pml4 & PG_V) == 0) {
1192			/* Have to allocate a new pd, recurse */
1193			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1194			    flags) == NULL) {
1195				--m->wire_count;
1196				vm_page_free(m);
1197				return (NULL);
1198			}
1199			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1200			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1201		} else {
1202			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1203			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1204			if ((*pdp & PG_V) == 0) {
1205				/* Have to allocate a new pd, recurse */
1206				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1207				    flags) == NULL) {
1208					--m->wire_count;
1209					vm_page_free(m);
1210					return (NULL);
1211				}
1212			} else {
1213				/* Add reference to the pd page */
1214				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1215				pdpg->wire_count++;
1216			}
1217		}
1218		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1219
1220		/* Now we know where the page directory page is */
1221		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1222		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1223	}
1224
1225	return m;
1226}
1227
1228static vm_page_t
1229pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1230{
1231	vm_pindex_t pdpindex, ptepindex;
1232	pdp_entry_t *pdpe;
1233	vm_page_t pdpg;
1234
1235	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1236	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1237	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1238retry:
1239	pdpe = pmap_pdpe(pmap, va);
1240	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1241		/* Add a reference to the pd page. */
1242		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1243		pdpg->wire_count++;
1244	} else {
1245		/* Allocate a pd page. */
1246		ptepindex = pmap_pde_pindex(va);
1247		pdpindex = ptepindex >> NPDPEPGSHIFT;
1248		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1249		if (pdpg == NULL && (flags & M_WAITOK))
1250			goto retry;
1251	}
1252	return (pdpg);
1253}
1254
1255static vm_page_t
1256pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1257{
1258	vm_pindex_t ptepindex;
1259	pd_entry_t *pd;
1260	vm_page_t m;
1261
1262	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1263	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1264	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1265
1266	/*
1267	 * Calculate pagetable page index
1268	 */
1269	ptepindex = pmap_pde_pindex(va);
1270retry:
1271	/*
1272	 * Get the page directory entry
1273	 */
1274	pd = pmap_pde(pmap, va);
1275
1276	/*
1277	 * This supports switching from a 2MB page to a
1278	 * normal 4K page.
1279	 */
1280	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1281		*pd = 0;
1282		pd = 0;
1283		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1284		pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va));
1285		pmap_invalidate_all(kernel_pmap);
1286	}
1287
1288	/*
1289	 * If the page table page is mapped, we just increment the
1290	 * hold count, and activate it.
1291	 */
1292	if (pd != 0 && (*pd & PG_V) != 0) {
1293		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1294		m->wire_count++;
1295	} else {
1296		/*
1297		 * Here if the pte page isn't mapped, or if it has been
1298		 * deallocated.
1299		 */
1300		m = _pmap_allocpte(pmap, ptepindex, flags);
1301		if (m == NULL && (flags & M_WAITOK))
1302			goto retry;
1303	}
1304	return (m);
1305}
1306
1307
1308/***************************************************
1309 * Pmap allocation/deallocation routines.
1310 ***************************************************/
1311
1312/*
1313 * Release any resources held by the given physical map.
1314 * Called when a pmap initialized by pmap_pinit is being released.
1315 * Should only be called if the map contains no valid mappings.
1316 */
1317void
1318pmap_release(pmap_t pmap)
1319{
1320	vm_page_t m;
1321
1322	KASSERT(pmap->pm_stats.resident_count == 0,
1323	    ("pmap_release: pmap resident count %ld != 0",
1324	    pmap->pm_stats.resident_count));
1325
1326	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1327
1328	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1329	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1330	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1331
1332	vm_page_lock_queues();
1333	m->wire_count--;
1334	atomic_subtract_int(&cnt.v_wire_count, 1);
1335	vm_page_free_zero(m);
1336	vm_page_unlock_queues();
1337	PMAP_LOCK_DESTROY(pmap);
1338}
1339
1340static int
1341kvm_size(SYSCTL_HANDLER_ARGS)
1342{
1343	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1344
1345	return sysctl_handle_long(oidp, &ksize, 0, req);
1346}
1347SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1348    0, 0, kvm_size, "IU", "Size of KVM");
1349
1350static int
1351kvm_free(SYSCTL_HANDLER_ARGS)
1352{
1353	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1354
1355	return sysctl_handle_long(oidp, &kfree, 0, req);
1356}
1357SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1358    0, 0, kvm_free, "IU", "Amount of KVM free");
1359
1360/*
1361 * grow the number of kernel page table entries, if needed
1362 */
1363void
1364pmap_growkernel(vm_offset_t addr)
1365{
1366	vm_paddr_t paddr;
1367	vm_page_t nkpg;
1368	pd_entry_t *pde, newpdir;
1369	pdp_entry_t newpdp;
1370
1371	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1372	if (kernel_vm_end == 0) {
1373		kernel_vm_end = KERNBASE;
1374		nkpt = 0;
1375		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1376			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1377			nkpt++;
1378		}
1379	}
1380	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1381	while (kernel_vm_end < addr) {
1382		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1383		if (pde == NULL) {
1384			/* We need a new PDP entry */
1385			nkpg = vm_page_alloc(NULL, nkpt,
1386			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1387			if (!nkpg)
1388				panic("pmap_growkernel: no memory to grow kernel");
1389			pmap_zero_page(nkpg);
1390			paddr = VM_PAGE_TO_PHYS(nkpg);
1391			newpdp = (pdp_entry_t)
1392				(paddr | PG_V | PG_RW | PG_A | PG_M);
1393			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1394			continue; /* try again */
1395		}
1396		if ((*pde & PG_V) != 0) {
1397			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1398			continue;
1399		}
1400
1401		/*
1402		 * This index is bogus, but out of the way
1403		 */
1404		nkpg = vm_page_alloc(NULL, nkpt,
1405		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1406		if (!nkpg)
1407			panic("pmap_growkernel: no memory to grow kernel");
1408
1409		nkpt++;
1410
1411		pmap_zero_page(nkpg);
1412		paddr = VM_PAGE_TO_PHYS(nkpg);
1413		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1414		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1415
1416		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1417	}
1418}
1419
1420
1421/***************************************************
1422 * page management routines.
1423 ***************************************************/
1424
1425/*
1426 * free the pv_entry back to the free list
1427 */
1428static PMAP_INLINE void
1429free_pv_entry(pv_entry_t pv)
1430{
1431	pv_entry_count--;
1432	uma_zfree(pvzone, pv);
1433}
1434
1435/*
1436 * get a new pv_entry, allocating a block from the system
1437 * when needed.
1438 * the memory allocation is performed bypassing the malloc code
1439 * because of the possibility of allocations at interrupt time.
1440 */
1441static pv_entry_t
1442get_pv_entry(void)
1443{
1444	pv_entry_count++;
1445	if (pv_entry_high_water &&
1446		(pv_entry_count > pv_entry_high_water) &&
1447		(pmap_pagedaemon_waken == 0)) {
1448		pmap_pagedaemon_waken = 1;
1449		wakeup (&vm_pages_needed);
1450	}
1451	return uma_zalloc(pvzone, M_NOWAIT);
1452}
1453
1454/*
1455 * Reclaim a pv entry by removing a mapping to an inactive page.
1456 */
1457static pv_entry_t
1458pv_entry_reclaim(pmap_t locked_pmap)
1459{
1460	pd_entry_t ptepde;
1461	pmap_t pmap;
1462	pt_entry_t *pte, tpte;
1463	pv_entry_t pv;
1464	vm_offset_t va;
1465	vm_page_t m;
1466
1467	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1468	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1469	TAILQ_FOREACH(m, &vm_page_queues[PQ_INACTIVE].pl, pageq) {
1470		if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1471			continue;
1472		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1473			va = pv->pv_va;
1474			pmap = pv->pv_pmap;
1475			if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1476				continue;
1477			pmap->pm_stats.resident_count--;
1478			pte = pmap_pte_pde(pmap, va, &ptepde);
1479			tpte = pte_load_clear(pte);
1480			KASSERT((tpte & PG_W) == 0,
1481			    ("pv_entry_reclaim: wired pte %#lx", tpte));
1482			if (tpte & PG_A)
1483				vm_page_flag_set(m, PG_REFERENCED);
1484			if (tpte & PG_M) {
1485				KASSERT((tpte & PG_RW),
1486	("pv_entry_reclaim: modified page not writable: va: %#lx, pte: %#lx",
1487				    va, tpte));
1488				if (pmap_track_modified(va))
1489					vm_page_dirty(m);
1490			}
1491			pmap_invalidate_page(pmap, va);
1492			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1493			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1494			if (TAILQ_EMPTY(&m->md.pv_list))
1495				vm_page_flag_clear(m, PG_WRITEABLE);
1496			m->md.pv_list_count--;
1497			pmap_unuse_pt(pmap, va, ptepde);
1498			if (pmap != locked_pmap)
1499				PMAP_UNLOCK(pmap);
1500			return (pv);
1501		}
1502	}
1503	panic("pv_entry_reclaim: increase vm.pmap.shpgperproc");
1504}
1505
1506static void
1507pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1508{
1509	pv_entry_t pv;
1510
1511	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1512	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1513	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1514		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1515			if (pmap == pv->pv_pmap && va == pv->pv_va)
1516				break;
1517		}
1518	} else {
1519		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1520			if (va == pv->pv_va)
1521				break;
1522		}
1523	}
1524	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1525	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1526	m->md.pv_list_count--;
1527	if (TAILQ_EMPTY(&m->md.pv_list))
1528		vm_page_flag_clear(m, PG_WRITEABLE);
1529	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1530	free_pv_entry(pv);
1531}
1532
1533/*
1534 * Create a pv entry for page at pa for
1535 * (pmap, va).
1536 */
1537static void
1538pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1539{
1540	pv_entry_t pv;
1541
1542	pv = get_pv_entry();
1543	if (pv == NULL) {
1544		pv_entry_count--;
1545		pv = pv_entry_reclaim(pmap);
1546	}
1547	pv->pv_va = va;
1548	pv->pv_pmap = pmap;
1549
1550	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1551	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1552	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1553	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1554	m->md.pv_list_count++;
1555}
1556
1557/*
1558 * pmap_remove_pte: do the things to unmap a page in a process
1559 */
1560static int
1561pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde)
1562{
1563	pt_entry_t oldpte;
1564	vm_page_t m;
1565
1566	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1567	oldpte = pte_load_clear(ptq);
1568	if (oldpte & PG_W)
1569		pmap->pm_stats.wired_count -= 1;
1570	/*
1571	 * Machines that don't support invlpg, also don't support
1572	 * PG_G.
1573	 */
1574	if (oldpte & PG_G)
1575		pmap_invalidate_page(kernel_pmap, va);
1576	pmap->pm_stats.resident_count -= 1;
1577	if (oldpte & PG_MANAGED) {
1578		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
1579		if (oldpte & PG_M) {
1580			KASSERT((oldpte & PG_RW),
1581	("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx",
1582			    va, oldpte));
1583			if (pmap_track_modified(va))
1584				vm_page_dirty(m);
1585		}
1586		if (oldpte & PG_A)
1587			vm_page_flag_set(m, PG_REFERENCED);
1588		pmap_remove_entry(pmap, m, va);
1589	}
1590	return (pmap_unuse_pt(pmap, va, ptepde));
1591}
1592
1593/*
1594 * Remove a single page from a process address space
1595 */
1596static void
1597pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde)
1598{
1599	pt_entry_t *pte;
1600
1601	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1602	if ((*pde & PG_V) == 0)
1603		return;
1604	pte = pmap_pde_to_pte(pde, va);
1605	if ((*pte & PG_V) == 0)
1606		return;
1607	pmap_remove_pte(pmap, pte, va, *pde);
1608	pmap_invalidate_page(pmap, va);
1609}
1610
1611/*
1612 *	Remove the given range of addresses from the specified map.
1613 *
1614 *	It is assumed that the start and end are properly
1615 *	rounded to the page size.
1616 */
1617void
1618pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1619{
1620	vm_offset_t va_next;
1621	pml4_entry_t *pml4e;
1622	pdp_entry_t *pdpe;
1623	pd_entry_t ptpaddr, *pde;
1624	pt_entry_t *pte;
1625	int anyvalid;
1626
1627	/*
1628	 * Perform an unsynchronized read.  This is, however, safe.
1629	 */
1630	if (pmap->pm_stats.resident_count == 0)
1631		return;
1632
1633	anyvalid = 0;
1634
1635	vm_page_lock_queues();
1636	PMAP_LOCK(pmap);
1637
1638	/*
1639	 * special handling of removing one page.  a very
1640	 * common operation and easy to short circuit some
1641	 * code.
1642	 */
1643	if (sva + PAGE_SIZE == eva) {
1644		pde = pmap_pde(pmap, sva);
1645		if (pde && (*pde & PG_PS) == 0) {
1646			pmap_remove_page(pmap, sva, pde);
1647			goto out;
1648		}
1649	}
1650
1651	for (; sva < eva; sva = va_next) {
1652
1653		if (pmap->pm_stats.resident_count == 0)
1654			break;
1655
1656		pml4e = pmap_pml4e(pmap, sva);
1657		if ((*pml4e & PG_V) == 0) {
1658			va_next = (sva + NBPML4) & ~PML4MASK;
1659			continue;
1660		}
1661
1662		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
1663		if ((*pdpe & PG_V) == 0) {
1664			va_next = (sva + NBPDP) & ~PDPMASK;
1665			continue;
1666		}
1667
1668		/*
1669		 * Calculate index for next page table.
1670		 */
1671		va_next = (sva + NBPDR) & ~PDRMASK;
1672
1673		pde = pmap_pdpe_to_pde(pdpe, sva);
1674		ptpaddr = *pde;
1675
1676		/*
1677		 * Weed out invalid mappings.
1678		 */
1679		if (ptpaddr == 0)
1680			continue;
1681
1682		/*
1683		 * Check for large page.
1684		 */
1685		if ((ptpaddr & PG_PS) != 0) {
1686			*pde = 0;
1687			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1688			pmap_unuse_pt(pmap, sva, *pdpe);
1689			anyvalid = 1;
1690			continue;
1691		}
1692
1693		/*
1694		 * Limit our scan to either the end of the va represented
1695		 * by the current page table page, or to the end of the
1696		 * range being removed.
1697		 */
1698		if (va_next > eva)
1699			va_next = eva;
1700
1701		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
1702		    sva += PAGE_SIZE) {
1703			if (*pte == 0)
1704				continue;
1705			anyvalid = 1;
1706			if (pmap_remove_pte(pmap, pte, sva, ptpaddr))
1707				break;
1708		}
1709	}
1710out:
1711	vm_page_unlock_queues();
1712	if (anyvalid)
1713		pmap_invalidate_all(pmap);
1714	PMAP_UNLOCK(pmap);
1715}
1716
1717/*
1718 *	Routine:	pmap_remove_all
1719 *	Function:
1720 *		Removes this physical page from
1721 *		all physical maps in which it resides.
1722 *		Reflects back modify bits to the pager.
1723 *
1724 *	Notes:
1725 *		Original versions of this routine were very
1726 *		inefficient because they iteratively called
1727 *		pmap_remove (slow...)
1728 */
1729
1730void
1731pmap_remove_all(vm_page_t m)
1732{
1733	register pv_entry_t pv;
1734	pt_entry_t *pte, tpte;
1735	pd_entry_t ptepde;
1736
1737#if defined(PMAP_DIAGNOSTIC)
1738	/*
1739	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1740	 */
1741	if (m->flags & PG_FICTITIOUS) {
1742		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx",
1743		    VM_PAGE_TO_PHYS(m));
1744	}
1745#endif
1746	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1747	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1748		PMAP_LOCK(pv->pv_pmap);
1749		pv->pv_pmap->pm_stats.resident_count--;
1750		pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde);
1751		tpte = pte_load_clear(pte);
1752		if (tpte & PG_W)
1753			pv->pv_pmap->pm_stats.wired_count--;
1754		if (tpte & PG_A)
1755			vm_page_flag_set(m, PG_REFERENCED);
1756
1757		/*
1758		 * Update the vm_page_t clean and reference bits.
1759		 */
1760		if (tpte & PG_M) {
1761			KASSERT((tpte & PG_RW),
1762	("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx",
1763			    pv->pv_va, tpte));
1764			if (pmap_track_modified(pv->pv_va))
1765				vm_page_dirty(m);
1766		}
1767		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1768		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1769		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1770		m->md.pv_list_count--;
1771		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde);
1772		PMAP_UNLOCK(pv->pv_pmap);
1773		free_pv_entry(pv);
1774	}
1775	vm_page_flag_clear(m, PG_WRITEABLE);
1776}
1777
1778/*
1779 *	Set the physical protection on the
1780 *	specified range of this map as requested.
1781 */
1782void
1783pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1784{
1785	vm_offset_t va_next;
1786	pml4_entry_t *pml4e;
1787	pdp_entry_t *pdpe;
1788	pd_entry_t ptpaddr, *pde;
1789	pt_entry_t *pte;
1790	int anychanged;
1791
1792	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1793		pmap_remove(pmap, sva, eva);
1794		return;
1795	}
1796
1797	if (prot & VM_PROT_WRITE)
1798		return;
1799
1800	anychanged = 0;
1801
1802	vm_page_lock_queues();
1803	PMAP_LOCK(pmap);
1804	for (; sva < eva; sva = va_next) {
1805
1806		pml4e = pmap_pml4e(pmap, sva);
1807		if ((*pml4e & PG_V) == 0) {
1808			va_next = (sva + NBPML4) & ~PML4MASK;
1809			continue;
1810		}
1811
1812		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
1813		if ((*pdpe & PG_V) == 0) {
1814			va_next = (sva + NBPDP) & ~PDPMASK;
1815			continue;
1816		}
1817
1818		va_next = (sva + NBPDR) & ~PDRMASK;
1819
1820		pde = pmap_pdpe_to_pde(pdpe, sva);
1821		ptpaddr = *pde;
1822
1823		/*
1824		 * Weed out invalid mappings.
1825		 */
1826		if (ptpaddr == 0)
1827			continue;
1828
1829		/*
1830		 * Check for large page.
1831		 */
1832		if ((ptpaddr & PG_PS) != 0) {
1833			*pde &= ~(PG_M|PG_RW);
1834			anychanged = 1;
1835			continue;
1836		}
1837
1838		if (va_next > eva)
1839			va_next = eva;
1840
1841		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
1842		    sva += PAGE_SIZE) {
1843			pt_entry_t obits, pbits;
1844			vm_page_t m;
1845
1846retry:
1847			obits = pbits = *pte;
1848			if (pbits & PG_MANAGED) {
1849				m = NULL;
1850				if (pbits & PG_A) {
1851					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
1852					vm_page_flag_set(m, PG_REFERENCED);
1853					pbits &= ~PG_A;
1854				}
1855				if ((pbits & PG_M) != 0 &&
1856				    pmap_track_modified(sva)) {
1857					if (m == NULL)
1858						m = PHYS_TO_VM_PAGE(pbits &
1859						    PG_FRAME);
1860					vm_page_dirty(m);
1861				}
1862			}
1863
1864			pbits &= ~(PG_RW | PG_M);
1865
1866			if (pbits != obits) {
1867				if (!atomic_cmpset_long(pte, obits, pbits))
1868					goto retry;
1869				if (obits & PG_G)
1870					pmap_invalidate_page(pmap, sva);
1871				else
1872					anychanged = 1;
1873			}
1874		}
1875	}
1876	vm_page_unlock_queues();
1877	if (anychanged)
1878		pmap_invalidate_all(pmap);
1879	PMAP_UNLOCK(pmap);
1880}
1881
1882/*
1883 *	Insert the given physical page (p) at
1884 *	the specified virtual address (v) in the
1885 *	target physical map with the protection requested.
1886 *
1887 *	If specified, the page will be wired down, meaning
1888 *	that the related pte can not be reclaimed.
1889 *
1890 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1891 *	or lose information.  That is, this routine must actually
1892 *	insert this page into the given map NOW.
1893 */
1894void
1895pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1896	   boolean_t wired)
1897{
1898	vm_paddr_t pa;
1899	register pt_entry_t *pte;
1900	vm_paddr_t opa;
1901	pt_entry_t origpte, newpte;
1902	vm_page_t mpte, om;
1903	boolean_t invlva;
1904
1905	va = trunc_page(va);
1906#ifdef PMAP_DIAGNOSTIC
1907	if (va > VM_MAX_KERNEL_ADDRESS)
1908		panic("pmap_enter: toobig");
1909	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1910		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
1911#endif
1912
1913	mpte = NULL;
1914
1915	vm_page_lock_queues();
1916	PMAP_LOCK(pmap);
1917
1918	/*
1919	 * In the case that a page table page is not
1920	 * resident, we are creating it here.
1921	 */
1922	if (va < VM_MAXUSER_ADDRESS) {
1923		mpte = pmap_allocpte(pmap, va, M_WAITOK);
1924	}
1925#if 0 && defined(PMAP_DIAGNOSTIC)
1926	else {
1927		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1928		origpte = *pdeaddr;
1929		if ((origpte & PG_V) == 0) {
1930			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
1931				origpte, va);
1932		}
1933	}
1934#endif
1935
1936	pte = pmap_pte(pmap, va);
1937
1938	/*
1939	 * Page Directory table entry not valid, we need a new PT page
1940	 */
1941	if (pte == NULL)
1942		panic("pmap_enter: invalid page directory va=%#lx\n", va);
1943
1944	pa = VM_PAGE_TO_PHYS(m);
1945	om = NULL;
1946	origpte = *pte;
1947	opa = origpte & PG_FRAME;
1948
1949	if (origpte & PG_PS)
1950		panic("pmap_enter: attempted pmap_enter on 2MB page");
1951
1952	/*
1953	 * Mapping has not changed, must be protection or wiring change.
1954	 */
1955	if (origpte && (opa == pa)) {
1956		/*
1957		 * Wiring change, just update stats. We don't worry about
1958		 * wiring PT pages as they remain resident as long as there
1959		 * are valid mappings in them. Hence, if a user page is wired,
1960		 * the PT page will be also.
1961		 */
1962		if (wired && ((origpte & PG_W) == 0))
1963			pmap->pm_stats.wired_count++;
1964		else if (!wired && (origpte & PG_W))
1965			pmap->pm_stats.wired_count--;
1966
1967		/*
1968		 * Remove extra pte reference
1969		 */
1970		if (mpte)
1971			mpte->wire_count--;
1972
1973		/*
1974		 * We might be turning off write access to the page,
1975		 * so we go ahead and sense modify status.
1976		 */
1977		if (origpte & PG_MANAGED) {
1978			om = m;
1979			pa |= PG_MANAGED;
1980		}
1981		goto validate;
1982	}
1983	/*
1984	 * Mapping has changed, invalidate old range and fall through to
1985	 * handle validating new mapping.
1986	 */
1987	if (opa) {
1988		if (origpte & PG_W)
1989			pmap->pm_stats.wired_count--;
1990		if (origpte & PG_MANAGED) {
1991			om = PHYS_TO_VM_PAGE(opa);
1992			pmap_remove_entry(pmap, om, va);
1993		}
1994		if (mpte != NULL) {
1995			mpte->wire_count--;
1996			KASSERT(mpte->wire_count > 0,
1997			    ("pmap_enter: missing reference to page table page,"
1998			     " va: 0x%lx", va));
1999		}
2000	} else
2001		pmap->pm_stats.resident_count++;
2002
2003	/*
2004	 * Enter on the PV list if part of our managed memory.
2005	 */
2006	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2007		pmap_insert_entry(pmap, va, m);
2008		pa |= PG_MANAGED;
2009	}
2010
2011	/*
2012	 * Increment counters
2013	 */
2014	if (wired)
2015		pmap->pm_stats.wired_count++;
2016
2017validate:
2018	/*
2019	 * Now validate mapping with desired protection/wiring.
2020	 */
2021	newpte = (pt_entry_t)(pa | PG_V);
2022	if ((prot & VM_PROT_WRITE) != 0)
2023		newpte |= PG_RW;
2024	if ((prot & VM_PROT_EXECUTE) == 0)
2025		newpte |= pg_nx;
2026	if (wired)
2027		newpte |= PG_W;
2028	if (va < VM_MAXUSER_ADDRESS)
2029		newpte |= PG_U;
2030	if (pmap == kernel_pmap)
2031		newpte |= PG_G;
2032
2033	/*
2034	 * if the mapping or permission bits are different, we need
2035	 * to update the pte.
2036	 */
2037	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2038		if (origpte & PG_V) {
2039			invlva = FALSE;
2040			origpte = pte_load_store(pte, newpte | PG_A);
2041			if (origpte & PG_A) {
2042				if (origpte & PG_MANAGED)
2043					vm_page_flag_set(om, PG_REFERENCED);
2044				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
2045				    PG_NX) == 0 && (newpte & PG_NX)))
2046					invlva = TRUE;
2047			}
2048			if (origpte & PG_M) {
2049				KASSERT((origpte & PG_RW),
2050	("pmap_enter: modified page not writable: va: %#lx, pte: %#lx",
2051				    va, origpte));
2052				if ((origpte & PG_MANAGED) &&
2053				    pmap_track_modified(va))
2054					vm_page_dirty(om);
2055				if ((newpte & PG_RW) == 0)
2056					invlva = TRUE;
2057			}
2058			if (invlva)
2059				pmap_invalidate_page(pmap, va);
2060		} else
2061			pte_store(pte, newpte | PG_A);
2062	}
2063	vm_page_unlock_queues();
2064	PMAP_UNLOCK(pmap);
2065}
2066
2067/*
2068 * this code makes some *MAJOR* assumptions:
2069 * 1. Current pmap & pmap exists.
2070 * 2. Not wired.
2071 * 3. Read access.
2072 * 4. No page table pages.
2073 * but is *MUCH* faster than pmap_enter...
2074 */
2075
2076vm_page_t
2077pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2078    vm_page_t mpte)
2079{
2080	pt_entry_t *pte;
2081	vm_paddr_t pa;
2082
2083	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2084	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2085	PMAP_LOCK(pmap);
2086
2087	/*
2088	 * In the case that a page table page is not
2089	 * resident, we are creating it here.
2090	 */
2091	if (va < VM_MAXUSER_ADDRESS) {
2092		vm_pindex_t ptepindex;
2093		pd_entry_t *ptepa;
2094
2095		/*
2096		 * Calculate pagetable page index
2097		 */
2098		ptepindex = pmap_pde_pindex(va);
2099		if (mpte && (mpte->pindex == ptepindex)) {
2100			mpte->wire_count++;
2101		} else {
2102	retry:
2103			/*
2104			 * Get the page directory entry
2105			 */
2106			ptepa = pmap_pde(pmap, va);
2107
2108			/*
2109			 * If the page table page is mapped, we just increment
2110			 * the hold count, and activate it.
2111			 */
2112			if (ptepa && (*ptepa & PG_V) != 0) {
2113				if (*ptepa & PG_PS)
2114					panic("pmap_enter_quick: unexpected mapping into 2MB page");
2115				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
2116				mpte->wire_count++;
2117			} else {
2118				mpte = _pmap_allocpte(pmap, ptepindex,
2119				    M_NOWAIT);
2120				if (mpte == NULL) {
2121					PMAP_UNLOCK(pmap);
2122					vm_page_busy(m);
2123					vm_page_unlock_queues();
2124					VM_OBJECT_UNLOCK(m->object);
2125					VM_WAIT;
2126					VM_OBJECT_LOCK(m->object);
2127					vm_page_lock_queues();
2128					vm_page_wakeup(m);
2129					PMAP_LOCK(pmap);
2130					goto retry;
2131				}
2132			}
2133		}
2134	} else {
2135		mpte = NULL;
2136	}
2137
2138	/*
2139	 * This call to vtopte makes the assumption that we are
2140	 * entering the page into the current pmap.  In order to support
2141	 * quick entry into any pmap, one would likely use pmap_pte.
2142	 * But that isn't as quick as vtopte.
2143	 */
2144	pte = vtopte(va);
2145	if (*pte) {
2146		if (mpte != NULL) {
2147			pmap_unwire_pte_hold(pmap, va, mpte);
2148			mpte = NULL;
2149		}
2150		goto out;
2151	}
2152
2153	/*
2154	 * Enter on the PV list if part of our managed memory. Note that we
2155	 * raise IPL while manipulating pv_table since pmap_enter can be
2156	 * called at interrupt time.
2157	 */
2158	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2159		pmap_insert_entry(pmap, va, m);
2160
2161	/*
2162	 * Increment counters
2163	 */
2164	pmap->pm_stats.resident_count++;
2165
2166	pa = VM_PAGE_TO_PHYS(m);
2167	if ((prot & VM_PROT_EXECUTE) == 0)
2168		pa |= pg_nx;
2169
2170	/*
2171	 * Now validate mapping with RO protection
2172	 */
2173	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2174		pte_store(pte, pa | PG_V | PG_U);
2175	else
2176		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2177out:
2178	PMAP_UNLOCK(pmap);
2179	return mpte;
2180}
2181
2182/*
2183 * Make a temporary mapping for a physical address.  This is only intended
2184 * to be used for panic dumps.
2185 */
2186void *
2187pmap_kenter_temporary(vm_paddr_t pa, int i)
2188{
2189	vm_offset_t va;
2190
2191	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2192	pmap_kenter(va, pa);
2193	invlpg(va);
2194	return ((void *)crashdumpmap);
2195}
2196
2197/*
2198 * This code maps large physical mmap regions into the
2199 * processor address space.  Note that some shortcuts
2200 * are taken, but the code works.
2201 */
2202void
2203pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2204		    vm_object_t object, vm_pindex_t pindex,
2205		    vm_size_t size)
2206{
2207	vm_offset_t va;
2208	vm_page_t p, pdpg;
2209
2210	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2211	KASSERT(object->type == OBJT_DEVICE,
2212	    ("pmap_object_init_pt: non-device object"));
2213	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2214		vm_page_t m[1];
2215		pd_entry_t ptepa, *pde;
2216
2217		PMAP_LOCK(pmap);
2218		pde = pmap_pde(pmap, addr);
2219		if (pde != 0 && (*pde & PG_V) != 0)
2220			goto out;
2221		PMAP_UNLOCK(pmap);
2222retry:
2223		p = vm_page_lookup(object, pindex);
2224		if (p != NULL) {
2225			vm_page_lock_queues();
2226			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2227				goto retry;
2228		} else {
2229			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2230			if (p == NULL)
2231				return;
2232			m[0] = p;
2233
2234			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2235				vm_page_lock_queues();
2236				vm_page_free(p);
2237				vm_page_unlock_queues();
2238				return;
2239			}
2240
2241			p = vm_page_lookup(object, pindex);
2242			vm_page_lock_queues();
2243			vm_page_wakeup(p);
2244		}
2245		vm_page_unlock_queues();
2246
2247		ptepa = VM_PAGE_TO_PHYS(p);
2248		if (ptepa & (NBPDR - 1))
2249			return;
2250
2251		p->valid = VM_PAGE_BITS_ALL;
2252
2253		PMAP_LOCK(pmap);
2254		for (va = addr; va < addr + size; va += NBPDR) {
2255			while ((pdpg =
2256			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
2257				PMAP_UNLOCK(pmap);
2258				vm_page_lock_queues();
2259				vm_page_busy(p);
2260				vm_page_unlock_queues();
2261				VM_OBJECT_UNLOCK(object);
2262				VM_WAIT;
2263				VM_OBJECT_LOCK(object);
2264				vm_page_lock_queues();
2265				vm_page_wakeup(p);
2266				vm_page_unlock_queues();
2267				PMAP_LOCK(pmap);
2268			}
2269			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
2270			pde = &pde[pmap_pde_index(va)];
2271			if ((*pde & PG_V) == 0) {
2272				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
2273				    PG_U | PG_RW | PG_V);
2274				pmap->pm_stats.resident_count +=
2275				    NBPDR / PAGE_SIZE;
2276			} else {
2277				pdpg->wire_count--;
2278				KASSERT(pdpg->wire_count > 0,
2279				    ("pmap_object_init_pt: missing reference "
2280				     "to page directory page, va: 0x%lx", va));
2281			}
2282			ptepa += NBPDR;
2283		}
2284		pmap_invalidate_all(pmap);
2285out:
2286		PMAP_UNLOCK(pmap);
2287	}
2288}
2289
2290/*
2291 *	Routine:	pmap_change_wiring
2292 *	Function:	Change the wiring attribute for a map/virtual-address
2293 *			pair.
2294 *	In/out conditions:
2295 *			The mapping must already exist in the pmap.
2296 */
2297void
2298pmap_change_wiring(pmap, va, wired)
2299	register pmap_t pmap;
2300	vm_offset_t va;
2301	boolean_t wired;
2302{
2303	register pt_entry_t *pte;
2304
2305	/*
2306	 * Wiring is not a hardware characteristic so there is no need to
2307	 * invalidate TLB.
2308	 */
2309	PMAP_LOCK(pmap);
2310	pte = pmap_pte(pmap, va);
2311	if (wired && (*pte & PG_W) == 0) {
2312		pmap->pm_stats.wired_count++;
2313		atomic_set_long(pte, PG_W);
2314	} else if (!wired && (*pte & PG_W) != 0) {
2315		pmap->pm_stats.wired_count--;
2316		atomic_clear_long(pte, PG_W);
2317	}
2318	PMAP_UNLOCK(pmap);
2319}
2320
2321
2322
2323/*
2324 *	Copy the range specified by src_addr/len
2325 *	from the source map to the range dst_addr/len
2326 *	in the destination map.
2327 *
2328 *	This routine is only advisory and need not do anything.
2329 */
2330
2331void
2332pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2333	  vm_offset_t src_addr)
2334{
2335	vm_offset_t addr;
2336	vm_offset_t end_addr = src_addr + len;
2337	vm_offset_t va_next;
2338	vm_page_t m;
2339
2340	if (dst_addr != src_addr)
2341		return;
2342
2343	if (!pmap_is_current(src_pmap))
2344		return;
2345
2346	vm_page_lock_queues();
2347	if (dst_pmap < src_pmap) {
2348		PMAP_LOCK(dst_pmap);
2349		PMAP_LOCK(src_pmap);
2350	} else {
2351		PMAP_LOCK(src_pmap);
2352		PMAP_LOCK(dst_pmap);
2353	}
2354	for (addr = src_addr; addr < end_addr; addr = va_next) {
2355		pt_entry_t *src_pte, *dst_pte;
2356		vm_page_t dstmpde, dstmpte, srcmpte;
2357		pml4_entry_t *pml4e;
2358		pdp_entry_t *pdpe;
2359		pd_entry_t srcptepaddr, *pde;
2360
2361		if (addr >= UPT_MIN_ADDRESS)
2362			panic("pmap_copy: invalid to pmap_copy page tables");
2363
2364		/*
2365		 * Don't let optional prefaulting of pages make us go
2366		 * way below the low water mark of free pages or way
2367		 * above high water mark of used pv entries.
2368		 */
2369		if (cnt.v_free_count < cnt.v_free_reserved ||
2370		    pv_entry_count > pv_entry_high_water)
2371			break;
2372
2373		pml4e = pmap_pml4e(src_pmap, addr);
2374		if ((*pml4e & PG_V) == 0) {
2375			va_next = (addr + NBPML4) & ~PML4MASK;
2376			continue;
2377		}
2378
2379		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
2380		if ((*pdpe & PG_V) == 0) {
2381			va_next = (addr + NBPDP) & ~PDPMASK;
2382			continue;
2383		}
2384
2385		va_next = (addr + NBPDR) & ~PDRMASK;
2386
2387		pde = pmap_pdpe_to_pde(pdpe, addr);
2388		srcptepaddr = *pde;
2389		if (srcptepaddr == 0)
2390			continue;
2391
2392		if (srcptepaddr & PG_PS) {
2393			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
2394			if (dstmpde == NULL)
2395				break;
2396			pde = (pd_entry_t *)
2397			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
2398			pde = &pde[pmap_pde_index(addr)];
2399			if (*pde == 0) {
2400				*pde = srcptepaddr;
2401				dst_pmap->pm_stats.resident_count +=
2402				    NBPDR / PAGE_SIZE;
2403			} else
2404				pmap_unwire_pte_hold(dst_pmap, addr, dstmpde);
2405			continue;
2406		}
2407
2408		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
2409		if (srcmpte->wire_count == 0)
2410			panic("pmap_copy: source page table page is unused");
2411
2412		if (va_next > end_addr)
2413			va_next = end_addr;
2414
2415		src_pte = vtopte(addr);
2416		while (addr < va_next) {
2417			pt_entry_t ptetemp;
2418			ptetemp = *src_pte;
2419			/*
2420			 * we only virtual copy managed pages
2421			 */
2422			if ((ptetemp & PG_MANAGED) != 0) {
2423				/*
2424				 * We have to check after allocpte for the
2425				 * pte still being around...  allocpte can
2426				 * block.
2427				 */
2428				dstmpte = pmap_allocpte(dst_pmap, addr,
2429				    M_NOWAIT);
2430				if (dstmpte == NULL)
2431					break;
2432				dst_pte = (pt_entry_t *)
2433				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2434				dst_pte = &dst_pte[pmap_pte_index(addr)];
2435				if (*dst_pte == 0) {
2436					/*
2437					 * Clear the modified and
2438					 * accessed (referenced) bits
2439					 * during the copy.
2440					 */
2441					m = PHYS_TO_VM_PAGE(ptetemp & PG_FRAME);
2442					*dst_pte = ptetemp & ~(PG_M | PG_A);
2443					dst_pmap->pm_stats.resident_count++;
2444					pmap_insert_entry(dst_pmap, addr, m);
2445	 			} else
2446					pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
2447				if (dstmpte->wire_count >= srcmpte->wire_count)
2448					break;
2449			}
2450			addr += PAGE_SIZE;
2451			src_pte++;
2452		}
2453	}
2454	vm_page_unlock_queues();
2455	PMAP_UNLOCK(src_pmap);
2456	PMAP_UNLOCK(dst_pmap);
2457}
2458
2459/*
2460 *	pmap_zero_page zeros the specified hardware page by mapping
2461 *	the page into KVM and using bzero to clear its contents.
2462 */
2463void
2464pmap_zero_page(vm_page_t m)
2465{
2466	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2467
2468	pagezero((void *)va);
2469}
2470
2471/*
2472 *	pmap_zero_page_area zeros the specified hardware page by mapping
2473 *	the page into KVM and using bzero to clear its contents.
2474 *
2475 *	off and size may not cover an area beyond a single hardware page.
2476 */
2477void
2478pmap_zero_page_area(vm_page_t m, int off, int size)
2479{
2480	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2481
2482	if (off == 0 && size == PAGE_SIZE)
2483		pagezero((void *)va);
2484	else
2485		bzero((char *)va + off, size);
2486}
2487
2488/*
2489 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2490 *	the page into KVM and using bzero to clear its contents.  This
2491 *	is intended to be called from the vm_pagezero process only and
2492 *	outside of Giant.
2493 */
2494void
2495pmap_zero_page_idle(vm_page_t m)
2496{
2497	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2498
2499	pagezero((void *)va);
2500}
2501
2502/*
2503 *	pmap_copy_page copies the specified (machine independent)
2504 *	page by mapping the page into virtual memory and using
2505 *	bcopy to copy the page, one machine dependent page at a
2506 *	time.
2507 */
2508void
2509pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2510{
2511	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2512	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2513
2514	pagecopy((void *)src, (void *)dst);
2515}
2516
2517/*
2518 * Returns true if the pmap's pv is one of the first
2519 * 16 pvs linked to from this page.  This count may
2520 * be changed upwards or downwards in the future; it
2521 * is only necessary that true be returned for a small
2522 * subset of pmaps for proper page aging.
2523 */
2524boolean_t
2525pmap_page_exists_quick(pmap, m)
2526	pmap_t pmap;
2527	vm_page_t m;
2528{
2529	pv_entry_t pv;
2530	int loops = 0;
2531
2532	if (m->flags & PG_FICTITIOUS)
2533		return FALSE;
2534
2535	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2536	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2537		if (pv->pv_pmap == pmap) {
2538			return TRUE;
2539		}
2540		loops++;
2541		if (loops >= 16)
2542			break;
2543	}
2544	return (FALSE);
2545}
2546
2547#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2548/*
2549 * Remove all pages from specified address space
2550 * this aids process exit speeds.  Also, this code
2551 * is special cased for current process only, but
2552 * can have the more generic (and slightly slower)
2553 * mode enabled.  This is much faster than pmap_remove
2554 * in the case of running down an entire address space.
2555 */
2556void
2557pmap_remove_pages(pmap, sva, eva)
2558	pmap_t pmap;
2559	vm_offset_t sva, eva;
2560{
2561	pt_entry_t *pte, tpte;
2562	vm_page_t m;
2563	pv_entry_t pv, npv;
2564
2565#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2566	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2567		printf("warning: pmap_remove_pages called with non-current pmap\n");
2568		return;
2569	}
2570#endif
2571	vm_page_lock_queues();
2572	PMAP_LOCK(pmap);
2573	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2574
2575		if (pv->pv_va >= eva || pv->pv_va < sva) {
2576			npv = TAILQ_NEXT(pv, pv_plist);
2577			continue;
2578		}
2579
2580#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2581		pte = vtopte(pv->pv_va);
2582#else
2583		pte = pmap_pte(pmap, pv->pv_va);
2584#endif
2585		tpte = *pte;
2586
2587		if (tpte == 0) {
2588			printf("TPTE at %p  IS ZERO @ VA %08lx\n",
2589							pte, pv->pv_va);
2590			panic("bad pte");
2591		}
2592
2593/*
2594 * We cannot remove wired pages from a process' mapping at this time
2595 */
2596		if (tpte & PG_W) {
2597			npv = TAILQ_NEXT(pv, pv_plist);
2598			continue;
2599		}
2600
2601		m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2602		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2603		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2604		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2605
2606		KASSERT(m < &vm_page_array[vm_page_array_size],
2607			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2608
2609		pmap->pm_stats.resident_count--;
2610
2611		pte_clear(pte);
2612
2613		/*
2614		 * Update the vm_page_t clean and reference bits.
2615		 */
2616		if (tpte & PG_M) {
2617			vm_page_dirty(m);
2618		}
2619
2620		npv = TAILQ_NEXT(pv, pv_plist);
2621		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2622
2623		m->md.pv_list_count--;
2624		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2625		if (TAILQ_EMPTY(&m->md.pv_list))
2626			vm_page_flag_clear(m, PG_WRITEABLE);
2627
2628		pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va));
2629		free_pv_entry(pv);
2630	}
2631	pmap_invalidate_all(pmap);
2632	PMAP_UNLOCK(pmap);
2633	vm_page_unlock_queues();
2634}
2635
2636/*
2637 *	pmap_is_modified:
2638 *
2639 *	Return whether or not the specified physical page was modified
2640 *	in any physical maps.
2641 */
2642boolean_t
2643pmap_is_modified(vm_page_t m)
2644{
2645	pv_entry_t pv;
2646	pt_entry_t *pte;
2647	boolean_t rv;
2648
2649	rv = FALSE;
2650	if (m->flags & PG_FICTITIOUS)
2651		return (rv);
2652
2653	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2654	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2655		/*
2656		 * if the bit being tested is the modified bit, then
2657		 * mark clean_map and ptes as never
2658		 * modified.
2659		 */
2660		if (!pmap_track_modified(pv->pv_va))
2661			continue;
2662		PMAP_LOCK(pv->pv_pmap);
2663		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2664		rv = (*pte & PG_M) != 0;
2665		PMAP_UNLOCK(pv->pv_pmap);
2666		if (rv)
2667			break;
2668	}
2669	return (rv);
2670}
2671
2672/*
2673 *	pmap_is_prefaultable:
2674 *
2675 *	Return whether or not the specified virtual address is elgible
2676 *	for prefault.
2677 */
2678boolean_t
2679pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2680{
2681	pd_entry_t *pde;
2682	pt_entry_t *pte;
2683	boolean_t rv;
2684
2685	rv = FALSE;
2686	PMAP_LOCK(pmap);
2687	pde = pmap_pde(pmap, addr);
2688	if (pde != NULL && (*pde & PG_V)) {
2689		pte = vtopte(addr);
2690		rv = (*pte & PG_V) == 0;
2691	}
2692	PMAP_UNLOCK(pmap);
2693	return (rv);
2694}
2695
2696/*
2697 *	Clear the given bit in each of the given page's ptes.
2698 */
2699static __inline void
2700pmap_clear_ptes(vm_page_t m, long bit)
2701{
2702	register pv_entry_t pv;
2703	pt_entry_t pbits, *pte;
2704
2705	if ((m->flags & PG_FICTITIOUS) ||
2706	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2707		return;
2708
2709	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2710	/*
2711	 * Loop over all current mappings setting/clearing as appropos If
2712	 * setting RO do we need to clear the VAC?
2713	 */
2714	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2715		/*
2716		 * don't write protect pager mappings
2717		 */
2718		if (bit == PG_RW) {
2719			if (!pmap_track_modified(pv->pv_va))
2720				continue;
2721		}
2722
2723		PMAP_LOCK(pv->pv_pmap);
2724		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2725retry:
2726		pbits = *pte;
2727		if (pbits & bit) {
2728			if (bit == PG_RW) {
2729				if (!atomic_cmpset_long(pte, pbits,
2730				    pbits & ~(PG_RW | PG_M)))
2731					goto retry;
2732				if (pbits & PG_M) {
2733					vm_page_dirty(m);
2734				}
2735			} else {
2736				atomic_clear_long(pte, bit);
2737			}
2738			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2739		}
2740		PMAP_UNLOCK(pv->pv_pmap);
2741	}
2742	if (bit == PG_RW)
2743		vm_page_flag_clear(m, PG_WRITEABLE);
2744}
2745
2746/*
2747 *      pmap_page_protect:
2748 *
2749 *      Lower the permission for all mappings to a given page.
2750 */
2751void
2752pmap_page_protect(vm_page_t m, vm_prot_t prot)
2753{
2754	if ((prot & VM_PROT_WRITE) == 0) {
2755		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2756			pmap_clear_ptes(m, PG_RW);
2757		} else {
2758			pmap_remove_all(m);
2759		}
2760	}
2761}
2762
2763/*
2764 *	pmap_ts_referenced:
2765 *
2766 *	Return a count of reference bits for a page, clearing those bits.
2767 *	It is not necessary for every reference bit to be cleared, but it
2768 *	is necessary that 0 only be returned when there are truly no
2769 *	reference bits set.
2770 *
2771 *	XXX: The exact number of bits to check and clear is a matter that
2772 *	should be tested and standardized at some point in the future for
2773 *	optimal aging of shared pages.
2774 */
2775int
2776pmap_ts_referenced(vm_page_t m)
2777{
2778	register pv_entry_t pv, pvf, pvn;
2779	pt_entry_t *pte;
2780	pt_entry_t v;
2781	int rtval = 0;
2782
2783	if (m->flags & PG_FICTITIOUS)
2784		return (rtval);
2785
2786	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2787	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2788
2789		pvf = pv;
2790
2791		do {
2792			pvn = TAILQ_NEXT(pv, pv_list);
2793
2794			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2795
2796			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2797
2798			if (!pmap_track_modified(pv->pv_va))
2799				continue;
2800
2801			PMAP_LOCK(pv->pv_pmap);
2802			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2803
2804			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2805				atomic_clear_long(pte, PG_A);
2806				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2807
2808				rtval++;
2809				if (rtval > 4) {
2810					PMAP_UNLOCK(pv->pv_pmap);
2811					break;
2812				}
2813			}
2814			PMAP_UNLOCK(pv->pv_pmap);
2815		} while ((pv = pvn) != NULL && pv != pvf);
2816	}
2817
2818	return (rtval);
2819}
2820
2821/*
2822 *	Clear the modify bits on the specified physical page.
2823 */
2824void
2825pmap_clear_modify(vm_page_t m)
2826{
2827	pmap_clear_ptes(m, PG_M);
2828}
2829
2830/*
2831 *	pmap_clear_reference:
2832 *
2833 *	Clear the reference bit on the specified physical page.
2834 */
2835void
2836pmap_clear_reference(vm_page_t m)
2837{
2838	pmap_clear_ptes(m, PG_A);
2839}
2840
2841/*
2842 * Miscellaneous support routines follow
2843 */
2844
2845/*
2846 * Map a set of physical memory pages into the kernel virtual
2847 * address space. Return a pointer to where it is mapped. This
2848 * routine is intended to be used for mapping device memory,
2849 * NOT real memory.
2850 */
2851void *
2852pmap_mapdev(pa, size)
2853	vm_paddr_t pa;
2854	vm_size_t size;
2855{
2856	vm_offset_t va, tmpva, offset;
2857
2858	/* If this fits within the direct map window, use it */
2859	if (pa < dmaplimit && (pa + size) < dmaplimit)
2860		return ((void *)PHYS_TO_DMAP(pa));
2861	offset = pa & PAGE_MASK;
2862	size = roundup(offset + size, PAGE_SIZE);
2863	va = kmem_alloc_nofault(kernel_map, size);
2864	if (!va)
2865		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2866	pa = trunc_page(pa);
2867	for (tmpva = va; size > 0; ) {
2868		pmap_kenter(tmpva, pa);
2869		size -= PAGE_SIZE;
2870		tmpva += PAGE_SIZE;
2871		pa += PAGE_SIZE;
2872	}
2873	pmap_invalidate_range(kernel_pmap, va, tmpva);
2874	return ((void *)(va + offset));
2875}
2876
2877void
2878pmap_unmapdev(va, size)
2879	vm_offset_t va;
2880	vm_size_t size;
2881{
2882	vm_offset_t base, offset, tmpva;
2883
2884	/* If we gave a direct map region in pmap_mapdev, do nothing */
2885	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2886		return;
2887	base = trunc_page(va);
2888	offset = va & PAGE_MASK;
2889	size = roundup(offset + size, PAGE_SIZE);
2890	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2891		pmap_kremove(tmpva);
2892	pmap_invalidate_range(kernel_pmap, va, tmpva);
2893	kmem_free(kernel_map, base, size);
2894}
2895
2896/*
2897 * perform the pmap work for mincore
2898 */
2899int
2900pmap_mincore(pmap, addr)
2901	pmap_t pmap;
2902	vm_offset_t addr;
2903{
2904	pt_entry_t *ptep, pte;
2905	vm_page_t m;
2906	int val = 0;
2907
2908	PMAP_LOCK(pmap);
2909	ptep = pmap_pte(pmap, addr);
2910	pte = (ptep != NULL) ? *ptep : 0;
2911	PMAP_UNLOCK(pmap);
2912
2913	if (pte != 0) {
2914		vm_paddr_t pa;
2915
2916		val = MINCORE_INCORE;
2917		if ((pte & PG_MANAGED) == 0)
2918			return val;
2919
2920		pa = pte & PG_FRAME;
2921
2922		m = PHYS_TO_VM_PAGE(pa);
2923
2924		/*
2925		 * Modified by us
2926		 */
2927		if (pte & PG_M)
2928			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2929		else {
2930			/*
2931			 * Modified by someone else
2932			 */
2933			vm_page_lock_queues();
2934			if (m->dirty || pmap_is_modified(m))
2935				val |= MINCORE_MODIFIED_OTHER;
2936			vm_page_unlock_queues();
2937		}
2938		/*
2939		 * Referenced by us
2940		 */
2941		if (pte & PG_A)
2942			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2943		else {
2944			/*
2945			 * Referenced by someone else
2946			 */
2947			vm_page_lock_queues();
2948			if ((m->flags & PG_REFERENCED) ||
2949			    pmap_ts_referenced(m)) {
2950				val |= MINCORE_REFERENCED_OTHER;
2951				vm_page_flag_set(m, PG_REFERENCED);
2952			}
2953			vm_page_unlock_queues();
2954		}
2955	}
2956	return val;
2957}
2958
2959void
2960pmap_activate(struct thread *td)
2961{
2962	struct proc *p = td->td_proc;
2963	pmap_t	pmap, oldpmap;
2964	u_int64_t  cr3;
2965
2966	critical_enter();
2967	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2968	oldpmap = PCPU_GET(curpmap);
2969#ifdef SMP
2970if (oldpmap)	/* XXX FIXME */
2971	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
2972	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
2973#else
2974if (oldpmap)	/* XXX FIXME */
2975	oldpmap->pm_active &= ~PCPU_GET(cpumask);
2976	pmap->pm_active |= PCPU_GET(cpumask);
2977#endif
2978	cr3 = vtophys(pmap->pm_pml4);
2979	/* XXXKSE this is wrong.
2980	 * pmap_activate is for the current thread on the current cpu
2981	 */
2982	if (p->p_flag & P_SA) {
2983		/* Make sure all other cr3 entries are updated. */
2984		/* what if they are running?  XXXKSE (maybe abort them) */
2985		FOREACH_THREAD_IN_PROC(p, td) {
2986			td->td_pcb->pcb_cr3 = cr3;
2987		}
2988	} else {
2989		td->td_pcb->pcb_cr3 = cr3;
2990	}
2991	load_cr3(cr3);
2992	critical_exit();
2993}
2994
2995vm_offset_t
2996pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
2997{
2998
2999	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3000		return addr;
3001	}
3002
3003	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3004	return addr;
3005}
3006