pmap.c revision 155234
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 155234 2006-02-03 00:16:36Z peter $");
81
82/*
83 *	Manages physical address maps.
84 *
85 *	In addition to hardware address maps, this
86 *	module is called upon to provide software-use-only
87 *	maps which may or may not be stored in the same
88 *	form as hardware maps.  These pseudo-maps are
89 *	used to store intermediate results from copy
90 *	operations to and from address spaces.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_msgbuf.h"
109
110#include <sys/param.h>
111#include <sys/systm.h>
112#include <sys/kernel.h>
113#include <sys/lock.h>
114#include <sys/malloc.h>
115#include <sys/mman.h>
116#include <sys/msgbuf.h>
117#include <sys/mutex.h>
118#include <sys/proc.h>
119#include <sys/sx.h>
120#include <sys/vmmeter.h>
121#include <sys/sched.h>
122#include <sys/sysctl.h>
123#ifdef SMP
124#include <sys/smp.h>
125#endif
126
127#include <vm/vm.h>
128#include <vm/vm_param.h>
129#include <vm/vm_kern.h>
130#include <vm/vm_page.h>
131#include <vm/vm_map.h>
132#include <vm/vm_object.h>
133#include <vm/vm_extern.h>
134#include <vm/vm_pageout.h>
135#include <vm/vm_pager.h>
136#include <vm/uma.h>
137
138#include <machine/cpu.h>
139#include <machine/cputypes.h>
140#include <machine/md_var.h>
141#include <machine/pcb.h>
142#include <machine/specialreg.h>
143#ifdef SMP
144#include <machine/smp.h>
145#endif
146
147#ifndef PMAP_SHPGPERPROC
148#define PMAP_SHPGPERPROC 200
149#endif
150
151#if defined(DIAGNOSTIC)
152#define PMAP_DIAGNOSTIC
153#endif
154
155#if !defined(PMAP_DIAGNOSTIC)
156#define PMAP_INLINE __inline
157#else
158#define PMAP_INLINE
159#endif
160
161struct pmap kernel_pmap_store;
162
163vm_paddr_t avail_start;		/* PA of first available physical page */
164vm_paddr_t avail_end;		/* PA of last available physical page */
165vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
166vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
167
168static int nkpt;
169static int ndmpdp;
170static vm_paddr_t dmaplimit;
171vm_offset_t kernel_vm_end;
172pt_entry_t pg_nx;
173
174static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
175static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
176static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
177u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
178
179static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
180static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
181
182/*
183 * Data for the pv entry allocation mechanism
184 */
185static uma_zone_t pvzone;
186static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
187static int shpgperproc = PMAP_SHPGPERPROC;
188
189/*
190 * All those kernel PT submaps that BSD is so fond of
191 */
192pt_entry_t *CMAP1 = 0;
193caddr_t CADDR1 = 0;
194struct msgbuf *msgbufp = 0;
195
196/*
197 * Crashdump maps.
198 */
199static caddr_t crashdumpmap;
200
201static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
202static pv_entry_t get_pv_entry(pmap_t locked_pmap);
203static void	pmap_clear_ptes(vm_page_t m, long bit);
204
205static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
206		vm_offset_t sva, pd_entry_t ptepde);
207static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde);
208static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
209		vm_offset_t va);
210static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
211
212static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
213static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
214
215static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
216static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m);
217static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t);
218static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
219
220CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
221CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
222
223/*
224 * Move the kernel virtual free pointer to the next
225 * 2MB.  This is used to help improve performance
226 * by using a large (2MB) page for much of the kernel
227 * (.text, .data, .bss)
228 */
229static vm_offset_t
230pmap_kmem_choose(vm_offset_t addr)
231{
232	vm_offset_t newaddr = addr;
233
234	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
235	return newaddr;
236}
237
238/********************/
239/* Inline functions */
240/********************/
241
242/* Return a non-clipped PD index for a given VA */
243static __inline vm_pindex_t
244pmap_pde_pindex(vm_offset_t va)
245{
246	return va >> PDRSHIFT;
247}
248
249
250/* Return various clipped indexes for a given VA */
251static __inline vm_pindex_t
252pmap_pte_index(vm_offset_t va)
253{
254
255	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
256}
257
258static __inline vm_pindex_t
259pmap_pde_index(vm_offset_t va)
260{
261
262	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
263}
264
265static __inline vm_pindex_t
266pmap_pdpe_index(vm_offset_t va)
267{
268
269	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
270}
271
272static __inline vm_pindex_t
273pmap_pml4e_index(vm_offset_t va)
274{
275
276	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
277}
278
279/* Return a pointer to the PML4 slot that corresponds to a VA */
280static __inline pml4_entry_t *
281pmap_pml4e(pmap_t pmap, vm_offset_t va)
282{
283
284	if (!pmap)
285		return NULL;
286	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
287}
288
289/* Return a pointer to the PDP slot that corresponds to a VA */
290static __inline pdp_entry_t *
291pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
292{
293	pdp_entry_t *pdpe;
294
295	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
296	return (&pdpe[pmap_pdpe_index(va)]);
297}
298
299/* Return a pointer to the PDP slot that corresponds to a VA */
300static __inline pdp_entry_t *
301pmap_pdpe(pmap_t pmap, vm_offset_t va)
302{
303	pml4_entry_t *pml4e;
304
305	pml4e = pmap_pml4e(pmap, va);
306	if (pml4e == NULL || (*pml4e & PG_V) == 0)
307		return NULL;
308	return (pmap_pml4e_to_pdpe(pml4e, va));
309}
310
311/* Return a pointer to the PD slot that corresponds to a VA */
312static __inline pd_entry_t *
313pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
314{
315	pd_entry_t *pde;
316
317	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
318	return (&pde[pmap_pde_index(va)]);
319}
320
321/* Return a pointer to the PD slot that corresponds to a VA */
322static __inline pd_entry_t *
323pmap_pde(pmap_t pmap, vm_offset_t va)
324{
325	pdp_entry_t *pdpe;
326
327	pdpe = pmap_pdpe(pmap, va);
328	if (pdpe == NULL || (*pdpe & PG_V) == 0)
329		 return NULL;
330	return (pmap_pdpe_to_pde(pdpe, va));
331}
332
333/* Return a pointer to the PT slot that corresponds to a VA */
334static __inline pt_entry_t *
335pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
336{
337	pt_entry_t *pte;
338
339	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
340	return (&pte[pmap_pte_index(va)]);
341}
342
343/* Return a pointer to the PT slot that corresponds to a VA */
344static __inline pt_entry_t *
345pmap_pte(pmap_t pmap, vm_offset_t va)
346{
347	pd_entry_t *pde;
348
349	pde = pmap_pde(pmap, va);
350	if (pde == NULL || (*pde & PG_V) == 0)
351		return NULL;
352	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
353		return ((pt_entry_t *)pde);
354	return (pmap_pde_to_pte(pde, va));
355}
356
357
358static __inline pt_entry_t *
359pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde)
360{
361	pd_entry_t *pde;
362
363	pde = pmap_pde(pmap, va);
364	if (pde == NULL || (*pde & PG_V) == 0)
365		return NULL;
366	*ptepde = *pde;
367	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
368		return ((pt_entry_t *)pde);
369	return (pmap_pde_to_pte(pde, va));
370}
371
372
373PMAP_INLINE pt_entry_t *
374vtopte(vm_offset_t va)
375{
376	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
377
378	return (PTmap + ((va >> PAGE_SHIFT) & mask));
379}
380
381static __inline pd_entry_t *
382vtopde(vm_offset_t va)
383{
384	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
385
386	return (PDmap + ((va >> PDRSHIFT) & mask));
387}
388
389static u_int64_t
390allocpages(int n)
391{
392	u_int64_t ret;
393
394	ret = avail_start;
395	bzero((void *)ret, n * PAGE_SIZE);
396	avail_start += n * PAGE_SIZE;
397	return (ret);
398}
399
400static void
401create_pagetables(void)
402{
403	int i;
404
405	/* Allocate pages */
406	KPTphys = allocpages(NKPT);
407	KPML4phys = allocpages(1);
408	KPDPphys = allocpages(NKPML4E);
409	KPDphys = allocpages(NKPDPE);
410
411	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
412	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
413		ndmpdp = 4;
414	DMPDPphys = allocpages(NDMPML4E);
415	DMPDphys = allocpages(ndmpdp);
416	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
417
418	/* Fill in the underlying page table pages */
419	/* Read-only from zero to physfree */
420	/* XXX not fully used, underneath 2M pages */
421	for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
422		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
423		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
424	}
425
426	/* Now map the page tables at their location within PTmap */
427	for (i = 0; i < NKPT; i++) {
428		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
429		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
430	}
431
432	/* Map from zero to end of allocations under 2M pages */
433	/* This replaces some of the KPTphys entries above */
434	for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
435		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
436		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
437	}
438
439	/* And connect up the PD to the PDP */
440	for (i = 0; i < NKPDPE; i++) {
441		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
442		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
443	}
444
445
446	/* Now set up the direct map space using 2MB pages */
447	for (i = 0; i < NPDEPG * ndmpdp; i++) {
448		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
449		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
450	}
451
452	/* And the direct map space's PDP */
453	for (i = 0; i < ndmpdp; i++) {
454		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
455		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
456	}
457
458	/* And recursively map PML4 to itself in order to get PTmap */
459	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
460	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
461
462	/* Connect the Direct Map slot up to the PML4 */
463	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
464	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
465
466	/* Connect the KVA slot up to the PML4 */
467	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
468	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
469}
470
471/*
472 *	Bootstrap the system enough to run with virtual memory.
473 *
474 *	On amd64 this is called after mapping has already been enabled
475 *	and just syncs the pmap module with what has already been done.
476 *	[We can't call it easily with mapping off since the kernel is not
477 *	mapped with PA == VA, hence we would have to relocate every address
478 *	from the linked base (virtual) address "KERNBASE" to the actual
479 *	(physical) address starting relative to 0]
480 */
481void
482pmap_bootstrap(firstaddr)
483	vm_paddr_t *firstaddr;
484{
485	vm_offset_t va;
486	pt_entry_t *pte, *unused;
487
488	avail_start = *firstaddr;
489
490	/*
491	 * Create an initial set of page tables to run the kernel in.
492	 */
493	create_pagetables();
494	*firstaddr = avail_start;
495
496	virtual_avail = (vm_offset_t) KERNBASE + avail_start;
497	virtual_avail = pmap_kmem_choose(virtual_avail);
498
499	virtual_end = VM_MAX_KERNEL_ADDRESS;
500
501
502	/* XXX do %cr0 as well */
503	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
504	load_cr3(KPML4phys);
505
506	/*
507	 * Initialize the kernel pmap (which is statically allocated).
508	 */
509	PMAP_LOCK_INIT(kernel_pmap);
510	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
511	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
512	TAILQ_INIT(&kernel_pmap->pm_pvlist);
513	nkpt = NKPT;
514
515	/*
516	 * Reserve some special page table entries/VA space for temporary
517	 * mapping of pages.
518	 */
519#define	SYSMAP(c, p, v, n)	\
520	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
521
522	va = virtual_avail;
523	pte = vtopte(va);
524
525	/*
526	 * CMAP1 is only used for the memory test.
527	 */
528	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
529
530	/*
531	 * Crashdump maps.
532	 */
533	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
534
535	/*
536	 * msgbufp is used to map the system message buffer.
537	 */
538	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
539
540	virtual_avail = va;
541
542	*CMAP1 = 0;
543
544	invltlb();
545}
546
547/*
548 *	Initialize a vm_page's machine-dependent fields.
549 */
550void
551pmap_page_init(vm_page_t m)
552{
553
554	TAILQ_INIT(&m->md.pv_list);
555	m->md.pv_list_count = 0;
556}
557
558/*
559 *	Initialize the pmap module.
560 *	Called by vm_init, to initialize any structures that the pmap
561 *	system needs to map virtual memory.
562 */
563void
564pmap_init(void)
565{
566
567	/*
568	 * Initialize the address space (zone) for the pv entries.  Set a
569	 * high water mark so that the system can recover from excessive
570	 * numbers of pv entries.
571	 */
572	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
573	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
574	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
575	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
576	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
577	pv_entry_high_water = 9 * (pv_entry_max / 10);
578}
579
580SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
581static int
582pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
583{
584	int error;
585
586	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
587	if (error == 0 && req->newptr) {
588		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
589		pv_entry_high_water = 9 * (pv_entry_max / 10);
590	}
591	return (error);
592}
593SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW,
594    &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
595
596static int
597pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
598{
599	int error;
600
601	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
602	if (error == 0 && req->newptr) {
603		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
604		pv_entry_high_water = 9 * (pv_entry_max / 10);
605	}
606	return (error);
607}
608SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW,
609    &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
610
611
612/***************************************************
613 * Low level helper routines.....
614 ***************************************************/
615
616
617/*
618 * this routine defines the region(s) of memory that should
619 * not be tested for the modified bit.
620 */
621static PMAP_INLINE int
622pmap_track_modified(vm_offset_t va)
623{
624	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
625		return 1;
626	else
627		return 0;
628}
629
630#ifdef SMP
631/*
632 * For SMP, these functions have to use the IPI mechanism for coherence.
633 */
634void
635pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
636{
637	u_int cpumask;
638	u_int other_cpus;
639
640	if (smp_started) {
641		if (!(read_rflags() & PSL_I))
642			panic("%s: interrupts disabled", __func__);
643		mtx_lock_spin(&smp_ipi_mtx);
644	} else
645		critical_enter();
646	/*
647	 * We need to disable interrupt preemption but MUST NOT have
648	 * interrupts disabled here.
649	 * XXX we may need to hold schedlock to get a coherent pm_active
650	 * XXX critical sections disable interrupts again
651	 */
652	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
653		invlpg(va);
654		smp_invlpg(va);
655	} else {
656		cpumask = PCPU_GET(cpumask);
657		other_cpus = PCPU_GET(other_cpus);
658		if (pmap->pm_active & cpumask)
659			invlpg(va);
660		if (pmap->pm_active & other_cpus)
661			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
662	}
663	if (smp_started)
664		mtx_unlock_spin(&smp_ipi_mtx);
665	else
666		critical_exit();
667}
668
669void
670pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
671{
672	u_int cpumask;
673	u_int other_cpus;
674	vm_offset_t addr;
675
676	if (smp_started) {
677		if (!(read_rflags() & PSL_I))
678			panic("%s: interrupts disabled", __func__);
679		mtx_lock_spin(&smp_ipi_mtx);
680	} else
681		critical_enter();
682	/*
683	 * We need to disable interrupt preemption but MUST NOT have
684	 * interrupts disabled here.
685	 * XXX we may need to hold schedlock to get a coherent pm_active
686	 * XXX critical sections disable interrupts again
687	 */
688	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
689		for (addr = sva; addr < eva; addr += PAGE_SIZE)
690			invlpg(addr);
691		smp_invlpg_range(sva, eva);
692	} else {
693		cpumask = PCPU_GET(cpumask);
694		other_cpus = PCPU_GET(other_cpus);
695		if (pmap->pm_active & cpumask)
696			for (addr = sva; addr < eva; addr += PAGE_SIZE)
697				invlpg(addr);
698		if (pmap->pm_active & other_cpus)
699			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
700			    sva, eva);
701	}
702	if (smp_started)
703		mtx_unlock_spin(&smp_ipi_mtx);
704	else
705		critical_exit();
706}
707
708void
709pmap_invalidate_all(pmap_t pmap)
710{
711	u_int cpumask;
712	u_int other_cpus;
713
714	if (smp_started) {
715		if (!(read_rflags() & PSL_I))
716			panic("%s: interrupts disabled", __func__);
717		mtx_lock_spin(&smp_ipi_mtx);
718	} else
719		critical_enter();
720	/*
721	 * We need to disable interrupt preemption but MUST NOT have
722	 * interrupts disabled here.
723	 * XXX we may need to hold schedlock to get a coherent pm_active
724	 * XXX critical sections disable interrupts again
725	 */
726	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
727		invltlb();
728		smp_invltlb();
729	} else {
730		cpumask = PCPU_GET(cpumask);
731		other_cpus = PCPU_GET(other_cpus);
732		if (pmap->pm_active & cpumask)
733			invltlb();
734		if (pmap->pm_active & other_cpus)
735			smp_masked_invltlb(pmap->pm_active & other_cpus);
736	}
737	if (smp_started)
738		mtx_unlock_spin(&smp_ipi_mtx);
739	else
740		critical_exit();
741}
742#else /* !SMP */
743/*
744 * Normal, non-SMP, invalidation functions.
745 * We inline these within pmap.c for speed.
746 */
747PMAP_INLINE void
748pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
749{
750
751	if (pmap == kernel_pmap || pmap->pm_active)
752		invlpg(va);
753}
754
755PMAP_INLINE void
756pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
757{
758	vm_offset_t addr;
759
760	if (pmap == kernel_pmap || pmap->pm_active)
761		for (addr = sva; addr < eva; addr += PAGE_SIZE)
762			invlpg(addr);
763}
764
765PMAP_INLINE void
766pmap_invalidate_all(pmap_t pmap)
767{
768
769	if (pmap == kernel_pmap || pmap->pm_active)
770		invltlb();
771}
772#endif /* !SMP */
773
774/*
775 * Are we current address space or kernel?
776 */
777static __inline int
778pmap_is_current(pmap_t pmap)
779{
780	return (pmap == kernel_pmap ||
781	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
782}
783
784/*
785 *	Routine:	pmap_extract
786 *	Function:
787 *		Extract the physical page address associated
788 *		with the given map/virtual_address pair.
789 */
790vm_paddr_t
791pmap_extract(pmap_t pmap, vm_offset_t va)
792{
793	vm_paddr_t rtval;
794	pt_entry_t *pte;
795	pd_entry_t pde, *pdep;
796
797	rtval = 0;
798	PMAP_LOCK(pmap);
799	pdep = pmap_pde(pmap, va);
800	if (pdep != NULL) {
801		pde = *pdep;
802		if (pde) {
803			if ((pde & PG_PS) != 0) {
804				KASSERT((pde & PG_FRAME & PDRMASK) == 0,
805				    ("pmap_extract: bad pde"));
806				rtval = (pde & PG_FRAME) | (va & PDRMASK);
807				PMAP_UNLOCK(pmap);
808				return rtval;
809			}
810			pte = pmap_pde_to_pte(pdep, va);
811			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
812		}
813	}
814	PMAP_UNLOCK(pmap);
815	return (rtval);
816}
817
818/*
819 *	Routine:	pmap_extract_and_hold
820 *	Function:
821 *		Atomically extract and hold the physical page
822 *		with the given pmap and virtual address pair
823 *		if that mapping permits the given protection.
824 */
825vm_page_t
826pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
827{
828	pd_entry_t pde, *pdep;
829	pt_entry_t pte;
830	vm_page_t m;
831
832	m = NULL;
833	vm_page_lock_queues();
834	PMAP_LOCK(pmap);
835	pdep = pmap_pde(pmap, va);
836	if (pdep != NULL && (pde = *pdep)) {
837		if (pde & PG_PS) {
838			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
839				KASSERT((pde & PG_FRAME & PDRMASK) == 0,
840				    ("pmap_extract_and_hold: bad pde"));
841				m = PHYS_TO_VM_PAGE((pde & PG_FRAME) |
842				    (va & PDRMASK));
843				vm_page_hold(m);
844			}
845		} else {
846			pte = *pmap_pde_to_pte(pdep, va);
847			if ((pte & PG_V) &&
848			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
849				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
850				vm_page_hold(m);
851			}
852		}
853	}
854	vm_page_unlock_queues();
855	PMAP_UNLOCK(pmap);
856	return (m);
857}
858
859vm_paddr_t
860pmap_kextract(vm_offset_t va)
861{
862	pd_entry_t *pde;
863	vm_paddr_t pa;
864
865	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
866		pa = DMAP_TO_PHYS(va);
867	} else {
868		pde = vtopde(va);
869		if (*pde & PG_PS) {
870			pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
871		} else {
872			pa = *vtopte(va);
873			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
874		}
875	}
876	return pa;
877}
878
879/***************************************************
880 * Low level mapping routines.....
881 ***************************************************/
882
883/*
884 * Add a wired page to the kva.
885 * Note: not SMP coherent.
886 */
887PMAP_INLINE void
888pmap_kenter(vm_offset_t va, vm_paddr_t pa)
889{
890	pt_entry_t *pte;
891
892	pte = vtopte(va);
893	pte_store(pte, pa | PG_RW | PG_V | PG_G);
894}
895
896/*
897 * Remove a page from the kernel pagetables.
898 * Note: not SMP coherent.
899 */
900PMAP_INLINE void
901pmap_kremove(vm_offset_t va)
902{
903	pt_entry_t *pte;
904
905	pte = vtopte(va);
906	pte_clear(pte);
907}
908
909/*
910 *	Used to map a range of physical addresses into kernel
911 *	virtual address space.
912 *
913 *	The value passed in '*virt' is a suggested virtual address for
914 *	the mapping. Architectures which can support a direct-mapped
915 *	physical to virtual region can return the appropriate address
916 *	within that region, leaving '*virt' unchanged. Other
917 *	architectures should map the pages starting at '*virt' and
918 *	update '*virt' with the first usable address after the mapped
919 *	region.
920 */
921vm_offset_t
922pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
923{
924	return PHYS_TO_DMAP(start);
925}
926
927
928/*
929 * Add a list of wired pages to the kva
930 * this routine is only used for temporary
931 * kernel mappings that do not need to have
932 * page modification or references recorded.
933 * Note that old mappings are simply written
934 * over.  The page *must* be wired.
935 * Note: SMP coherent.  Uses a ranged shootdown IPI.
936 */
937void
938pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
939{
940	vm_offset_t va;
941
942	va = sva;
943	while (count-- > 0) {
944		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
945		va += PAGE_SIZE;
946		m++;
947	}
948	pmap_invalidate_range(kernel_pmap, sva, va);
949}
950
951/*
952 * This routine tears out page mappings from the
953 * kernel -- it is meant only for temporary mappings.
954 * Note: SMP coherent.  Uses a ranged shootdown IPI.
955 */
956void
957pmap_qremove(vm_offset_t sva, int count)
958{
959	vm_offset_t va;
960
961	va = sva;
962	while (count-- > 0) {
963		pmap_kremove(va);
964		va += PAGE_SIZE;
965	}
966	pmap_invalidate_range(kernel_pmap, sva, va);
967}
968
969/***************************************************
970 * Page table page management routines.....
971 ***************************************************/
972
973/*
974 * This routine unholds page table pages, and if the hold count
975 * drops to zero, then it decrements the wire count.
976 */
977static PMAP_INLINE int
978pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
979{
980
981	--m->wire_count;
982	if (m->wire_count == 0)
983		return _pmap_unwire_pte_hold(pmap, va, m);
984	else
985		return 0;
986}
987
988static int
989_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
990{
991	vm_offset_t pteva;
992
993	/*
994	 * unmap the page table page
995	 */
996	if (m->pindex >= (NUPDE + NUPDPE)) {
997		/* PDP page */
998		pml4_entry_t *pml4;
999		pml4 = pmap_pml4e(pmap, va);
1000		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
1001		*pml4 = 0;
1002	} else if (m->pindex >= NUPDE) {
1003		/* PD page */
1004		pdp_entry_t *pdp;
1005		pdp = pmap_pdpe(pmap, va);
1006		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
1007		*pdp = 0;
1008	} else {
1009		/* PTE page */
1010		pd_entry_t *pd;
1011		pd = pmap_pde(pmap, va);
1012		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
1013		*pd = 0;
1014	}
1015	--pmap->pm_stats.resident_count;
1016	if (m->pindex < NUPDE) {
1017		/* We just released a PT, unhold the matching PD */
1018		vm_page_t pdpg;
1019
1020		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1021		pmap_unwire_pte_hold(pmap, va, pdpg);
1022	}
1023	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1024		/* We just released a PD, unhold the matching PDP */
1025		vm_page_t pdppg;
1026
1027		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1028		pmap_unwire_pte_hold(pmap, va, pdppg);
1029	}
1030
1031	/*
1032	 * Do an invltlb to make the invalidated mapping
1033	 * take effect immediately.
1034	 */
1035	pmap_invalidate_page(pmap, pteva);
1036
1037	vm_page_free_zero(m);
1038	atomic_subtract_int(&cnt.v_wire_count, 1);
1039	return 1;
1040}
1041
1042/*
1043 * After removing a page table entry, this routine is used to
1044 * conditionally free the page, and manage the hold/wire counts.
1045 */
1046static int
1047pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde)
1048{
1049	vm_page_t mpte;
1050
1051	if (va >= VM_MAXUSER_ADDRESS)
1052		return 0;
1053	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1054	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1055	return pmap_unwire_pte_hold(pmap, va, mpte);
1056}
1057
1058void
1059pmap_pinit0(pmap)
1060	struct pmap *pmap;
1061{
1062
1063	PMAP_LOCK_INIT(pmap);
1064	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1065	pmap->pm_active = 0;
1066	TAILQ_INIT(&pmap->pm_pvlist);
1067	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1068}
1069
1070/*
1071 * Initialize a preallocated and zeroed pmap structure,
1072 * such as one in a vmspace structure.
1073 */
1074void
1075pmap_pinit(pmap)
1076	register struct pmap *pmap;
1077{
1078	vm_page_t pml4pg;
1079	static vm_pindex_t color;
1080
1081	PMAP_LOCK_INIT(pmap);
1082
1083	/*
1084	 * allocate the page directory page
1085	 */
1086	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1087	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1088		VM_WAIT;
1089
1090	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1091
1092	if ((pml4pg->flags & PG_ZERO) == 0)
1093		pagezero(pmap->pm_pml4);
1094
1095	/* Wire in kernel global address entries. */
1096	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1097	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1098
1099	/* install self-referential address mapping entry(s) */
1100	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1101
1102	pmap->pm_active = 0;
1103	TAILQ_INIT(&pmap->pm_pvlist);
1104	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1105}
1106
1107/*
1108 * this routine is called if the page table page is not
1109 * mapped correctly.
1110 *
1111 * Note: If a page allocation fails at page table level two or three,
1112 * one or two pages may be held during the wait, only to be released
1113 * afterwards.  This conservative approach is easily argued to avoid
1114 * race conditions.
1115 */
1116static vm_page_t
1117_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1118{
1119	vm_page_t m, pdppg, pdpg;
1120
1121	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1122	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1123	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1124
1125	/*
1126	 * Allocate a page table page.
1127	 */
1128	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1129	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1130		if (flags & M_WAITOK) {
1131			PMAP_UNLOCK(pmap);
1132			vm_page_unlock_queues();
1133			VM_WAIT;
1134			vm_page_lock_queues();
1135			PMAP_LOCK(pmap);
1136		}
1137
1138		/*
1139		 * Indicate the need to retry.  While waiting, the page table
1140		 * page may have been allocated.
1141		 */
1142		return (NULL);
1143	}
1144	if ((m->flags & PG_ZERO) == 0)
1145		pmap_zero_page(m);
1146
1147	/*
1148	 * Map the pagetable page into the process address space, if
1149	 * it isn't already there.
1150	 */
1151
1152	pmap->pm_stats.resident_count++;
1153
1154	if (ptepindex >= (NUPDE + NUPDPE)) {
1155		pml4_entry_t *pml4;
1156		vm_pindex_t pml4index;
1157
1158		/* Wire up a new PDPE page */
1159		pml4index = ptepindex - (NUPDE + NUPDPE);
1160		pml4 = &pmap->pm_pml4[pml4index];
1161		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1162
1163	} else if (ptepindex >= NUPDE) {
1164		vm_pindex_t pml4index;
1165		vm_pindex_t pdpindex;
1166		pml4_entry_t *pml4;
1167		pdp_entry_t *pdp;
1168
1169		/* Wire up a new PDE page */
1170		pdpindex = ptepindex - NUPDE;
1171		pml4index = pdpindex >> NPML4EPGSHIFT;
1172
1173		pml4 = &pmap->pm_pml4[pml4index];
1174		if ((*pml4 & PG_V) == 0) {
1175			/* Have to allocate a new pdp, recurse */
1176			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1177			    flags) == NULL) {
1178				--m->wire_count;
1179				vm_page_free(m);
1180				return (NULL);
1181			}
1182		} else {
1183			/* Add reference to pdp page */
1184			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1185			pdppg->wire_count++;
1186		}
1187		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1188
1189		/* Now find the pdp page */
1190		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1191		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1192
1193	} else {
1194		vm_pindex_t pml4index;
1195		vm_pindex_t pdpindex;
1196		pml4_entry_t *pml4;
1197		pdp_entry_t *pdp;
1198		pd_entry_t *pd;
1199
1200		/* Wire up a new PTE page */
1201		pdpindex = ptepindex >> NPDPEPGSHIFT;
1202		pml4index = pdpindex >> NPML4EPGSHIFT;
1203
1204		/* First, find the pdp and check that its valid. */
1205		pml4 = &pmap->pm_pml4[pml4index];
1206		if ((*pml4 & PG_V) == 0) {
1207			/* Have to allocate a new pd, recurse */
1208			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1209			    flags) == NULL) {
1210				--m->wire_count;
1211				vm_page_free(m);
1212				return (NULL);
1213			}
1214			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1215			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1216		} else {
1217			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1218			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1219			if ((*pdp & PG_V) == 0) {
1220				/* Have to allocate a new pd, recurse */
1221				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1222				    flags) == NULL) {
1223					--m->wire_count;
1224					vm_page_free(m);
1225					return (NULL);
1226				}
1227			} else {
1228				/* Add reference to the pd page */
1229				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1230				pdpg->wire_count++;
1231			}
1232		}
1233		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1234
1235		/* Now we know where the page directory page is */
1236		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1237		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1238	}
1239
1240	return m;
1241}
1242
1243static vm_page_t
1244pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1245{
1246	vm_pindex_t pdpindex, ptepindex;
1247	pdp_entry_t *pdpe;
1248	vm_page_t pdpg;
1249
1250	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1251	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1252	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1253retry:
1254	pdpe = pmap_pdpe(pmap, va);
1255	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1256		/* Add a reference to the pd page. */
1257		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1258		pdpg->wire_count++;
1259	} else {
1260		/* Allocate a pd page. */
1261		ptepindex = pmap_pde_pindex(va);
1262		pdpindex = ptepindex >> NPDPEPGSHIFT;
1263		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1264		if (pdpg == NULL && (flags & M_WAITOK))
1265			goto retry;
1266	}
1267	return (pdpg);
1268}
1269
1270static vm_page_t
1271pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1272{
1273	vm_pindex_t ptepindex;
1274	pd_entry_t *pd;
1275	vm_page_t m;
1276
1277	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1278	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1279	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1280
1281	/*
1282	 * Calculate pagetable page index
1283	 */
1284	ptepindex = pmap_pde_pindex(va);
1285retry:
1286	/*
1287	 * Get the page directory entry
1288	 */
1289	pd = pmap_pde(pmap, va);
1290
1291	/*
1292	 * This supports switching from a 2MB page to a
1293	 * normal 4K page.
1294	 */
1295	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1296		*pd = 0;
1297		pd = 0;
1298		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1299		pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va));
1300		pmap_invalidate_all(kernel_pmap);
1301	}
1302
1303	/*
1304	 * If the page table page is mapped, we just increment the
1305	 * hold count, and activate it.
1306	 */
1307	if (pd != 0 && (*pd & PG_V) != 0) {
1308		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1309		m->wire_count++;
1310	} else {
1311		/*
1312		 * Here if the pte page isn't mapped, or if it has been
1313		 * deallocated.
1314		 */
1315		m = _pmap_allocpte(pmap, ptepindex, flags);
1316		if (m == NULL && (flags & M_WAITOK))
1317			goto retry;
1318	}
1319	return (m);
1320}
1321
1322
1323/***************************************************
1324 * Pmap allocation/deallocation routines.
1325 ***************************************************/
1326
1327/*
1328 * Release any resources held by the given physical map.
1329 * Called when a pmap initialized by pmap_pinit is being released.
1330 * Should only be called if the map contains no valid mappings.
1331 */
1332void
1333pmap_release(pmap_t pmap)
1334{
1335	vm_page_t m;
1336
1337	KASSERT(pmap->pm_stats.resident_count == 0,
1338	    ("pmap_release: pmap resident count %ld != 0",
1339	    pmap->pm_stats.resident_count));
1340
1341	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1342
1343	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1344	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1345	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1346
1347	vm_page_lock_queues();
1348	m->wire_count--;
1349	atomic_subtract_int(&cnt.v_wire_count, 1);
1350	vm_page_free_zero(m);
1351	vm_page_unlock_queues();
1352	PMAP_LOCK_DESTROY(pmap);
1353}
1354
1355static int
1356kvm_size(SYSCTL_HANDLER_ARGS)
1357{
1358	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1359
1360	return sysctl_handle_long(oidp, &ksize, 0, req);
1361}
1362SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1363    0, 0, kvm_size, "IU", "Size of KVM");
1364
1365static int
1366kvm_free(SYSCTL_HANDLER_ARGS)
1367{
1368	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1369
1370	return sysctl_handle_long(oidp, &kfree, 0, req);
1371}
1372SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1373    0, 0, kvm_free, "IU", "Amount of KVM free");
1374
1375/*
1376 * grow the number of kernel page table entries, if needed
1377 */
1378void
1379pmap_growkernel(vm_offset_t addr)
1380{
1381	vm_paddr_t paddr;
1382	vm_page_t nkpg;
1383	pd_entry_t *pde, newpdir;
1384	pdp_entry_t newpdp;
1385
1386	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1387	if (kernel_vm_end == 0) {
1388		kernel_vm_end = KERNBASE;
1389		nkpt = 0;
1390		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1391			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1392			nkpt++;
1393		}
1394	}
1395	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1396	while (kernel_vm_end < addr) {
1397		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1398		if (pde == NULL) {
1399			/* We need a new PDP entry */
1400			nkpg = vm_page_alloc(NULL, nkpt,
1401			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1402			if (!nkpg)
1403				panic("pmap_growkernel: no memory to grow kernel");
1404			pmap_zero_page(nkpg);
1405			paddr = VM_PAGE_TO_PHYS(nkpg);
1406			newpdp = (pdp_entry_t)
1407				(paddr | PG_V | PG_RW | PG_A | PG_M);
1408			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1409			continue; /* try again */
1410		}
1411		if ((*pde & PG_V) != 0) {
1412			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1413			continue;
1414		}
1415
1416		/*
1417		 * This index is bogus, but out of the way
1418		 */
1419		nkpg = vm_page_alloc(NULL, nkpt,
1420		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1421		if (!nkpg)
1422			panic("pmap_growkernel: no memory to grow kernel");
1423
1424		nkpt++;
1425
1426		pmap_zero_page(nkpg);
1427		paddr = VM_PAGE_TO_PHYS(nkpg);
1428		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1429		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1430
1431		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1432	}
1433}
1434
1435
1436/***************************************************
1437 * page management routines.
1438 ***************************************************/
1439
1440/*
1441 * free the pv_entry back to the free list
1442 */
1443static PMAP_INLINE void
1444free_pv_entry(pv_entry_t pv)
1445{
1446	pv_entry_count--;
1447	uma_zfree(pvzone, pv);
1448}
1449
1450/*
1451 * get a new pv_entry, allocating a block from the system
1452 * when needed.
1453 */
1454static pv_entry_t
1455get_pv_entry(pmap_t locked_pmap)
1456{
1457	static const struct timeval printinterval = { 60, 0 };
1458	static struct timeval lastprint;
1459	struct vpgqueues *vpq;
1460	pd_entry_t ptepde;
1461	pmap_t pmap;
1462	pt_entry_t *pte, tpte;
1463	pv_entry_t allocated_pv, next_pv, pv;
1464	vm_offset_t va;
1465	vm_page_t m;
1466
1467	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1468	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1469	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
1470	if (allocated_pv != NULL) {
1471		pv_entry_count++;
1472		if (pv_entry_count > pv_entry_high_water)
1473			pagedaemon_wakeup();
1474		else
1475			return (allocated_pv);
1476	}
1477
1478	/*
1479	 * Reclaim pv entries: At first, destroy mappings to inactive
1480	 * pages.  After that, if a pv entry is still needed, destroy
1481	 * mappings to active pages.
1482	 */
1483	if (ratecheck(&lastprint, &printinterval))
1484		printf("Approaching the limit on PV entries, consider "
1485		    "increasing sysctl vm.pmap.shpgperproc or "
1486		    "vm.pmap.pv_entry_max\n");
1487	vpq = &vm_page_queues[PQ_INACTIVE];
1488retry:
1489	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1490		if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1491			continue;
1492		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1493			va = pv->pv_va;
1494			pmap = pv->pv_pmap;
1495			/* Avoid deadlock and lock recursion. */
1496			if (pmap > locked_pmap)
1497				PMAP_LOCK(pmap);
1498			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1499				continue;
1500			pmap->pm_stats.resident_count--;
1501			pte = pmap_pte_pde(pmap, va, &ptepde);
1502			tpte = pte_load_clear(pte);
1503			KASSERT((tpte & PG_W) == 0,
1504			    ("get_pv_entry: wired pte %#lx", tpte));
1505			if (tpte & PG_A)
1506				vm_page_flag_set(m, PG_REFERENCED);
1507			if (tpte & PG_M) {
1508				KASSERT((tpte & PG_RW),
1509	("get_pv_entry: modified page not writable: va: %#lx, pte: %#lx",
1510				    va, tpte));
1511				if (pmap_track_modified(va))
1512					vm_page_dirty(m);
1513			}
1514			pmap_invalidate_page(pmap, va);
1515			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1516			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1517			if (TAILQ_EMPTY(&m->md.pv_list))
1518				vm_page_flag_clear(m, PG_WRITEABLE);
1519			m->md.pv_list_count--;
1520			pmap_unuse_pt(pmap, va, ptepde);
1521			if (pmap != locked_pmap)
1522				PMAP_UNLOCK(pmap);
1523			if (allocated_pv == NULL)
1524				allocated_pv = pv;
1525			else
1526				free_pv_entry(pv);
1527		}
1528	}
1529	if (allocated_pv == NULL) {
1530		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
1531			vpq = &vm_page_queues[PQ_ACTIVE];
1532			goto retry;
1533		}
1534		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
1535	}
1536	return (allocated_pv);
1537}
1538
1539static void
1540pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1541{
1542	pv_entry_t pv;
1543
1544	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1545	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1546	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1547		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1548			if (pmap == pv->pv_pmap && va == pv->pv_va)
1549				break;
1550		}
1551	} else {
1552		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1553			if (va == pv->pv_va)
1554				break;
1555		}
1556	}
1557	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1558	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1559	m->md.pv_list_count--;
1560	if (TAILQ_EMPTY(&m->md.pv_list))
1561		vm_page_flag_clear(m, PG_WRITEABLE);
1562	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1563	free_pv_entry(pv);
1564}
1565
1566/*
1567 * Create a pv entry for page at pa for
1568 * (pmap, va).
1569 */
1570static void
1571pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1572{
1573	pv_entry_t pv;
1574
1575	pv = get_pv_entry(pmap);
1576	pv->pv_va = va;
1577	pv->pv_pmap = pmap;
1578
1579	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1580	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1581	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1582	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1583	m->md.pv_list_count++;
1584}
1585
1586/*
1587 * pmap_remove_pte: do the things to unmap a page in a process
1588 */
1589static int
1590pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde)
1591{
1592	pt_entry_t oldpte;
1593	vm_page_t m;
1594
1595	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1596	oldpte = pte_load_clear(ptq);
1597	if (oldpte & PG_W)
1598		pmap->pm_stats.wired_count -= 1;
1599	/*
1600	 * Machines that don't support invlpg, also don't support
1601	 * PG_G.
1602	 */
1603	if (oldpte & PG_G)
1604		pmap_invalidate_page(kernel_pmap, va);
1605	pmap->pm_stats.resident_count -= 1;
1606	if (oldpte & PG_MANAGED) {
1607		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
1608		if (oldpte & PG_M) {
1609			KASSERT((oldpte & PG_RW),
1610	("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx",
1611			    va, oldpte));
1612			if (pmap_track_modified(va))
1613				vm_page_dirty(m);
1614		}
1615		if (oldpte & PG_A)
1616			vm_page_flag_set(m, PG_REFERENCED);
1617		pmap_remove_entry(pmap, m, va);
1618	}
1619	return (pmap_unuse_pt(pmap, va, ptepde));
1620}
1621
1622/*
1623 * Remove a single page from a process address space
1624 */
1625static void
1626pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde)
1627{
1628	pt_entry_t *pte;
1629
1630	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1631	if ((*pde & PG_V) == 0)
1632		return;
1633	pte = pmap_pde_to_pte(pde, va);
1634	if ((*pte & PG_V) == 0)
1635		return;
1636	pmap_remove_pte(pmap, pte, va, *pde);
1637	pmap_invalidate_page(pmap, va);
1638}
1639
1640/*
1641 *	Remove the given range of addresses from the specified map.
1642 *
1643 *	It is assumed that the start and end are properly
1644 *	rounded to the page size.
1645 */
1646void
1647pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1648{
1649	vm_offset_t va_next;
1650	pml4_entry_t *pml4e;
1651	pdp_entry_t *pdpe;
1652	pd_entry_t ptpaddr, *pde;
1653	pt_entry_t *pte;
1654	int anyvalid;
1655
1656	/*
1657	 * Perform an unsynchronized read.  This is, however, safe.
1658	 */
1659	if (pmap->pm_stats.resident_count == 0)
1660		return;
1661
1662	anyvalid = 0;
1663
1664	vm_page_lock_queues();
1665	PMAP_LOCK(pmap);
1666
1667	/*
1668	 * special handling of removing one page.  a very
1669	 * common operation and easy to short circuit some
1670	 * code.
1671	 */
1672	if (sva + PAGE_SIZE == eva) {
1673		pde = pmap_pde(pmap, sva);
1674		if (pde && (*pde & PG_PS) == 0) {
1675			pmap_remove_page(pmap, sva, pde);
1676			goto out;
1677		}
1678	}
1679
1680	for (; sva < eva; sva = va_next) {
1681
1682		if (pmap->pm_stats.resident_count == 0)
1683			break;
1684
1685		pml4e = pmap_pml4e(pmap, sva);
1686		if ((*pml4e & PG_V) == 0) {
1687			va_next = (sva + NBPML4) & ~PML4MASK;
1688			continue;
1689		}
1690
1691		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
1692		if ((*pdpe & PG_V) == 0) {
1693			va_next = (sva + NBPDP) & ~PDPMASK;
1694			continue;
1695		}
1696
1697		/*
1698		 * Calculate index for next page table.
1699		 */
1700		va_next = (sva + NBPDR) & ~PDRMASK;
1701
1702		pde = pmap_pdpe_to_pde(pdpe, sva);
1703		ptpaddr = *pde;
1704
1705		/*
1706		 * Weed out invalid mappings.
1707		 */
1708		if (ptpaddr == 0)
1709			continue;
1710
1711		/*
1712		 * Check for large page.
1713		 */
1714		if ((ptpaddr & PG_PS) != 0) {
1715			*pde = 0;
1716			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1717			pmap_unuse_pt(pmap, sva, *pdpe);
1718			anyvalid = 1;
1719			continue;
1720		}
1721
1722		/*
1723		 * Limit our scan to either the end of the va represented
1724		 * by the current page table page, or to the end of the
1725		 * range being removed.
1726		 */
1727		if (va_next > eva)
1728			va_next = eva;
1729
1730		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
1731		    sva += PAGE_SIZE) {
1732			if (*pte == 0)
1733				continue;
1734			anyvalid = 1;
1735			if (pmap_remove_pte(pmap, pte, sva, ptpaddr))
1736				break;
1737		}
1738	}
1739out:
1740	vm_page_unlock_queues();
1741	if (anyvalid)
1742		pmap_invalidate_all(pmap);
1743	PMAP_UNLOCK(pmap);
1744}
1745
1746/*
1747 *	Routine:	pmap_remove_all
1748 *	Function:
1749 *		Removes this physical page from
1750 *		all physical maps in which it resides.
1751 *		Reflects back modify bits to the pager.
1752 *
1753 *	Notes:
1754 *		Original versions of this routine were very
1755 *		inefficient because they iteratively called
1756 *		pmap_remove (slow...)
1757 */
1758
1759void
1760pmap_remove_all(vm_page_t m)
1761{
1762	register pv_entry_t pv;
1763	pt_entry_t *pte, tpte;
1764	pd_entry_t ptepde;
1765
1766#if defined(PMAP_DIAGNOSTIC)
1767	/*
1768	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1769	 */
1770	if (m->flags & PG_FICTITIOUS) {
1771		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx",
1772		    VM_PAGE_TO_PHYS(m));
1773	}
1774#endif
1775	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1776	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1777		PMAP_LOCK(pv->pv_pmap);
1778		pv->pv_pmap->pm_stats.resident_count--;
1779		pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde);
1780		tpte = pte_load_clear(pte);
1781		if (tpte & PG_W)
1782			pv->pv_pmap->pm_stats.wired_count--;
1783		if (tpte & PG_A)
1784			vm_page_flag_set(m, PG_REFERENCED);
1785
1786		/*
1787		 * Update the vm_page_t clean and reference bits.
1788		 */
1789		if (tpte & PG_M) {
1790			KASSERT((tpte & PG_RW),
1791	("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx",
1792			    pv->pv_va, tpte));
1793			if (pmap_track_modified(pv->pv_va))
1794				vm_page_dirty(m);
1795		}
1796		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1797		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1798		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1799		m->md.pv_list_count--;
1800		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde);
1801		PMAP_UNLOCK(pv->pv_pmap);
1802		free_pv_entry(pv);
1803	}
1804	vm_page_flag_clear(m, PG_WRITEABLE);
1805}
1806
1807/*
1808 *	Set the physical protection on the
1809 *	specified range of this map as requested.
1810 */
1811void
1812pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1813{
1814	vm_offset_t va_next;
1815	pml4_entry_t *pml4e;
1816	pdp_entry_t *pdpe;
1817	pd_entry_t ptpaddr, *pde;
1818	pt_entry_t *pte;
1819	int anychanged;
1820
1821	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1822		pmap_remove(pmap, sva, eva);
1823		return;
1824	}
1825
1826	if (prot & VM_PROT_WRITE)
1827		return;
1828
1829	anychanged = 0;
1830
1831	vm_page_lock_queues();
1832	PMAP_LOCK(pmap);
1833	for (; sva < eva; sva = va_next) {
1834
1835		pml4e = pmap_pml4e(pmap, sva);
1836		if ((*pml4e & PG_V) == 0) {
1837			va_next = (sva + NBPML4) & ~PML4MASK;
1838			continue;
1839		}
1840
1841		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
1842		if ((*pdpe & PG_V) == 0) {
1843			va_next = (sva + NBPDP) & ~PDPMASK;
1844			continue;
1845		}
1846
1847		va_next = (sva + NBPDR) & ~PDRMASK;
1848
1849		pde = pmap_pdpe_to_pde(pdpe, sva);
1850		ptpaddr = *pde;
1851
1852		/*
1853		 * Weed out invalid mappings.
1854		 */
1855		if (ptpaddr == 0)
1856			continue;
1857
1858		/*
1859		 * Check for large page.
1860		 */
1861		if ((ptpaddr & PG_PS) != 0) {
1862			*pde &= ~(PG_M|PG_RW);
1863			anychanged = 1;
1864			continue;
1865		}
1866
1867		if (va_next > eva)
1868			va_next = eva;
1869
1870		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
1871		    sva += PAGE_SIZE) {
1872			pt_entry_t obits, pbits;
1873			vm_page_t m;
1874
1875retry:
1876			obits = pbits = *pte;
1877			if (pbits & PG_MANAGED) {
1878				m = NULL;
1879				if (pbits & PG_A) {
1880					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
1881					vm_page_flag_set(m, PG_REFERENCED);
1882					pbits &= ~PG_A;
1883				}
1884				if ((pbits & PG_M) != 0 &&
1885				    pmap_track_modified(sva)) {
1886					if (m == NULL)
1887						m = PHYS_TO_VM_PAGE(pbits &
1888						    PG_FRAME);
1889					vm_page_dirty(m);
1890				}
1891			}
1892
1893			pbits &= ~(PG_RW | PG_M);
1894
1895			if (pbits != obits) {
1896				if (!atomic_cmpset_long(pte, obits, pbits))
1897					goto retry;
1898				if (obits & PG_G)
1899					pmap_invalidate_page(pmap, sva);
1900				else
1901					anychanged = 1;
1902			}
1903		}
1904	}
1905	vm_page_unlock_queues();
1906	if (anychanged)
1907		pmap_invalidate_all(pmap);
1908	PMAP_UNLOCK(pmap);
1909}
1910
1911/*
1912 *	Insert the given physical page (p) at
1913 *	the specified virtual address (v) in the
1914 *	target physical map with the protection requested.
1915 *
1916 *	If specified, the page will be wired down, meaning
1917 *	that the related pte can not be reclaimed.
1918 *
1919 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1920 *	or lose information.  That is, this routine must actually
1921 *	insert this page into the given map NOW.
1922 */
1923void
1924pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1925	   boolean_t wired)
1926{
1927	vm_paddr_t pa;
1928	register pt_entry_t *pte;
1929	vm_paddr_t opa;
1930	pt_entry_t origpte, newpte;
1931	vm_page_t mpte, om;
1932	boolean_t invlva;
1933
1934	va = trunc_page(va);
1935#ifdef PMAP_DIAGNOSTIC
1936	if (va > VM_MAX_KERNEL_ADDRESS)
1937		panic("pmap_enter: toobig");
1938	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1939		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
1940#endif
1941
1942	mpte = NULL;
1943
1944	vm_page_lock_queues();
1945	PMAP_LOCK(pmap);
1946
1947	/*
1948	 * In the case that a page table page is not
1949	 * resident, we are creating it here.
1950	 */
1951	if (va < VM_MAXUSER_ADDRESS) {
1952		mpte = pmap_allocpte(pmap, va, M_WAITOK);
1953	}
1954#if 0 && defined(PMAP_DIAGNOSTIC)
1955	else {
1956		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1957		origpte = *pdeaddr;
1958		if ((origpte & PG_V) == 0) {
1959			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
1960				origpte, va);
1961		}
1962	}
1963#endif
1964
1965	pte = pmap_pte(pmap, va);
1966
1967	/*
1968	 * Page Directory table entry not valid, we need a new PT page
1969	 */
1970	if (pte == NULL)
1971		panic("pmap_enter: invalid page directory va=%#lx\n", va);
1972
1973	pa = VM_PAGE_TO_PHYS(m);
1974	om = NULL;
1975	origpte = *pte;
1976	opa = origpte & PG_FRAME;
1977
1978	if (origpte & PG_PS)
1979		panic("pmap_enter: attempted pmap_enter on 2MB page");
1980
1981	/*
1982	 * Mapping has not changed, must be protection or wiring change.
1983	 */
1984	if (origpte && (opa == pa)) {
1985		/*
1986		 * Wiring change, just update stats. We don't worry about
1987		 * wiring PT pages as they remain resident as long as there
1988		 * are valid mappings in them. Hence, if a user page is wired,
1989		 * the PT page will be also.
1990		 */
1991		if (wired && ((origpte & PG_W) == 0))
1992			pmap->pm_stats.wired_count++;
1993		else if (!wired && (origpte & PG_W))
1994			pmap->pm_stats.wired_count--;
1995
1996		/*
1997		 * Remove extra pte reference
1998		 */
1999		if (mpte)
2000			mpte->wire_count--;
2001
2002		/*
2003		 * We might be turning off write access to the page,
2004		 * so we go ahead and sense modify status.
2005		 */
2006		if (origpte & PG_MANAGED) {
2007			om = m;
2008			pa |= PG_MANAGED;
2009		}
2010		goto validate;
2011	}
2012	/*
2013	 * Mapping has changed, invalidate old range and fall through to
2014	 * handle validating new mapping.
2015	 */
2016	if (opa) {
2017		if (origpte & PG_W)
2018			pmap->pm_stats.wired_count--;
2019		if (origpte & PG_MANAGED) {
2020			om = PHYS_TO_VM_PAGE(opa);
2021			pmap_remove_entry(pmap, om, va);
2022		}
2023		if (mpte != NULL) {
2024			mpte->wire_count--;
2025			KASSERT(mpte->wire_count > 0,
2026			    ("pmap_enter: missing reference to page table page,"
2027			     " va: 0x%lx", va));
2028		}
2029	} else
2030		pmap->pm_stats.resident_count++;
2031
2032	/*
2033	 * Enter on the PV list if part of our managed memory.
2034	 */
2035	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2036		pmap_insert_entry(pmap, va, m);
2037		pa |= PG_MANAGED;
2038	}
2039
2040	/*
2041	 * Increment counters
2042	 */
2043	if (wired)
2044		pmap->pm_stats.wired_count++;
2045
2046validate:
2047	/*
2048	 * Now validate mapping with desired protection/wiring.
2049	 */
2050	newpte = (pt_entry_t)(pa | PG_V);
2051	if ((prot & VM_PROT_WRITE) != 0)
2052		newpte |= PG_RW;
2053	if ((prot & VM_PROT_EXECUTE) == 0)
2054		newpte |= pg_nx;
2055	if (wired)
2056		newpte |= PG_W;
2057	if (va < VM_MAXUSER_ADDRESS)
2058		newpte |= PG_U;
2059	if (pmap == kernel_pmap)
2060		newpte |= PG_G;
2061
2062	/*
2063	 * if the mapping or permission bits are different, we need
2064	 * to update the pte.
2065	 */
2066	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2067		if (origpte & PG_V) {
2068			invlva = FALSE;
2069			origpte = pte_load_store(pte, newpte | PG_A);
2070			if (origpte & PG_A) {
2071				if (origpte & PG_MANAGED)
2072					vm_page_flag_set(om, PG_REFERENCED);
2073				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
2074				    PG_NX) == 0 && (newpte & PG_NX)))
2075					invlva = TRUE;
2076			}
2077			if (origpte & PG_M) {
2078				KASSERT((origpte & PG_RW),
2079	("pmap_enter: modified page not writable: va: %#lx, pte: %#lx",
2080				    va, origpte));
2081				if ((origpte & PG_MANAGED) &&
2082				    pmap_track_modified(va))
2083					vm_page_dirty(om);
2084				if ((newpte & PG_RW) == 0)
2085					invlva = TRUE;
2086			}
2087			if (invlva)
2088				pmap_invalidate_page(pmap, va);
2089		} else
2090			pte_store(pte, newpte | PG_A);
2091	}
2092	vm_page_unlock_queues();
2093	PMAP_UNLOCK(pmap);
2094}
2095
2096/*
2097 * this code makes some *MAJOR* assumptions:
2098 * 1. Current pmap & pmap exists.
2099 * 2. Not wired.
2100 * 3. Read access.
2101 * 4. No page table pages.
2102 * but is *MUCH* faster than pmap_enter...
2103 */
2104
2105vm_page_t
2106pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2107    vm_page_t mpte)
2108{
2109	pt_entry_t *pte;
2110	vm_paddr_t pa;
2111
2112	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2113	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2114	PMAP_LOCK(pmap);
2115
2116	/*
2117	 * In the case that a page table page is not
2118	 * resident, we are creating it here.
2119	 */
2120	if (va < VM_MAXUSER_ADDRESS) {
2121		vm_pindex_t ptepindex;
2122		pd_entry_t *ptepa;
2123
2124		/*
2125		 * Calculate pagetable page index
2126		 */
2127		ptepindex = pmap_pde_pindex(va);
2128		if (mpte && (mpte->pindex == ptepindex)) {
2129			mpte->wire_count++;
2130		} else {
2131	retry:
2132			/*
2133			 * Get the page directory entry
2134			 */
2135			ptepa = pmap_pde(pmap, va);
2136
2137			/*
2138			 * If the page table page is mapped, we just increment
2139			 * the hold count, and activate it.
2140			 */
2141			if (ptepa && (*ptepa & PG_V) != 0) {
2142				if (*ptepa & PG_PS)
2143					panic("pmap_enter_quick: unexpected mapping into 2MB page");
2144				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
2145				mpte->wire_count++;
2146			} else {
2147				mpte = _pmap_allocpte(pmap, ptepindex,
2148				    M_NOWAIT);
2149				if (mpte == NULL) {
2150					PMAP_UNLOCK(pmap);
2151					vm_page_busy(m);
2152					vm_page_unlock_queues();
2153					VM_OBJECT_UNLOCK(m->object);
2154					VM_WAIT;
2155					VM_OBJECT_LOCK(m->object);
2156					vm_page_lock_queues();
2157					vm_page_wakeup(m);
2158					PMAP_LOCK(pmap);
2159					goto retry;
2160				}
2161			}
2162		}
2163	} else {
2164		mpte = NULL;
2165	}
2166
2167	/*
2168	 * This call to vtopte makes the assumption that we are
2169	 * entering the page into the current pmap.  In order to support
2170	 * quick entry into any pmap, one would likely use pmap_pte.
2171	 * But that isn't as quick as vtopte.
2172	 */
2173	pte = vtopte(va);
2174	if (*pte) {
2175		if (mpte != NULL) {
2176			pmap_unwire_pte_hold(pmap, va, mpte);
2177			mpte = NULL;
2178		}
2179		goto out;
2180	}
2181
2182	/*
2183	 * Enter on the PV list if part of our managed memory. Note that we
2184	 * raise IPL while manipulating pv_table since pmap_enter can be
2185	 * called at interrupt time.
2186	 */
2187	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2188		pmap_insert_entry(pmap, va, m);
2189
2190	/*
2191	 * Increment counters
2192	 */
2193	pmap->pm_stats.resident_count++;
2194
2195	pa = VM_PAGE_TO_PHYS(m);
2196	if ((prot & VM_PROT_EXECUTE) == 0)
2197		pa |= pg_nx;
2198
2199	/*
2200	 * Now validate mapping with RO protection
2201	 */
2202	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2203		pte_store(pte, pa | PG_V | PG_U);
2204	else
2205		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2206out:
2207	PMAP_UNLOCK(pmap);
2208	return mpte;
2209}
2210
2211/*
2212 * Make a temporary mapping for a physical address.  This is only intended
2213 * to be used for panic dumps.
2214 */
2215void *
2216pmap_kenter_temporary(vm_paddr_t pa, int i)
2217{
2218	vm_offset_t va;
2219
2220	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2221	pmap_kenter(va, pa);
2222	invlpg(va);
2223	return ((void *)crashdumpmap);
2224}
2225
2226/*
2227 * This code maps large physical mmap regions into the
2228 * processor address space.  Note that some shortcuts
2229 * are taken, but the code works.
2230 */
2231void
2232pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2233		    vm_object_t object, vm_pindex_t pindex,
2234		    vm_size_t size)
2235{
2236	vm_offset_t va;
2237	vm_page_t p, pdpg;
2238
2239	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2240	KASSERT(object->type == OBJT_DEVICE,
2241	    ("pmap_object_init_pt: non-device object"));
2242	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2243		vm_page_t m[1];
2244		pd_entry_t ptepa, *pde;
2245
2246		PMAP_LOCK(pmap);
2247		pde = pmap_pde(pmap, addr);
2248		if (pde != 0 && (*pde & PG_V) != 0)
2249			goto out;
2250		PMAP_UNLOCK(pmap);
2251retry:
2252		p = vm_page_lookup(object, pindex);
2253		if (p != NULL) {
2254			vm_page_lock_queues();
2255			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2256				goto retry;
2257		} else {
2258			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2259			if (p == NULL)
2260				return;
2261			m[0] = p;
2262
2263			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2264				vm_page_lock_queues();
2265				vm_page_free(p);
2266				vm_page_unlock_queues();
2267				return;
2268			}
2269
2270			p = vm_page_lookup(object, pindex);
2271			vm_page_lock_queues();
2272			vm_page_wakeup(p);
2273		}
2274		vm_page_unlock_queues();
2275
2276		ptepa = VM_PAGE_TO_PHYS(p);
2277		if (ptepa & (NBPDR - 1))
2278			return;
2279
2280		p->valid = VM_PAGE_BITS_ALL;
2281
2282		PMAP_LOCK(pmap);
2283		for (va = addr; va < addr + size; va += NBPDR) {
2284			while ((pdpg =
2285			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
2286				PMAP_UNLOCK(pmap);
2287				vm_page_lock_queues();
2288				vm_page_busy(p);
2289				vm_page_unlock_queues();
2290				VM_OBJECT_UNLOCK(object);
2291				VM_WAIT;
2292				VM_OBJECT_LOCK(object);
2293				vm_page_lock_queues();
2294				vm_page_wakeup(p);
2295				vm_page_unlock_queues();
2296				PMAP_LOCK(pmap);
2297			}
2298			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
2299			pde = &pde[pmap_pde_index(va)];
2300			if ((*pde & PG_V) == 0) {
2301				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
2302				    PG_U | PG_RW | PG_V);
2303				pmap->pm_stats.resident_count +=
2304				    NBPDR / PAGE_SIZE;
2305			} else {
2306				pdpg->wire_count--;
2307				KASSERT(pdpg->wire_count > 0,
2308				    ("pmap_object_init_pt: missing reference "
2309				     "to page directory page, va: 0x%lx", va));
2310			}
2311			ptepa += NBPDR;
2312		}
2313		pmap_invalidate_all(pmap);
2314out:
2315		PMAP_UNLOCK(pmap);
2316	}
2317}
2318
2319/*
2320 *	Routine:	pmap_change_wiring
2321 *	Function:	Change the wiring attribute for a map/virtual-address
2322 *			pair.
2323 *	In/out conditions:
2324 *			The mapping must already exist in the pmap.
2325 */
2326void
2327pmap_change_wiring(pmap, va, wired)
2328	register pmap_t pmap;
2329	vm_offset_t va;
2330	boolean_t wired;
2331{
2332	register pt_entry_t *pte;
2333
2334	/*
2335	 * Wiring is not a hardware characteristic so there is no need to
2336	 * invalidate TLB.
2337	 */
2338	PMAP_LOCK(pmap);
2339	pte = pmap_pte(pmap, va);
2340	if (wired && (*pte & PG_W) == 0) {
2341		pmap->pm_stats.wired_count++;
2342		atomic_set_long(pte, PG_W);
2343	} else if (!wired && (*pte & PG_W) != 0) {
2344		pmap->pm_stats.wired_count--;
2345		atomic_clear_long(pte, PG_W);
2346	}
2347	PMAP_UNLOCK(pmap);
2348}
2349
2350
2351
2352/*
2353 *	Copy the range specified by src_addr/len
2354 *	from the source map to the range dst_addr/len
2355 *	in the destination map.
2356 *
2357 *	This routine is only advisory and need not do anything.
2358 */
2359
2360void
2361pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2362	  vm_offset_t src_addr)
2363{
2364	vm_offset_t addr;
2365	vm_offset_t end_addr = src_addr + len;
2366	vm_offset_t va_next;
2367	vm_page_t m;
2368
2369	if (dst_addr != src_addr)
2370		return;
2371
2372	if (!pmap_is_current(src_pmap))
2373		return;
2374
2375	vm_page_lock_queues();
2376	if (dst_pmap < src_pmap) {
2377		PMAP_LOCK(dst_pmap);
2378		PMAP_LOCK(src_pmap);
2379	} else {
2380		PMAP_LOCK(src_pmap);
2381		PMAP_LOCK(dst_pmap);
2382	}
2383	for (addr = src_addr; addr < end_addr; addr = va_next) {
2384		pt_entry_t *src_pte, *dst_pte;
2385		vm_page_t dstmpde, dstmpte, srcmpte;
2386		pml4_entry_t *pml4e;
2387		pdp_entry_t *pdpe;
2388		pd_entry_t srcptepaddr, *pde;
2389
2390		if (addr >= UPT_MIN_ADDRESS)
2391			panic("pmap_copy: invalid to pmap_copy page tables");
2392
2393		/*
2394		 * Don't let optional prefaulting of pages make us go
2395		 * way below the low water mark of free pages or way
2396		 * above high water mark of used pv entries.
2397		 */
2398		if (cnt.v_free_count < cnt.v_free_reserved ||
2399		    pv_entry_count > pv_entry_high_water)
2400			break;
2401
2402		pml4e = pmap_pml4e(src_pmap, addr);
2403		if ((*pml4e & PG_V) == 0) {
2404			va_next = (addr + NBPML4) & ~PML4MASK;
2405			continue;
2406		}
2407
2408		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
2409		if ((*pdpe & PG_V) == 0) {
2410			va_next = (addr + NBPDP) & ~PDPMASK;
2411			continue;
2412		}
2413
2414		va_next = (addr + NBPDR) & ~PDRMASK;
2415
2416		pde = pmap_pdpe_to_pde(pdpe, addr);
2417		srcptepaddr = *pde;
2418		if (srcptepaddr == 0)
2419			continue;
2420
2421		if (srcptepaddr & PG_PS) {
2422			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
2423			if (dstmpde == NULL)
2424				break;
2425			pde = (pd_entry_t *)
2426			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
2427			pde = &pde[pmap_pde_index(addr)];
2428			if (*pde == 0) {
2429				*pde = srcptepaddr;
2430				dst_pmap->pm_stats.resident_count +=
2431				    NBPDR / PAGE_SIZE;
2432			} else
2433				pmap_unwire_pte_hold(dst_pmap, addr, dstmpde);
2434			continue;
2435		}
2436
2437		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
2438		if (srcmpte->wire_count == 0)
2439			panic("pmap_copy: source page table page is unused");
2440
2441		if (va_next > end_addr)
2442			va_next = end_addr;
2443
2444		src_pte = vtopte(addr);
2445		while (addr < va_next) {
2446			pt_entry_t ptetemp;
2447			ptetemp = *src_pte;
2448			/*
2449			 * we only virtual copy managed pages
2450			 */
2451			if ((ptetemp & PG_MANAGED) != 0) {
2452				/*
2453				 * We have to check after allocpte for the
2454				 * pte still being around...  allocpte can
2455				 * block.
2456				 */
2457				dstmpte = pmap_allocpte(dst_pmap, addr,
2458				    M_NOWAIT);
2459				if (dstmpte == NULL)
2460					break;
2461				dst_pte = (pt_entry_t *)
2462				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2463				dst_pte = &dst_pte[pmap_pte_index(addr)];
2464				if (*dst_pte == 0) {
2465					/*
2466					 * Clear the modified and
2467					 * accessed (referenced) bits
2468					 * during the copy.
2469					 */
2470					m = PHYS_TO_VM_PAGE(ptetemp & PG_FRAME);
2471					*dst_pte = ptetemp & ~(PG_M | PG_A);
2472					dst_pmap->pm_stats.resident_count++;
2473					pmap_insert_entry(dst_pmap, addr, m);
2474	 			} else
2475					pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
2476				if (dstmpte->wire_count >= srcmpte->wire_count)
2477					break;
2478			}
2479			addr += PAGE_SIZE;
2480			src_pte++;
2481		}
2482	}
2483	vm_page_unlock_queues();
2484	PMAP_UNLOCK(src_pmap);
2485	PMAP_UNLOCK(dst_pmap);
2486}
2487
2488/*
2489 *	pmap_zero_page zeros the specified hardware page by mapping
2490 *	the page into KVM and using bzero to clear its contents.
2491 */
2492void
2493pmap_zero_page(vm_page_t m)
2494{
2495	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2496
2497	pagezero((void *)va);
2498}
2499
2500/*
2501 *	pmap_zero_page_area zeros the specified hardware page by mapping
2502 *	the page into KVM and using bzero to clear its contents.
2503 *
2504 *	off and size may not cover an area beyond a single hardware page.
2505 */
2506void
2507pmap_zero_page_area(vm_page_t m, int off, int size)
2508{
2509	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2510
2511	if (off == 0 && size == PAGE_SIZE)
2512		pagezero((void *)va);
2513	else
2514		bzero((char *)va + off, size);
2515}
2516
2517/*
2518 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2519 *	the page into KVM and using bzero to clear its contents.  This
2520 *	is intended to be called from the vm_pagezero process only and
2521 *	outside of Giant.
2522 */
2523void
2524pmap_zero_page_idle(vm_page_t m)
2525{
2526	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2527
2528	pagezero((void *)va);
2529}
2530
2531/*
2532 *	pmap_copy_page copies the specified (machine independent)
2533 *	page by mapping the page into virtual memory and using
2534 *	bcopy to copy the page, one machine dependent page at a
2535 *	time.
2536 */
2537void
2538pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2539{
2540	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2541	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2542
2543	pagecopy((void *)src, (void *)dst);
2544}
2545
2546/*
2547 * Returns true if the pmap's pv is one of the first
2548 * 16 pvs linked to from this page.  This count may
2549 * be changed upwards or downwards in the future; it
2550 * is only necessary that true be returned for a small
2551 * subset of pmaps for proper page aging.
2552 */
2553boolean_t
2554pmap_page_exists_quick(pmap, m)
2555	pmap_t pmap;
2556	vm_page_t m;
2557{
2558	pv_entry_t pv;
2559	int loops = 0;
2560
2561	if (m->flags & PG_FICTITIOUS)
2562		return FALSE;
2563
2564	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2565	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2566		if (pv->pv_pmap == pmap) {
2567			return TRUE;
2568		}
2569		loops++;
2570		if (loops >= 16)
2571			break;
2572	}
2573	return (FALSE);
2574}
2575
2576#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2577/*
2578 * Remove all pages from specified address space
2579 * this aids process exit speeds.  Also, this code
2580 * is special cased for current process only, but
2581 * can have the more generic (and slightly slower)
2582 * mode enabled.  This is much faster than pmap_remove
2583 * in the case of running down an entire address space.
2584 */
2585void
2586pmap_remove_pages(pmap, sva, eva)
2587	pmap_t pmap;
2588	vm_offset_t sva, eva;
2589{
2590	pt_entry_t *pte, tpte;
2591	vm_page_t m;
2592	pv_entry_t pv, npv;
2593
2594#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2595	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2596		printf("warning: pmap_remove_pages called with non-current pmap\n");
2597		return;
2598	}
2599#endif
2600	vm_page_lock_queues();
2601	PMAP_LOCK(pmap);
2602	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2603
2604		if (pv->pv_va >= eva || pv->pv_va < sva) {
2605			npv = TAILQ_NEXT(pv, pv_plist);
2606			continue;
2607		}
2608
2609#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2610		pte = vtopte(pv->pv_va);
2611#else
2612		pte = pmap_pte(pmap, pv->pv_va);
2613#endif
2614		tpte = *pte;
2615
2616		if (tpte == 0) {
2617			printf("TPTE at %p  IS ZERO @ VA %08lx\n",
2618							pte, pv->pv_va);
2619			panic("bad pte");
2620		}
2621
2622/*
2623 * We cannot remove wired pages from a process' mapping at this time
2624 */
2625		if (tpte & PG_W) {
2626			npv = TAILQ_NEXT(pv, pv_plist);
2627			continue;
2628		}
2629
2630		m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2631		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2632		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2633		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2634
2635		KASSERT(m < &vm_page_array[vm_page_array_size],
2636			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2637
2638		pmap->pm_stats.resident_count--;
2639
2640		pte_clear(pte);
2641
2642		/*
2643		 * Update the vm_page_t clean and reference bits.
2644		 */
2645		if (tpte & PG_M) {
2646			vm_page_dirty(m);
2647		}
2648
2649		npv = TAILQ_NEXT(pv, pv_plist);
2650		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2651
2652		m->md.pv_list_count--;
2653		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2654		if (TAILQ_EMPTY(&m->md.pv_list))
2655			vm_page_flag_clear(m, PG_WRITEABLE);
2656
2657		pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va));
2658		free_pv_entry(pv);
2659	}
2660	pmap_invalidate_all(pmap);
2661	PMAP_UNLOCK(pmap);
2662	vm_page_unlock_queues();
2663}
2664
2665/*
2666 *	pmap_is_modified:
2667 *
2668 *	Return whether or not the specified physical page was modified
2669 *	in any physical maps.
2670 */
2671boolean_t
2672pmap_is_modified(vm_page_t m)
2673{
2674	pv_entry_t pv;
2675	pt_entry_t *pte;
2676	boolean_t rv;
2677
2678	rv = FALSE;
2679	if (m->flags & PG_FICTITIOUS)
2680		return (rv);
2681
2682	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2683	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2684		/*
2685		 * if the bit being tested is the modified bit, then
2686		 * mark clean_map and ptes as never
2687		 * modified.
2688		 */
2689		if (!pmap_track_modified(pv->pv_va))
2690			continue;
2691		PMAP_LOCK(pv->pv_pmap);
2692		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2693		rv = (*pte & PG_M) != 0;
2694		PMAP_UNLOCK(pv->pv_pmap);
2695		if (rv)
2696			break;
2697	}
2698	return (rv);
2699}
2700
2701/*
2702 *	pmap_is_prefaultable:
2703 *
2704 *	Return whether or not the specified virtual address is elgible
2705 *	for prefault.
2706 */
2707boolean_t
2708pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2709{
2710	pd_entry_t *pde;
2711	pt_entry_t *pte;
2712	boolean_t rv;
2713
2714	rv = FALSE;
2715	PMAP_LOCK(pmap);
2716	pde = pmap_pde(pmap, addr);
2717	if (pde != NULL && (*pde & PG_V)) {
2718		pte = vtopte(addr);
2719		rv = (*pte & PG_V) == 0;
2720	}
2721	PMAP_UNLOCK(pmap);
2722	return (rv);
2723}
2724
2725/*
2726 *	Clear the given bit in each of the given page's ptes.
2727 */
2728static __inline void
2729pmap_clear_ptes(vm_page_t m, long bit)
2730{
2731	register pv_entry_t pv;
2732	pt_entry_t pbits, *pte;
2733
2734	if ((m->flags & PG_FICTITIOUS) ||
2735	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2736		return;
2737
2738	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2739	/*
2740	 * Loop over all current mappings setting/clearing as appropos If
2741	 * setting RO do we need to clear the VAC?
2742	 */
2743	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2744		/*
2745		 * don't write protect pager mappings
2746		 */
2747		if (bit == PG_RW) {
2748			if (!pmap_track_modified(pv->pv_va))
2749				continue;
2750		}
2751
2752		PMAP_LOCK(pv->pv_pmap);
2753		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2754retry:
2755		pbits = *pte;
2756		if (pbits & bit) {
2757			if (bit == PG_RW) {
2758				if (!atomic_cmpset_long(pte, pbits,
2759				    pbits & ~(PG_RW | PG_M)))
2760					goto retry;
2761				if (pbits & PG_M) {
2762					vm_page_dirty(m);
2763				}
2764			} else {
2765				atomic_clear_long(pte, bit);
2766			}
2767			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2768		}
2769		PMAP_UNLOCK(pv->pv_pmap);
2770	}
2771	if (bit == PG_RW)
2772		vm_page_flag_clear(m, PG_WRITEABLE);
2773}
2774
2775/*
2776 *      pmap_page_protect:
2777 *
2778 *      Lower the permission for all mappings to a given page.
2779 */
2780void
2781pmap_page_protect(vm_page_t m, vm_prot_t prot)
2782{
2783	if ((prot & VM_PROT_WRITE) == 0) {
2784		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2785			pmap_clear_ptes(m, PG_RW);
2786		} else {
2787			pmap_remove_all(m);
2788		}
2789	}
2790}
2791
2792/*
2793 *	pmap_ts_referenced:
2794 *
2795 *	Return a count of reference bits for a page, clearing those bits.
2796 *	It is not necessary for every reference bit to be cleared, but it
2797 *	is necessary that 0 only be returned when there are truly no
2798 *	reference bits set.
2799 *
2800 *	XXX: The exact number of bits to check and clear is a matter that
2801 *	should be tested and standardized at some point in the future for
2802 *	optimal aging of shared pages.
2803 */
2804int
2805pmap_ts_referenced(vm_page_t m)
2806{
2807	register pv_entry_t pv, pvf, pvn;
2808	pt_entry_t *pte;
2809	pt_entry_t v;
2810	int rtval = 0;
2811
2812	if (m->flags & PG_FICTITIOUS)
2813		return (rtval);
2814
2815	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2816	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2817
2818		pvf = pv;
2819
2820		do {
2821			pvn = TAILQ_NEXT(pv, pv_list);
2822
2823			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2824
2825			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2826
2827			if (!pmap_track_modified(pv->pv_va))
2828				continue;
2829
2830			PMAP_LOCK(pv->pv_pmap);
2831			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2832
2833			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2834				atomic_clear_long(pte, PG_A);
2835				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2836
2837				rtval++;
2838				if (rtval > 4) {
2839					PMAP_UNLOCK(pv->pv_pmap);
2840					break;
2841				}
2842			}
2843			PMAP_UNLOCK(pv->pv_pmap);
2844		} while ((pv = pvn) != NULL && pv != pvf);
2845	}
2846
2847	return (rtval);
2848}
2849
2850/*
2851 *	Clear the modify bits on the specified physical page.
2852 */
2853void
2854pmap_clear_modify(vm_page_t m)
2855{
2856	pmap_clear_ptes(m, PG_M);
2857}
2858
2859/*
2860 *	pmap_clear_reference:
2861 *
2862 *	Clear the reference bit on the specified physical page.
2863 */
2864void
2865pmap_clear_reference(vm_page_t m)
2866{
2867	pmap_clear_ptes(m, PG_A);
2868}
2869
2870/*
2871 * Miscellaneous support routines follow
2872 */
2873
2874/*
2875 * Map a set of physical memory pages into the kernel virtual
2876 * address space. Return a pointer to where it is mapped. This
2877 * routine is intended to be used for mapping device memory,
2878 * NOT real memory.
2879 */
2880void *
2881pmap_mapdev(pa, size)
2882	vm_paddr_t pa;
2883	vm_size_t size;
2884{
2885	vm_offset_t va, tmpva, offset;
2886
2887	/* If this fits within the direct map window, use it */
2888	if (pa < dmaplimit && (pa + size) < dmaplimit)
2889		return ((void *)PHYS_TO_DMAP(pa));
2890	offset = pa & PAGE_MASK;
2891	size = roundup(offset + size, PAGE_SIZE);
2892	va = kmem_alloc_nofault(kernel_map, size);
2893	if (!va)
2894		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2895	pa = trunc_page(pa);
2896	for (tmpva = va; size > 0; ) {
2897		pmap_kenter(tmpva, pa);
2898		size -= PAGE_SIZE;
2899		tmpva += PAGE_SIZE;
2900		pa += PAGE_SIZE;
2901	}
2902	pmap_invalidate_range(kernel_pmap, va, tmpva);
2903	return ((void *)(va + offset));
2904}
2905
2906void
2907pmap_unmapdev(va, size)
2908	vm_offset_t va;
2909	vm_size_t size;
2910{
2911	vm_offset_t base, offset, tmpva;
2912
2913	/* If we gave a direct map region in pmap_mapdev, do nothing */
2914	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2915		return;
2916	base = trunc_page(va);
2917	offset = va & PAGE_MASK;
2918	size = roundup(offset + size, PAGE_SIZE);
2919	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2920		pmap_kremove(tmpva);
2921	pmap_invalidate_range(kernel_pmap, va, tmpva);
2922	kmem_free(kernel_map, base, size);
2923}
2924
2925/*
2926 * perform the pmap work for mincore
2927 */
2928int
2929pmap_mincore(pmap, addr)
2930	pmap_t pmap;
2931	vm_offset_t addr;
2932{
2933	pt_entry_t *ptep, pte;
2934	vm_page_t m;
2935	int val = 0;
2936
2937	PMAP_LOCK(pmap);
2938	ptep = pmap_pte(pmap, addr);
2939	pte = (ptep != NULL) ? *ptep : 0;
2940	PMAP_UNLOCK(pmap);
2941
2942	if (pte != 0) {
2943		vm_paddr_t pa;
2944
2945		val = MINCORE_INCORE;
2946		if ((pte & PG_MANAGED) == 0)
2947			return val;
2948
2949		pa = pte & PG_FRAME;
2950
2951		m = PHYS_TO_VM_PAGE(pa);
2952
2953		/*
2954		 * Modified by us
2955		 */
2956		if (pte & PG_M)
2957			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2958		else {
2959			/*
2960			 * Modified by someone else
2961			 */
2962			vm_page_lock_queues();
2963			if (m->dirty || pmap_is_modified(m))
2964				val |= MINCORE_MODIFIED_OTHER;
2965			vm_page_unlock_queues();
2966		}
2967		/*
2968		 * Referenced by us
2969		 */
2970		if (pte & PG_A)
2971			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2972		else {
2973			/*
2974			 * Referenced by someone else
2975			 */
2976			vm_page_lock_queues();
2977			if ((m->flags & PG_REFERENCED) ||
2978			    pmap_ts_referenced(m)) {
2979				val |= MINCORE_REFERENCED_OTHER;
2980				vm_page_flag_set(m, PG_REFERENCED);
2981			}
2982			vm_page_unlock_queues();
2983		}
2984	}
2985	return val;
2986}
2987
2988void
2989pmap_activate(struct thread *td)
2990{
2991	struct proc *p = td->td_proc;
2992	pmap_t	pmap, oldpmap;
2993	u_int64_t  cr3;
2994
2995	critical_enter();
2996	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2997	oldpmap = PCPU_GET(curpmap);
2998#ifdef SMP
2999if (oldpmap)	/* XXX FIXME */
3000	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3001	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3002#else
3003if (oldpmap)	/* XXX FIXME */
3004	oldpmap->pm_active &= ~PCPU_GET(cpumask);
3005	pmap->pm_active |= PCPU_GET(cpumask);
3006#endif
3007	cr3 = vtophys(pmap->pm_pml4);
3008	/* XXXKSE this is wrong.
3009	 * pmap_activate is for the current thread on the current cpu
3010	 */
3011	if (p->p_flag & P_SA) {
3012		/* Make sure all other cr3 entries are updated. */
3013		/* what if they are running?  XXXKSE (maybe abort them) */
3014		FOREACH_THREAD_IN_PROC(p, td) {
3015			td->td_pcb->pcb_cr3 = cr3;
3016		}
3017	} else {
3018		td->td_pcb->pcb_cr3 = cr3;
3019	}
3020	load_cr3(cr3);
3021	critical_exit();
3022}
3023
3024vm_offset_t
3025pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3026{
3027
3028	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3029		return addr;
3030	}
3031
3032	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3033	return addr;
3034}
3035