pmap.c revision 149058
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 149058 2005-08-14 20:02:50Z alc $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_msgbuf.h"
107
108#include <sys/param.h>
109#include <sys/systm.h>
110#include <sys/kernel.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/msgbuf.h>
115#include <sys/mutex.h>
116#include <sys/proc.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#ifdef SMP
122#include <sys/smp.h>
123#endif
124
125#include <vm/vm.h>
126#include <vm/vm_param.h>
127#include <vm/vm_kern.h>
128#include <vm/vm_page.h>
129#include <vm/vm_map.h>
130#include <vm/vm_object.h>
131#include <vm/vm_extern.h>
132#include <vm/vm_pageout.h>
133#include <vm/vm_pager.h>
134#include <vm/uma.h>
135
136#include <machine/cpu.h>
137#include <machine/cputypes.h>
138#include <machine/md_var.h>
139#include <machine/pcb.h>
140#include <machine/specialreg.h>
141#ifdef SMP
142#include <machine/smp.h>
143#endif
144
145#ifndef PMAP_SHPGPERPROC
146#define PMAP_SHPGPERPROC 200
147#endif
148
149#if defined(DIAGNOSTIC)
150#define PMAP_DIAGNOSTIC
151#endif
152
153#define MINPV 2048
154
155#if !defined(PMAP_DIAGNOSTIC)
156#define PMAP_INLINE __inline
157#else
158#define PMAP_INLINE
159#endif
160
161struct pmap kernel_pmap_store;
162
163vm_paddr_t avail_start;		/* PA of first available physical page */
164vm_paddr_t avail_end;		/* PA of last available physical page */
165vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
166vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
167
168static int nkpt;
169static int ndmpdp;
170static vm_paddr_t dmaplimit;
171vm_offset_t kernel_vm_end;
172pt_entry_t pg_nx;
173
174static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
175static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
176static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
177u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
178
179static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
180static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
181
182/*
183 * Data for the pv entry allocation mechanism
184 */
185static uma_zone_t pvzone;
186static struct vm_object pvzone_obj;
187static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
188int pmap_pagedaemon_waken;
189
190/*
191 * All those kernel PT submaps that BSD is so fond of
192 */
193pt_entry_t *CMAP1 = 0;
194caddr_t CADDR1 = 0;
195struct msgbuf *msgbufp = 0;
196
197/*
198 * Crashdump maps.
199 */
200static caddr_t crashdumpmap;
201
202static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
203static pv_entry_t get_pv_entry(void);
204static void	pmap_clear_ptes(vm_page_t m, long bit);
205
206static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
207		vm_offset_t sva, pd_entry_t ptepde);
208static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
209static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
210		vm_offset_t va);
211static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
212
213static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
214
215static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
216static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m);
217static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t);
218static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
219
220CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
221CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
222
223/*
224 * Move the kernel virtual free pointer to the next
225 * 2MB.  This is used to help improve performance
226 * by using a large (2MB) page for much of the kernel
227 * (.text, .data, .bss)
228 */
229static vm_offset_t
230pmap_kmem_choose(vm_offset_t addr)
231{
232	vm_offset_t newaddr = addr;
233
234	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
235	return newaddr;
236}
237
238/********************/
239/* Inline functions */
240/********************/
241
242/* Return a non-clipped PD index for a given VA */
243static __inline vm_pindex_t
244pmap_pde_pindex(vm_offset_t va)
245{
246	return va >> PDRSHIFT;
247}
248
249
250/* Return various clipped indexes for a given VA */
251static __inline vm_pindex_t
252pmap_pte_index(vm_offset_t va)
253{
254
255	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
256}
257
258static __inline vm_pindex_t
259pmap_pde_index(vm_offset_t va)
260{
261
262	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
263}
264
265static __inline vm_pindex_t
266pmap_pdpe_index(vm_offset_t va)
267{
268
269	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
270}
271
272static __inline vm_pindex_t
273pmap_pml4e_index(vm_offset_t va)
274{
275
276	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
277}
278
279/* Return a pointer to the PML4 slot that corresponds to a VA */
280static __inline pml4_entry_t *
281pmap_pml4e(pmap_t pmap, vm_offset_t va)
282{
283
284	if (!pmap)
285		return NULL;
286	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
287}
288
289/* Return a pointer to the PDP slot that corresponds to a VA */
290static __inline pdp_entry_t *
291pmap_pdpe(pmap_t pmap, vm_offset_t va)
292{
293	pml4_entry_t *pml4e;
294	pdp_entry_t *pdpe;
295
296	pml4e = pmap_pml4e(pmap, va);
297	if (pml4e == NULL || (*pml4e & PG_V) == 0)
298		return NULL;
299	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
300	return (&pdpe[pmap_pdpe_index(va)]);
301}
302
303/* Return a pointer to the PD slot that corresponds to a VA */
304static __inline pd_entry_t *
305pmap_pde(pmap_t pmap, vm_offset_t va)
306{
307	pdp_entry_t *pdpe;
308	pd_entry_t *pde;
309
310	pdpe = pmap_pdpe(pmap, va);
311	if (pdpe == NULL || (*pdpe & PG_V) == 0)
312		 return NULL;
313	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
314	return (&pde[pmap_pde_index(va)]);
315}
316
317/* Return a pointer to the PT slot that corresponds to a VA */
318static __inline pt_entry_t *
319pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
320{
321	pt_entry_t *pte;
322
323	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
324	return (&pte[pmap_pte_index(va)]);
325}
326
327/* Return a pointer to the PT slot that corresponds to a VA */
328static __inline pt_entry_t *
329pmap_pte(pmap_t pmap, vm_offset_t va)
330{
331	pd_entry_t *pde;
332
333	pde = pmap_pde(pmap, va);
334	if (pde == NULL || (*pde & PG_V) == 0)
335		return NULL;
336	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
337		return ((pt_entry_t *)pde);
338	return (pmap_pde_to_pte(pde, va));
339}
340
341
342static __inline pt_entry_t *
343pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde)
344{
345	pd_entry_t *pde;
346
347	pde = pmap_pde(pmap, va);
348	if (pde == NULL || (*pde & PG_V) == 0)
349		return NULL;
350	*ptepde = *pde;
351	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
352		return ((pt_entry_t *)pde);
353	return (pmap_pde_to_pte(pde, va));
354}
355
356
357PMAP_INLINE pt_entry_t *
358vtopte(vm_offset_t va)
359{
360	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
361
362	return (PTmap + ((va >> PAGE_SHIFT) & mask));
363}
364
365static __inline pd_entry_t *
366vtopde(vm_offset_t va)
367{
368	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
369
370	return (PDmap + ((va >> PDRSHIFT) & mask));
371}
372
373static u_int64_t
374allocpages(int n)
375{
376	u_int64_t ret;
377
378	ret = avail_start;
379	bzero((void *)ret, n * PAGE_SIZE);
380	avail_start += n * PAGE_SIZE;
381	return (ret);
382}
383
384static void
385create_pagetables(void)
386{
387	int i;
388
389	/* Allocate pages */
390	KPTphys = allocpages(NKPT);
391	KPML4phys = allocpages(1);
392	KPDPphys = allocpages(NKPML4E);
393	KPDphys = allocpages(NKPDPE);
394
395	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
396	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
397		ndmpdp = 4;
398	DMPDPphys = allocpages(NDMPML4E);
399	DMPDphys = allocpages(ndmpdp);
400	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
401
402	/* Fill in the underlying page table pages */
403	/* Read-only from zero to physfree */
404	/* XXX not fully used, underneath 2M pages */
405	for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
406		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
407		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
408	}
409
410	/* Now map the page tables at their location within PTmap */
411	for (i = 0; i < NKPT; i++) {
412		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
413		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
414	}
415
416	/* Map from zero to end of allocations under 2M pages */
417	/* This replaces some of the KPTphys entries above */
418	for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
419		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
420		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
421	}
422
423	/* And connect up the PD to the PDP */
424	for (i = 0; i < NKPDPE; i++) {
425		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
426		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
427	}
428
429
430	/* Now set up the direct map space using 2MB pages */
431	for (i = 0; i < NPDEPG * ndmpdp; i++) {
432		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
433		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
434	}
435
436	/* And the direct map space's PDP */
437	for (i = 0; i < ndmpdp; i++) {
438		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
439		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
440	}
441
442	/* And recursively map PML4 to itself in order to get PTmap */
443	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
444	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
445
446	/* Connect the Direct Map slot up to the PML4 */
447	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
448	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
449
450	/* Connect the KVA slot up to the PML4 */
451	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
452	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
453}
454
455/*
456 *	Bootstrap the system enough to run with virtual memory.
457 *
458 *	On amd64 this is called after mapping has already been enabled
459 *	and just syncs the pmap module with what has already been done.
460 *	[We can't call it easily with mapping off since the kernel is not
461 *	mapped with PA == VA, hence we would have to relocate every address
462 *	from the linked base (virtual) address "KERNBASE" to the actual
463 *	(physical) address starting relative to 0]
464 */
465void
466pmap_bootstrap(firstaddr)
467	vm_paddr_t *firstaddr;
468{
469	vm_offset_t va;
470	pt_entry_t *pte, *unused;
471
472	avail_start = *firstaddr;
473
474	/*
475	 * Create an initial set of page tables to run the kernel in.
476	 */
477	create_pagetables();
478	*firstaddr = avail_start;
479
480	virtual_avail = (vm_offset_t) KERNBASE + avail_start;
481	virtual_avail = pmap_kmem_choose(virtual_avail);
482
483	virtual_end = VM_MAX_KERNEL_ADDRESS;
484
485
486	/* XXX do %cr0 as well */
487	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
488	load_cr3(KPML4phys);
489
490	/*
491	 * Initialize the kernel pmap (which is statically allocated).
492	 */
493	PMAP_LOCK_INIT(kernel_pmap);
494	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
495	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
496	TAILQ_INIT(&kernel_pmap->pm_pvlist);
497	nkpt = NKPT;
498
499	/*
500	 * Reserve some special page table entries/VA space for temporary
501	 * mapping of pages.
502	 */
503#define	SYSMAP(c, p, v, n)	\
504	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
505
506	va = virtual_avail;
507	pte = vtopte(va);
508
509	/*
510	 * CMAP1 is only used for the memory test.
511	 */
512	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
513
514	/*
515	 * Crashdump maps.
516	 */
517	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
518
519	/*
520	 * msgbufp is used to map the system message buffer.
521	 */
522	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
523
524	virtual_avail = va;
525
526	*CMAP1 = 0;
527
528	invltlb();
529}
530
531/*
532 *	Initialize a vm_page's machine-dependent fields.
533 */
534void
535pmap_page_init(vm_page_t m)
536{
537
538	TAILQ_INIT(&m->md.pv_list);
539	m->md.pv_list_count = 0;
540}
541
542/*
543 *	Initialize the pmap module.
544 *	Called by vm_init, to initialize any structures that the pmap
545 *	system needs to map virtual memory.
546 */
547void
548pmap_init(void)
549{
550
551	/*
552	 * init the pv free list
553	 */
554	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
555	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
556	uma_prealloc(pvzone, MINPV);
557}
558
559/*
560 * Initialize the address space (zone) for the pv_entries.  Set a
561 * high water mark so that the system can recover from excessive
562 * numbers of pv entries.
563 */
564void
565pmap_init2()
566{
567	int shpgperproc = PMAP_SHPGPERPROC;
568
569	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
570	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
571	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
572	pv_entry_high_water = 9 * (pv_entry_max / 10);
573	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
574}
575
576
577/***************************************************
578 * Low level helper routines.....
579 ***************************************************/
580
581#if defined(PMAP_DIAGNOSTIC)
582
583/*
584 * This code checks for non-writeable/modified pages.
585 * This should be an invalid condition.
586 */
587static int
588pmap_nw_modified(pt_entry_t ptea)
589{
590	int pte;
591
592	pte = (int) ptea;
593
594	if ((pte & (PG_M|PG_RW)) == PG_M)
595		return 1;
596	else
597		return 0;
598}
599#endif
600
601
602/*
603 * this routine defines the region(s) of memory that should
604 * not be tested for the modified bit.
605 */
606static PMAP_INLINE int
607pmap_track_modified(vm_offset_t va)
608{
609	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
610		return 1;
611	else
612		return 0;
613}
614
615#ifdef SMP
616/*
617 * For SMP, these functions have to use the IPI mechanism for coherence.
618 */
619void
620pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
621{
622	u_int cpumask;
623	u_int other_cpus;
624
625	if (smp_started) {
626		if (!(read_rflags() & PSL_I))
627			panic("%s: interrupts disabled", __func__);
628		mtx_lock_spin(&smp_ipi_mtx);
629	} else
630		critical_enter();
631	/*
632	 * We need to disable interrupt preemption but MUST NOT have
633	 * interrupts disabled here.
634	 * XXX we may need to hold schedlock to get a coherent pm_active
635	 * XXX critical sections disable interrupts again
636	 */
637	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
638		invlpg(va);
639		smp_invlpg(va);
640	} else {
641		cpumask = PCPU_GET(cpumask);
642		other_cpus = PCPU_GET(other_cpus);
643		if (pmap->pm_active & cpumask)
644			invlpg(va);
645		if (pmap->pm_active & other_cpus)
646			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
647	}
648	if (smp_started)
649		mtx_unlock_spin(&smp_ipi_mtx);
650	else
651		critical_exit();
652}
653
654void
655pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
656{
657	u_int cpumask;
658	u_int other_cpus;
659	vm_offset_t addr;
660
661	if (smp_started) {
662		if (!(read_rflags() & PSL_I))
663			panic("%s: interrupts disabled", __func__);
664		mtx_lock_spin(&smp_ipi_mtx);
665	} else
666		critical_enter();
667	/*
668	 * We need to disable interrupt preemption but MUST NOT have
669	 * interrupts disabled here.
670	 * XXX we may need to hold schedlock to get a coherent pm_active
671	 * XXX critical sections disable interrupts again
672	 */
673	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
674		for (addr = sva; addr < eva; addr += PAGE_SIZE)
675			invlpg(addr);
676		smp_invlpg_range(sva, eva);
677	} else {
678		cpumask = PCPU_GET(cpumask);
679		other_cpus = PCPU_GET(other_cpus);
680		if (pmap->pm_active & cpumask)
681			for (addr = sva; addr < eva; addr += PAGE_SIZE)
682				invlpg(addr);
683		if (pmap->pm_active & other_cpus)
684			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
685			    sva, eva);
686	}
687	if (smp_started)
688		mtx_unlock_spin(&smp_ipi_mtx);
689	else
690		critical_exit();
691}
692
693void
694pmap_invalidate_all(pmap_t pmap)
695{
696	u_int cpumask;
697	u_int other_cpus;
698
699	if (smp_started) {
700		if (!(read_rflags() & PSL_I))
701			panic("%s: interrupts disabled", __func__);
702		mtx_lock_spin(&smp_ipi_mtx);
703	} else
704		critical_enter();
705	/*
706	 * We need to disable interrupt preemption but MUST NOT have
707	 * interrupts disabled here.
708	 * XXX we may need to hold schedlock to get a coherent pm_active
709	 * XXX critical sections disable interrupts again
710	 */
711	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
712		invltlb();
713		smp_invltlb();
714	} else {
715		cpumask = PCPU_GET(cpumask);
716		other_cpus = PCPU_GET(other_cpus);
717		if (pmap->pm_active & cpumask)
718			invltlb();
719		if (pmap->pm_active & other_cpus)
720			smp_masked_invltlb(pmap->pm_active & other_cpus);
721	}
722	if (smp_started)
723		mtx_unlock_spin(&smp_ipi_mtx);
724	else
725		critical_exit();
726}
727#else /* !SMP */
728/*
729 * Normal, non-SMP, invalidation functions.
730 * We inline these within pmap.c for speed.
731 */
732PMAP_INLINE void
733pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
734{
735
736	if (pmap == kernel_pmap || pmap->pm_active)
737		invlpg(va);
738}
739
740PMAP_INLINE void
741pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
742{
743	vm_offset_t addr;
744
745	if (pmap == kernel_pmap || pmap->pm_active)
746		for (addr = sva; addr < eva; addr += PAGE_SIZE)
747			invlpg(addr);
748}
749
750PMAP_INLINE void
751pmap_invalidate_all(pmap_t pmap)
752{
753
754	if (pmap == kernel_pmap || pmap->pm_active)
755		invltlb();
756}
757#endif /* !SMP */
758
759/*
760 * Are we current address space or kernel?
761 */
762static __inline int
763pmap_is_current(pmap_t pmap)
764{
765	return (pmap == kernel_pmap ||
766	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
767}
768
769/*
770 *	Routine:	pmap_extract
771 *	Function:
772 *		Extract the physical page address associated
773 *		with the given map/virtual_address pair.
774 */
775vm_paddr_t
776pmap_extract(pmap_t pmap, vm_offset_t va)
777{
778	vm_paddr_t rtval;
779	pt_entry_t *pte;
780	pd_entry_t pde, *pdep;
781
782	rtval = 0;
783	PMAP_LOCK(pmap);
784	pdep = pmap_pde(pmap, va);
785	if (pdep != NULL) {
786		pde = *pdep;
787		if (pde) {
788			if ((pde & PG_PS) != 0) {
789				rtval = (pde & ~PDRMASK) | (va & PDRMASK);
790				PMAP_UNLOCK(pmap);
791				return rtval;
792			}
793			pte = pmap_pde_to_pte(pdep, va);
794			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
795		}
796	}
797	PMAP_UNLOCK(pmap);
798	return (rtval);
799}
800
801/*
802 *	Routine:	pmap_extract_and_hold
803 *	Function:
804 *		Atomically extract and hold the physical page
805 *		with the given pmap and virtual address pair
806 *		if that mapping permits the given protection.
807 */
808vm_page_t
809pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
810{
811	pd_entry_t pde, *pdep;
812	pt_entry_t pte;
813	vm_page_t m;
814
815	m = NULL;
816	vm_page_lock_queues();
817	PMAP_LOCK(pmap);
818	pdep = pmap_pde(pmap, va);
819	if (pdep != NULL && (pde = *pdep)) {
820		if (pde & PG_PS) {
821			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
822				m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
823				    (va & PDRMASK));
824				vm_page_hold(m);
825			}
826		} else {
827			pte = *pmap_pde_to_pte(pdep, va);
828			if ((pte & PG_V) &&
829			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
830				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
831				vm_page_hold(m);
832			}
833		}
834	}
835	vm_page_unlock_queues();
836	PMAP_UNLOCK(pmap);
837	return (m);
838}
839
840vm_paddr_t
841pmap_kextract(vm_offset_t va)
842{
843	pd_entry_t *pde;
844	vm_paddr_t pa;
845
846	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
847		pa = DMAP_TO_PHYS(va);
848	} else {
849		pde = vtopde(va);
850		if (*pde & PG_PS) {
851			pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
852		} else {
853			pa = *vtopte(va);
854			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
855		}
856	}
857	return pa;
858}
859
860/***************************************************
861 * Low level mapping routines.....
862 ***************************************************/
863
864/*
865 * Add a wired page to the kva.
866 * Note: not SMP coherent.
867 */
868PMAP_INLINE void
869pmap_kenter(vm_offset_t va, vm_paddr_t pa)
870{
871	pt_entry_t *pte;
872
873	pte = vtopte(va);
874	pte_store(pte, pa | PG_RW | PG_V | PG_G);
875}
876
877/*
878 * Remove a page from the kernel pagetables.
879 * Note: not SMP coherent.
880 */
881PMAP_INLINE void
882pmap_kremove(vm_offset_t va)
883{
884	pt_entry_t *pte;
885
886	pte = vtopte(va);
887	pte_clear(pte);
888}
889
890/*
891 *	Used to map a range of physical addresses into kernel
892 *	virtual address space.
893 *
894 *	The value passed in '*virt' is a suggested virtual address for
895 *	the mapping. Architectures which can support a direct-mapped
896 *	physical to virtual region can return the appropriate address
897 *	within that region, leaving '*virt' unchanged. Other
898 *	architectures should map the pages starting at '*virt' and
899 *	update '*virt' with the first usable address after the mapped
900 *	region.
901 */
902vm_offset_t
903pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
904{
905	return PHYS_TO_DMAP(start);
906}
907
908
909/*
910 * Add a list of wired pages to the kva
911 * this routine is only used for temporary
912 * kernel mappings that do not need to have
913 * page modification or references recorded.
914 * Note that old mappings are simply written
915 * over.  The page *must* be wired.
916 * Note: SMP coherent.  Uses a ranged shootdown IPI.
917 */
918void
919pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
920{
921	vm_offset_t va;
922
923	va = sva;
924	while (count-- > 0) {
925		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
926		va += PAGE_SIZE;
927		m++;
928	}
929	pmap_invalidate_range(kernel_pmap, sva, va);
930}
931
932/*
933 * This routine tears out page mappings from the
934 * kernel -- it is meant only for temporary mappings.
935 * Note: SMP coherent.  Uses a ranged shootdown IPI.
936 */
937void
938pmap_qremove(vm_offset_t sva, int count)
939{
940	vm_offset_t va;
941
942	va = sva;
943	while (count-- > 0) {
944		pmap_kremove(va);
945		va += PAGE_SIZE;
946	}
947	pmap_invalidate_range(kernel_pmap, sva, va);
948}
949
950/***************************************************
951 * Page table page management routines.....
952 ***************************************************/
953
954/*
955 * This routine unholds page table pages, and if the hold count
956 * drops to zero, then it decrements the wire count.
957 */
958static PMAP_INLINE int
959pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
960{
961
962	--m->wire_count;
963	if (m->wire_count == 0)
964		return _pmap_unwire_pte_hold(pmap, va, m);
965	else
966		return 0;
967}
968
969static int
970_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
971{
972	vm_offset_t pteva;
973
974	/*
975	 * unmap the page table page
976	 */
977	if (m->pindex >= (NUPDE + NUPDPE)) {
978		/* PDP page */
979		pml4_entry_t *pml4;
980		pml4 = pmap_pml4e(pmap, va);
981		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
982		*pml4 = 0;
983	} else if (m->pindex >= NUPDE) {
984		/* PD page */
985		pdp_entry_t *pdp;
986		pdp = pmap_pdpe(pmap, va);
987		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
988		*pdp = 0;
989	} else {
990		/* PTE page */
991		pd_entry_t *pd;
992		pd = pmap_pde(pmap, va);
993		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
994		*pd = 0;
995	}
996	--pmap->pm_stats.resident_count;
997	if (m->pindex < NUPDE) {
998		/* We just released a PT, unhold the matching PD */
999		vm_page_t pdpg;
1000
1001		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1002		pmap_unwire_pte_hold(pmap, va, pdpg);
1003	}
1004	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1005		/* We just released a PD, unhold the matching PDP */
1006		vm_page_t pdppg;
1007
1008		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1009		pmap_unwire_pte_hold(pmap, va, pdppg);
1010	}
1011
1012	/*
1013	 * Do an invltlb to make the invalidated mapping
1014	 * take effect immediately.
1015	 */
1016	pmap_invalidate_page(pmap, pteva);
1017
1018	vm_page_free_zero(m);
1019	atomic_subtract_int(&cnt.v_wire_count, 1);
1020	return 1;
1021}
1022
1023/*
1024 * After removing a page table entry, this routine is used to
1025 * conditionally free the page, and manage the hold/wire counts.
1026 */
1027static int
1028pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde)
1029{
1030	vm_page_t mpte;
1031
1032	if (va >= VM_MAXUSER_ADDRESS)
1033		return 0;
1034	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1035	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1036	return pmap_unwire_pte_hold(pmap, va, mpte);
1037}
1038
1039void
1040pmap_pinit0(pmap)
1041	struct pmap *pmap;
1042{
1043
1044	PMAP_LOCK_INIT(pmap);
1045	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1046	pmap->pm_active = 0;
1047	TAILQ_INIT(&pmap->pm_pvlist);
1048	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1049}
1050
1051/*
1052 * Initialize a preallocated and zeroed pmap structure,
1053 * such as one in a vmspace structure.
1054 */
1055void
1056pmap_pinit(pmap)
1057	register struct pmap *pmap;
1058{
1059	vm_page_t pml4pg;
1060	static vm_pindex_t color;
1061
1062	PMAP_LOCK_INIT(pmap);
1063
1064	/*
1065	 * allocate the page directory page
1066	 */
1067	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1068	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1069		VM_WAIT;
1070
1071	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1072
1073	if ((pml4pg->flags & PG_ZERO) == 0)
1074		pagezero(pmap->pm_pml4);
1075
1076	/* Wire in kernel global address entries. */
1077	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1078	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1079
1080	/* install self-referential address mapping entry(s) */
1081	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1082
1083	pmap->pm_active = 0;
1084	TAILQ_INIT(&pmap->pm_pvlist);
1085	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1086}
1087
1088/*
1089 * this routine is called if the page table page is not
1090 * mapped correctly.
1091 *
1092 * Note: If a page allocation fails at page table level two or three,
1093 * one or two pages may be held during the wait, only to be released
1094 * afterwards.  This conservative approach is easily argued to avoid
1095 * race conditions.
1096 */
1097static vm_page_t
1098_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1099{
1100	vm_page_t m, pdppg, pdpg;
1101
1102	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1103	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1104	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1105
1106	/*
1107	 * Allocate a page table page.
1108	 */
1109	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1110	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1111		if (flags & M_WAITOK) {
1112			PMAP_UNLOCK(pmap);
1113			vm_page_unlock_queues();
1114			VM_WAIT;
1115			vm_page_lock_queues();
1116			PMAP_LOCK(pmap);
1117		}
1118
1119		/*
1120		 * Indicate the need to retry.  While waiting, the page table
1121		 * page may have been allocated.
1122		 */
1123		return (NULL);
1124	}
1125	if ((m->flags & PG_ZERO) == 0)
1126		pmap_zero_page(m);
1127
1128	/*
1129	 * Map the pagetable page into the process address space, if
1130	 * it isn't already there.
1131	 */
1132
1133	pmap->pm_stats.resident_count++;
1134
1135	if (ptepindex >= (NUPDE + NUPDPE)) {
1136		pml4_entry_t *pml4;
1137		vm_pindex_t pml4index;
1138
1139		/* Wire up a new PDPE page */
1140		pml4index = ptepindex - (NUPDE + NUPDPE);
1141		pml4 = &pmap->pm_pml4[pml4index];
1142		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1143
1144	} else if (ptepindex >= NUPDE) {
1145		vm_pindex_t pml4index;
1146		vm_pindex_t pdpindex;
1147		pml4_entry_t *pml4;
1148		pdp_entry_t *pdp;
1149
1150		/* Wire up a new PDE page */
1151		pdpindex = ptepindex - NUPDE;
1152		pml4index = pdpindex >> NPML4EPGSHIFT;
1153
1154		pml4 = &pmap->pm_pml4[pml4index];
1155		if ((*pml4 & PG_V) == 0) {
1156			/* Have to allocate a new pdp, recurse */
1157			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1158			    flags) == NULL) {
1159				--m->wire_count;
1160				vm_page_free(m);
1161				return (NULL);
1162			}
1163		} else {
1164			/* Add reference to pdp page */
1165			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1166			pdppg->wire_count++;
1167		}
1168		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1169
1170		/* Now find the pdp page */
1171		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1172		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1173
1174	} else {
1175		vm_pindex_t pml4index;
1176		vm_pindex_t pdpindex;
1177		pml4_entry_t *pml4;
1178		pdp_entry_t *pdp;
1179		pd_entry_t *pd;
1180
1181		/* Wire up a new PTE page */
1182		pdpindex = ptepindex >> NPDPEPGSHIFT;
1183		pml4index = pdpindex >> NPML4EPGSHIFT;
1184
1185		/* First, find the pdp and check that its valid. */
1186		pml4 = &pmap->pm_pml4[pml4index];
1187		if ((*pml4 & PG_V) == 0) {
1188			/* Have to allocate a new pd, recurse */
1189			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1190			    flags) == NULL) {
1191				--m->wire_count;
1192				vm_page_free(m);
1193				return (NULL);
1194			}
1195			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1196			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1197		} else {
1198			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1199			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1200			if ((*pdp & PG_V) == 0) {
1201				/* Have to allocate a new pd, recurse */
1202				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1203				    flags) == NULL) {
1204					--m->wire_count;
1205					vm_page_free(m);
1206					return (NULL);
1207				}
1208			} else {
1209				/* Add reference to the pd page */
1210				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1211				pdpg->wire_count++;
1212			}
1213		}
1214		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1215
1216		/* Now we know where the page directory page is */
1217		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1218		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1219	}
1220
1221	return m;
1222}
1223
1224static vm_page_t
1225pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1226{
1227	vm_pindex_t ptepindex;
1228	pd_entry_t *pd;
1229	vm_page_t m;
1230
1231	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1232	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1233	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1234
1235	/*
1236	 * Calculate pagetable page index
1237	 */
1238	ptepindex = pmap_pde_pindex(va);
1239retry:
1240	/*
1241	 * Get the page directory entry
1242	 */
1243	pd = pmap_pde(pmap, va);
1244
1245	/*
1246	 * This supports switching from a 2MB page to a
1247	 * normal 4K page.
1248	 */
1249	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1250		*pd = 0;
1251		pd = 0;
1252		pmap_invalidate_all(kernel_pmap);
1253	}
1254
1255	/*
1256	 * If the page table page is mapped, we just increment the
1257	 * hold count, and activate it.
1258	 */
1259	if (pd != 0 && (*pd & PG_V) != 0) {
1260		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1261		m->wire_count++;
1262	} else {
1263		/*
1264		 * Here if the pte page isn't mapped, or if it has been
1265		 * deallocated.
1266		 */
1267		m = _pmap_allocpte(pmap, ptepindex, flags);
1268		if (m == NULL && (flags & M_WAITOK))
1269			goto retry;
1270	}
1271	return (m);
1272}
1273
1274
1275/***************************************************
1276 * Pmap allocation/deallocation routines.
1277 ***************************************************/
1278
1279/*
1280 * Release any resources held by the given physical map.
1281 * Called when a pmap initialized by pmap_pinit is being released.
1282 * Should only be called if the map contains no valid mappings.
1283 */
1284void
1285pmap_release(pmap_t pmap)
1286{
1287	vm_page_t m;
1288
1289	KASSERT(pmap->pm_stats.resident_count == 0,
1290	    ("pmap_release: pmap resident count %ld != 0",
1291	    pmap->pm_stats.resident_count));
1292
1293	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1294
1295	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1296	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1297	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1298
1299	vm_page_lock_queues();
1300	m->wire_count--;
1301	atomic_subtract_int(&cnt.v_wire_count, 1);
1302	vm_page_free_zero(m);
1303	vm_page_unlock_queues();
1304	PMAP_LOCK_DESTROY(pmap);
1305}
1306
1307static int
1308kvm_size(SYSCTL_HANDLER_ARGS)
1309{
1310	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1311
1312	return sysctl_handle_long(oidp, &ksize, 0, req);
1313}
1314SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1315    0, 0, kvm_size, "IU", "Size of KVM");
1316
1317static int
1318kvm_free(SYSCTL_HANDLER_ARGS)
1319{
1320	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1321
1322	return sysctl_handle_long(oidp, &kfree, 0, req);
1323}
1324SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1325    0, 0, kvm_free, "IU", "Amount of KVM free");
1326
1327/*
1328 * grow the number of kernel page table entries, if needed
1329 */
1330void
1331pmap_growkernel(vm_offset_t addr)
1332{
1333	vm_paddr_t paddr;
1334	vm_page_t nkpg;
1335	pd_entry_t *pde, newpdir;
1336	pdp_entry_t newpdp;
1337
1338	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1339	if (kernel_vm_end == 0) {
1340		kernel_vm_end = KERNBASE;
1341		nkpt = 0;
1342		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1343			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1344			nkpt++;
1345		}
1346	}
1347	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1348	while (kernel_vm_end < addr) {
1349		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1350		if (pde == NULL) {
1351			/* We need a new PDP entry */
1352			nkpg = vm_page_alloc(NULL, nkpt,
1353			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1354			if (!nkpg)
1355				panic("pmap_growkernel: no memory to grow kernel");
1356			pmap_zero_page(nkpg);
1357			paddr = VM_PAGE_TO_PHYS(nkpg);
1358			newpdp = (pdp_entry_t)
1359				(paddr | PG_V | PG_RW | PG_A | PG_M);
1360			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1361			continue; /* try again */
1362		}
1363		if ((*pde & PG_V) != 0) {
1364			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1365			continue;
1366		}
1367
1368		/*
1369		 * This index is bogus, but out of the way
1370		 */
1371		nkpg = vm_page_alloc(NULL, nkpt,
1372		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1373		if (!nkpg)
1374			panic("pmap_growkernel: no memory to grow kernel");
1375
1376		nkpt++;
1377
1378		pmap_zero_page(nkpg);
1379		paddr = VM_PAGE_TO_PHYS(nkpg);
1380		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1381		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1382
1383		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1384	}
1385}
1386
1387
1388/***************************************************
1389 * page management routines.
1390 ***************************************************/
1391
1392/*
1393 * free the pv_entry back to the free list
1394 */
1395static PMAP_INLINE void
1396free_pv_entry(pv_entry_t pv)
1397{
1398	pv_entry_count--;
1399	uma_zfree(pvzone, pv);
1400}
1401
1402/*
1403 * get a new pv_entry, allocating a block from the system
1404 * when needed.
1405 * the memory allocation is performed bypassing the malloc code
1406 * because of the possibility of allocations at interrupt time.
1407 */
1408static pv_entry_t
1409get_pv_entry(void)
1410{
1411	pv_entry_count++;
1412	if (pv_entry_high_water &&
1413		(pv_entry_count > pv_entry_high_water) &&
1414		(pmap_pagedaemon_waken == 0)) {
1415		pmap_pagedaemon_waken = 1;
1416		wakeup (&vm_pages_needed);
1417	}
1418	return uma_zalloc(pvzone, M_NOWAIT);
1419}
1420
1421
1422static void
1423pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1424{
1425	pv_entry_t pv;
1426
1427	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1428	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1429	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1430		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1431			if (pmap == pv->pv_pmap && va == pv->pv_va)
1432				break;
1433		}
1434	} else {
1435		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1436			if (va == pv->pv_va)
1437				break;
1438		}
1439	}
1440	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1441	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1442	m->md.pv_list_count--;
1443	if (TAILQ_EMPTY(&m->md.pv_list))
1444		vm_page_flag_clear(m, PG_WRITEABLE);
1445	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1446	free_pv_entry(pv);
1447}
1448
1449/*
1450 * Create a pv entry for page at pa for
1451 * (pmap, va).
1452 */
1453static void
1454pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1455{
1456	pv_entry_t pv;
1457
1458	pv = get_pv_entry();
1459	pv->pv_va = va;
1460	pv->pv_pmap = pmap;
1461
1462	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1463	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1464	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1465	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1466	m->md.pv_list_count++;
1467}
1468
1469/*
1470 * pmap_remove_pte: do the things to unmap a page in a process
1471 */
1472static int
1473pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde)
1474{
1475	pt_entry_t oldpte;
1476	vm_page_t m;
1477
1478	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1479	oldpte = pte_load_clear(ptq);
1480	if (oldpte & PG_W)
1481		pmap->pm_stats.wired_count -= 1;
1482	/*
1483	 * Machines that don't support invlpg, also don't support
1484	 * PG_G.
1485	 */
1486	if (oldpte & PG_G)
1487		pmap_invalidate_page(kernel_pmap, va);
1488	pmap->pm_stats.resident_count -= 1;
1489	if (oldpte & PG_MANAGED) {
1490		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
1491		if (oldpte & PG_M) {
1492#if defined(PMAP_DIAGNOSTIC)
1493			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1494				printf(
1495	"pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
1496				    va, oldpte);
1497			}
1498#endif
1499			if (pmap_track_modified(va))
1500				vm_page_dirty(m);
1501		}
1502		if (oldpte & PG_A)
1503			vm_page_flag_set(m, PG_REFERENCED);
1504		pmap_remove_entry(pmap, m, va);
1505	}
1506	return (pmap_unuse_pt(pmap, va, ptepde));
1507}
1508
1509/*
1510 * Remove a single page from a process address space
1511 */
1512static void
1513pmap_remove_page(pmap_t pmap, vm_offset_t va)
1514{
1515	pd_entry_t ptepde;
1516	pt_entry_t *pte;
1517
1518	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1519	pte = pmap_pte_pde(pmap, va, &ptepde);
1520	if (pte == NULL || (*pte & PG_V) == 0)
1521		return;
1522	pmap_remove_pte(pmap, pte, va, ptepde);
1523	pmap_invalidate_page(pmap, va);
1524}
1525
1526/*
1527 *	Remove the given range of addresses from the specified map.
1528 *
1529 *	It is assumed that the start and end are properly
1530 *	rounded to the page size.
1531 */
1532void
1533pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1534{
1535	vm_offset_t va_next;
1536	pml4_entry_t *pml4e;
1537	pdp_entry_t *pdpe;
1538	pd_entry_t ptpaddr, *pde;
1539	pt_entry_t *pte;
1540	int anyvalid;
1541
1542	/*
1543	 * Perform an unsynchronized read.  This is, however, safe.
1544	 */
1545	if (pmap->pm_stats.resident_count == 0)
1546		return;
1547
1548	anyvalid = 0;
1549
1550	vm_page_lock_queues();
1551	PMAP_LOCK(pmap);
1552
1553	/*
1554	 * special handling of removing one page.  a very
1555	 * common operation and easy to short circuit some
1556	 * code.
1557	 */
1558	if (sva + PAGE_SIZE == eva) {
1559		pde = pmap_pde(pmap, sva);
1560		if (pde && (*pde & PG_PS) == 0) {
1561			pmap_remove_page(pmap, sva);
1562			goto out;
1563		}
1564	}
1565
1566	for (; sva < eva; sva = va_next) {
1567
1568		if (pmap->pm_stats.resident_count == 0)
1569			break;
1570
1571		pml4e = pmap_pml4e(pmap, sva);
1572		if (pml4e == 0) {
1573			va_next = (sva + NBPML4) & ~PML4MASK;
1574			continue;
1575		}
1576
1577		pdpe = pmap_pdpe(pmap, sva);
1578		if (pdpe == 0) {
1579			va_next = (sva + NBPDP) & ~PDPMASK;
1580			continue;
1581		}
1582
1583		/*
1584		 * Calculate index for next page table.
1585		 */
1586		va_next = (sva + NBPDR) & ~PDRMASK;
1587
1588		pde = pmap_pde(pmap, sva);
1589		if (pde == 0)
1590			continue;
1591		ptpaddr = *pde;
1592
1593		/*
1594		 * Weed out invalid mappings.
1595		 */
1596		if (ptpaddr == 0)
1597			continue;
1598
1599		/*
1600		 * Check for large page.
1601		 */
1602		if ((ptpaddr & PG_PS) != 0) {
1603			*pde = 0;
1604			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1605			anyvalid = 1;
1606			continue;
1607		}
1608
1609		/*
1610		 * Limit our scan to either the end of the va represented
1611		 * by the current page table page, or to the end of the
1612		 * range being removed.
1613		 */
1614		if (va_next > eva)
1615			va_next = eva;
1616
1617		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
1618		    sva += PAGE_SIZE) {
1619			if (*pte == 0)
1620				continue;
1621			anyvalid = 1;
1622			if (pmap_remove_pte(pmap, pte, sva, ptpaddr))
1623				break;
1624		}
1625	}
1626out:
1627	vm_page_unlock_queues();
1628	if (anyvalid)
1629		pmap_invalidate_all(pmap);
1630	PMAP_UNLOCK(pmap);
1631}
1632
1633/*
1634 *	Routine:	pmap_remove_all
1635 *	Function:
1636 *		Removes this physical page from
1637 *		all physical maps in which it resides.
1638 *		Reflects back modify bits to the pager.
1639 *
1640 *	Notes:
1641 *		Original versions of this routine were very
1642 *		inefficient because they iteratively called
1643 *		pmap_remove (slow...)
1644 */
1645
1646void
1647pmap_remove_all(vm_page_t m)
1648{
1649	register pv_entry_t pv;
1650	pt_entry_t *pte, tpte;
1651	pd_entry_t ptepde;
1652
1653#if defined(PMAP_DIAGNOSTIC)
1654	/*
1655	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1656	 */
1657	if (m->flags & PG_FICTITIOUS) {
1658		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx",
1659		    VM_PAGE_TO_PHYS(m));
1660	}
1661#endif
1662	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1663	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1664		PMAP_LOCK(pv->pv_pmap);
1665		pv->pv_pmap->pm_stats.resident_count--;
1666		pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde);
1667		tpte = pte_load_clear(pte);
1668		if (tpte & PG_W)
1669			pv->pv_pmap->pm_stats.wired_count--;
1670		if (tpte & PG_A)
1671			vm_page_flag_set(m, PG_REFERENCED);
1672
1673		/*
1674		 * Update the vm_page_t clean and reference bits.
1675		 */
1676		if (tpte & PG_M) {
1677#if defined(PMAP_DIAGNOSTIC)
1678			if (pmap_nw_modified((pt_entry_t) tpte)) {
1679				printf(
1680	"pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
1681				    pv->pv_va, tpte);
1682			}
1683#endif
1684			if (pmap_track_modified(pv->pv_va))
1685				vm_page_dirty(m);
1686		}
1687		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1688		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1689		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1690		m->md.pv_list_count--;
1691		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde);
1692		PMAP_UNLOCK(pv->pv_pmap);
1693		free_pv_entry(pv);
1694	}
1695	vm_page_flag_clear(m, PG_WRITEABLE);
1696}
1697
1698/*
1699 *	Set the physical protection on the
1700 *	specified range of this map as requested.
1701 */
1702void
1703pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1704{
1705	vm_offset_t va_next;
1706	pml4_entry_t *pml4e;
1707	pdp_entry_t *pdpe;
1708	pd_entry_t ptpaddr, *pde;
1709	pt_entry_t *pte;
1710	int anychanged;
1711
1712	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1713		pmap_remove(pmap, sva, eva);
1714		return;
1715	}
1716
1717	if (prot & VM_PROT_WRITE)
1718		return;
1719
1720	anychanged = 0;
1721
1722	vm_page_lock_queues();
1723	PMAP_LOCK(pmap);
1724	for (; sva < eva; sva = va_next) {
1725
1726		pml4e = pmap_pml4e(pmap, sva);
1727		if (pml4e == 0) {
1728			va_next = (sva + NBPML4) & ~PML4MASK;
1729			continue;
1730		}
1731
1732		pdpe = pmap_pdpe(pmap, sva);
1733		if (pdpe == 0) {
1734			va_next = (sva + NBPDP) & ~PDPMASK;
1735			continue;
1736		}
1737
1738		va_next = (sva + NBPDR) & ~PDRMASK;
1739
1740		pde = pmap_pde(pmap, sva);
1741		if (pde == NULL)
1742			continue;
1743		ptpaddr = *pde;
1744
1745		/*
1746		 * Weed out invalid mappings.
1747		 */
1748		if (ptpaddr == 0)
1749			continue;
1750
1751		/*
1752		 * Check for large page.
1753		 */
1754		if ((ptpaddr & PG_PS) != 0) {
1755			*pde &= ~(PG_M|PG_RW);
1756			anychanged = 1;
1757			continue;
1758		}
1759
1760		if (va_next > eva)
1761			va_next = eva;
1762
1763		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
1764		    sva += PAGE_SIZE) {
1765			pt_entry_t obits, pbits;
1766			vm_page_t m;
1767
1768retry:
1769			obits = pbits = *pte;
1770			if (pbits & PG_MANAGED) {
1771				m = NULL;
1772				if (pbits & PG_A) {
1773					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
1774					vm_page_flag_set(m, PG_REFERENCED);
1775					pbits &= ~PG_A;
1776				}
1777				if ((pbits & PG_M) != 0 &&
1778				    pmap_track_modified(sva)) {
1779					if (m == NULL)
1780						m = PHYS_TO_VM_PAGE(pbits &
1781						    PG_FRAME);
1782					vm_page_dirty(m);
1783				}
1784			}
1785
1786			pbits &= ~(PG_RW | PG_M);
1787
1788			if (pbits != obits) {
1789				if (!atomic_cmpset_long(pte, obits, pbits))
1790					goto retry;
1791				if (obits & PG_G)
1792					pmap_invalidate_page(pmap, sva);
1793				else
1794					anychanged = 1;
1795			}
1796		}
1797	}
1798	vm_page_unlock_queues();
1799	if (anychanged)
1800		pmap_invalidate_all(pmap);
1801	PMAP_UNLOCK(pmap);
1802}
1803
1804/*
1805 *	Insert the given physical page (p) at
1806 *	the specified virtual address (v) in the
1807 *	target physical map with the protection requested.
1808 *
1809 *	If specified, the page will be wired down, meaning
1810 *	that the related pte can not be reclaimed.
1811 *
1812 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1813 *	or lose information.  That is, this routine must actually
1814 *	insert this page into the given map NOW.
1815 */
1816void
1817pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1818	   boolean_t wired)
1819{
1820	vm_paddr_t pa;
1821	register pt_entry_t *pte;
1822	vm_paddr_t opa;
1823	pt_entry_t origpte, newpte;
1824	vm_page_t mpte, om;
1825
1826	va = trunc_page(va);
1827#ifdef PMAP_DIAGNOSTIC
1828	if (va > VM_MAX_KERNEL_ADDRESS)
1829		panic("pmap_enter: toobig");
1830	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1831		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
1832#endif
1833
1834	mpte = NULL;
1835
1836	vm_page_lock_queues();
1837	PMAP_LOCK(pmap);
1838
1839	/*
1840	 * In the case that a page table page is not
1841	 * resident, we are creating it here.
1842	 */
1843	if (va < VM_MAXUSER_ADDRESS) {
1844		mpte = pmap_allocpte(pmap, va, M_WAITOK);
1845	}
1846#if 0 && defined(PMAP_DIAGNOSTIC)
1847	else {
1848		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1849		origpte = *pdeaddr;
1850		if ((origpte & PG_V) == 0) {
1851			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
1852				origpte, va);
1853		}
1854	}
1855#endif
1856
1857	pte = pmap_pte(pmap, va);
1858
1859	/*
1860	 * Page Directory table entry not valid, we need a new PT page
1861	 */
1862	if (pte == NULL)
1863		panic("pmap_enter: invalid page directory va=%#lx\n", va);
1864
1865	pa = VM_PAGE_TO_PHYS(m);
1866	om = NULL;
1867	origpte = *pte;
1868	opa = origpte & PG_FRAME;
1869
1870	if (origpte & PG_PS)
1871		panic("pmap_enter: attempted pmap_enter on 2MB page");
1872
1873	/*
1874	 * Mapping has not changed, must be protection or wiring change.
1875	 */
1876	if (origpte && (opa == pa)) {
1877		/*
1878		 * Wiring change, just update stats. We don't worry about
1879		 * wiring PT pages as they remain resident as long as there
1880		 * are valid mappings in them. Hence, if a user page is wired,
1881		 * the PT page will be also.
1882		 */
1883		if (wired && ((origpte & PG_W) == 0))
1884			pmap->pm_stats.wired_count++;
1885		else if (!wired && (origpte & PG_W))
1886			pmap->pm_stats.wired_count--;
1887
1888#if defined(PMAP_DIAGNOSTIC)
1889		if (pmap_nw_modified((pt_entry_t) origpte)) {
1890			printf(
1891	"pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
1892			    va, origpte);
1893		}
1894#endif
1895
1896		/*
1897		 * Remove extra pte reference
1898		 */
1899		if (mpte)
1900			mpte->wire_count--;
1901
1902		/*
1903		 * We might be turning off write access to the page,
1904		 * so we go ahead and sense modify status.
1905		 */
1906		if (origpte & PG_MANAGED) {
1907			om = m;
1908			pa |= PG_MANAGED;
1909		}
1910		goto validate;
1911	}
1912	/*
1913	 * Mapping has changed, invalidate old range and fall through to
1914	 * handle validating new mapping.
1915	 */
1916	if (opa) {
1917		if (origpte & PG_W)
1918			pmap->pm_stats.wired_count--;
1919		if (origpte & PG_MANAGED) {
1920			om = PHYS_TO_VM_PAGE(opa);
1921			pmap_remove_entry(pmap, om, va);
1922		}
1923		if (mpte != NULL) {
1924			mpte->wire_count--;
1925			KASSERT(mpte->wire_count > 0,
1926			    ("pmap_enter: missing reference to page table page,"
1927			     " va: 0x%lx", va));
1928		}
1929	} else
1930		pmap->pm_stats.resident_count++;
1931
1932	/*
1933	 * Enter on the PV list if part of our managed memory.
1934	 */
1935	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
1936		pmap_insert_entry(pmap, va, m);
1937		pa |= PG_MANAGED;
1938	}
1939
1940	/*
1941	 * Increment counters
1942	 */
1943	if (wired)
1944		pmap->pm_stats.wired_count++;
1945
1946validate:
1947	/*
1948	 * Now validate mapping with desired protection/wiring.
1949	 */
1950	newpte = (pt_entry_t)(pa | PG_V);
1951	if ((prot & VM_PROT_WRITE) != 0)
1952		newpte |= PG_RW;
1953	if ((prot & VM_PROT_EXECUTE) == 0)
1954		newpte |= pg_nx;
1955	if (wired)
1956		newpte |= PG_W;
1957	if (va < VM_MAXUSER_ADDRESS)
1958		newpte |= PG_U;
1959	if (pmap == kernel_pmap)
1960		newpte |= PG_G;
1961
1962	/*
1963	 * if the mapping or permission bits are different, we need
1964	 * to update the pte.
1965	 */
1966	if ((origpte & ~(PG_M|PG_A)) != newpte) {
1967		if (origpte & PG_MANAGED) {
1968			origpte = pte_load_store(pte, newpte | PG_A);
1969			if ((origpte & PG_M) && pmap_track_modified(va))
1970				vm_page_dirty(om);
1971			if (origpte & PG_A)
1972				vm_page_flag_set(om, PG_REFERENCED);
1973		} else
1974			pte_store(pte, newpte | PG_A);
1975		if (origpte) {
1976			pmap_invalidate_page(pmap, va);
1977		}
1978	}
1979	vm_page_unlock_queues();
1980	PMAP_UNLOCK(pmap);
1981}
1982
1983/*
1984 * this code makes some *MAJOR* assumptions:
1985 * 1. Current pmap & pmap exists.
1986 * 2. Not wired.
1987 * 3. Read access.
1988 * 4. No page table pages.
1989 * 6. Page IS managed.
1990 * but is *MUCH* faster than pmap_enter...
1991 */
1992
1993vm_page_t
1994pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
1995{
1996	pt_entry_t *pte;
1997	vm_paddr_t pa;
1998
1999	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2000	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2001	PMAP_LOCK(pmap);
2002
2003	/*
2004	 * In the case that a page table page is not
2005	 * resident, we are creating it here.
2006	 */
2007	if (va < VM_MAXUSER_ADDRESS) {
2008		vm_pindex_t ptepindex;
2009		pd_entry_t *ptepa;
2010
2011		/*
2012		 * Calculate pagetable page index
2013		 */
2014		ptepindex = pmap_pde_pindex(va);
2015		if (mpte && (mpte->pindex == ptepindex)) {
2016			mpte->wire_count++;
2017		} else {
2018	retry:
2019			/*
2020			 * Get the page directory entry
2021			 */
2022			ptepa = pmap_pde(pmap, va);
2023
2024			/*
2025			 * If the page table page is mapped, we just increment
2026			 * the hold count, and activate it.
2027			 */
2028			if (ptepa && (*ptepa & PG_V) != 0) {
2029				if (*ptepa & PG_PS)
2030					panic("pmap_enter_quick: unexpected mapping into 2MB page");
2031				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
2032				mpte->wire_count++;
2033			} else {
2034				mpte = _pmap_allocpte(pmap, ptepindex,
2035				    M_NOWAIT);
2036				if (mpte == NULL) {
2037					PMAP_UNLOCK(pmap);
2038					vm_page_busy(m);
2039					vm_page_unlock_queues();
2040					VM_OBJECT_UNLOCK(m->object);
2041					VM_WAIT;
2042					VM_OBJECT_LOCK(m->object);
2043					vm_page_lock_queues();
2044					vm_page_wakeup(m);
2045					PMAP_LOCK(pmap);
2046					goto retry;
2047				}
2048			}
2049		}
2050	} else {
2051		mpte = NULL;
2052	}
2053
2054	/*
2055	 * This call to vtopte makes the assumption that we are
2056	 * entering the page into the current pmap.  In order to support
2057	 * quick entry into any pmap, one would likely use pmap_pte.
2058	 * But that isn't as quick as vtopte.
2059	 */
2060	pte = vtopte(va);
2061	if (*pte) {
2062		if (mpte != NULL) {
2063			pmap_unwire_pte_hold(pmap, va, mpte);
2064			mpte = NULL;
2065		}
2066		goto out;
2067	}
2068
2069	/*
2070	 * Enter on the PV list if part of our managed memory. Note that we
2071	 * raise IPL while manipulating pv_table since pmap_enter can be
2072	 * called at interrupt time.
2073	 */
2074	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2075		pmap_insert_entry(pmap, va, m);
2076
2077	/*
2078	 * Increment counters
2079	 */
2080	pmap->pm_stats.resident_count++;
2081
2082	pa = VM_PAGE_TO_PHYS(m);
2083
2084	/*
2085	 * Now validate mapping with RO protection
2086	 */
2087	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2088		pte_store(pte, pa | PG_V | PG_U);
2089	else
2090		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2091out:
2092	PMAP_UNLOCK(pmap);
2093	return mpte;
2094}
2095
2096/*
2097 * Make a temporary mapping for a physical address.  This is only intended
2098 * to be used for panic dumps.
2099 */
2100void *
2101pmap_kenter_temporary(vm_paddr_t pa, int i)
2102{
2103	vm_offset_t va;
2104
2105	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2106	pmap_kenter(va, pa);
2107	invlpg(va);
2108	return ((void *)crashdumpmap);
2109}
2110
2111/*
2112 * This code maps large physical mmap regions into the
2113 * processor address space.  Note that some shortcuts
2114 * are taken, but the code works.
2115 */
2116void
2117pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2118		    vm_object_t object, vm_pindex_t pindex,
2119		    vm_size_t size)
2120{
2121	vm_page_t p;
2122
2123	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2124	KASSERT(object->type == OBJT_DEVICE,
2125	    ("pmap_object_init_pt: non-device object"));
2126	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2127		int i;
2128		vm_page_t m[1];
2129		int npdes;
2130		pd_entry_t ptepa, *pde;
2131
2132		PMAP_LOCK(pmap);
2133		pde = pmap_pde(pmap, addr);
2134		if (pde != 0 && (*pde & PG_V) != 0)
2135			goto out;
2136		PMAP_UNLOCK(pmap);
2137retry:
2138		p = vm_page_lookup(object, pindex);
2139		if (p != NULL) {
2140			vm_page_lock_queues();
2141			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2142				goto retry;
2143		} else {
2144			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2145			if (p == NULL)
2146				return;
2147			m[0] = p;
2148
2149			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2150				vm_page_lock_queues();
2151				vm_page_free(p);
2152				vm_page_unlock_queues();
2153				return;
2154			}
2155
2156			p = vm_page_lookup(object, pindex);
2157			vm_page_lock_queues();
2158			vm_page_wakeup(p);
2159		}
2160		vm_page_unlock_queues();
2161
2162		ptepa = VM_PAGE_TO_PHYS(p);
2163		if (ptepa & (NBPDR - 1))
2164			return;
2165
2166		p->valid = VM_PAGE_BITS_ALL;
2167
2168		PMAP_LOCK(pmap);
2169		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2170		npdes = size >> PDRSHIFT;
2171		for(i = 0; i < npdes; i++) {
2172			pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS);
2173			ptepa += NBPDR;
2174			pde++;
2175		}
2176		pmap_invalidate_all(pmap);
2177out:
2178		PMAP_UNLOCK(pmap);
2179	}
2180}
2181
2182/*
2183 *	Routine:	pmap_change_wiring
2184 *	Function:	Change the wiring attribute for a map/virtual-address
2185 *			pair.
2186 *	In/out conditions:
2187 *			The mapping must already exist in the pmap.
2188 */
2189void
2190pmap_change_wiring(pmap, va, wired)
2191	register pmap_t pmap;
2192	vm_offset_t va;
2193	boolean_t wired;
2194{
2195	register pt_entry_t *pte;
2196
2197	/*
2198	 * Wiring is not a hardware characteristic so there is no need to
2199	 * invalidate TLB.
2200	 */
2201	PMAP_LOCK(pmap);
2202	pte = pmap_pte(pmap, va);
2203	if (wired && (*pte & PG_W) == 0) {
2204		pmap->pm_stats.wired_count++;
2205		atomic_set_long(pte, PG_W);
2206	} else if (!wired && (*pte & PG_W) != 0) {
2207		pmap->pm_stats.wired_count--;
2208		atomic_clear_long(pte, PG_W);
2209	}
2210	PMAP_UNLOCK(pmap);
2211}
2212
2213
2214
2215/*
2216 *	Copy the range specified by src_addr/len
2217 *	from the source map to the range dst_addr/len
2218 *	in the destination map.
2219 *
2220 *	This routine is only advisory and need not do anything.
2221 */
2222
2223void
2224pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2225	  vm_offset_t src_addr)
2226{
2227	vm_offset_t addr;
2228	vm_offset_t end_addr = src_addr + len;
2229	vm_offset_t va_next;
2230	vm_page_t m;
2231
2232	if (dst_addr != src_addr)
2233		return;
2234
2235	if (!pmap_is_current(src_pmap))
2236		return;
2237
2238	vm_page_lock_queues();
2239	if (dst_pmap < src_pmap) {
2240		PMAP_LOCK(dst_pmap);
2241		PMAP_LOCK(src_pmap);
2242	} else {
2243		PMAP_LOCK(src_pmap);
2244		PMAP_LOCK(dst_pmap);
2245	}
2246	for (addr = src_addr; addr < end_addr; addr = va_next) {
2247		pt_entry_t *src_pte, *dst_pte;
2248		vm_page_t dstmpte, srcmpte;
2249		pml4_entry_t *pml4e;
2250		pdp_entry_t *pdpe;
2251		pd_entry_t srcptepaddr, *pde;
2252
2253		if (addr >= UPT_MIN_ADDRESS)
2254			panic("pmap_copy: invalid to pmap_copy page tables");
2255
2256		/*
2257		 * Don't let optional prefaulting of pages make us go
2258		 * way below the low water mark of free pages or way
2259		 * above high water mark of used pv entries.
2260		 */
2261		if (cnt.v_free_count < cnt.v_free_reserved ||
2262		    pv_entry_count > pv_entry_high_water)
2263			break;
2264
2265		pml4e = pmap_pml4e(src_pmap, addr);
2266		if (pml4e == 0) {
2267			va_next = (addr + NBPML4) & ~PML4MASK;
2268			continue;
2269		}
2270
2271		pdpe = pmap_pdpe(src_pmap, addr);
2272		if (pdpe == 0) {
2273			va_next = (addr + NBPDP) & ~PDPMASK;
2274			continue;
2275		}
2276
2277		va_next = (addr + NBPDR) & ~PDRMASK;
2278
2279		pde = pmap_pde(src_pmap, addr);
2280		if (pde)
2281			srcptepaddr = *pde;
2282		else
2283			continue;
2284		if (srcptepaddr == 0)
2285			continue;
2286
2287		if (srcptepaddr & PG_PS) {
2288			pde = pmap_pde(dst_pmap, addr);
2289			if (pde == 0) {
2290				/*
2291				 * XXX should do an allocpte here to
2292				 * instantiate the pde
2293				 */
2294				continue;
2295			}
2296			if (*pde == 0) {
2297				*pde = srcptepaddr;
2298				dst_pmap->pm_stats.resident_count +=
2299				    NBPDR / PAGE_SIZE;
2300			}
2301			continue;
2302		}
2303
2304		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
2305		if (srcmpte->wire_count == 0)
2306			panic("pmap_copy: source page table page is unused");
2307
2308		if (va_next > end_addr)
2309			va_next = end_addr;
2310
2311		src_pte = vtopte(addr);
2312		while (addr < va_next) {
2313			pt_entry_t ptetemp;
2314			ptetemp = *src_pte;
2315			/*
2316			 * we only virtual copy managed pages
2317			 */
2318			if ((ptetemp & PG_MANAGED) != 0) {
2319				/*
2320				 * We have to check after allocpte for the
2321				 * pte still being around...  allocpte can
2322				 * block.
2323				 */
2324				dstmpte = pmap_allocpte(dst_pmap, addr,
2325				    M_NOWAIT);
2326				if (dstmpte == NULL)
2327					break;
2328				dst_pte = (pt_entry_t *)
2329				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2330				dst_pte = &dst_pte[pmap_pte_index(addr)];
2331				if (*dst_pte == 0) {
2332					/*
2333					 * Clear the modified and
2334					 * accessed (referenced) bits
2335					 * during the copy.
2336					 */
2337					m = PHYS_TO_VM_PAGE(ptetemp & PG_FRAME);
2338					*dst_pte = ptetemp & ~(PG_M | PG_A);
2339					dst_pmap->pm_stats.resident_count++;
2340					pmap_insert_entry(dst_pmap, addr, m);
2341	 			} else
2342					pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
2343				if (dstmpte->wire_count >= srcmpte->wire_count)
2344					break;
2345			}
2346			addr += PAGE_SIZE;
2347			src_pte++;
2348		}
2349	}
2350	vm_page_unlock_queues();
2351	PMAP_UNLOCK(src_pmap);
2352	PMAP_UNLOCK(dst_pmap);
2353}
2354
2355/*
2356 *	pmap_zero_page zeros the specified hardware page by mapping
2357 *	the page into KVM and using bzero to clear its contents.
2358 */
2359void
2360pmap_zero_page(vm_page_t m)
2361{
2362	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2363
2364	pagezero((void *)va);
2365}
2366
2367/*
2368 *	pmap_zero_page_area zeros the specified hardware page by mapping
2369 *	the page into KVM and using bzero to clear its contents.
2370 *
2371 *	off and size may not cover an area beyond a single hardware page.
2372 */
2373void
2374pmap_zero_page_area(vm_page_t m, int off, int size)
2375{
2376	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2377
2378	if (off == 0 && size == PAGE_SIZE)
2379		pagezero((void *)va);
2380	else
2381		bzero((char *)va + off, size);
2382}
2383
2384/*
2385 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2386 *	the page into KVM and using bzero to clear its contents.  This
2387 *	is intended to be called from the vm_pagezero process only and
2388 *	outside of Giant.
2389 */
2390void
2391pmap_zero_page_idle(vm_page_t m)
2392{
2393	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2394
2395	pagezero((void *)va);
2396}
2397
2398/*
2399 *	pmap_copy_page copies the specified (machine independent)
2400 *	page by mapping the page into virtual memory and using
2401 *	bcopy to copy the page, one machine dependent page at a
2402 *	time.
2403 */
2404void
2405pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2406{
2407	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2408	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2409
2410	pagecopy((void *)src, (void *)dst);
2411}
2412
2413/*
2414 * Returns true if the pmap's pv is one of the first
2415 * 16 pvs linked to from this page.  This count may
2416 * be changed upwards or downwards in the future; it
2417 * is only necessary that true be returned for a small
2418 * subset of pmaps for proper page aging.
2419 */
2420boolean_t
2421pmap_page_exists_quick(pmap, m)
2422	pmap_t pmap;
2423	vm_page_t m;
2424{
2425	pv_entry_t pv;
2426	int loops = 0;
2427
2428	if (m->flags & PG_FICTITIOUS)
2429		return FALSE;
2430
2431	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2432	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2433		if (pv->pv_pmap == pmap) {
2434			return TRUE;
2435		}
2436		loops++;
2437		if (loops >= 16)
2438			break;
2439	}
2440	return (FALSE);
2441}
2442
2443#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2444/*
2445 * Remove all pages from specified address space
2446 * this aids process exit speeds.  Also, this code
2447 * is special cased for current process only, but
2448 * can have the more generic (and slightly slower)
2449 * mode enabled.  This is much faster than pmap_remove
2450 * in the case of running down an entire address space.
2451 */
2452void
2453pmap_remove_pages(pmap, sva, eva)
2454	pmap_t pmap;
2455	vm_offset_t sva, eva;
2456{
2457	pt_entry_t *pte, tpte;
2458	vm_page_t m;
2459	pv_entry_t pv, npv;
2460
2461#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2462	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2463		printf("warning: pmap_remove_pages called with non-current pmap\n");
2464		return;
2465	}
2466#endif
2467	vm_page_lock_queues();
2468	PMAP_LOCK(pmap);
2469	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2470
2471		if (pv->pv_va >= eva || pv->pv_va < sva) {
2472			npv = TAILQ_NEXT(pv, pv_plist);
2473			continue;
2474		}
2475
2476#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2477		pte = vtopte(pv->pv_va);
2478#else
2479		pte = pmap_pte(pmap, pv->pv_va);
2480#endif
2481		tpte = *pte;
2482
2483		if (tpte == 0) {
2484			printf("TPTE at %p  IS ZERO @ VA %08lx\n",
2485							pte, pv->pv_va);
2486			panic("bad pte");
2487		}
2488
2489/*
2490 * We cannot remove wired pages from a process' mapping at this time
2491 */
2492		if (tpte & PG_W) {
2493			npv = TAILQ_NEXT(pv, pv_plist);
2494			continue;
2495		}
2496
2497		m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2498		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2499		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2500		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2501
2502		KASSERT(m < &vm_page_array[vm_page_array_size],
2503			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2504
2505		pmap->pm_stats.resident_count--;
2506
2507		pte_clear(pte);
2508
2509		/*
2510		 * Update the vm_page_t clean and reference bits.
2511		 */
2512		if (tpte & PG_M) {
2513			vm_page_dirty(m);
2514		}
2515
2516		npv = TAILQ_NEXT(pv, pv_plist);
2517		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2518
2519		m->md.pv_list_count--;
2520		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2521		if (TAILQ_EMPTY(&m->md.pv_list))
2522			vm_page_flag_clear(m, PG_WRITEABLE);
2523
2524		pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va));
2525		free_pv_entry(pv);
2526	}
2527	pmap_invalidate_all(pmap);
2528	PMAP_UNLOCK(pmap);
2529	vm_page_unlock_queues();
2530}
2531
2532/*
2533 *	pmap_is_modified:
2534 *
2535 *	Return whether or not the specified physical page was modified
2536 *	in any physical maps.
2537 */
2538boolean_t
2539pmap_is_modified(vm_page_t m)
2540{
2541	pv_entry_t pv;
2542	pt_entry_t *pte;
2543	boolean_t rv;
2544
2545	rv = FALSE;
2546	if (m->flags & PG_FICTITIOUS)
2547		return (rv);
2548
2549	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2550	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2551		/*
2552		 * if the bit being tested is the modified bit, then
2553		 * mark clean_map and ptes as never
2554		 * modified.
2555		 */
2556		if (!pmap_track_modified(pv->pv_va))
2557			continue;
2558		PMAP_LOCK(pv->pv_pmap);
2559		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2560		rv = (*pte & PG_M) != 0;
2561		PMAP_UNLOCK(pv->pv_pmap);
2562		if (rv)
2563			break;
2564	}
2565	return (rv);
2566}
2567
2568/*
2569 *	pmap_is_prefaultable:
2570 *
2571 *	Return whether or not the specified virtual address is elgible
2572 *	for prefault.
2573 */
2574boolean_t
2575pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2576{
2577	pd_entry_t *pde;
2578	pt_entry_t *pte;
2579	boolean_t rv;
2580
2581	rv = FALSE;
2582	PMAP_LOCK(pmap);
2583	pde = pmap_pde(pmap, addr);
2584	if (pde != NULL && (*pde & PG_V)) {
2585		pte = vtopte(addr);
2586		rv = (*pte & PG_V) == 0;
2587	}
2588	PMAP_UNLOCK(pmap);
2589	return (rv);
2590}
2591
2592/*
2593 *	Clear the given bit in each of the given page's ptes.
2594 */
2595static __inline void
2596pmap_clear_ptes(vm_page_t m, long bit)
2597{
2598	register pv_entry_t pv;
2599	pt_entry_t pbits, *pte;
2600
2601	if ((m->flags & PG_FICTITIOUS) ||
2602	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2603		return;
2604
2605	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2606	/*
2607	 * Loop over all current mappings setting/clearing as appropos If
2608	 * setting RO do we need to clear the VAC?
2609	 */
2610	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2611		/*
2612		 * don't write protect pager mappings
2613		 */
2614		if (bit == PG_RW) {
2615			if (!pmap_track_modified(pv->pv_va))
2616				continue;
2617		}
2618
2619		PMAP_LOCK(pv->pv_pmap);
2620		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2621retry:
2622		pbits = *pte;
2623		if (pbits & bit) {
2624			if (bit == PG_RW) {
2625				if (!atomic_cmpset_long(pte, pbits,
2626				    pbits & ~(PG_RW | PG_M)))
2627					goto retry;
2628				if (pbits & PG_M) {
2629					vm_page_dirty(m);
2630				}
2631			} else {
2632				atomic_clear_long(pte, bit);
2633			}
2634			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2635		}
2636		PMAP_UNLOCK(pv->pv_pmap);
2637	}
2638	if (bit == PG_RW)
2639		vm_page_flag_clear(m, PG_WRITEABLE);
2640}
2641
2642/*
2643 *      pmap_page_protect:
2644 *
2645 *      Lower the permission for all mappings to a given page.
2646 */
2647void
2648pmap_page_protect(vm_page_t m, vm_prot_t prot)
2649{
2650	if ((prot & VM_PROT_WRITE) == 0) {
2651		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2652			pmap_clear_ptes(m, PG_RW);
2653		} else {
2654			pmap_remove_all(m);
2655		}
2656	}
2657}
2658
2659/*
2660 *	pmap_ts_referenced:
2661 *
2662 *	Return a count of reference bits for a page, clearing those bits.
2663 *	It is not necessary for every reference bit to be cleared, but it
2664 *	is necessary that 0 only be returned when there are truly no
2665 *	reference bits set.
2666 *
2667 *	XXX: The exact number of bits to check and clear is a matter that
2668 *	should be tested and standardized at some point in the future for
2669 *	optimal aging of shared pages.
2670 */
2671int
2672pmap_ts_referenced(vm_page_t m)
2673{
2674	register pv_entry_t pv, pvf, pvn;
2675	pt_entry_t *pte;
2676	pt_entry_t v;
2677	int rtval = 0;
2678
2679	if (m->flags & PG_FICTITIOUS)
2680		return (rtval);
2681
2682	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2683	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2684
2685		pvf = pv;
2686
2687		do {
2688			pvn = TAILQ_NEXT(pv, pv_list);
2689
2690			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2691
2692			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2693
2694			if (!pmap_track_modified(pv->pv_va))
2695				continue;
2696
2697			PMAP_LOCK(pv->pv_pmap);
2698			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2699
2700			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2701				atomic_clear_long(pte, PG_A);
2702				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2703
2704				rtval++;
2705				if (rtval > 4) {
2706					PMAP_UNLOCK(pv->pv_pmap);
2707					break;
2708				}
2709			}
2710			PMAP_UNLOCK(pv->pv_pmap);
2711		} while ((pv = pvn) != NULL && pv != pvf);
2712	}
2713
2714	return (rtval);
2715}
2716
2717/*
2718 *	Clear the modify bits on the specified physical page.
2719 */
2720void
2721pmap_clear_modify(vm_page_t m)
2722{
2723	pmap_clear_ptes(m, PG_M);
2724}
2725
2726/*
2727 *	pmap_clear_reference:
2728 *
2729 *	Clear the reference bit on the specified physical page.
2730 */
2731void
2732pmap_clear_reference(vm_page_t m)
2733{
2734	pmap_clear_ptes(m, PG_A);
2735}
2736
2737/*
2738 * Miscellaneous support routines follow
2739 */
2740
2741/*
2742 * Map a set of physical memory pages into the kernel virtual
2743 * address space. Return a pointer to where it is mapped. This
2744 * routine is intended to be used for mapping device memory,
2745 * NOT real memory.
2746 */
2747void *
2748pmap_mapdev(pa, size)
2749	vm_paddr_t pa;
2750	vm_size_t size;
2751{
2752	vm_offset_t va, tmpva, offset;
2753
2754	/* If this fits within the direct map window, use it */
2755	if (pa < dmaplimit && (pa + size) < dmaplimit)
2756		return ((void *)PHYS_TO_DMAP(pa));
2757	offset = pa & PAGE_MASK;
2758	size = roundup(offset + size, PAGE_SIZE);
2759	va = kmem_alloc_nofault(kernel_map, size);
2760	if (!va)
2761		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2762	pa = trunc_page(pa);
2763	for (tmpva = va; size > 0; ) {
2764		pmap_kenter(tmpva, pa);
2765		size -= PAGE_SIZE;
2766		tmpva += PAGE_SIZE;
2767		pa += PAGE_SIZE;
2768	}
2769	pmap_invalidate_range(kernel_pmap, va, tmpva);
2770	return ((void *)(va + offset));
2771}
2772
2773void
2774pmap_unmapdev(va, size)
2775	vm_offset_t va;
2776	vm_size_t size;
2777{
2778	vm_offset_t base, offset, tmpva;
2779
2780	/* If we gave a direct map region in pmap_mapdev, do nothing */
2781	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2782		return;
2783	base = trunc_page(va);
2784	offset = va & PAGE_MASK;
2785	size = roundup(offset + size, PAGE_SIZE);
2786	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2787		pmap_kremove(tmpva);
2788	pmap_invalidate_range(kernel_pmap, va, tmpva);
2789	kmem_free(kernel_map, base, size);
2790}
2791
2792/*
2793 * perform the pmap work for mincore
2794 */
2795int
2796pmap_mincore(pmap, addr)
2797	pmap_t pmap;
2798	vm_offset_t addr;
2799{
2800	pt_entry_t *ptep, pte;
2801	vm_page_t m;
2802	int val = 0;
2803
2804	PMAP_LOCK(pmap);
2805	ptep = pmap_pte(pmap, addr);
2806	pte = (ptep != NULL) ? *ptep : 0;
2807	PMAP_UNLOCK(pmap);
2808
2809	if (pte != 0) {
2810		vm_paddr_t pa;
2811
2812		val = MINCORE_INCORE;
2813		if ((pte & PG_MANAGED) == 0)
2814			return val;
2815
2816		pa = pte & PG_FRAME;
2817
2818		m = PHYS_TO_VM_PAGE(pa);
2819
2820		/*
2821		 * Modified by us
2822		 */
2823		if (pte & PG_M)
2824			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2825		else {
2826			/*
2827			 * Modified by someone else
2828			 */
2829			vm_page_lock_queues();
2830			if (m->dirty || pmap_is_modified(m))
2831				val |= MINCORE_MODIFIED_OTHER;
2832			vm_page_unlock_queues();
2833		}
2834		/*
2835		 * Referenced by us
2836		 */
2837		if (pte & PG_A)
2838			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2839		else {
2840			/*
2841			 * Referenced by someone else
2842			 */
2843			vm_page_lock_queues();
2844			if ((m->flags & PG_REFERENCED) ||
2845			    pmap_ts_referenced(m)) {
2846				val |= MINCORE_REFERENCED_OTHER;
2847				vm_page_flag_set(m, PG_REFERENCED);
2848			}
2849			vm_page_unlock_queues();
2850		}
2851	}
2852	return val;
2853}
2854
2855void
2856pmap_activate(struct thread *td)
2857{
2858	struct proc *p = td->td_proc;
2859	pmap_t	pmap, oldpmap;
2860	u_int64_t  cr3;
2861
2862	critical_enter();
2863	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2864	oldpmap = PCPU_GET(curpmap);
2865#ifdef SMP
2866if (oldpmap)	/* XXX FIXME */
2867	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
2868	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
2869#else
2870if (oldpmap)	/* XXX FIXME */
2871	oldpmap->pm_active &= ~PCPU_GET(cpumask);
2872	pmap->pm_active |= PCPU_GET(cpumask);
2873#endif
2874	cr3 = vtophys(pmap->pm_pml4);
2875	/* XXXKSE this is wrong.
2876	 * pmap_activate is for the current thread on the current cpu
2877	 */
2878	if (p->p_flag & P_SA) {
2879		/* Make sure all other cr3 entries are updated. */
2880		/* what if they are running?  XXXKSE (maybe abort them) */
2881		FOREACH_THREAD_IN_PROC(p, td) {
2882			td->td_pcb->pcb_cr3 = cr3;
2883		}
2884	} else {
2885		td->td_pcb->pcb_cr3 = cr3;
2886	}
2887	load_cr3(cr3);
2888	critical_exit();
2889}
2890
2891vm_offset_t
2892pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
2893{
2894
2895	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
2896		return addr;
2897	}
2898
2899	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
2900	return addr;
2901}
2902