pmap.c revision 157443
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 157443 2006-04-03 21:16:10Z peter $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_cpu.h"
107#include "opt_pmap.h"
108#include "opt_msgbuf.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/lock.h>
116#include <sys/malloc.h>
117#include <sys/mman.h>
118#include <sys/msgbuf.h>
119#include <sys/mutex.h>
120#include <sys/proc.h>
121#include <sys/sx.h>
122#include <sys/vmmeter.h>
123#include <sys/sched.h>
124#include <sys/sysctl.h>
125#ifdef SMP
126#include <sys/smp.h>
127#endif
128
129#include <vm/vm.h>
130#include <vm/vm_param.h>
131#include <vm/vm_kern.h>
132#include <vm/vm_page.h>
133#include <vm/vm_map.h>
134#include <vm/vm_object.h>
135#include <vm/vm_extern.h>
136#include <vm/vm_pageout.h>
137#include <vm/vm_pager.h>
138#include <vm/uma.h>
139
140#include <machine/cpu.h>
141#include <machine/cputypes.h>
142#include <machine/md_var.h>
143#include <machine/pcb.h>
144#include <machine/specialreg.h>
145#ifdef SMP
146#include <machine/smp.h>
147#endif
148
149#ifdef XBOX
150#include <machine/xbox.h>
151#endif
152
153#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
154#define CPU_ENABLE_SSE
155#endif
156
157#ifndef PMAP_SHPGPERPROC
158#define PMAP_SHPGPERPROC 200
159#endif
160
161#if defined(DIAGNOSTIC)
162#define PMAP_DIAGNOSTIC
163#endif
164
165#if !defined(PMAP_DIAGNOSTIC)
166#define PMAP_INLINE __inline
167#else
168#define PMAP_INLINE
169#endif
170
171/*
172 * Get PDEs and PTEs for user/kernel address space
173 */
174#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
175#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
176
177#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
178#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
179#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
180#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
181#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
182
183#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
184    atomic_clear_int((u_int *)(pte), PG_W))
185#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
186
187struct pmap kernel_pmap_store;
188LIST_HEAD(pmaplist, pmap);
189static struct pmaplist allpmaps;
190static struct mtx allpmaps_lock;
191
192vm_paddr_t avail_end;	/* PA of last available physical page */
193vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
194vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
195int pgeflag = 0;		/* PG_G or-in */
196int pseflag = 0;		/* PG_PS or-in */
197
198static int nkpt;
199vm_offset_t kernel_vm_end;
200extern u_int32_t KERNend;
201
202#ifdef PAE
203static uma_zone_t pdptzone;
204#endif
205
206/*
207 * Data for the pv entry allocation mechanism
208 */
209static uma_zone_t pvzone;
210static struct vm_object pvzone_obj;
211static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
212
213/*
214 * All those kernel PT submaps that BSD is so fond of
215 */
216struct sysmaps {
217	struct	mtx lock;
218	pt_entry_t *CMAP1;
219	pt_entry_t *CMAP2;
220	caddr_t	CADDR1;
221	caddr_t	CADDR2;
222};
223static struct sysmaps sysmaps_pcpu[MAXCPU];
224pt_entry_t *CMAP1 = 0;
225static pt_entry_t *CMAP3;
226caddr_t CADDR1 = 0, ptvmmap = 0;
227static caddr_t CADDR3;
228struct msgbuf *msgbufp = 0;
229
230/*
231 * Crashdump maps.
232 */
233static caddr_t crashdumpmap;
234
235#ifdef SMP
236extern pt_entry_t *SMPpt;
237#endif
238static pt_entry_t *PMAP1 = 0, *PMAP2;
239static pt_entry_t *PADDR1 = 0, *PADDR2;
240#ifdef SMP
241static int PMAP1cpu;
242static int PMAP1changedcpu;
243SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
244	   &PMAP1changedcpu, 0,
245	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
246#endif
247static int PMAP1changed;
248SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
249	   &PMAP1changed, 0,
250	   "Number of times pmap_pte_quick changed PMAP1");
251static int PMAP1unchanged;
252SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
253	   &PMAP1unchanged, 0,
254	   "Number of times pmap_pte_quick didn't change PMAP1");
255static struct mtx PMAP2mutex;
256
257static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
258static pv_entry_t get_pv_entry(pmap_t locked_pmap);
259static void	pmap_clear_ptes(vm_page_t m, int bit);
260
261static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
262static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
263static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
264					vm_offset_t va);
265static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
266static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
267    vm_page_t m);
268
269static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
270
271static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
272static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
273static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
274static void pmap_pte_release(pt_entry_t *pte);
275static int pmap_unuse_pt(pmap_t, vm_offset_t);
276static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
277#ifdef PAE
278static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
279#endif
280
281CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
282CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
283
284/*
285 * Move the kernel virtual free pointer to the next
286 * 4MB.  This is used to help improve performance
287 * by using a large (4MB) page for much of the kernel
288 * (.text, .data, .bss)
289 */
290static vm_offset_t
291pmap_kmem_choose(vm_offset_t addr)
292{
293	vm_offset_t newaddr = addr;
294
295#ifndef DISABLE_PSE
296	if (cpu_feature & CPUID_PSE)
297		newaddr = (addr + PDRMASK) & ~PDRMASK;
298#endif
299	return newaddr;
300}
301
302/*
303 *	Bootstrap the system enough to run with virtual memory.
304 *
305 *	On the i386 this is called after mapping has already been enabled
306 *	and just syncs the pmap module with what has already been done.
307 *	[We can't call it easily with mapping off since the kernel is not
308 *	mapped with PA == VA, hence we would have to relocate every address
309 *	from the linked base (virtual) address "KERNBASE" to the actual
310 *	(physical) address starting relative to 0]
311 */
312void
313pmap_bootstrap(firstaddr, loadaddr)
314	vm_paddr_t firstaddr;
315	vm_paddr_t loadaddr;
316{
317	vm_offset_t va;
318	pt_entry_t *pte, *unused;
319	struct sysmaps *sysmaps;
320	int i;
321
322	/*
323	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
324	 * large. It should instead be correctly calculated in locore.s and
325	 * not based on 'first' (which is a physical address, not a virtual
326	 * address, for the start of unused physical memory). The kernel
327	 * page tables are NOT double mapped and thus should not be included
328	 * in this calculation.
329	 */
330	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
331	virtual_avail = pmap_kmem_choose(virtual_avail);
332
333	virtual_end = VM_MAX_KERNEL_ADDRESS;
334
335	/*
336	 * Initialize the kernel pmap (which is statically allocated).
337	 */
338	PMAP_LOCK_INIT(kernel_pmap);
339	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
340#ifdef PAE
341	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
342#endif
343	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
344	TAILQ_INIT(&kernel_pmap->pm_pvlist);
345	LIST_INIT(&allpmaps);
346	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
347	mtx_lock_spin(&allpmaps_lock);
348	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
349	mtx_unlock_spin(&allpmaps_lock);
350	nkpt = NKPT;
351
352	/*
353	 * Reserve some special page table entries/VA space for temporary
354	 * mapping of pages.
355	 */
356#define	SYSMAP(c, p, v, n)	\
357	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
358
359	va = virtual_avail;
360	pte = vtopte(va);
361
362	/*
363	 * CMAP1/CMAP2 are used for zeroing and copying pages.
364	 * CMAP3 is used for the idle process page zeroing.
365	 */
366	for (i = 0; i < MAXCPU; i++) {
367		sysmaps = &sysmaps_pcpu[i];
368		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
369		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
370		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
371	}
372	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
373	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
374	*CMAP3 = 0;
375
376	/*
377	 * Crashdump maps.
378	 */
379	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
380
381	/*
382	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
383	 */
384	SYSMAP(caddr_t, unused, ptvmmap, 1)
385
386	/*
387	 * msgbufp is used to map the system message buffer.
388	 */
389	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
390
391	/*
392	 * ptemap is used for pmap_pte_quick
393	 */
394	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
395	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
396
397	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
398
399	virtual_avail = va;
400
401	*CMAP1 = 0;
402
403#ifdef XBOX
404	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
405	 * an early stadium, we cannot yet neatly map video memory ... :-(
406	 * Better fixes are very welcome! */
407	if (!arch_i386_is_xbox)
408#endif
409	for (i = 0; i < NKPT; i++)
410		PTD[i] = 0;
411
412	/* Turn on PG_G on kernel page(s) */
413	pmap_set_pg();
414}
415
416/*
417 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
418 */
419void
420pmap_set_pg(void)
421{
422	pd_entry_t pdir;
423	pt_entry_t *pte;
424	vm_offset_t va, endva;
425	int i;
426
427	if (pgeflag == 0)
428		return;
429
430	i = KERNLOAD/NBPDR;
431	endva = KERNBASE + KERNend;
432
433	if (pseflag) {
434		va = KERNBASE + KERNLOAD;
435		while (va  < endva) {
436			pdir = kernel_pmap->pm_pdir[KPTDI+i];
437			pdir |= pgeflag;
438			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
439			invltlb();	/* Play it safe, invltlb() every time */
440			i++;
441			va += NBPDR;
442		}
443	} else {
444		va = (vm_offset_t)btext;
445		while (va < endva) {
446			pte = vtopte(va);
447			if (*pte)
448				*pte |= pgeflag;
449			invltlb();	/* Play it safe, invltlb() every time */
450			va += PAGE_SIZE;
451		}
452	}
453}
454
455/*
456 * Initialize a vm_page's machine-dependent fields.
457 */
458void
459pmap_page_init(vm_page_t m)
460{
461
462	TAILQ_INIT(&m->md.pv_list);
463	m->md.pv_list_count = 0;
464}
465
466#ifdef PAE
467
468static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
469
470static void *
471pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
472{
473	*flags = UMA_SLAB_PRIV;
474	return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
475	    1, 0));
476}
477#endif
478
479/*
480 *	Initialize the pmap module.
481 *	Called by vm_init, to initialize any structures that the pmap
482 *	system needs to map virtual memory.
483 */
484void
485pmap_init(void)
486{
487	int shpgperproc = PMAP_SHPGPERPROC;
488
489	/*
490	 * Initialize the address space (zone) for the pv entries.  Set a
491	 * high water mark so that the system can recover from excessive
492	 * numbers of pv entries.
493	 */
494	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
495	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
496	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
497	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
498	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
499	pv_entry_high_water = 9 * (pv_entry_max / 10);
500	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
501
502#ifdef PAE
503	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
504	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
505	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
506	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
507#endif
508}
509
510
511/***************************************************
512 * Low level helper routines.....
513 ***************************************************/
514
515
516/*
517 * this routine defines the region(s) of memory that should
518 * not be tested for the modified bit.
519 */
520static PMAP_INLINE int
521pmap_track_modified(vm_offset_t va)
522{
523	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
524		return 1;
525	else
526		return 0;
527}
528
529#ifdef SMP
530/*
531 * For SMP, these functions have to use the IPI mechanism for coherence.
532 */
533void
534pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
535{
536	u_int cpumask;
537	u_int other_cpus;
538
539	if (smp_started) {
540		if (!(read_eflags() & PSL_I))
541			panic("%s: interrupts disabled", __func__);
542		mtx_lock_spin(&smp_ipi_mtx);
543	} else
544		critical_enter();
545	/*
546	 * We need to disable interrupt preemption but MUST NOT have
547	 * interrupts disabled here.
548	 * XXX we may need to hold schedlock to get a coherent pm_active
549	 * XXX critical sections disable interrupts again
550	 */
551	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
552		invlpg(va);
553		smp_invlpg(va);
554	} else {
555		cpumask = PCPU_GET(cpumask);
556		other_cpus = PCPU_GET(other_cpus);
557		if (pmap->pm_active & cpumask)
558			invlpg(va);
559		if (pmap->pm_active & other_cpus)
560			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
561	}
562	if (smp_started)
563		mtx_unlock_spin(&smp_ipi_mtx);
564	else
565		critical_exit();
566}
567
568void
569pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
570{
571	u_int cpumask;
572	u_int other_cpus;
573	vm_offset_t addr;
574
575	if (smp_started) {
576		if (!(read_eflags() & PSL_I))
577			panic("%s: interrupts disabled", __func__);
578		mtx_lock_spin(&smp_ipi_mtx);
579	} else
580		critical_enter();
581	/*
582	 * We need to disable interrupt preemption but MUST NOT have
583	 * interrupts disabled here.
584	 * XXX we may need to hold schedlock to get a coherent pm_active
585	 * XXX critical sections disable interrupts again
586	 */
587	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
588		for (addr = sva; addr < eva; addr += PAGE_SIZE)
589			invlpg(addr);
590		smp_invlpg_range(sva, eva);
591	} else {
592		cpumask = PCPU_GET(cpumask);
593		other_cpus = PCPU_GET(other_cpus);
594		if (pmap->pm_active & cpumask)
595			for (addr = sva; addr < eva; addr += PAGE_SIZE)
596				invlpg(addr);
597		if (pmap->pm_active & other_cpus)
598			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
599			    sva, eva);
600	}
601	if (smp_started)
602		mtx_unlock_spin(&smp_ipi_mtx);
603	else
604		critical_exit();
605}
606
607void
608pmap_invalidate_all(pmap_t pmap)
609{
610	u_int cpumask;
611	u_int other_cpus;
612
613	if (smp_started) {
614		if (!(read_eflags() & PSL_I))
615			panic("%s: interrupts disabled", __func__);
616		mtx_lock_spin(&smp_ipi_mtx);
617	} else
618		critical_enter();
619	/*
620	 * We need to disable interrupt preemption but MUST NOT have
621	 * interrupts disabled here.
622	 * XXX we may need to hold schedlock to get a coherent pm_active
623	 * XXX critical sections disable interrupts again
624	 */
625	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
626		invltlb();
627		smp_invltlb();
628	} else {
629		cpumask = PCPU_GET(cpumask);
630		other_cpus = PCPU_GET(other_cpus);
631		if (pmap->pm_active & cpumask)
632			invltlb();
633		if (pmap->pm_active & other_cpus)
634			smp_masked_invltlb(pmap->pm_active & other_cpus);
635	}
636	if (smp_started)
637		mtx_unlock_spin(&smp_ipi_mtx);
638	else
639		critical_exit();
640}
641#else /* !SMP */
642/*
643 * Normal, non-SMP, 486+ invalidation functions.
644 * We inline these within pmap.c for speed.
645 */
646PMAP_INLINE void
647pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
648{
649
650	if (pmap == kernel_pmap || pmap->pm_active)
651		invlpg(va);
652}
653
654PMAP_INLINE void
655pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
656{
657	vm_offset_t addr;
658
659	if (pmap == kernel_pmap || pmap->pm_active)
660		for (addr = sva; addr < eva; addr += PAGE_SIZE)
661			invlpg(addr);
662}
663
664PMAP_INLINE void
665pmap_invalidate_all(pmap_t pmap)
666{
667
668	if (pmap == kernel_pmap || pmap->pm_active)
669		invltlb();
670}
671#endif /* !SMP */
672
673/*
674 * Are we current address space or kernel?  N.B. We return FALSE when
675 * a pmap's page table is in use because a kernel thread is borrowing
676 * it.  The borrowed page table can change spontaneously, making any
677 * dependence on its continued use subject to a race condition.
678 */
679static __inline int
680pmap_is_current(pmap_t pmap)
681{
682
683	return (pmap == kernel_pmap ||
684		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
685	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
686}
687
688/*
689 * If the given pmap is not the current or kernel pmap, the returned pte must
690 * be released by passing it to pmap_pte_release().
691 */
692pt_entry_t *
693pmap_pte(pmap_t pmap, vm_offset_t va)
694{
695	pd_entry_t newpf;
696	pd_entry_t *pde;
697
698	pde = pmap_pde(pmap, va);
699	if (*pde & PG_PS)
700		return (pde);
701	if (*pde != 0) {
702		/* are we current address space or kernel? */
703		if (pmap_is_current(pmap))
704			return (vtopte(va));
705		mtx_lock(&PMAP2mutex);
706		newpf = *pde & PG_FRAME;
707		if ((*PMAP2 & PG_FRAME) != newpf) {
708			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
709			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
710		}
711		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
712	}
713	return (0);
714}
715
716/*
717 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
718 * being NULL.
719 */
720static __inline void
721pmap_pte_release(pt_entry_t *pte)
722{
723
724	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
725		mtx_unlock(&PMAP2mutex);
726}
727
728static __inline void
729invlcaddr(void *caddr)
730{
731
732	invlpg((u_int)caddr);
733}
734
735/*
736 * Super fast pmap_pte routine best used when scanning
737 * the pv lists.  This eliminates many coarse-grained
738 * invltlb calls.  Note that many of the pv list
739 * scans are across different pmaps.  It is very wasteful
740 * to do an entire invltlb for checking a single mapping.
741 *
742 * If the given pmap is not the current pmap, vm_page_queue_mtx
743 * must be held and curthread pinned to a CPU.
744 */
745static pt_entry_t *
746pmap_pte_quick(pmap_t pmap, vm_offset_t va)
747{
748	pd_entry_t newpf;
749	pd_entry_t *pde;
750
751	pde = pmap_pde(pmap, va);
752	if (*pde & PG_PS)
753		return (pde);
754	if (*pde != 0) {
755		/* are we current address space or kernel? */
756		if (pmap_is_current(pmap))
757			return (vtopte(va));
758		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
759		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
760		newpf = *pde & PG_FRAME;
761		if ((*PMAP1 & PG_FRAME) != newpf) {
762			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
763#ifdef SMP
764			PMAP1cpu = PCPU_GET(cpuid);
765#endif
766			invlcaddr(PADDR1);
767			PMAP1changed++;
768		} else
769#ifdef SMP
770		if (PMAP1cpu != PCPU_GET(cpuid)) {
771			PMAP1cpu = PCPU_GET(cpuid);
772			invlcaddr(PADDR1);
773			PMAP1changedcpu++;
774		} else
775#endif
776			PMAP1unchanged++;
777		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
778	}
779	return (0);
780}
781
782/*
783 *	Routine:	pmap_extract
784 *	Function:
785 *		Extract the physical page address associated
786 *		with the given map/virtual_address pair.
787 */
788vm_paddr_t
789pmap_extract(pmap_t pmap, vm_offset_t va)
790{
791	vm_paddr_t rtval;
792	pt_entry_t *pte;
793	pd_entry_t pde;
794
795	rtval = 0;
796	PMAP_LOCK(pmap);
797	pde = pmap->pm_pdir[va >> PDRSHIFT];
798	if (pde != 0) {
799		if ((pde & PG_PS) != 0) {
800			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
801			PMAP_UNLOCK(pmap);
802			return rtval;
803		}
804		pte = pmap_pte(pmap, va);
805		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
806		pmap_pte_release(pte);
807	}
808	PMAP_UNLOCK(pmap);
809	return (rtval);
810}
811
812/*
813 *	Routine:	pmap_extract_and_hold
814 *	Function:
815 *		Atomically extract and hold the physical page
816 *		with the given pmap and virtual address pair
817 *		if that mapping permits the given protection.
818 */
819vm_page_t
820pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
821{
822	pd_entry_t pde;
823	pt_entry_t pte;
824	vm_page_t m;
825
826	m = NULL;
827	vm_page_lock_queues();
828	PMAP_LOCK(pmap);
829	pde = *pmap_pde(pmap, va);
830	if (pde != 0) {
831		if (pde & PG_PS) {
832			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
833				m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
834				    (va & PDRMASK));
835				vm_page_hold(m);
836			}
837		} else {
838			sched_pin();
839			pte = *pmap_pte_quick(pmap, va);
840			if (pte != 0 &&
841			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
842				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
843				vm_page_hold(m);
844			}
845			sched_unpin();
846		}
847	}
848	vm_page_unlock_queues();
849	PMAP_UNLOCK(pmap);
850	return (m);
851}
852
853/***************************************************
854 * Low level mapping routines.....
855 ***************************************************/
856
857/*
858 * Add a wired page to the kva.
859 * Note: not SMP coherent.
860 */
861PMAP_INLINE void
862pmap_kenter(vm_offset_t va, vm_paddr_t pa)
863{
864	pt_entry_t *pte;
865
866	pte = vtopte(va);
867	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
868}
869
870/*
871 * Remove a page from the kernel pagetables.
872 * Note: not SMP coherent.
873 */
874PMAP_INLINE void
875pmap_kremove(vm_offset_t va)
876{
877	pt_entry_t *pte;
878
879	pte = vtopte(va);
880	pte_clear(pte);
881}
882
883/*
884 *	Used to map a range of physical addresses into kernel
885 *	virtual address space.
886 *
887 *	The value passed in '*virt' is a suggested virtual address for
888 *	the mapping. Architectures which can support a direct-mapped
889 *	physical to virtual region can return the appropriate address
890 *	within that region, leaving '*virt' unchanged. Other
891 *	architectures should map the pages starting at '*virt' and
892 *	update '*virt' with the first usable address after the mapped
893 *	region.
894 */
895vm_offset_t
896pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
897{
898	vm_offset_t va, sva;
899
900	va = sva = *virt;
901	while (start < end) {
902		pmap_kenter(va, start);
903		va += PAGE_SIZE;
904		start += PAGE_SIZE;
905	}
906	pmap_invalidate_range(kernel_pmap, sva, va);
907	*virt = va;
908	return (sva);
909}
910
911
912/*
913 * Add a list of wired pages to the kva
914 * this routine is only used for temporary
915 * kernel mappings that do not need to have
916 * page modification or references recorded.
917 * Note that old mappings are simply written
918 * over.  The page *must* be wired.
919 * Note: SMP coherent.  Uses a ranged shootdown IPI.
920 */
921void
922pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
923{
924	vm_offset_t va;
925
926	va = sva;
927	while (count-- > 0) {
928		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
929		va += PAGE_SIZE;
930		m++;
931	}
932	pmap_invalidate_range(kernel_pmap, sva, va);
933}
934
935/*
936 * This routine tears out page mappings from the
937 * kernel -- it is meant only for temporary mappings.
938 * Note: SMP coherent.  Uses a ranged shootdown IPI.
939 */
940void
941pmap_qremove(vm_offset_t sva, int count)
942{
943	vm_offset_t va;
944
945	va = sva;
946	while (count-- > 0) {
947		pmap_kremove(va);
948		va += PAGE_SIZE;
949	}
950	pmap_invalidate_range(kernel_pmap, sva, va);
951}
952
953/***************************************************
954 * Page table page management routines.....
955 ***************************************************/
956
957/*
958 * This routine unholds page table pages, and if the hold count
959 * drops to zero, then it decrements the wire count.
960 */
961static PMAP_INLINE int
962pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
963{
964
965	--m->wire_count;
966	if (m->wire_count == 0)
967		return _pmap_unwire_pte_hold(pmap, m);
968	else
969		return 0;
970}
971
972static int
973_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
974{
975	vm_offset_t pteva;
976
977	/*
978	 * unmap the page table page
979	 */
980	pmap->pm_pdir[m->pindex] = 0;
981	--pmap->pm_stats.resident_count;
982
983	/*
984	 * Do an invltlb to make the invalidated mapping
985	 * take effect immediately.
986	 */
987	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
988	pmap_invalidate_page(pmap, pteva);
989
990	vm_page_free_zero(m);
991	atomic_subtract_int(&cnt.v_wire_count, 1);
992	return 1;
993}
994
995/*
996 * After removing a page table entry, this routine is used to
997 * conditionally free the page, and manage the hold/wire counts.
998 */
999static int
1000pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
1001{
1002	pd_entry_t ptepde;
1003	vm_page_t mpte;
1004
1005	if (va >= VM_MAXUSER_ADDRESS)
1006		return 0;
1007	ptepde = *pmap_pde(pmap, va);
1008	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1009	return pmap_unwire_pte_hold(pmap, mpte);
1010}
1011
1012void
1013pmap_pinit0(pmap)
1014	struct pmap *pmap;
1015{
1016
1017	PMAP_LOCK_INIT(pmap);
1018	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1019#ifdef PAE
1020	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1021#endif
1022	pmap->pm_active = 0;
1023	PCPU_SET(curpmap, pmap);
1024	TAILQ_INIT(&pmap->pm_pvlist);
1025	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1026	mtx_lock_spin(&allpmaps_lock);
1027	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1028	mtx_unlock_spin(&allpmaps_lock);
1029}
1030
1031/*
1032 * Initialize a preallocated and zeroed pmap structure,
1033 * such as one in a vmspace structure.
1034 */
1035void
1036pmap_pinit(pmap)
1037	register struct pmap *pmap;
1038{
1039	vm_page_t m, ptdpg[NPGPTD];
1040	vm_paddr_t pa;
1041	static int color;
1042	int i;
1043
1044	PMAP_LOCK_INIT(pmap);
1045
1046	/*
1047	 * No need to allocate page table space yet but we do need a valid
1048	 * page directory table.
1049	 */
1050	if (pmap->pm_pdir == NULL) {
1051		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1052		    NBPTD);
1053#ifdef PAE
1054		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1055		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1056		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1057		    ("pmap_pinit: pdpt misaligned"));
1058		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1059		    ("pmap_pinit: pdpt above 4g"));
1060#endif
1061	}
1062
1063	/*
1064	 * allocate the page directory page(s)
1065	 */
1066	for (i = 0; i < NPGPTD;) {
1067		m = vm_page_alloc(NULL, color++,
1068		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1069		    VM_ALLOC_ZERO);
1070		if (m == NULL)
1071			VM_WAIT;
1072		else {
1073			ptdpg[i++] = m;
1074		}
1075	}
1076
1077	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1078
1079	for (i = 0; i < NPGPTD; i++) {
1080		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1081			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1082	}
1083
1084	mtx_lock_spin(&allpmaps_lock);
1085	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1086	mtx_unlock_spin(&allpmaps_lock);
1087	/* Wire in kernel global address entries. */
1088	/* XXX copies current process, does not fill in MPPTDI */
1089	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1090#ifdef SMP
1091	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1092#endif
1093
1094	/* install self-referential address mapping entry(s) */
1095	for (i = 0; i < NPGPTD; i++) {
1096		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1097		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1098#ifdef PAE
1099		pmap->pm_pdpt[i] = pa | PG_V;
1100#endif
1101	}
1102
1103	pmap->pm_active = 0;
1104	TAILQ_INIT(&pmap->pm_pvlist);
1105	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1106}
1107
1108/*
1109 * this routine is called if the page table page is not
1110 * mapped correctly.
1111 */
1112static vm_page_t
1113_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1114{
1115	vm_paddr_t ptepa;
1116	vm_page_t m;
1117
1118	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1119	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1120	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1121
1122	/*
1123	 * Allocate a page table page.
1124	 */
1125	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1126	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1127		if (flags & M_WAITOK) {
1128			PMAP_UNLOCK(pmap);
1129			vm_page_unlock_queues();
1130			VM_WAIT;
1131			vm_page_lock_queues();
1132			PMAP_LOCK(pmap);
1133		}
1134
1135		/*
1136		 * Indicate the need to retry.  While waiting, the page table
1137		 * page may have been allocated.
1138		 */
1139		return (NULL);
1140	}
1141	if ((m->flags & PG_ZERO) == 0)
1142		pmap_zero_page(m);
1143
1144	/*
1145	 * Map the pagetable page into the process address space, if
1146	 * it isn't already there.
1147	 */
1148
1149	pmap->pm_stats.resident_count++;
1150
1151	ptepa = VM_PAGE_TO_PHYS(m);
1152	pmap->pm_pdir[ptepindex] =
1153		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1154
1155	return m;
1156}
1157
1158static vm_page_t
1159pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1160{
1161	unsigned ptepindex;
1162	pd_entry_t ptepa;
1163	vm_page_t m;
1164
1165	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1166	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1167	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1168
1169	/*
1170	 * Calculate pagetable page index
1171	 */
1172	ptepindex = va >> PDRSHIFT;
1173retry:
1174	/*
1175	 * Get the page directory entry
1176	 */
1177	ptepa = pmap->pm_pdir[ptepindex];
1178
1179	/*
1180	 * This supports switching from a 4MB page to a
1181	 * normal 4K page.
1182	 */
1183	if (ptepa & PG_PS) {
1184		pmap->pm_pdir[ptepindex] = 0;
1185		ptepa = 0;
1186		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1187		pmap_invalidate_all(kernel_pmap);
1188	}
1189
1190	/*
1191	 * If the page table page is mapped, we just increment the
1192	 * hold count, and activate it.
1193	 */
1194	if (ptepa) {
1195		m = PHYS_TO_VM_PAGE(ptepa);
1196		m->wire_count++;
1197	} else {
1198		/*
1199		 * Here if the pte page isn't mapped, or if it has
1200		 * been deallocated.
1201		 */
1202		m = _pmap_allocpte(pmap, ptepindex, flags);
1203		if (m == NULL && (flags & M_WAITOK))
1204			goto retry;
1205	}
1206	return (m);
1207}
1208
1209
1210/***************************************************
1211* Pmap allocation/deallocation routines.
1212 ***************************************************/
1213
1214#ifdef SMP
1215/*
1216 * Deal with a SMP shootdown of other users of the pmap that we are
1217 * trying to dispose of.  This can be a bit hairy.
1218 */
1219static u_int *lazymask;
1220static u_int lazyptd;
1221static volatile u_int lazywait;
1222
1223void pmap_lazyfix_action(void);
1224
1225void
1226pmap_lazyfix_action(void)
1227{
1228	u_int mymask = PCPU_GET(cpumask);
1229
1230#ifdef COUNT_IPIS
1231	*ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
1232#endif
1233	if (rcr3() == lazyptd)
1234		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1235	atomic_clear_int(lazymask, mymask);
1236	atomic_store_rel_int(&lazywait, 1);
1237}
1238
1239static void
1240pmap_lazyfix_self(u_int mymask)
1241{
1242
1243	if (rcr3() == lazyptd)
1244		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1245	atomic_clear_int(lazymask, mymask);
1246}
1247
1248
1249static void
1250pmap_lazyfix(pmap_t pmap)
1251{
1252	u_int mymask;
1253	u_int mask;
1254	register u_int spins;
1255
1256	while ((mask = pmap->pm_active) != 0) {
1257		spins = 50000000;
1258		mask = mask & -mask;	/* Find least significant set bit */
1259		mtx_lock_spin(&smp_ipi_mtx);
1260#ifdef PAE
1261		lazyptd = vtophys(pmap->pm_pdpt);
1262#else
1263		lazyptd = vtophys(pmap->pm_pdir);
1264#endif
1265		mymask = PCPU_GET(cpumask);
1266		if (mask == mymask) {
1267			lazymask = &pmap->pm_active;
1268			pmap_lazyfix_self(mymask);
1269		} else {
1270			atomic_store_rel_int((u_int *)&lazymask,
1271			    (u_int)&pmap->pm_active);
1272			atomic_store_rel_int(&lazywait, 0);
1273			ipi_selected(mask, IPI_LAZYPMAP);
1274			while (lazywait == 0) {
1275				ia32_pause();
1276				if (--spins == 0)
1277					break;
1278			}
1279		}
1280		mtx_unlock_spin(&smp_ipi_mtx);
1281		if (spins == 0)
1282			printf("pmap_lazyfix: spun for 50000000\n");
1283	}
1284}
1285
1286#else	/* SMP */
1287
1288/*
1289 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1290 * unlikely to have to even execute this code, including the fact
1291 * that the cleanup is deferred until the parent does a wait(2), which
1292 * means that another userland process has run.
1293 */
1294static void
1295pmap_lazyfix(pmap_t pmap)
1296{
1297	u_int cr3;
1298
1299	cr3 = vtophys(pmap->pm_pdir);
1300	if (cr3 == rcr3()) {
1301		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1302		pmap->pm_active &= ~(PCPU_GET(cpumask));
1303	}
1304}
1305#endif	/* SMP */
1306
1307/*
1308 * Release any resources held by the given physical map.
1309 * Called when a pmap initialized by pmap_pinit is being released.
1310 * Should only be called if the map contains no valid mappings.
1311 */
1312void
1313pmap_release(pmap_t pmap)
1314{
1315	vm_page_t m, ptdpg[NPGPTD];
1316	int i;
1317
1318	KASSERT(pmap->pm_stats.resident_count == 0,
1319	    ("pmap_release: pmap resident count %ld != 0",
1320	    pmap->pm_stats.resident_count));
1321
1322	pmap_lazyfix(pmap);
1323	mtx_lock_spin(&allpmaps_lock);
1324	LIST_REMOVE(pmap, pm_list);
1325	mtx_unlock_spin(&allpmaps_lock);
1326
1327	for (i = 0; i < NPGPTD; i++)
1328		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1329
1330	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1331	    sizeof(*pmap->pm_pdir));
1332#ifdef SMP
1333	pmap->pm_pdir[MPPTDI] = 0;
1334#endif
1335
1336	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1337
1338	vm_page_lock_queues();
1339	for (i = 0; i < NPGPTD; i++) {
1340		m = ptdpg[i];
1341#ifdef PAE
1342		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1343		    ("pmap_release: got wrong ptd page"));
1344#endif
1345		m->wire_count--;
1346		atomic_subtract_int(&cnt.v_wire_count, 1);
1347		vm_page_free_zero(m);
1348	}
1349	vm_page_unlock_queues();
1350	PMAP_LOCK_DESTROY(pmap);
1351}
1352
1353static int
1354kvm_size(SYSCTL_HANDLER_ARGS)
1355{
1356	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1357
1358	return sysctl_handle_long(oidp, &ksize, 0, req);
1359}
1360SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1361    0, 0, kvm_size, "IU", "Size of KVM");
1362
1363static int
1364kvm_free(SYSCTL_HANDLER_ARGS)
1365{
1366	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1367
1368	return sysctl_handle_long(oidp, &kfree, 0, req);
1369}
1370SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1371    0, 0, kvm_free, "IU", "Amount of KVM free");
1372
1373/*
1374 * grow the number of kernel page table entries, if needed
1375 */
1376void
1377pmap_growkernel(vm_offset_t addr)
1378{
1379	struct pmap *pmap;
1380	vm_paddr_t ptppaddr;
1381	vm_page_t nkpg;
1382	pd_entry_t newpdir;
1383	pt_entry_t *pde;
1384
1385	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1386	if (kernel_vm_end == 0) {
1387		kernel_vm_end = KERNBASE;
1388		nkpt = 0;
1389		while (pdir_pde(PTD, kernel_vm_end)) {
1390			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1391			nkpt++;
1392			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1393				kernel_vm_end = kernel_map->max_offset;
1394				break;
1395			}
1396		}
1397	}
1398	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1399	if (addr - 1 >= kernel_map->max_offset)
1400		addr = kernel_map->max_offset;
1401	while (kernel_vm_end < addr) {
1402		if (pdir_pde(PTD, kernel_vm_end)) {
1403			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1404			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1405				kernel_vm_end = kernel_map->max_offset;
1406				break;
1407			}
1408			continue;
1409		}
1410
1411		/*
1412		 * This index is bogus, but out of the way
1413		 */
1414		nkpg = vm_page_alloc(NULL, nkpt,
1415		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1416		if (!nkpg)
1417			panic("pmap_growkernel: no memory to grow kernel");
1418
1419		nkpt++;
1420
1421		pmap_zero_page(nkpg);
1422		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1423		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1424		pdir_pde(PTD, kernel_vm_end) = newpdir;
1425
1426		mtx_lock_spin(&allpmaps_lock);
1427		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1428			pde = pmap_pde(pmap, kernel_vm_end);
1429			pde_store(pde, newpdir);
1430		}
1431		mtx_unlock_spin(&allpmaps_lock);
1432		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1433		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1434			kernel_vm_end = kernel_map->max_offset;
1435			break;
1436		}
1437	}
1438}
1439
1440
1441/***************************************************
1442 * page management routines.
1443 ***************************************************/
1444
1445/*
1446 * free the pv_entry back to the free list
1447 */
1448static PMAP_INLINE void
1449free_pv_entry(pv_entry_t pv)
1450{
1451	pv_entry_count--;
1452	uma_zfree(pvzone, pv);
1453}
1454
1455/*
1456 * get a new pv_entry, allocating a block from the system
1457 * when needed.
1458 */
1459static pv_entry_t
1460get_pv_entry(pmap_t locked_pmap)
1461{
1462	static const struct timeval printinterval = { 60, 0 };
1463	static struct timeval lastprint;
1464	struct vpgqueues *vpq;
1465	pmap_t pmap;
1466	pt_entry_t *pte, tpte;
1467	pv_entry_t allocated_pv, next_pv, pv;
1468	vm_offset_t va;
1469	vm_page_t m;
1470
1471	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1472	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1473	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
1474	if (allocated_pv != NULL) {
1475		pv_entry_count++;
1476		if (pv_entry_count > pv_entry_high_water)
1477			pagedaemon_wakeup();
1478		else
1479			return (allocated_pv);
1480	}
1481
1482	/*
1483	 * Reclaim pv entries: At first, destroy mappings to inactive
1484	 * pages.  After that, if a pv entry is still needed, destroy
1485	 * mappings to active pages.
1486	 */
1487	if (ratecheck(&lastprint, &printinterval))
1488		printf("Approaching the limit on PV entries, "
1489		    "increase the vm.pmap.shpgperproc tunable.\n");
1490	vpq = &vm_page_queues[PQ_INACTIVE];
1491retry:
1492	sched_pin();
1493	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1494		if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1495			continue;
1496		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1497			va = pv->pv_va;
1498			pmap = pv->pv_pmap;
1499			/* Avoid deadlock and lock recursion. */
1500			if (pmap > locked_pmap)
1501				PMAP_LOCK(pmap);
1502			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1503				continue;
1504			pmap->pm_stats.resident_count--;
1505			pte = pmap_pte_quick(pmap, va);
1506			tpte = pte_load_clear(pte);
1507			KASSERT((tpte & PG_W) == 0,
1508			    ("get_pv_entry: wired pte %#jx", (uintmax_t)tpte));
1509			if (tpte & PG_A)
1510				vm_page_flag_set(m, PG_REFERENCED);
1511			if (tpte & PG_M) {
1512				KASSERT((tpte & PG_RW),
1513	("get_pv_entry: modified page not writable: va: %#x, pte: %#jx",
1514				    va, (uintmax_t)tpte));
1515				if (pmap_track_modified(va))
1516					vm_page_dirty(m);
1517			}
1518			pmap_invalidate_page(pmap, va);
1519			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1520			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1521			if (TAILQ_EMPTY(&m->md.pv_list))
1522				vm_page_flag_clear(m, PG_WRITEABLE);
1523			m->md.pv_list_count--;
1524			pmap_unuse_pt(pmap, va);
1525			if (pmap != locked_pmap)
1526				PMAP_UNLOCK(pmap);
1527			if (allocated_pv == NULL)
1528				allocated_pv = pv;
1529			else
1530				free_pv_entry(pv);
1531		}
1532	}
1533	sched_unpin();
1534	if (allocated_pv == NULL) {
1535		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
1536			vpq = &vm_page_queues[PQ_ACTIVE];
1537			goto retry;
1538		}
1539		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
1540	}
1541	return (allocated_pv);
1542}
1543
1544static void
1545pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1546{
1547	pv_entry_t pv;
1548
1549	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1550	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1551	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1552		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1553			if (pmap == pv->pv_pmap && va == pv->pv_va)
1554				break;
1555		}
1556	} else {
1557		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1558			if (va == pv->pv_va)
1559				break;
1560		}
1561	}
1562	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1563	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1564	m->md.pv_list_count--;
1565	if (TAILQ_EMPTY(&m->md.pv_list))
1566		vm_page_flag_clear(m, PG_WRITEABLE);
1567	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1568	free_pv_entry(pv);
1569}
1570
1571/*
1572 * Create a pv entry for page at pa for
1573 * (pmap, va).
1574 */
1575static void
1576pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1577{
1578	pv_entry_t pv;
1579
1580	pv = get_pv_entry(pmap);
1581	pv->pv_va = va;
1582	pv->pv_pmap = pmap;
1583
1584	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1585	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1586	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1587	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1588	m->md.pv_list_count++;
1589}
1590
1591/*
1592 * Conditionally create a pv entry.
1593 */
1594static boolean_t
1595pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1596{
1597	pv_entry_t pv;
1598
1599	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1600	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1601	if (pv_entry_count < pv_entry_high_water &&
1602	    (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) {
1603		pv_entry_count++;
1604		pv->pv_va = va;
1605		pv->pv_pmap = pmap;
1606		TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1607		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1608		m->md.pv_list_count++;
1609		return (TRUE);
1610	} else
1611		return (FALSE);
1612}
1613
1614/*
1615 * pmap_remove_pte: do the things to unmap a page in a process
1616 */
1617static int
1618pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1619{
1620	pt_entry_t oldpte;
1621	vm_page_t m;
1622
1623	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1624	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1625	oldpte = pte_load_clear(ptq);
1626	if (oldpte & PG_W)
1627		pmap->pm_stats.wired_count -= 1;
1628	/*
1629	 * Machines that don't support invlpg, also don't support
1630	 * PG_G.
1631	 */
1632	if (oldpte & PG_G)
1633		pmap_invalidate_page(kernel_pmap, va);
1634	pmap->pm_stats.resident_count -= 1;
1635	if (oldpte & PG_MANAGED) {
1636		m = PHYS_TO_VM_PAGE(oldpte);
1637		if (oldpte & PG_M) {
1638			KASSERT((oldpte & PG_RW),
1639	("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
1640			    va, (uintmax_t)oldpte));
1641			if (pmap_track_modified(va))
1642				vm_page_dirty(m);
1643		}
1644		if (oldpte & PG_A)
1645			vm_page_flag_set(m, PG_REFERENCED);
1646		pmap_remove_entry(pmap, m, va);
1647	}
1648	return (pmap_unuse_pt(pmap, va));
1649}
1650
1651/*
1652 * Remove a single page from a process address space
1653 */
1654static void
1655pmap_remove_page(pmap_t pmap, vm_offset_t va)
1656{
1657	pt_entry_t *pte;
1658
1659	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1660	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1661	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1662	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1663		return;
1664	pmap_remove_pte(pmap, pte, va);
1665	pmap_invalidate_page(pmap, va);
1666}
1667
1668/*
1669 *	Remove the given range of addresses from the specified map.
1670 *
1671 *	It is assumed that the start and end are properly
1672 *	rounded to the page size.
1673 */
1674void
1675pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1676{
1677	vm_offset_t pdnxt;
1678	pd_entry_t ptpaddr;
1679	pt_entry_t *pte;
1680	int anyvalid;
1681
1682	/*
1683	 * Perform an unsynchronized read.  This is, however, safe.
1684	 */
1685	if (pmap->pm_stats.resident_count == 0)
1686		return;
1687
1688	anyvalid = 0;
1689
1690	vm_page_lock_queues();
1691	sched_pin();
1692	PMAP_LOCK(pmap);
1693
1694	/*
1695	 * special handling of removing one page.  a very
1696	 * common operation and easy to short circuit some
1697	 * code.
1698	 */
1699	if ((sva + PAGE_SIZE == eva) &&
1700	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1701		pmap_remove_page(pmap, sva);
1702		goto out;
1703	}
1704
1705	for (; sva < eva; sva = pdnxt) {
1706		unsigned pdirindex;
1707
1708		/*
1709		 * Calculate index for next page table.
1710		 */
1711		pdnxt = (sva + NBPDR) & ~PDRMASK;
1712		if (pmap->pm_stats.resident_count == 0)
1713			break;
1714
1715		pdirindex = sva >> PDRSHIFT;
1716		ptpaddr = pmap->pm_pdir[pdirindex];
1717
1718		/*
1719		 * Weed out invalid mappings. Note: we assume that the page
1720		 * directory table is always allocated, and in kernel virtual.
1721		 */
1722		if (ptpaddr == 0)
1723			continue;
1724
1725		/*
1726		 * Check for large page.
1727		 */
1728		if ((ptpaddr & PG_PS) != 0) {
1729			pmap->pm_pdir[pdirindex] = 0;
1730			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1731			anyvalid = 1;
1732			continue;
1733		}
1734
1735		/*
1736		 * Limit our scan to either the end of the va represented
1737		 * by the current page table page, or to the end of the
1738		 * range being removed.
1739		 */
1740		if (pdnxt > eva)
1741			pdnxt = eva;
1742
1743		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1744		    sva += PAGE_SIZE) {
1745			if (*pte == 0)
1746				continue;
1747
1748			/*
1749			 * The TLB entry for a PG_G mapping is invalidated
1750			 * by pmap_remove_pte().
1751			 */
1752			if ((*pte & PG_G) == 0)
1753				anyvalid = 1;
1754			if (pmap_remove_pte(pmap, pte, sva))
1755				break;
1756		}
1757	}
1758out:
1759	sched_unpin();
1760	vm_page_unlock_queues();
1761	if (anyvalid)
1762		pmap_invalidate_all(pmap);
1763	PMAP_UNLOCK(pmap);
1764}
1765
1766/*
1767 *	Routine:	pmap_remove_all
1768 *	Function:
1769 *		Removes this physical page from
1770 *		all physical maps in which it resides.
1771 *		Reflects back modify bits to the pager.
1772 *
1773 *	Notes:
1774 *		Original versions of this routine were very
1775 *		inefficient because they iteratively called
1776 *		pmap_remove (slow...)
1777 */
1778
1779void
1780pmap_remove_all(vm_page_t m)
1781{
1782	register pv_entry_t pv;
1783	pt_entry_t *pte, tpte;
1784
1785#if defined(PMAP_DIAGNOSTIC)
1786	/*
1787	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1788	 */
1789	if (m->flags & PG_FICTITIOUS) {
1790		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1791		    VM_PAGE_TO_PHYS(m));
1792	}
1793#endif
1794	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1795	sched_pin();
1796	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1797		PMAP_LOCK(pv->pv_pmap);
1798		pv->pv_pmap->pm_stats.resident_count--;
1799		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1800		tpte = pte_load_clear(pte);
1801		if (tpte & PG_W)
1802			pv->pv_pmap->pm_stats.wired_count--;
1803		if (tpte & PG_A)
1804			vm_page_flag_set(m, PG_REFERENCED);
1805
1806		/*
1807		 * Update the vm_page_t clean and reference bits.
1808		 */
1809		if (tpte & PG_M) {
1810			KASSERT((tpte & PG_RW),
1811	("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
1812			    pv->pv_va, (uintmax_t)tpte));
1813			if (pmap_track_modified(pv->pv_va))
1814				vm_page_dirty(m);
1815		}
1816		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1817		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1818		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1819		m->md.pv_list_count--;
1820		pmap_unuse_pt(pv->pv_pmap, pv->pv_va);
1821		PMAP_UNLOCK(pv->pv_pmap);
1822		free_pv_entry(pv);
1823	}
1824	vm_page_flag_clear(m, PG_WRITEABLE);
1825	sched_unpin();
1826}
1827
1828/*
1829 *	Set the physical protection on the
1830 *	specified range of this map as requested.
1831 */
1832void
1833pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1834{
1835	vm_offset_t pdnxt;
1836	pd_entry_t ptpaddr;
1837	pt_entry_t *pte;
1838	int anychanged;
1839
1840	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1841		pmap_remove(pmap, sva, eva);
1842		return;
1843	}
1844
1845	if (prot & VM_PROT_WRITE)
1846		return;
1847
1848	anychanged = 0;
1849
1850	vm_page_lock_queues();
1851	sched_pin();
1852	PMAP_LOCK(pmap);
1853	for (; sva < eva; sva = pdnxt) {
1854		unsigned obits, pbits, pdirindex;
1855
1856		pdnxt = (sva + NBPDR) & ~PDRMASK;
1857
1858		pdirindex = sva >> PDRSHIFT;
1859		ptpaddr = pmap->pm_pdir[pdirindex];
1860
1861		/*
1862		 * Weed out invalid mappings. Note: we assume that the page
1863		 * directory table is always allocated, and in kernel virtual.
1864		 */
1865		if (ptpaddr == 0)
1866			continue;
1867
1868		/*
1869		 * Check for large page.
1870		 */
1871		if ((ptpaddr & PG_PS) != 0) {
1872			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1873			anychanged = 1;
1874			continue;
1875		}
1876
1877		if (pdnxt > eva)
1878			pdnxt = eva;
1879
1880		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1881		    sva += PAGE_SIZE) {
1882			vm_page_t m;
1883
1884retry:
1885			/*
1886			 * Regardless of whether a pte is 32 or 64 bits in
1887			 * size, PG_RW, PG_A, and PG_M are among the least
1888			 * significant 32 bits.
1889			 */
1890			obits = pbits = *(u_int *)pte;
1891			if (pbits & PG_MANAGED) {
1892				m = NULL;
1893				if (pbits & PG_A) {
1894					m = PHYS_TO_VM_PAGE(*pte);
1895					vm_page_flag_set(m, PG_REFERENCED);
1896					pbits &= ~PG_A;
1897				}
1898				if ((pbits & PG_M) != 0 &&
1899				    pmap_track_modified(sva)) {
1900					if (m == NULL)
1901						m = PHYS_TO_VM_PAGE(*pte);
1902					vm_page_dirty(m);
1903				}
1904			}
1905
1906			pbits &= ~(PG_RW | PG_M);
1907
1908			if (pbits != obits) {
1909				if (!atomic_cmpset_int((u_int *)pte, obits,
1910				    pbits))
1911					goto retry;
1912				if (obits & PG_G)
1913					pmap_invalidate_page(pmap, sva);
1914				else
1915					anychanged = 1;
1916			}
1917		}
1918	}
1919	sched_unpin();
1920	vm_page_unlock_queues();
1921	if (anychanged)
1922		pmap_invalidate_all(pmap);
1923	PMAP_UNLOCK(pmap);
1924}
1925
1926/*
1927 *	Insert the given physical page (p) at
1928 *	the specified virtual address (v) in the
1929 *	target physical map with the protection requested.
1930 *
1931 *	If specified, the page will be wired down, meaning
1932 *	that the related pte can not be reclaimed.
1933 *
1934 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1935 *	or lose information.  That is, this routine must actually
1936 *	insert this page into the given map NOW.
1937 */
1938void
1939pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1940	   boolean_t wired)
1941{
1942	vm_paddr_t pa;
1943	register pt_entry_t *pte;
1944	vm_paddr_t opa;
1945	pt_entry_t origpte, newpte;
1946	vm_page_t mpte, om;
1947	boolean_t invlva;
1948
1949	va &= PG_FRAME;
1950#ifdef PMAP_DIAGNOSTIC
1951	if (va > VM_MAX_KERNEL_ADDRESS)
1952		panic("pmap_enter: toobig");
1953	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1954		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1955#endif
1956
1957	mpte = NULL;
1958
1959	vm_page_lock_queues();
1960	PMAP_LOCK(pmap);
1961	sched_pin();
1962
1963	/*
1964	 * In the case that a page table page is not
1965	 * resident, we are creating it here.
1966	 */
1967	if (va < VM_MAXUSER_ADDRESS) {
1968		mpte = pmap_allocpte(pmap, va, M_WAITOK);
1969	}
1970#if 0 && defined(PMAP_DIAGNOSTIC)
1971	else {
1972		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1973		origpte = *pdeaddr;
1974		if ((origpte & PG_V) == 0) {
1975			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
1976				pmap->pm_pdir[PTDPTDI], origpte, va);
1977		}
1978	}
1979#endif
1980
1981	pte = pmap_pte_quick(pmap, va);
1982
1983	/*
1984	 * Page Directory table entry not valid, we need a new PT page
1985	 */
1986	if (pte == NULL) {
1987		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
1988			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
1989	}
1990
1991	pa = VM_PAGE_TO_PHYS(m);
1992	om = NULL;
1993	origpte = *pte;
1994	opa = origpte & PG_FRAME;
1995
1996	if (origpte & PG_PS) {
1997		/*
1998		 * Yes, I know this will truncate upper address bits for PAE,
1999		 * but I'm actually more interested in the lower bits
2000		 */
2001		printf("pmap_enter: va %p, pte %p, origpte %p\n",
2002		    (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
2003		panic("pmap_enter: attempted pmap_enter on 4MB page");
2004	}
2005
2006	/*
2007	 * Mapping has not changed, must be protection or wiring change.
2008	 */
2009	if (origpte && (opa == pa)) {
2010		/*
2011		 * Wiring change, just update stats. We don't worry about
2012		 * wiring PT pages as they remain resident as long as there
2013		 * are valid mappings in them. Hence, if a user page is wired,
2014		 * the PT page will be also.
2015		 */
2016		if (wired && ((origpte & PG_W) == 0))
2017			pmap->pm_stats.wired_count++;
2018		else if (!wired && (origpte & PG_W))
2019			pmap->pm_stats.wired_count--;
2020
2021		/*
2022		 * Remove extra pte reference
2023		 */
2024		if (mpte)
2025			mpte->wire_count--;
2026
2027		/*
2028		 * We might be turning off write access to the page,
2029		 * so we go ahead and sense modify status.
2030		 */
2031		if (origpte & PG_MANAGED) {
2032			om = m;
2033			pa |= PG_MANAGED;
2034		}
2035		goto validate;
2036	}
2037	/*
2038	 * Mapping has changed, invalidate old range and fall through to
2039	 * handle validating new mapping.
2040	 */
2041	if (opa) {
2042		if (origpte & PG_W)
2043			pmap->pm_stats.wired_count--;
2044		if (origpte & PG_MANAGED) {
2045			om = PHYS_TO_VM_PAGE(opa);
2046			pmap_remove_entry(pmap, om, va);
2047		}
2048		if (mpte != NULL) {
2049			mpte->wire_count--;
2050			KASSERT(mpte->wire_count > 0,
2051			    ("pmap_enter: missing reference to page table page,"
2052			     " va: 0x%x", va));
2053		}
2054	} else
2055		pmap->pm_stats.resident_count++;
2056
2057	/*
2058	 * Enter on the PV list if part of our managed memory.
2059	 */
2060	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2061		pmap_insert_entry(pmap, va, m);
2062		pa |= PG_MANAGED;
2063	}
2064
2065	/*
2066	 * Increment counters
2067	 */
2068	if (wired)
2069		pmap->pm_stats.wired_count++;
2070
2071validate:
2072	/*
2073	 * Now validate mapping with desired protection/wiring.
2074	 */
2075	newpte = (pt_entry_t)(pa | PG_V);
2076	if ((prot & VM_PROT_WRITE) != 0)
2077		newpte |= PG_RW;
2078	if (wired)
2079		newpte |= PG_W;
2080	if (va < VM_MAXUSER_ADDRESS)
2081		newpte |= PG_U;
2082	if (pmap == kernel_pmap)
2083		newpte |= pgeflag;
2084
2085	/*
2086	 * if the mapping or permission bits are different, we need
2087	 * to update the pte.
2088	 */
2089	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2090		if (origpte & PG_V) {
2091			invlva = FALSE;
2092			origpte = pte_load_store(pte, newpte | PG_A);
2093			if (origpte & PG_A) {
2094				if (origpte & PG_MANAGED)
2095					vm_page_flag_set(om, PG_REFERENCED);
2096				if (opa != VM_PAGE_TO_PHYS(m))
2097					invlva = TRUE;
2098			}
2099			if (origpte & PG_M) {
2100				KASSERT((origpte & PG_RW),
2101	("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
2102				    va, (uintmax_t)origpte));
2103				if ((origpte & PG_MANAGED) &&
2104				    pmap_track_modified(va))
2105					vm_page_dirty(om);
2106				if ((prot & VM_PROT_WRITE) == 0)
2107					invlva = TRUE;
2108			}
2109			if (invlva)
2110				pmap_invalidate_page(pmap, va);
2111		} else
2112			pte_store(pte, newpte | PG_A);
2113	}
2114	sched_unpin();
2115	vm_page_unlock_queues();
2116	PMAP_UNLOCK(pmap);
2117}
2118
2119/*
2120 * this code makes some *MAJOR* assumptions:
2121 * 1. Current pmap & pmap exists.
2122 * 2. Not wired.
2123 * 3. Read access.
2124 * 4. No page table pages.
2125 * but is *MUCH* faster than pmap_enter...
2126 */
2127
2128vm_page_t
2129pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2130    vm_page_t mpte)
2131{
2132	pt_entry_t *pte;
2133	vm_paddr_t pa;
2134
2135	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2136	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2137	PMAP_LOCK(pmap);
2138
2139	/*
2140	 * In the case that a page table page is not
2141	 * resident, we are creating it here.
2142	 */
2143	if (va < VM_MAXUSER_ADDRESS) {
2144		unsigned ptepindex;
2145		pd_entry_t ptepa;
2146
2147		/*
2148		 * Calculate pagetable page index
2149		 */
2150		ptepindex = va >> PDRSHIFT;
2151		if (mpte && (mpte->pindex == ptepindex)) {
2152			mpte->wire_count++;
2153		} else {
2154retry:
2155			/*
2156			 * Get the page directory entry
2157			 */
2158			ptepa = pmap->pm_pdir[ptepindex];
2159
2160			/*
2161			 * If the page table page is mapped, we just increment
2162			 * the hold count, and activate it.
2163			 */
2164			if (ptepa) {
2165				if (ptepa & PG_PS)
2166					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2167				mpte = PHYS_TO_VM_PAGE(ptepa);
2168				mpte->wire_count++;
2169			} else {
2170				mpte = _pmap_allocpte(pmap, ptepindex,
2171				    M_NOWAIT);
2172				if (mpte == NULL) {
2173					PMAP_UNLOCK(pmap);
2174					vm_page_busy(m);
2175					vm_page_unlock_queues();
2176					VM_OBJECT_UNLOCK(m->object);
2177					VM_WAIT;
2178					VM_OBJECT_LOCK(m->object);
2179					vm_page_lock_queues();
2180					vm_page_wakeup(m);
2181					PMAP_LOCK(pmap);
2182					goto retry;
2183				}
2184			}
2185		}
2186	} else {
2187		mpte = NULL;
2188	}
2189
2190	/*
2191	 * This call to vtopte makes the assumption that we are
2192	 * entering the page into the current pmap.  In order to support
2193	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2194	 * But that isn't as quick as vtopte.
2195	 */
2196	pte = vtopte(va);
2197	if (*pte) {
2198		if (mpte != NULL) {
2199			pmap_unwire_pte_hold(pmap, mpte);
2200			mpte = NULL;
2201		}
2202		goto out;
2203	}
2204
2205	/*
2206	 * Enter on the PV list if part of our managed memory. Note that we
2207	 * raise IPL while manipulating pv_table since pmap_enter can be
2208	 * called at interrupt time.
2209	 */
2210	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2211		pmap_insert_entry(pmap, va, m);
2212
2213	/*
2214	 * Increment counters
2215	 */
2216	pmap->pm_stats.resident_count++;
2217
2218	pa = VM_PAGE_TO_PHYS(m);
2219
2220	/*
2221	 * Now validate mapping with RO protection
2222	 */
2223	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2224		pte_store(pte, pa | PG_V | PG_U);
2225	else
2226		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2227out:
2228	PMAP_UNLOCK(pmap);
2229	return mpte;
2230}
2231
2232/*
2233 * Make a temporary mapping for a physical address.  This is only intended
2234 * to be used for panic dumps.
2235 */
2236void *
2237pmap_kenter_temporary(vm_paddr_t pa, int i)
2238{
2239	vm_offset_t va;
2240
2241	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2242	pmap_kenter(va, pa);
2243	invlpg(va);
2244	return ((void *)crashdumpmap);
2245}
2246
2247/*
2248 * This code maps large physical mmap regions into the
2249 * processor address space.  Note that some shortcuts
2250 * are taken, but the code works.
2251 */
2252void
2253pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2254		    vm_object_t object, vm_pindex_t pindex,
2255		    vm_size_t size)
2256{
2257	vm_page_t p;
2258
2259	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2260	KASSERT(object->type == OBJT_DEVICE,
2261	    ("pmap_object_init_pt: non-device object"));
2262	if (pseflag &&
2263	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2264		int i;
2265		vm_page_t m[1];
2266		unsigned int ptepindex;
2267		int npdes;
2268		pd_entry_t ptepa;
2269
2270		PMAP_LOCK(pmap);
2271		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2272			goto out;
2273		PMAP_UNLOCK(pmap);
2274retry:
2275		p = vm_page_lookup(object, pindex);
2276		if (p != NULL) {
2277			vm_page_lock_queues();
2278			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2279				goto retry;
2280		} else {
2281			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2282			if (p == NULL)
2283				return;
2284			m[0] = p;
2285
2286			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2287				vm_page_lock_queues();
2288				vm_page_free(p);
2289				vm_page_unlock_queues();
2290				return;
2291			}
2292
2293			p = vm_page_lookup(object, pindex);
2294			vm_page_lock_queues();
2295			vm_page_wakeup(p);
2296		}
2297		vm_page_unlock_queues();
2298
2299		ptepa = VM_PAGE_TO_PHYS(p);
2300		if (ptepa & (NBPDR - 1))
2301			return;
2302
2303		p->valid = VM_PAGE_BITS_ALL;
2304
2305		PMAP_LOCK(pmap);
2306		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2307		npdes = size >> PDRSHIFT;
2308		for(i = 0; i < npdes; i++) {
2309			pde_store(&pmap->pm_pdir[ptepindex],
2310			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
2311			ptepa += NBPDR;
2312			ptepindex += 1;
2313		}
2314		pmap_invalidate_all(pmap);
2315out:
2316		PMAP_UNLOCK(pmap);
2317	}
2318}
2319
2320/*
2321 *	Routine:	pmap_change_wiring
2322 *	Function:	Change the wiring attribute for a map/virtual-address
2323 *			pair.
2324 *	In/out conditions:
2325 *			The mapping must already exist in the pmap.
2326 */
2327void
2328pmap_change_wiring(pmap, va, wired)
2329	register pmap_t pmap;
2330	vm_offset_t va;
2331	boolean_t wired;
2332{
2333	register pt_entry_t *pte;
2334
2335	PMAP_LOCK(pmap);
2336	pte = pmap_pte(pmap, va);
2337
2338	if (wired && !pmap_pte_w(pte))
2339		pmap->pm_stats.wired_count++;
2340	else if (!wired && pmap_pte_w(pte))
2341		pmap->pm_stats.wired_count--;
2342
2343	/*
2344	 * Wiring is not a hardware characteristic so there is no need to
2345	 * invalidate TLB.
2346	 */
2347	pmap_pte_set_w(pte, wired);
2348	pmap_pte_release(pte);
2349	PMAP_UNLOCK(pmap);
2350}
2351
2352
2353
2354/*
2355 *	Copy the range specified by src_addr/len
2356 *	from the source map to the range dst_addr/len
2357 *	in the destination map.
2358 *
2359 *	This routine is only advisory and need not do anything.
2360 */
2361
2362void
2363pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2364	  vm_offset_t src_addr)
2365{
2366	vm_offset_t addr;
2367	vm_offset_t end_addr = src_addr + len;
2368	vm_offset_t pdnxt;
2369
2370	if (dst_addr != src_addr)
2371		return;
2372
2373	if (!pmap_is_current(src_pmap))
2374		return;
2375
2376	vm_page_lock_queues();
2377	if (dst_pmap < src_pmap) {
2378		PMAP_LOCK(dst_pmap);
2379		PMAP_LOCK(src_pmap);
2380	} else {
2381		PMAP_LOCK(src_pmap);
2382		PMAP_LOCK(dst_pmap);
2383	}
2384	sched_pin();
2385	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2386		pt_entry_t *src_pte, *dst_pte;
2387		vm_page_t dstmpte, srcmpte;
2388		pd_entry_t srcptepaddr;
2389		unsigned ptepindex;
2390
2391		if (addr >= UPT_MIN_ADDRESS)
2392			panic("pmap_copy: invalid to pmap_copy page tables");
2393
2394		pdnxt = (addr + NBPDR) & ~PDRMASK;
2395		ptepindex = addr >> PDRSHIFT;
2396
2397		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2398		if (srcptepaddr == 0)
2399			continue;
2400
2401		if (srcptepaddr & PG_PS) {
2402			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2403				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2404				dst_pmap->pm_stats.resident_count +=
2405				    NBPDR / PAGE_SIZE;
2406			}
2407			continue;
2408		}
2409
2410		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2411		if (srcmpte->wire_count == 0)
2412			panic("pmap_copy: source page table page is unused");
2413
2414		if (pdnxt > end_addr)
2415			pdnxt = end_addr;
2416
2417		src_pte = vtopte(addr);
2418		while (addr < pdnxt) {
2419			pt_entry_t ptetemp;
2420			ptetemp = *src_pte;
2421			/*
2422			 * we only virtual copy managed pages
2423			 */
2424			if ((ptetemp & PG_MANAGED) != 0) {
2425				/*
2426				 * We have to check after allocpte for the
2427				 * pte still being around...  allocpte can
2428				 * block.
2429				 */
2430				dstmpte = pmap_allocpte(dst_pmap, addr,
2431				    M_NOWAIT);
2432				if (dstmpte == NULL)
2433					break;
2434				dst_pte = pmap_pte_quick(dst_pmap, addr);
2435				if (*dst_pte == 0 &&
2436				    pmap_try_insert_pv_entry(dst_pmap, addr,
2437				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
2438					/*
2439					 * Clear the modified and
2440					 * accessed (referenced) bits
2441					 * during the copy.
2442					 */
2443					*dst_pte = ptetemp & ~(PG_M | PG_A);
2444					dst_pmap->pm_stats.resident_count++;
2445	 			} else
2446					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2447				if (dstmpte->wire_count >= srcmpte->wire_count)
2448					break;
2449			}
2450			addr += PAGE_SIZE;
2451			src_pte++;
2452		}
2453	}
2454	sched_unpin();
2455	vm_page_unlock_queues();
2456	PMAP_UNLOCK(src_pmap);
2457	PMAP_UNLOCK(dst_pmap);
2458}
2459
2460static __inline void
2461pagezero(void *page)
2462{
2463#if defined(I686_CPU)
2464	if (cpu_class == CPUCLASS_686) {
2465#if defined(CPU_ENABLE_SSE)
2466		if (cpu_feature & CPUID_SSE2)
2467			sse2_pagezero(page);
2468		else
2469#endif
2470			i686_pagezero(page);
2471	} else
2472#endif
2473		bzero(page, PAGE_SIZE);
2474}
2475
2476/*
2477 *	pmap_zero_page zeros the specified hardware page by mapping
2478 *	the page into KVM and using bzero to clear its contents.
2479 */
2480void
2481pmap_zero_page(vm_page_t m)
2482{
2483	struct sysmaps *sysmaps;
2484
2485	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2486	mtx_lock(&sysmaps->lock);
2487	if (*sysmaps->CMAP2)
2488		panic("pmap_zero_page: CMAP2 busy");
2489	sched_pin();
2490	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2491	invlcaddr(sysmaps->CADDR2);
2492	pagezero(sysmaps->CADDR2);
2493	*sysmaps->CMAP2 = 0;
2494	sched_unpin();
2495	mtx_unlock(&sysmaps->lock);
2496}
2497
2498/*
2499 *	pmap_zero_page_area zeros the specified hardware page by mapping
2500 *	the page into KVM and using bzero to clear its contents.
2501 *
2502 *	off and size may not cover an area beyond a single hardware page.
2503 */
2504void
2505pmap_zero_page_area(vm_page_t m, int off, int size)
2506{
2507	struct sysmaps *sysmaps;
2508
2509	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2510	mtx_lock(&sysmaps->lock);
2511	if (*sysmaps->CMAP2)
2512		panic("pmap_zero_page: CMAP2 busy");
2513	sched_pin();
2514	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2515	invlcaddr(sysmaps->CADDR2);
2516	if (off == 0 && size == PAGE_SIZE)
2517		pagezero(sysmaps->CADDR2);
2518	else
2519		bzero((char *)sysmaps->CADDR2 + off, size);
2520	*sysmaps->CMAP2 = 0;
2521	sched_unpin();
2522	mtx_unlock(&sysmaps->lock);
2523}
2524
2525/*
2526 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2527 *	the page into KVM and using bzero to clear its contents.  This
2528 *	is intended to be called from the vm_pagezero process only and
2529 *	outside of Giant.
2530 */
2531void
2532pmap_zero_page_idle(vm_page_t m)
2533{
2534
2535	if (*CMAP3)
2536		panic("pmap_zero_page: CMAP3 busy");
2537	sched_pin();
2538	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2539	invlcaddr(CADDR3);
2540	pagezero(CADDR3);
2541	*CMAP3 = 0;
2542	sched_unpin();
2543}
2544
2545/*
2546 *	pmap_copy_page copies the specified (machine independent)
2547 *	page by mapping the page into virtual memory and using
2548 *	bcopy to copy the page, one machine dependent page at a
2549 *	time.
2550 */
2551void
2552pmap_copy_page(vm_page_t src, vm_page_t dst)
2553{
2554	struct sysmaps *sysmaps;
2555
2556	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2557	mtx_lock(&sysmaps->lock);
2558	if (*sysmaps->CMAP1)
2559		panic("pmap_copy_page: CMAP1 busy");
2560	if (*sysmaps->CMAP2)
2561		panic("pmap_copy_page: CMAP2 busy");
2562	sched_pin();
2563	invlpg((u_int)sysmaps->CADDR1);
2564	invlpg((u_int)sysmaps->CADDR2);
2565	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2566	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2567	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
2568	*sysmaps->CMAP1 = 0;
2569	*sysmaps->CMAP2 = 0;
2570	sched_unpin();
2571	mtx_unlock(&sysmaps->lock);
2572}
2573
2574/*
2575 * Returns true if the pmap's pv is one of the first
2576 * 16 pvs linked to from this page.  This count may
2577 * be changed upwards or downwards in the future; it
2578 * is only necessary that true be returned for a small
2579 * subset of pmaps for proper page aging.
2580 */
2581boolean_t
2582pmap_page_exists_quick(pmap, m)
2583	pmap_t pmap;
2584	vm_page_t m;
2585{
2586	pv_entry_t pv;
2587	int loops = 0;
2588
2589	if (m->flags & PG_FICTITIOUS)
2590		return FALSE;
2591
2592	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2593	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2594		if (pv->pv_pmap == pmap) {
2595			return TRUE;
2596		}
2597		loops++;
2598		if (loops >= 16)
2599			break;
2600	}
2601	return (FALSE);
2602}
2603
2604#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2605/*
2606 * Remove all pages from specified address space
2607 * this aids process exit speeds.  Also, this code
2608 * is special cased for current process only, but
2609 * can have the more generic (and slightly slower)
2610 * mode enabled.  This is much faster than pmap_remove
2611 * in the case of running down an entire address space.
2612 */
2613void
2614pmap_remove_pages(pmap_t pmap)
2615{
2616	pt_entry_t *pte, tpte;
2617	vm_page_t m;
2618	pv_entry_t pv, npv;
2619
2620#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2621	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2622		printf("warning: pmap_remove_pages called with non-current pmap\n");
2623		return;
2624	}
2625#endif
2626	vm_page_lock_queues();
2627	PMAP_LOCK(pmap);
2628	sched_pin();
2629	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2630
2631#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2632		pte = vtopte(pv->pv_va);
2633#else
2634		pte = pmap_pte_quick(pmap, pv->pv_va);
2635#endif
2636		tpte = *pte;
2637
2638		if (tpte == 0) {
2639			printf("TPTE at %p  IS ZERO @ VA %08x\n",
2640							pte, pv->pv_va);
2641			panic("bad pte");
2642		}
2643
2644/*
2645 * We cannot remove wired pages from a process' mapping at this time
2646 */
2647		if (tpte & PG_W) {
2648			npv = TAILQ_NEXT(pv, pv_plist);
2649			continue;
2650		}
2651
2652		m = PHYS_TO_VM_PAGE(tpte);
2653		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2654		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2655		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2656
2657		KASSERT(m < &vm_page_array[vm_page_array_size],
2658			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2659
2660		pmap->pm_stats.resident_count--;
2661
2662		pte_clear(pte);
2663
2664		/*
2665		 * Update the vm_page_t clean and reference bits.
2666		 */
2667		if (tpte & PG_M) {
2668			vm_page_dirty(m);
2669		}
2670
2671		npv = TAILQ_NEXT(pv, pv_plist);
2672		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2673
2674		m->md.pv_list_count--;
2675		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2676		if (TAILQ_EMPTY(&m->md.pv_list))
2677			vm_page_flag_clear(m, PG_WRITEABLE);
2678
2679		pmap_unuse_pt(pmap, pv->pv_va);
2680		free_pv_entry(pv);
2681	}
2682	sched_unpin();
2683	pmap_invalidate_all(pmap);
2684	PMAP_UNLOCK(pmap);
2685	vm_page_unlock_queues();
2686}
2687
2688/*
2689 *	pmap_is_modified:
2690 *
2691 *	Return whether or not the specified physical page was modified
2692 *	in any physical maps.
2693 */
2694boolean_t
2695pmap_is_modified(vm_page_t m)
2696{
2697	pv_entry_t pv;
2698	pt_entry_t *pte;
2699	boolean_t rv;
2700
2701	rv = FALSE;
2702	if (m->flags & PG_FICTITIOUS)
2703		return (rv);
2704
2705	sched_pin();
2706	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2707	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2708		/*
2709		 * if the bit being tested is the modified bit, then
2710		 * mark clean_map and ptes as never
2711		 * modified.
2712		 */
2713		if (!pmap_track_modified(pv->pv_va))
2714			continue;
2715		PMAP_LOCK(pv->pv_pmap);
2716		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2717		rv = (*pte & PG_M) != 0;
2718		PMAP_UNLOCK(pv->pv_pmap);
2719		if (rv)
2720			break;
2721	}
2722	sched_unpin();
2723	return (rv);
2724}
2725
2726/*
2727 *	pmap_is_prefaultable:
2728 *
2729 *	Return whether or not the specified virtual address is elgible
2730 *	for prefault.
2731 */
2732boolean_t
2733pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2734{
2735	pt_entry_t *pte;
2736	boolean_t rv;
2737
2738	rv = FALSE;
2739	PMAP_LOCK(pmap);
2740	if (*pmap_pde(pmap, addr)) {
2741		pte = vtopte(addr);
2742		rv = *pte == 0;
2743	}
2744	PMAP_UNLOCK(pmap);
2745	return (rv);
2746}
2747
2748/*
2749 *	Clear the given bit in each of the given page's ptes.  The bit is
2750 *	expressed as a 32-bit mask.  Consequently, if the pte is 64 bits in
2751 *	size, only a bit within the least significant 32 can be cleared.
2752 */
2753static __inline void
2754pmap_clear_ptes(vm_page_t m, int bit)
2755{
2756	register pv_entry_t pv;
2757	pt_entry_t pbits, *pte;
2758
2759	if ((m->flags & PG_FICTITIOUS) ||
2760	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2761		return;
2762
2763	sched_pin();
2764	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2765	/*
2766	 * Loop over all current mappings setting/clearing as appropos If
2767	 * setting RO do we need to clear the VAC?
2768	 */
2769	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2770		/*
2771		 * don't write protect pager mappings
2772		 */
2773		if (bit == PG_RW) {
2774			if (!pmap_track_modified(pv->pv_va))
2775				continue;
2776		}
2777
2778		PMAP_LOCK(pv->pv_pmap);
2779		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2780retry:
2781		pbits = *pte;
2782		if (pbits & bit) {
2783			if (bit == PG_RW) {
2784				/*
2785				 * Regardless of whether a pte is 32 or 64 bits
2786				 * in size, PG_RW and PG_M are among the least
2787				 * significant 32 bits.
2788				 */
2789				if (!atomic_cmpset_int((u_int *)pte, pbits,
2790				    pbits & ~(PG_RW | PG_M)))
2791					goto retry;
2792				if (pbits & PG_M) {
2793					vm_page_dirty(m);
2794				}
2795			} else {
2796				atomic_clear_int((u_int *)pte, bit);
2797			}
2798			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2799		}
2800		PMAP_UNLOCK(pv->pv_pmap);
2801	}
2802	if (bit == PG_RW)
2803		vm_page_flag_clear(m, PG_WRITEABLE);
2804	sched_unpin();
2805}
2806
2807/*
2808 *      pmap_page_protect:
2809 *
2810 *      Lower the permission for all mappings to a given page.
2811 */
2812void
2813pmap_page_protect(vm_page_t m, vm_prot_t prot)
2814{
2815	if ((prot & VM_PROT_WRITE) == 0) {
2816		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2817			pmap_clear_ptes(m, PG_RW);
2818		} else {
2819			pmap_remove_all(m);
2820		}
2821	}
2822}
2823
2824/*
2825 *	pmap_ts_referenced:
2826 *
2827 *	Return a count of reference bits for a page, clearing those bits.
2828 *	It is not necessary for every reference bit to be cleared, but it
2829 *	is necessary that 0 only be returned when there are truly no
2830 *	reference bits set.
2831 *
2832 *	XXX: The exact number of bits to check and clear is a matter that
2833 *	should be tested and standardized at some point in the future for
2834 *	optimal aging of shared pages.
2835 */
2836int
2837pmap_ts_referenced(vm_page_t m)
2838{
2839	register pv_entry_t pv, pvf, pvn;
2840	pt_entry_t *pte;
2841	pt_entry_t v;
2842	int rtval = 0;
2843
2844	if (m->flags & PG_FICTITIOUS)
2845		return (rtval);
2846
2847	sched_pin();
2848	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2849	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2850
2851		pvf = pv;
2852
2853		do {
2854			pvn = TAILQ_NEXT(pv, pv_list);
2855
2856			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2857
2858			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2859
2860			if (!pmap_track_modified(pv->pv_va))
2861				continue;
2862
2863			PMAP_LOCK(pv->pv_pmap);
2864			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2865
2866			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2867				atomic_clear_int((u_int *)pte, PG_A);
2868				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2869
2870				rtval++;
2871				if (rtval > 4) {
2872					PMAP_UNLOCK(pv->pv_pmap);
2873					break;
2874				}
2875			}
2876			PMAP_UNLOCK(pv->pv_pmap);
2877		} while ((pv = pvn) != NULL && pv != pvf);
2878	}
2879	sched_unpin();
2880
2881	return (rtval);
2882}
2883
2884/*
2885 *	Clear the modify bits on the specified physical page.
2886 */
2887void
2888pmap_clear_modify(vm_page_t m)
2889{
2890	pmap_clear_ptes(m, PG_M);
2891}
2892
2893/*
2894 *	pmap_clear_reference:
2895 *
2896 *	Clear the reference bit on the specified physical page.
2897 */
2898void
2899pmap_clear_reference(vm_page_t m)
2900{
2901	pmap_clear_ptes(m, PG_A);
2902}
2903
2904/*
2905 * Miscellaneous support routines follow
2906 */
2907
2908/*
2909 * Map a set of physical memory pages into the kernel virtual
2910 * address space. Return a pointer to where it is mapped. This
2911 * routine is intended to be used for mapping device memory,
2912 * NOT real memory.
2913 */
2914void *
2915pmap_mapdev(pa, size)
2916	vm_paddr_t pa;
2917	vm_size_t size;
2918{
2919	vm_offset_t va, tmpva, offset;
2920
2921	offset = pa & PAGE_MASK;
2922	size = roundup(offset + size, PAGE_SIZE);
2923	pa = pa & PG_FRAME;
2924
2925	if (pa < KERNLOAD && pa + size <= KERNLOAD)
2926		va = KERNBASE + pa;
2927	else
2928		va = kmem_alloc_nofault(kernel_map, size);
2929	if (!va)
2930		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2931
2932	for (tmpva = va; size > 0; ) {
2933		pmap_kenter(tmpva, pa);
2934		size -= PAGE_SIZE;
2935		tmpva += PAGE_SIZE;
2936		pa += PAGE_SIZE;
2937	}
2938	pmap_invalidate_range(kernel_pmap, va, tmpva);
2939	return ((void *)(va + offset));
2940}
2941
2942void
2943pmap_unmapdev(va, size)
2944	vm_offset_t va;
2945	vm_size_t size;
2946{
2947	vm_offset_t base, offset, tmpva;
2948
2949	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
2950		return;
2951	base = va & PG_FRAME;
2952	offset = va & PAGE_MASK;
2953	size = roundup(offset + size, PAGE_SIZE);
2954	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2955		pmap_kremove(tmpva);
2956	pmap_invalidate_range(kernel_pmap, va, tmpva);
2957	kmem_free(kernel_map, base, size);
2958}
2959
2960/*
2961 * perform the pmap work for mincore
2962 */
2963int
2964pmap_mincore(pmap, addr)
2965	pmap_t pmap;
2966	vm_offset_t addr;
2967{
2968	pt_entry_t *ptep, pte;
2969	vm_page_t m;
2970	int val = 0;
2971
2972	PMAP_LOCK(pmap);
2973	ptep = pmap_pte(pmap, addr);
2974	pte = (ptep != NULL) ? *ptep : 0;
2975	pmap_pte_release(ptep);
2976	PMAP_UNLOCK(pmap);
2977
2978	if (pte != 0) {
2979		vm_paddr_t pa;
2980
2981		val = MINCORE_INCORE;
2982		if ((pte & PG_MANAGED) == 0)
2983			return val;
2984
2985		pa = pte & PG_FRAME;
2986
2987		m = PHYS_TO_VM_PAGE(pa);
2988
2989		/*
2990		 * Modified by us
2991		 */
2992		if (pte & PG_M)
2993			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2994		else {
2995			/*
2996			 * Modified by someone else
2997			 */
2998			vm_page_lock_queues();
2999			if (m->dirty || pmap_is_modified(m))
3000				val |= MINCORE_MODIFIED_OTHER;
3001			vm_page_unlock_queues();
3002		}
3003		/*
3004		 * Referenced by us
3005		 */
3006		if (pte & PG_A)
3007			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3008		else {
3009			/*
3010			 * Referenced by someone else
3011			 */
3012			vm_page_lock_queues();
3013			if ((m->flags & PG_REFERENCED) ||
3014			    pmap_ts_referenced(m)) {
3015				val |= MINCORE_REFERENCED_OTHER;
3016				vm_page_flag_set(m, PG_REFERENCED);
3017			}
3018			vm_page_unlock_queues();
3019		}
3020	}
3021	return val;
3022}
3023
3024void
3025pmap_activate(struct thread *td)
3026{
3027	pmap_t	pmap, oldpmap;
3028	u_int32_t  cr3;
3029
3030	critical_enter();
3031	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3032	oldpmap = PCPU_GET(curpmap);
3033#if defined(SMP)
3034	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3035	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3036#else
3037	oldpmap->pm_active &= ~1;
3038	pmap->pm_active |= 1;
3039#endif
3040#ifdef PAE
3041	cr3 = vtophys(pmap->pm_pdpt);
3042#else
3043	cr3 = vtophys(pmap->pm_pdir);
3044#endif
3045	/*
3046	 * pmap_activate is for the current thread on the current cpu
3047	 */
3048	td->td_pcb->pcb_cr3 = cr3;
3049	load_cr3(cr3);
3050	PCPU_SET(curpmap, pmap);
3051	critical_exit();
3052}
3053
3054vm_offset_t
3055pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3056{
3057
3058	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3059		return addr;
3060	}
3061
3062	addr = (addr + PDRMASK) & ~PDRMASK;
3063	return addr;
3064}
3065
3066
3067#if defined(PMAP_DEBUG)
3068pmap_pid_dump(int pid)
3069{
3070	pmap_t pmap;
3071	struct proc *p;
3072	int npte = 0;
3073	int index;
3074
3075	sx_slock(&allproc_lock);
3076	LIST_FOREACH(p, &allproc, p_list) {
3077		if (p->p_pid != pid)
3078			continue;
3079
3080		if (p->p_vmspace) {
3081			int i,j;
3082			index = 0;
3083			pmap = vmspace_pmap(p->p_vmspace);
3084			for (i = 0; i < NPDEPTD; i++) {
3085				pd_entry_t *pde;
3086				pt_entry_t *pte;
3087				vm_offset_t base = i << PDRSHIFT;
3088
3089				pde = &pmap->pm_pdir[i];
3090				if (pde && pmap_pde_v(pde)) {
3091					for (j = 0; j < NPTEPG; j++) {
3092						vm_offset_t va = base + (j << PAGE_SHIFT);
3093						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3094							if (index) {
3095								index = 0;
3096								printf("\n");
3097							}
3098							sx_sunlock(&allproc_lock);
3099							return npte;
3100						}
3101						pte = pmap_pte(pmap, va);
3102						if (pte && pmap_pte_v(pte)) {
3103							pt_entry_t pa;
3104							vm_page_t m;
3105							pa = *pte;
3106							m = PHYS_TO_VM_PAGE(pa);
3107							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3108								va, pa, m->hold_count, m->wire_count, m->flags);
3109							npte++;
3110							index++;
3111							if (index >= 2) {
3112								index = 0;
3113								printf("\n");
3114							} else {
3115								printf(" ");
3116							}
3117						}
3118					}
3119				}
3120			}
3121		}
3122	}
3123	sx_sunlock(&allproc_lock);
3124	return npte;
3125}
3126#endif
3127
3128#if defined(DEBUG)
3129
3130static void	pads(pmap_t pm);
3131void		pmap_pvdump(vm_offset_t pa);
3132
3133/* print address space of pmap*/
3134static void
3135pads(pm)
3136	pmap_t pm;
3137{
3138	int i, j;
3139	vm_paddr_t va;
3140	pt_entry_t *ptep;
3141
3142	if (pm == kernel_pmap)
3143		return;
3144	for (i = 0; i < NPDEPTD; i++)
3145		if (pm->pm_pdir[i])
3146			for (j = 0; j < NPTEPG; j++) {
3147				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3148				if (pm == kernel_pmap && va < KERNBASE)
3149					continue;
3150				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3151					continue;
3152				ptep = pmap_pte(pm, va);
3153				if (pmap_pte_v(ptep))
3154					printf("%x:%x ", va, *ptep);
3155			};
3156
3157}
3158
3159void
3160pmap_pvdump(pa)
3161	vm_paddr_t pa;
3162{
3163	pv_entry_t pv;
3164	vm_page_t m;
3165
3166	printf("pa %x", pa);
3167	m = PHYS_TO_VM_PAGE(pa);
3168	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3169		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3170		pads(pv->pv_pmap);
3171	}
3172	printf(" ");
3173}
3174#endif
3175