pmap.c revision 130386
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 */
43/*-
44 * Copyright (c) 2003 Networks Associates Technology, Inc.
45 * All rights reserved.
46 *
47 * This software was developed for the FreeBSD Project by Jake Burkholder,
48 * Safeport Network Services, and Network Associates Laboratories, the
49 * Security Research Division of Network Associates, Inc. under
50 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
51 * CHATS research program.
52 *
53 * Redistribution and use in source and binary forms, with or without
54 * modification, are permitted provided that the following conditions
55 * are met:
56 * 1. Redistributions of source code must retain the above copyright
57 *    notice, this list of conditions and the following disclaimer.
58 * 2. Redistributions in binary form must reproduce the above copyright
59 *    notice, this list of conditions and the following disclaimer in the
60 *    documentation and/or other materials provided with the distribution.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 */
74
75#include <sys/cdefs.h>
76__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 130386 2004-06-12 20:01:48Z alc $");
77
78/*
79 *	Manages physical address maps.
80 *
81 *	In addition to hardware address maps, this
82 *	module is called upon to provide software-use-only
83 *	maps which may or may not be stored in the same
84 *	form as hardware maps.  These pseudo-maps are
85 *	used to store intermediate results from copy
86 *	operations to and from address spaces.
87 *
88 *	Since the information managed by this module is
89 *	also stored by the logical address mapping module,
90 *	this module may throw away valid virtual-to-physical
91 *	mappings at almost any time.  However, invalidations
92 *	of virtual-to-physical mappings must be done as
93 *	requested.
94 *
95 *	In order to cope with hardware architectures which
96 *	make virtual-to-physical map invalidates expensive,
97 *	this module may delay invalidate or reduced protection
98 *	operations until such time as they are actually
99 *	necessary.  This module is given full information as
100 *	to which processors are currently using which maps,
101 *	and to when physical maps must be made correct.
102 */
103
104#include "opt_cpu.h"
105#include "opt_pmap.h"
106#include "opt_msgbuf.h"
107#include "opt_kstack_pages.h"
108
109#include <sys/param.h>
110#include <sys/systm.h>
111#include <sys/kernel.h>
112#include <sys/lock.h>
113#include <sys/mman.h>
114#include <sys/msgbuf.h>
115#include <sys/mutex.h>
116#include <sys/proc.h>
117#include <sys/sx.h>
118#include <sys/user.h>
119#include <sys/vmmeter.h>
120#include <sys/sched.h>
121#include <sys/sysctl.h>
122#ifdef SMP
123#include <sys/smp.h>
124#endif
125
126#include <vm/vm.h>
127#include <vm/vm_param.h>
128#include <vm/vm_kern.h>
129#include <vm/vm_page.h>
130#include <vm/vm_map.h>
131#include <vm/vm_object.h>
132#include <vm/vm_extern.h>
133#include <vm/vm_pageout.h>
134#include <vm/vm_pager.h>
135#include <vm/uma.h>
136
137#include <machine/cpu.h>
138#include <machine/cputypes.h>
139#include <machine/md_var.h>
140#include <machine/specialreg.h>
141#ifdef SMP
142#include <machine/smp.h>
143#endif
144
145#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU)
146#define CPU_ENABLE_SSE
147#endif
148#if defined(CPU_DISABLE_SSE)
149#undef CPU_ENABLE_SSE
150#endif
151
152#define PMAP_KEEP_PDIRS
153#ifndef PMAP_SHPGPERPROC
154#define PMAP_SHPGPERPROC 200
155#endif
156
157#if defined(DIAGNOSTIC)
158#define PMAP_DIAGNOSTIC
159#endif
160
161#define MINPV 2048
162
163#if !defined(PMAP_DIAGNOSTIC)
164#define PMAP_INLINE __inline
165#else
166#define PMAP_INLINE
167#endif
168
169/*
170 * Get PDEs and PTEs for user/kernel address space
171 */
172#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
173#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
174
175#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
176#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
177#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
178#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
179#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
180
181#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
182    atomic_clear_int((u_int *)(pte), PG_W))
183#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
184
185struct pmap kernel_pmap_store;
186LIST_HEAD(pmaplist, pmap);
187static struct pmaplist allpmaps;
188static struct mtx allpmaps_lock;
189#ifdef SMP
190static struct mtx lazypmap_lock;
191#endif
192
193vm_paddr_t avail_end;	/* PA of last available physical page */
194vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
195vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
196static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
197int pgeflag = 0;		/* PG_G or-in */
198int pseflag = 0;		/* PG_PS or-in */
199
200static int nkpt;
201vm_offset_t kernel_vm_end;
202extern u_int32_t KERNend;
203
204#ifdef PAE
205static uma_zone_t pdptzone;
206#endif
207
208/*
209 * Data for the pv entry allocation mechanism
210 */
211static uma_zone_t pvzone;
212static struct vm_object pvzone_obj;
213static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
214int pmap_pagedaemon_waken;
215
216/*
217 * All those kernel PT submaps that BSD is so fond of
218 */
219pt_entry_t *CMAP1 = 0;
220static pt_entry_t *CMAP2, *CMAP3, *ptmmap;
221caddr_t CADDR1 = 0, ptvmmap = 0;
222static caddr_t CADDR2, CADDR3;
223static struct mtx CMAPCADDR12_lock;
224static pt_entry_t *msgbufmap;
225struct msgbuf *msgbufp = 0;
226
227/*
228 * Crashdump maps.
229 */
230static pt_entry_t *pt_crashdumpmap;
231static caddr_t crashdumpmap;
232
233#ifdef SMP
234extern pt_entry_t *SMPpt;
235#endif
236static pt_entry_t *PMAP1 = 0, *PMAP2;
237static pt_entry_t *PADDR1 = 0, *PADDR2;
238#ifdef SMP
239static int PMAP1cpu;
240static int PMAP1changedcpu;
241SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
242	   &PMAP1changedcpu, 0,
243	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
244#endif
245static int PMAP1changed;
246SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
247	   &PMAP1changed, 0,
248	   "Number of times pmap_pte_quick changed PMAP1");
249static int PMAP1unchanged;
250SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
251	   &PMAP1unchanged, 0,
252	   "Number of times pmap_pte_quick didn't change PMAP1");
253
254static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
255static pv_entry_t get_pv_entry(void);
256static void	pmap_clear_ptes(vm_page_t m, int bit)
257    __always_inline;
258
259static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
260static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
261static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
262					vm_offset_t va);
263static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
264		vm_page_t mpte, vm_page_t m);
265
266static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
267
268static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex);
269static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
270static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
271static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
272#ifdef PAE
273static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
274#endif
275
276CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
277CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
278
279/*
280 * Move the kernel virtual free pointer to the next
281 * 4MB.  This is used to help improve performance
282 * by using a large (4MB) page for much of the kernel
283 * (.text, .data, .bss)
284 */
285static vm_offset_t
286pmap_kmem_choose(vm_offset_t addr)
287{
288	vm_offset_t newaddr = addr;
289
290#ifndef DISABLE_PSE
291	if (cpu_feature & CPUID_PSE)
292		newaddr = (addr + PDRMASK) & ~PDRMASK;
293#endif
294	return newaddr;
295}
296
297/*
298 *	Bootstrap the system enough to run with virtual memory.
299 *
300 *	On the i386 this is called after mapping has already been enabled
301 *	and just syncs the pmap module with what has already been done.
302 *	[We can't call it easily with mapping off since the kernel is not
303 *	mapped with PA == VA, hence we would have to relocate every address
304 *	from the linked base (virtual) address "KERNBASE" to the actual
305 *	(physical) address starting relative to 0]
306 */
307void
308pmap_bootstrap(firstaddr, loadaddr)
309	vm_paddr_t firstaddr;
310	vm_paddr_t loadaddr;
311{
312	vm_offset_t va;
313	pt_entry_t *pte;
314	int i;
315
316	/*
317	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
318	 * large. It should instead be correctly calculated in locore.s and
319	 * not based on 'first' (which is a physical address, not a virtual
320	 * address, for the start of unused physical memory). The kernel
321	 * page tables are NOT double mapped and thus should not be included
322	 * in this calculation.
323	 */
324	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
325	virtual_avail = pmap_kmem_choose(virtual_avail);
326
327	virtual_end = VM_MAX_KERNEL_ADDRESS;
328
329	/*
330	 * Initialize the kernel pmap (which is statically allocated).
331	 */
332	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
333#ifdef PAE
334	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
335#endif
336	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
337	TAILQ_INIT(&kernel_pmap->pm_pvlist);
338	LIST_INIT(&allpmaps);
339#ifdef SMP
340	mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN);
341#endif
342	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
343	mtx_lock_spin(&allpmaps_lock);
344	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
345	mtx_unlock_spin(&allpmaps_lock);
346	nkpt = NKPT;
347
348	/*
349	 * Reserve some special page table entries/VA space for temporary
350	 * mapping of pages.
351	 */
352#define	SYSMAP(c, p, v, n)	\
353	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
354
355	va = virtual_avail;
356	pte = vtopte(va);
357
358	/*
359	 * CMAP1/CMAP2 are used for zeroing and copying pages.
360	 * CMAP3 is used for the idle process page zeroing.
361	 */
362	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
363	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
364	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
365	*CMAP3 = 0;
366
367	mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF);
368
369	/*
370	 * Crashdump maps.
371	 */
372	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
373
374	/*
375	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
376	 * XXX ptmmap is not used.
377	 */
378	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
379
380	/*
381	 * msgbufp is used to map the system message buffer.
382	 * XXX msgbufmap is not used.
383	 */
384	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
385	       atop(round_page(MSGBUF_SIZE)))
386
387	/*
388	 * ptemap is used for pmap_pte_quick
389	 */
390	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
391	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
392
393	virtual_avail = va;
394
395	*CMAP1 = *CMAP2 = 0;
396	for (i = 0; i < NKPT; i++)
397		PTD[i] = 0;
398
399	/* Turn on PG_G on kernel page(s) */
400	pmap_set_pg();
401}
402
403/*
404 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
405 */
406void
407pmap_set_pg(void)
408{
409	pd_entry_t pdir;
410	pt_entry_t *pte;
411	vm_offset_t va, endva;
412	int i;
413
414	if (pgeflag == 0)
415		return;
416
417	i = KERNLOAD/NBPDR;
418	endva = KERNBASE + KERNend;
419
420	if (pseflag) {
421		va = KERNBASE + KERNLOAD;
422		while (va  < endva) {
423			pdir = kernel_pmap->pm_pdir[KPTDI+i];
424			pdir |= pgeflag;
425			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
426			invltlb();	/* Play it safe, invltlb() every time */
427			i++;
428			va += NBPDR;
429		}
430	} else {
431		va = (vm_offset_t)btext;
432		while (va < endva) {
433			pte = vtopte(va);
434			if (*pte)
435				*pte |= pgeflag;
436			invltlb();	/* Play it safe, invltlb() every time */
437			va += PAGE_SIZE;
438		}
439	}
440}
441
442#ifdef PAE
443static void *
444pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
445{
446	*flags = UMA_SLAB_PRIV;
447	return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0));
448}
449#endif
450
451/*
452 *	Initialize the pmap module.
453 *	Called by vm_init, to initialize any structures that the pmap
454 *	system needs to map virtual memory.
455 *	pmap_init has been enhanced to support in a fairly consistant
456 *	way, discontiguous physical memory.
457 */
458void
459pmap_init(void)
460{
461	int i;
462
463	/*
464	 * Allocate memory for random pmap data structures.  Includes the
465	 * pv_head_table.
466	 */
467
468	for(i = 0; i < vm_page_array_size; i++) {
469		vm_page_t m;
470
471		m = &vm_page_array[i];
472		TAILQ_INIT(&m->md.pv_list);
473		m->md.pv_list_count = 0;
474	}
475
476	/*
477	 * init the pv free list
478	 */
479	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
480	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
481	uma_prealloc(pvzone, MINPV);
482
483#ifdef PAE
484	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
485	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
486	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
487	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
488#endif
489
490	/*
491	 * Now it is safe to enable pv_table recording.
492	 */
493	pmap_initialized = TRUE;
494}
495
496/*
497 * Initialize the address space (zone) for the pv_entries.  Set a
498 * high water mark so that the system can recover from excessive
499 * numbers of pv entries.
500 */
501void
502pmap_init2()
503{
504	int shpgperproc = PMAP_SHPGPERPROC;
505
506	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
507	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
508	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
509	pv_entry_high_water = 9 * (pv_entry_max / 10);
510	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
511}
512
513
514/***************************************************
515 * Low level helper routines.....
516 ***************************************************/
517
518#if defined(PMAP_DIAGNOSTIC)
519
520/*
521 * This code checks for non-writeable/modified pages.
522 * This should be an invalid condition.
523 */
524static int
525pmap_nw_modified(pt_entry_t ptea)
526{
527	int pte;
528
529	pte = (int) ptea;
530
531	if ((pte & (PG_M|PG_RW)) == PG_M)
532		return 1;
533	else
534		return 0;
535}
536#endif
537
538
539/*
540 * this routine defines the region(s) of memory that should
541 * not be tested for the modified bit.
542 */
543static PMAP_INLINE int
544pmap_track_modified(vm_offset_t va)
545{
546	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
547		return 1;
548	else
549		return 0;
550}
551
552#ifdef I386_CPU
553/*
554 * i386 only has "invalidate everything" and no SMP to worry about.
555 */
556PMAP_INLINE void
557pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
558{
559
560	if (pmap == kernel_pmap || pmap->pm_active)
561		invltlb();
562}
563
564PMAP_INLINE void
565pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
566{
567
568	if (pmap == kernel_pmap || pmap->pm_active)
569		invltlb();
570}
571
572PMAP_INLINE void
573pmap_invalidate_all(pmap_t pmap)
574{
575
576	if (pmap == kernel_pmap || pmap->pm_active)
577		invltlb();
578}
579#else /* !I386_CPU */
580#ifdef SMP
581/*
582 * For SMP, these functions have to use the IPI mechanism for coherence.
583 */
584void
585pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
586{
587	u_int cpumask;
588	u_int other_cpus;
589
590	if (smp_started) {
591		if (!(read_eflags() & PSL_I))
592			panic("%s: interrupts disabled", __func__);
593		mtx_lock_spin(&smp_tlb_mtx);
594	} else
595		critical_enter();
596	/*
597	 * We need to disable interrupt preemption but MUST NOT have
598	 * interrupts disabled here.
599	 * XXX we may need to hold schedlock to get a coherent pm_active
600	 * XXX critical sections disable interrupts again
601	 */
602	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
603		invlpg(va);
604		smp_invlpg(va);
605	} else {
606		cpumask = PCPU_GET(cpumask);
607		other_cpus = PCPU_GET(other_cpus);
608		if (pmap->pm_active & cpumask)
609			invlpg(va);
610		if (pmap->pm_active & other_cpus)
611			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
612	}
613	if (smp_started)
614		mtx_unlock_spin(&smp_tlb_mtx);
615	else
616		critical_exit();
617}
618
619void
620pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
621{
622	u_int cpumask;
623	u_int other_cpus;
624	vm_offset_t addr;
625
626	if (smp_started) {
627		if (!(read_eflags() & PSL_I))
628			panic("%s: interrupts disabled", __func__);
629		mtx_lock_spin(&smp_tlb_mtx);
630	} else
631		critical_enter();
632	/*
633	 * We need to disable interrupt preemption but MUST NOT have
634	 * interrupts disabled here.
635	 * XXX we may need to hold schedlock to get a coherent pm_active
636	 * XXX critical sections disable interrupts again
637	 */
638	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
639		for (addr = sva; addr < eva; addr += PAGE_SIZE)
640			invlpg(addr);
641		smp_invlpg_range(sva, eva);
642	} else {
643		cpumask = PCPU_GET(cpumask);
644		other_cpus = PCPU_GET(other_cpus);
645		if (pmap->pm_active & cpumask)
646			for (addr = sva; addr < eva; addr += PAGE_SIZE)
647				invlpg(addr);
648		if (pmap->pm_active & other_cpus)
649			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
650			    sva, eva);
651	}
652	if (smp_started)
653		mtx_unlock_spin(&smp_tlb_mtx);
654	else
655		critical_exit();
656}
657
658void
659pmap_invalidate_all(pmap_t pmap)
660{
661	u_int cpumask;
662	u_int other_cpus;
663
664	if (smp_started) {
665		if (!(read_eflags() & PSL_I))
666			panic("%s: interrupts disabled", __func__);
667		mtx_lock_spin(&smp_tlb_mtx);
668	} else
669		critical_enter();
670	/*
671	 * We need to disable interrupt preemption but MUST NOT have
672	 * interrupts disabled here.
673	 * XXX we may need to hold schedlock to get a coherent pm_active
674	 * XXX critical sections disable interrupts again
675	 */
676	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
677		invltlb();
678		smp_invltlb();
679	} else {
680		cpumask = PCPU_GET(cpumask);
681		other_cpus = PCPU_GET(other_cpus);
682		if (pmap->pm_active & cpumask)
683			invltlb();
684		if (pmap->pm_active & other_cpus)
685			smp_masked_invltlb(pmap->pm_active & other_cpus);
686	}
687	if (smp_started)
688		mtx_unlock_spin(&smp_tlb_mtx);
689	else
690		critical_exit();
691}
692#else /* !SMP */
693/*
694 * Normal, non-SMP, 486+ invalidation functions.
695 * We inline these within pmap.c for speed.
696 */
697PMAP_INLINE void
698pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
699{
700
701	if (pmap == kernel_pmap || pmap->pm_active)
702		invlpg(va);
703}
704
705PMAP_INLINE void
706pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
707{
708	vm_offset_t addr;
709
710	if (pmap == kernel_pmap || pmap->pm_active)
711		for (addr = sva; addr < eva; addr += PAGE_SIZE)
712			invlpg(addr);
713}
714
715PMAP_INLINE void
716pmap_invalidate_all(pmap_t pmap)
717{
718
719	if (pmap == kernel_pmap || pmap->pm_active)
720		invltlb();
721}
722#endif /* !SMP */
723#endif /* !I386_CPU */
724
725/*
726 * Are we current address space or kernel?  N.B. We return FALSE when
727 * a pmap's page table is in use because a kernel thread is borrowing
728 * it.  The borrowed page table can change spontaneously, making any
729 * dependence on its continued use subject to a race condition.
730 */
731static __inline int
732pmap_is_current(pmap_t pmap)
733{
734
735	return (pmap == kernel_pmap ||
736		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
737	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
738}
739
740/*
741 * If the given pmap is not the current pmap, Giant must be held.
742 */
743pt_entry_t *
744pmap_pte(pmap_t pmap, vm_offset_t va)
745{
746	pd_entry_t newpf;
747	pd_entry_t *pde;
748
749	pde = pmap_pde(pmap, va);
750	if (*pde & PG_PS)
751		return (pde);
752	if (*pde != 0) {
753		/* are we current address space or kernel? */
754		if (pmap_is_current(pmap))
755			return (vtopte(va));
756		GIANT_REQUIRED;
757		newpf = *pde & PG_FRAME;
758		if ((*PMAP2 & PG_FRAME) != newpf) {
759			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
760			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
761		}
762		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
763	}
764	return (0);
765}
766
767static __inline void
768invlcaddr(void *caddr)
769{
770#ifdef I386_CPU
771	invltlb();
772#else
773	invlpg((u_int)caddr);
774#endif
775}
776
777/*
778 * Super fast pmap_pte routine best used when scanning
779 * the pv lists.  This eliminates many coarse-grained
780 * invltlb calls.  Note that many of the pv list
781 * scans are across different pmaps.  It is very wasteful
782 * to do an entire invltlb for checking a single mapping.
783 *
784 * If the given pmap is not the current pmap, vm_page_queue_mtx
785 * must be held and curthread pinned to a CPU.
786 */
787static pt_entry_t *
788pmap_pte_quick(pmap_t pmap, vm_offset_t va)
789{
790	pd_entry_t newpf;
791	pd_entry_t *pde;
792
793	pde = pmap_pde(pmap, va);
794	if (*pde & PG_PS)
795		return (pde);
796	if (*pde != 0) {
797		/* are we current address space or kernel? */
798		if (pmap_is_current(pmap))
799			return (vtopte(va));
800		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
801		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
802		newpf = *pde & PG_FRAME;
803		if ((*PMAP1 & PG_FRAME) != newpf) {
804			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
805#ifdef SMP
806			PMAP1cpu = PCPU_GET(cpuid);
807#endif
808			invlcaddr(PADDR1);
809			PMAP1changed++;
810		} else
811#ifdef SMP
812		if (PMAP1cpu != PCPU_GET(cpuid)) {
813			PMAP1cpu = PCPU_GET(cpuid);
814			invlcaddr(PADDR1);
815			PMAP1changedcpu++;
816		} else
817#endif
818			PMAP1unchanged++;
819		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
820	}
821	return (0);
822}
823
824/*
825 *	Routine:	pmap_extract
826 *	Function:
827 *		Extract the physical page address associated
828 *		with the given map/virtual_address pair.
829 */
830vm_paddr_t
831pmap_extract(pmap, va)
832	register pmap_t pmap;
833	vm_offset_t va;
834{
835	vm_paddr_t rtval;
836	pt_entry_t *pte;
837	pd_entry_t pde;
838
839	if (pmap == 0)
840		return 0;
841	pde = pmap->pm_pdir[va >> PDRSHIFT];
842	if (pde != 0) {
843		if ((pde & PG_PS) != 0) {
844			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
845			return rtval;
846		}
847		pte = pmap_pte(pmap, va);
848		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
849		return rtval;
850	}
851	return 0;
852
853}
854
855/*
856 *	Routine:	pmap_extract_and_hold
857 *	Function:
858 *		Atomically extract and hold the physical page
859 *		with the given pmap and virtual address pair
860 *		if that mapping permits the given protection.
861 */
862vm_page_t
863pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
864{
865	vm_paddr_t pa;
866	vm_page_t m;
867
868	m = NULL;
869	mtx_lock(&Giant);
870	if ((pa = pmap_extract(pmap, va)) != 0) {
871		m = PHYS_TO_VM_PAGE(pa);
872		vm_page_lock_queues();
873		vm_page_hold(m);
874		vm_page_unlock_queues();
875	}
876	mtx_unlock(&Giant);
877	return (m);
878}
879
880/***************************************************
881 * Low level mapping routines.....
882 ***************************************************/
883
884/*
885 * Add a wired page to the kva.
886 * Note: not SMP coherent.
887 */
888PMAP_INLINE void
889pmap_kenter(vm_offset_t va, vm_paddr_t pa)
890{
891	pt_entry_t *pte;
892
893	pte = vtopte(va);
894	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
895}
896
897/*
898 * Remove a page from the kernel pagetables.
899 * Note: not SMP coherent.
900 */
901PMAP_INLINE void
902pmap_kremove(vm_offset_t va)
903{
904	pt_entry_t *pte;
905
906	pte = vtopte(va);
907	pte_clear(pte);
908}
909
910/*
911 *	Used to map a range of physical addresses into kernel
912 *	virtual address space.
913 *
914 *	The value passed in '*virt' is a suggested virtual address for
915 *	the mapping. Architectures which can support a direct-mapped
916 *	physical to virtual region can return the appropriate address
917 *	within that region, leaving '*virt' unchanged. Other
918 *	architectures should map the pages starting at '*virt' and
919 *	update '*virt' with the first usable address after the mapped
920 *	region.
921 */
922vm_offset_t
923pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
924{
925	vm_offset_t va, sva;
926
927	va = sva = *virt;
928	while (start < end) {
929		pmap_kenter(va, start);
930		va += PAGE_SIZE;
931		start += PAGE_SIZE;
932	}
933	pmap_invalidate_range(kernel_pmap, sva, va);
934	*virt = va;
935	return (sva);
936}
937
938
939/*
940 * Add a list of wired pages to the kva
941 * this routine is only used for temporary
942 * kernel mappings that do not need to have
943 * page modification or references recorded.
944 * Note that old mappings are simply written
945 * over.  The page *must* be wired.
946 * Note: SMP coherent.  Uses a ranged shootdown IPI.
947 */
948void
949pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
950{
951	vm_offset_t va;
952
953	va = sva;
954	while (count-- > 0) {
955		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
956		va += PAGE_SIZE;
957		m++;
958	}
959	pmap_invalidate_range(kernel_pmap, sva, va);
960}
961
962/*
963 * This routine tears out page mappings from the
964 * kernel -- it is meant only for temporary mappings.
965 * Note: SMP coherent.  Uses a ranged shootdown IPI.
966 */
967void
968pmap_qremove(vm_offset_t sva, int count)
969{
970	vm_offset_t va;
971
972	va = sva;
973	while (count-- > 0) {
974		pmap_kremove(va);
975		va += PAGE_SIZE;
976	}
977	pmap_invalidate_range(kernel_pmap, sva, va);
978}
979
980/***************************************************
981 * Page table page management routines.....
982 ***************************************************/
983
984/*
985 * This routine unholds page table pages, and if the hold count
986 * drops to zero, then it decrements the wire count.
987 */
988static int
989_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
990{
991
992	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
993		vm_page_lock_queues();
994
995	if (m->hold_count == 0) {
996		vm_offset_t pteva;
997		/*
998		 * unmap the page table page
999		 */
1000		pmap->pm_pdir[m->pindex] = 0;
1001		--pmap->pm_stats.resident_count;
1002		/*
1003		 * We never unwire a kernel page table page, making a
1004		 * check for the kernel_pmap unnecessary.
1005		 */
1006		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) {
1007			/*
1008			 * Do an invltlb to make the invalidated mapping
1009			 * take effect immediately.
1010			 */
1011			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1012			pmap_invalidate_page(pmap, pteva);
1013		}
1014
1015		/*
1016		 * If the page is finally unwired, simply free it.
1017		 */
1018		--m->wire_count;
1019		if (m->wire_count == 0) {
1020			vm_page_busy(m);
1021			vm_page_free_zero(m);
1022			atomic_subtract_int(&cnt.v_wire_count, 1);
1023		}
1024		return 1;
1025	}
1026	return 0;
1027}
1028
1029static PMAP_INLINE int
1030pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1031{
1032	vm_page_unhold(m);
1033	if (m->hold_count == 0)
1034		return _pmap_unwire_pte_hold(pmap, m);
1035	else
1036		return 0;
1037}
1038
1039/*
1040 * After removing a page table entry, this routine is used to
1041 * conditionally free the page, and manage the hold/wire counts.
1042 */
1043static int
1044pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1045{
1046
1047	if (va >= VM_MAXUSER_ADDRESS)
1048		return 0;
1049
1050	return pmap_unwire_pte_hold(pmap, mpte);
1051}
1052
1053void
1054pmap_pinit0(pmap)
1055	struct pmap *pmap;
1056{
1057
1058	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1059#ifdef PAE
1060	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1061#endif
1062	pmap->pm_active = 0;
1063	PCPU_SET(curpmap, pmap);
1064	TAILQ_INIT(&pmap->pm_pvlist);
1065	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1066	mtx_lock_spin(&allpmaps_lock);
1067	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1068	mtx_unlock_spin(&allpmaps_lock);
1069}
1070
1071/*
1072 * Initialize a preallocated and zeroed pmap structure,
1073 * such as one in a vmspace structure.
1074 */
1075void
1076pmap_pinit(pmap)
1077	register struct pmap *pmap;
1078{
1079	vm_page_t m, ptdpg[NPGPTD];
1080	vm_paddr_t pa;
1081	static int color;
1082	int i;
1083
1084	/*
1085	 * No need to allocate page table space yet but we do need a valid
1086	 * page directory table.
1087	 */
1088	if (pmap->pm_pdir == NULL) {
1089		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1090		    NBPTD);
1091#ifdef PAE
1092		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1093		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1094		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1095		    ("pmap_pinit: pdpt misaligned"));
1096		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1097		    ("pmap_pinit: pdpt above 4g"));
1098#endif
1099	}
1100
1101	/*
1102	 * allocate the page directory page(s)
1103	 */
1104	for (i = 0; i < NPGPTD;) {
1105		m = vm_page_alloc(NULL, color++,
1106		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1107		    VM_ALLOC_ZERO);
1108		if (m == NULL)
1109			VM_WAIT;
1110		else {
1111			ptdpg[i++] = m;
1112		}
1113	}
1114
1115	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1116
1117	for (i = 0; i < NPGPTD; i++) {
1118		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1119			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1120	}
1121
1122	mtx_lock_spin(&allpmaps_lock);
1123	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1124	mtx_unlock_spin(&allpmaps_lock);
1125	/* Wire in kernel global address entries. */
1126	/* XXX copies current process, does not fill in MPPTDI */
1127	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1128#ifdef SMP
1129	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1130#endif
1131
1132	/* install self-referential address mapping entry(s) */
1133	for (i = 0; i < NPGPTD; i++) {
1134		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1135		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1136#ifdef PAE
1137		pmap->pm_pdpt[i] = pa | PG_V;
1138#endif
1139	}
1140
1141	pmap->pm_active = 0;
1142	TAILQ_INIT(&pmap->pm_pvlist);
1143	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1144}
1145
1146/*
1147 * this routine is called if the page table page is not
1148 * mapped correctly.
1149 */
1150static vm_page_t
1151_pmap_allocpte(pmap, ptepindex)
1152	pmap_t	pmap;
1153	unsigned ptepindex;
1154{
1155	vm_paddr_t ptepa;
1156	vm_page_t m;
1157
1158	/*
1159	 * Allocate a page table page.
1160	 */
1161	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1162	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1163		VM_WAIT;
1164		/*
1165		 * Indicate the need to retry.  While waiting, the page table
1166		 * page may have been allocated.
1167		 */
1168		return (NULL);
1169	}
1170	if ((m->flags & PG_ZERO) == 0)
1171		pmap_zero_page(m);
1172
1173	KASSERT(m->queue == PQ_NONE,
1174		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1175
1176	/*
1177	 * Increment the hold count for the page table page
1178	 * (denoting a new mapping.)
1179	 */
1180	m->hold_count++;
1181
1182	/*
1183	 * Map the pagetable page into the process address space, if
1184	 * it isn't already there.
1185	 */
1186
1187	pmap->pm_stats.resident_count++;
1188
1189	ptepa = VM_PAGE_TO_PHYS(m);
1190	pmap->pm_pdir[ptepindex] =
1191		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1192
1193	vm_page_lock_queues();
1194	vm_page_wakeup(m);
1195	vm_page_unlock_queues();
1196
1197	return m;
1198}
1199
1200static vm_page_t
1201pmap_allocpte(pmap_t pmap, vm_offset_t va)
1202{
1203	unsigned ptepindex;
1204	pd_entry_t ptepa;
1205	vm_page_t m;
1206
1207	/*
1208	 * Calculate pagetable page index
1209	 */
1210	ptepindex = va >> PDRSHIFT;
1211retry:
1212	/*
1213	 * Get the page directory entry
1214	 */
1215	ptepa = pmap->pm_pdir[ptepindex];
1216
1217	/*
1218	 * This supports switching from a 4MB page to a
1219	 * normal 4K page.
1220	 */
1221	if (ptepa & PG_PS) {
1222		pmap->pm_pdir[ptepindex] = 0;
1223		ptepa = 0;
1224		pmap_invalidate_all(kernel_pmap);
1225	}
1226
1227	/*
1228	 * If the page table page is mapped, we just increment the
1229	 * hold count, and activate it.
1230	 */
1231	if (ptepa) {
1232		m = PHYS_TO_VM_PAGE(ptepa);
1233		m->hold_count++;
1234	} else {
1235		/*
1236		 * Here if the pte page isn't mapped, or if it has
1237		 * been deallocated.
1238		 */
1239		m = _pmap_allocpte(pmap, ptepindex);
1240		if (m == NULL)
1241			goto retry;
1242	}
1243	return (m);
1244}
1245
1246
1247/***************************************************
1248* Pmap allocation/deallocation routines.
1249 ***************************************************/
1250
1251#ifdef SMP
1252/*
1253 * Deal with a SMP shootdown of other users of the pmap that we are
1254 * trying to dispose of.  This can be a bit hairy.
1255 */
1256static u_int *lazymask;
1257static u_int lazyptd;
1258static volatile u_int lazywait;
1259
1260void pmap_lazyfix_action(void);
1261
1262void
1263pmap_lazyfix_action(void)
1264{
1265	u_int mymask = PCPU_GET(cpumask);
1266
1267	if (rcr3() == lazyptd)
1268		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1269	atomic_clear_int(lazymask, mymask);
1270	atomic_store_rel_int(&lazywait, 1);
1271}
1272
1273static void
1274pmap_lazyfix_self(u_int mymask)
1275{
1276
1277	if (rcr3() == lazyptd)
1278		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1279	atomic_clear_int(lazymask, mymask);
1280}
1281
1282
1283static void
1284pmap_lazyfix(pmap_t pmap)
1285{
1286	u_int mymask = PCPU_GET(cpumask);
1287	u_int mask;
1288	register u_int spins;
1289
1290	while ((mask = pmap->pm_active) != 0) {
1291		spins = 50000000;
1292		mask = mask & -mask;	/* Find least significant set bit */
1293		mtx_lock_spin(&lazypmap_lock);
1294#ifdef PAE
1295		lazyptd = vtophys(pmap->pm_pdpt);
1296#else
1297		lazyptd = vtophys(pmap->pm_pdir);
1298#endif
1299		if (mask == mymask) {
1300			lazymask = &pmap->pm_active;
1301			pmap_lazyfix_self(mymask);
1302		} else {
1303			atomic_store_rel_int((u_int *)&lazymask,
1304			    (u_int)&pmap->pm_active);
1305			atomic_store_rel_int(&lazywait, 0);
1306			ipi_selected(mask, IPI_LAZYPMAP);
1307			while (lazywait == 0) {
1308				ia32_pause();
1309				if (--spins == 0)
1310					break;
1311			}
1312		}
1313		mtx_unlock_spin(&lazypmap_lock);
1314		if (spins == 0)
1315			printf("pmap_lazyfix: spun for 50000000\n");
1316	}
1317}
1318
1319#else	/* SMP */
1320
1321/*
1322 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1323 * unlikely to have to even execute this code, including the fact
1324 * that the cleanup is deferred until the parent does a wait(2), which
1325 * means that another userland process has run.
1326 */
1327static void
1328pmap_lazyfix(pmap_t pmap)
1329{
1330	u_int cr3;
1331
1332	cr3 = vtophys(pmap->pm_pdir);
1333	if (cr3 == rcr3()) {
1334		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1335		pmap->pm_active &= ~(PCPU_GET(cpumask));
1336	}
1337}
1338#endif	/* SMP */
1339
1340/*
1341 * Release any resources held by the given physical map.
1342 * Called when a pmap initialized by pmap_pinit is being released.
1343 * Should only be called if the map contains no valid mappings.
1344 */
1345void
1346pmap_release(pmap_t pmap)
1347{
1348	vm_page_t m, ptdpg[NPGPTD];
1349	int i;
1350
1351	KASSERT(pmap->pm_stats.resident_count == 0,
1352	    ("pmap_release: pmap resident count %ld != 0",
1353	    pmap->pm_stats.resident_count));
1354
1355	pmap_lazyfix(pmap);
1356	mtx_lock_spin(&allpmaps_lock);
1357	LIST_REMOVE(pmap, pm_list);
1358	mtx_unlock_spin(&allpmaps_lock);
1359
1360	for (i = 0; i < NPGPTD; i++)
1361		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1362
1363	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1364	    sizeof(*pmap->pm_pdir));
1365#ifdef SMP
1366	pmap->pm_pdir[MPPTDI] = 0;
1367#endif
1368
1369	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1370
1371	vm_page_lock_queues();
1372	for (i = 0; i < NPGPTD; i++) {
1373		m = ptdpg[i];
1374#ifdef PAE
1375		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1376		    ("pmap_release: got wrong ptd page"));
1377#endif
1378		m->wire_count--;
1379		atomic_subtract_int(&cnt.v_wire_count, 1);
1380		vm_page_free_zero(m);
1381	}
1382	vm_page_unlock_queues();
1383}
1384
1385static int
1386kvm_size(SYSCTL_HANDLER_ARGS)
1387{
1388	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1389
1390	return sysctl_handle_long(oidp, &ksize, 0, req);
1391}
1392SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1393    0, 0, kvm_size, "IU", "Size of KVM");
1394
1395static int
1396kvm_free(SYSCTL_HANDLER_ARGS)
1397{
1398	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1399
1400	return sysctl_handle_long(oidp, &kfree, 0, req);
1401}
1402SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1403    0, 0, kvm_free, "IU", "Amount of KVM free");
1404
1405/*
1406 * grow the number of kernel page table entries, if needed
1407 */
1408void
1409pmap_growkernel(vm_offset_t addr)
1410{
1411	struct pmap *pmap;
1412	int s;
1413	vm_paddr_t ptppaddr;
1414	vm_page_t nkpg;
1415	pd_entry_t newpdir;
1416	pt_entry_t *pde;
1417
1418	s = splhigh();
1419	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1420	if (kernel_vm_end == 0) {
1421		kernel_vm_end = KERNBASE;
1422		nkpt = 0;
1423		while (pdir_pde(PTD, kernel_vm_end)) {
1424			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1425			nkpt++;
1426		}
1427	}
1428	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1429	while (kernel_vm_end < addr) {
1430		if (pdir_pde(PTD, kernel_vm_end)) {
1431			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1432			continue;
1433		}
1434
1435		/*
1436		 * This index is bogus, but out of the way
1437		 */
1438		nkpg = vm_page_alloc(NULL, nkpt,
1439		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1440		if (!nkpg)
1441			panic("pmap_growkernel: no memory to grow kernel");
1442
1443		nkpt++;
1444
1445		pmap_zero_page(nkpg);
1446		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1447		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1448		pdir_pde(PTD, kernel_vm_end) = newpdir;
1449
1450		mtx_lock_spin(&allpmaps_lock);
1451		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1452			pde = pmap_pde(pmap, kernel_vm_end);
1453			pde_store(pde, newpdir);
1454		}
1455		mtx_unlock_spin(&allpmaps_lock);
1456		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1457	}
1458	splx(s);
1459}
1460
1461
1462/***************************************************
1463 * page management routines.
1464 ***************************************************/
1465
1466/*
1467 * free the pv_entry back to the free list
1468 */
1469static PMAP_INLINE void
1470free_pv_entry(pv_entry_t pv)
1471{
1472	pv_entry_count--;
1473	uma_zfree(pvzone, pv);
1474}
1475
1476/*
1477 * get a new pv_entry, allocating a block from the system
1478 * when needed.
1479 * the memory allocation is performed bypassing the malloc code
1480 * because of the possibility of allocations at interrupt time.
1481 */
1482static pv_entry_t
1483get_pv_entry(void)
1484{
1485	pv_entry_count++;
1486	if (pv_entry_high_water &&
1487		(pv_entry_count > pv_entry_high_water) &&
1488		(pmap_pagedaemon_waken == 0)) {
1489		pmap_pagedaemon_waken = 1;
1490		wakeup (&vm_pages_needed);
1491	}
1492	return uma_zalloc(pvzone, M_NOWAIT);
1493}
1494
1495/*
1496 * If it is the first entry on the list, it is actually
1497 * in the header and we must copy the following entry up
1498 * to the header.  Otherwise we must search the list for
1499 * the entry.  In either case we free the now unused entry.
1500 */
1501
1502static int
1503pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1504{
1505	pv_entry_t pv;
1506	int rtval;
1507	int s;
1508
1509	s = splvm();
1510	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1511	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1512		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1513			if (pmap == pv->pv_pmap && va == pv->pv_va)
1514				break;
1515		}
1516	} else {
1517		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1518			if (va == pv->pv_va)
1519				break;
1520		}
1521	}
1522
1523	rtval = 0;
1524	if (pv) {
1525		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1526		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1527		m->md.pv_list_count--;
1528		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1529			vm_page_flag_clear(m, PG_WRITEABLE);
1530
1531		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1532		free_pv_entry(pv);
1533	}
1534
1535	splx(s);
1536	return rtval;
1537}
1538
1539/*
1540 * Create a pv entry for page at pa for
1541 * (pmap, va).
1542 */
1543static void
1544pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1545{
1546
1547	int s;
1548	pv_entry_t pv;
1549
1550	s = splvm();
1551	pv = get_pv_entry();
1552	pv->pv_va = va;
1553	pv->pv_pmap = pmap;
1554	pv->pv_ptem = mpte;
1555
1556	vm_page_lock_queues();
1557	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1558	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1559	m->md.pv_list_count++;
1560
1561	vm_page_unlock_queues();
1562	splx(s);
1563}
1564
1565/*
1566 * pmap_remove_pte: do the things to unmap a page in a process
1567 */
1568static int
1569pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1570{
1571	pt_entry_t oldpte;
1572	vm_page_t m, mpte;
1573
1574	oldpte = pte_load_clear(ptq);
1575	if (oldpte & PG_W)
1576		pmap->pm_stats.wired_count -= 1;
1577	/*
1578	 * Machines that don't support invlpg, also don't support
1579	 * PG_G.
1580	 */
1581	if (oldpte & PG_G)
1582		pmap_invalidate_page(kernel_pmap, va);
1583	pmap->pm_stats.resident_count -= 1;
1584	if (oldpte & PG_MANAGED) {
1585		m = PHYS_TO_VM_PAGE(oldpte);
1586		if (oldpte & PG_M) {
1587#if defined(PMAP_DIAGNOSTIC)
1588			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1589				printf(
1590	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1591				    va, oldpte);
1592			}
1593#endif
1594			if (pmap_track_modified(va))
1595				vm_page_dirty(m);
1596		}
1597		if (oldpte & PG_A)
1598			vm_page_flag_set(m, PG_REFERENCED);
1599		return pmap_remove_entry(pmap, m, va);
1600	} else {
1601		mpte = PHYS_TO_VM_PAGE(*pmap_pde(pmap, va));
1602		return pmap_unuse_pt(pmap, va, mpte);
1603	}
1604}
1605
1606/*
1607 * Remove a single page from a process address space
1608 */
1609static void
1610pmap_remove_page(pmap_t pmap, vm_offset_t va)
1611{
1612	pt_entry_t *pte;
1613
1614	if ((pte = pmap_pte(pmap, va)) == NULL || *pte == 0)
1615		return;
1616	pmap_remove_pte(pmap, pte, va);
1617	pmap_invalidate_page(pmap, va);
1618}
1619
1620/*
1621 *	Remove the given range of addresses from the specified map.
1622 *
1623 *	It is assumed that the start and end are properly
1624 *	rounded to the page size.
1625 */
1626void
1627pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1628{
1629	vm_offset_t pdnxt;
1630	pd_entry_t ptpaddr;
1631	pt_entry_t *pte;
1632	int anyvalid;
1633
1634	if (pmap == NULL)
1635		return;
1636
1637	if (pmap->pm_stats.resident_count == 0)
1638		return;
1639
1640	/*
1641	 * special handling of removing one page.  a very
1642	 * common operation and easy to short circuit some
1643	 * code.
1644	 */
1645	if ((sva + PAGE_SIZE == eva) &&
1646	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1647		pmap_remove_page(pmap, sva);
1648		return;
1649	}
1650
1651	anyvalid = 0;
1652
1653	for (; sva < eva; sva = pdnxt) {
1654		unsigned pdirindex;
1655
1656		/*
1657		 * Calculate index for next page table.
1658		 */
1659		pdnxt = (sva + NBPDR) & ~PDRMASK;
1660		if (pmap->pm_stats.resident_count == 0)
1661			break;
1662
1663		pdirindex = sva >> PDRSHIFT;
1664		ptpaddr = pmap->pm_pdir[pdirindex];
1665
1666		/*
1667		 * Weed out invalid mappings. Note: we assume that the page
1668		 * directory table is always allocated, and in kernel virtual.
1669		 */
1670		if (ptpaddr == 0)
1671			continue;
1672
1673		/*
1674		 * Check for large page.
1675		 */
1676		if ((ptpaddr & PG_PS) != 0) {
1677			pmap->pm_pdir[pdirindex] = 0;
1678			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1679			anyvalid = 1;
1680			continue;
1681		}
1682
1683		/*
1684		 * Limit our scan to either the end of the va represented
1685		 * by the current page table page, or to the end of the
1686		 * range being removed.
1687		 */
1688		if (pdnxt > eva)
1689			pdnxt = eva;
1690
1691		for (; sva != pdnxt; sva += PAGE_SIZE) {
1692			if ((pte = pmap_pte(pmap, sva)) == NULL ||
1693			    *pte == 0)
1694				continue;
1695			anyvalid = 1;
1696			if (pmap_remove_pte(pmap, pte, sva))
1697				break;
1698		}
1699	}
1700
1701	if (anyvalid)
1702		pmap_invalidate_all(pmap);
1703}
1704
1705/*
1706 *	Routine:	pmap_remove_all
1707 *	Function:
1708 *		Removes this physical page from
1709 *		all physical maps in which it resides.
1710 *		Reflects back modify bits to the pager.
1711 *
1712 *	Notes:
1713 *		Original versions of this routine were very
1714 *		inefficient because they iteratively called
1715 *		pmap_remove (slow...)
1716 */
1717
1718void
1719pmap_remove_all(vm_page_t m)
1720{
1721	register pv_entry_t pv;
1722	pt_entry_t *pte, tpte;
1723	int s;
1724
1725#if defined(PMAP_DIAGNOSTIC)
1726	/*
1727	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1728	 */
1729	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1730		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1731		    VM_PAGE_TO_PHYS(m));
1732	}
1733#endif
1734	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1735	s = splvm();
1736	sched_pin();
1737	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1738		pv->pv_pmap->pm_stats.resident_count--;
1739		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1740		tpte = pte_load_clear(pte);
1741		if (tpte & PG_W)
1742			pv->pv_pmap->pm_stats.wired_count--;
1743		if (tpte & PG_A)
1744			vm_page_flag_set(m, PG_REFERENCED);
1745
1746		/*
1747		 * Update the vm_page_t clean and reference bits.
1748		 */
1749		if (tpte & PG_M) {
1750#if defined(PMAP_DIAGNOSTIC)
1751			if (pmap_nw_modified((pt_entry_t) tpte)) {
1752				printf(
1753	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1754				    pv->pv_va, tpte);
1755			}
1756#endif
1757			if (pmap_track_modified(pv->pv_va))
1758				vm_page_dirty(m);
1759		}
1760		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1761		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1762		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1763		m->md.pv_list_count--;
1764		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1765		free_pv_entry(pv);
1766	}
1767	vm_page_flag_clear(m, PG_WRITEABLE);
1768	sched_unpin();
1769	splx(s);
1770}
1771
1772/*
1773 *	Set the physical protection on the
1774 *	specified range of this map as requested.
1775 */
1776void
1777pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1778{
1779	vm_offset_t pdnxt;
1780	pd_entry_t ptpaddr;
1781	int anychanged;
1782
1783	if (pmap == NULL)
1784		return;
1785
1786	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1787		pmap_remove(pmap, sva, eva);
1788		return;
1789	}
1790
1791	if (prot & VM_PROT_WRITE)
1792		return;
1793
1794	anychanged = 0;
1795
1796	for (; sva < eva; sva = pdnxt) {
1797		unsigned pdirindex;
1798
1799		pdnxt = (sva + NBPDR) & ~PDRMASK;
1800
1801		pdirindex = sva >> PDRSHIFT;
1802		ptpaddr = pmap->pm_pdir[pdirindex];
1803
1804		/*
1805		 * Weed out invalid mappings. Note: we assume that the page
1806		 * directory table is always allocated, and in kernel virtual.
1807		 */
1808		if (ptpaddr == 0)
1809			continue;
1810
1811		/*
1812		 * Check for large page.
1813		 */
1814		if ((ptpaddr & PG_PS) != 0) {
1815			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1816			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1817			anychanged = 1;
1818			continue;
1819		}
1820
1821		if (pdnxt > eva)
1822			pdnxt = eva;
1823
1824		for (; sva != pdnxt; sva += PAGE_SIZE) {
1825			pt_entry_t pbits;
1826			pt_entry_t *pte;
1827			vm_page_t m;
1828
1829			if ((pte = pmap_pte(pmap, sva)) == NULL)
1830				continue;
1831			pbits = *pte;
1832			if (pbits & PG_MANAGED) {
1833				m = NULL;
1834				if (pbits & PG_A) {
1835					m = PHYS_TO_VM_PAGE(pbits);
1836					vm_page_flag_set(m, PG_REFERENCED);
1837					pbits &= ~PG_A;
1838				}
1839				if ((pbits & PG_M) != 0 &&
1840				    pmap_track_modified(sva)) {
1841					if (m == NULL)
1842						m = PHYS_TO_VM_PAGE(pbits);
1843					vm_page_dirty(m);
1844					pbits &= ~PG_M;
1845				}
1846			}
1847
1848			pbits &= ~PG_RW;
1849
1850			if (pbits != *pte) {
1851				pte_store(pte, pbits);
1852				anychanged = 1;
1853			}
1854		}
1855	}
1856	if (anychanged)
1857		pmap_invalidate_all(pmap);
1858}
1859
1860/*
1861 *	Insert the given physical page (p) at
1862 *	the specified virtual address (v) in the
1863 *	target physical map with the protection requested.
1864 *
1865 *	If specified, the page will be wired down, meaning
1866 *	that the related pte can not be reclaimed.
1867 *
1868 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1869 *	or lose information.  That is, this routine must actually
1870 *	insert this page into the given map NOW.
1871 */
1872void
1873pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1874	   boolean_t wired)
1875{
1876	vm_paddr_t pa;
1877	register pt_entry_t *pte;
1878	vm_paddr_t opa;
1879	pt_entry_t origpte, newpte;
1880	vm_page_t mpte;
1881
1882	if (pmap == NULL)
1883		return;
1884
1885	va &= PG_FRAME;
1886#ifdef PMAP_DIAGNOSTIC
1887	if (va > VM_MAX_KERNEL_ADDRESS)
1888		panic("pmap_enter: toobig");
1889	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1890		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1891#endif
1892
1893	mpte = NULL;
1894	/*
1895	 * In the case that a page table page is not
1896	 * resident, we are creating it here.
1897	 */
1898	if (va < VM_MAXUSER_ADDRESS) {
1899		mpte = pmap_allocpte(pmap, va);
1900	}
1901#if 0 && defined(PMAP_DIAGNOSTIC)
1902	else {
1903		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1904		origpte = *pdeaddr;
1905		if ((origpte & PG_V) == 0) {
1906			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
1907				pmap->pm_pdir[PTDPTDI], origpte, va);
1908		}
1909	}
1910#endif
1911
1912	pte = pmap_pte(pmap, va);
1913
1914	/*
1915	 * Page Directory table entry not valid, we need a new PT page
1916	 */
1917	if (pte == NULL) {
1918		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
1919			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
1920	}
1921
1922	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
1923	origpte = *pte;
1924	opa = origpte & PG_FRAME;
1925
1926	if (origpte & PG_PS) {
1927		/*
1928		 * Yes, I know this will truncate upper address bits for PAE,
1929		 * but I'm actually more interested in the lower bits
1930		 */
1931		printf("pmap_enter: va %p, pte %p, origpte %p\n",
1932		    (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
1933		panic("pmap_enter: attempted pmap_enter on 4MB page");
1934	}
1935
1936	/*
1937	 * Mapping has not changed, must be protection or wiring change.
1938	 */
1939	if (origpte && (opa == pa)) {
1940		/*
1941		 * Wiring change, just update stats. We don't worry about
1942		 * wiring PT pages as they remain resident as long as there
1943		 * are valid mappings in them. Hence, if a user page is wired,
1944		 * the PT page will be also.
1945		 */
1946		if (wired && ((origpte & PG_W) == 0))
1947			pmap->pm_stats.wired_count++;
1948		else if (!wired && (origpte & PG_W))
1949			pmap->pm_stats.wired_count--;
1950
1951#if defined(PMAP_DIAGNOSTIC)
1952		if (pmap_nw_modified((pt_entry_t) origpte)) {
1953			printf(
1954	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
1955			    va, origpte);
1956		}
1957#endif
1958
1959		/*
1960		 * Remove extra pte reference
1961		 */
1962		if (mpte)
1963			mpte->hold_count--;
1964
1965		/*
1966		 * We might be turning off write access to the page,
1967		 * so we go ahead and sense modify status.
1968		 */
1969		if (origpte & PG_MANAGED) {
1970			if ((origpte & PG_M) && pmap_track_modified(va)) {
1971				vm_page_t om;
1972				om = PHYS_TO_VM_PAGE(opa);
1973				vm_page_dirty(om);
1974			}
1975			pa |= PG_MANAGED;
1976		}
1977		goto validate;
1978	}
1979	/*
1980	 * Mapping has changed, invalidate old range and fall through to
1981	 * handle validating new mapping.
1982	 */
1983	if (opa) {
1984		int err;
1985		vm_page_lock_queues();
1986		err = pmap_remove_pte(pmap, pte, va);
1987		vm_page_unlock_queues();
1988		if (err)
1989			panic("pmap_enter: pte vanished, va: 0x%x", va);
1990	}
1991
1992	/*
1993	 * Enter on the PV list if part of our managed memory. Note that we
1994	 * raise IPL while manipulating pv_table since pmap_enter can be
1995	 * called at interrupt time.
1996	 */
1997	if (pmap_initialized &&
1998	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1999		pmap_insert_entry(pmap, va, mpte, m);
2000		pa |= PG_MANAGED;
2001	}
2002
2003	/*
2004	 * Increment counters
2005	 */
2006	pmap->pm_stats.resident_count++;
2007	if (wired)
2008		pmap->pm_stats.wired_count++;
2009
2010validate:
2011	/*
2012	 * Now validate mapping with desired protection/wiring.
2013	 */
2014	newpte = (pt_entry_t)(pa | PG_V);
2015	if ((prot & VM_PROT_WRITE) != 0)
2016		newpte |= PG_RW;
2017	if (wired)
2018		newpte |= PG_W;
2019	if (va < VM_MAXUSER_ADDRESS)
2020		newpte |= PG_U;
2021	if (pmap == kernel_pmap)
2022		newpte |= pgeflag;
2023
2024	/*
2025	 * if the mapping or permission bits are different, we need
2026	 * to update the pte.
2027	 */
2028	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2029		pte_store(pte, newpte | PG_A);
2030		/*if (origpte)*/ {
2031			pmap_invalidate_page(pmap, va);
2032		}
2033	}
2034}
2035
2036/*
2037 * this code makes some *MAJOR* assumptions:
2038 * 1. Current pmap & pmap exists.
2039 * 2. Not wired.
2040 * 3. Read access.
2041 * 4. No page table pages.
2042 * 5. Tlbflush is deferred to calling procedure.
2043 * 6. Page IS managed.
2044 * but is *MUCH* faster than pmap_enter...
2045 */
2046
2047vm_page_t
2048pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2049{
2050	pt_entry_t *pte;
2051	vm_paddr_t pa;
2052
2053	/*
2054	 * In the case that a page table page is not
2055	 * resident, we are creating it here.
2056	 */
2057	if (va < VM_MAXUSER_ADDRESS) {
2058		unsigned ptepindex;
2059		pd_entry_t ptepa;
2060
2061		/*
2062		 * Calculate pagetable page index
2063		 */
2064		ptepindex = va >> PDRSHIFT;
2065		if (mpte && (mpte->pindex == ptepindex)) {
2066			mpte->hold_count++;
2067		} else {
2068retry:
2069			/*
2070			 * Get the page directory entry
2071			 */
2072			ptepa = pmap->pm_pdir[ptepindex];
2073
2074			/*
2075			 * If the page table page is mapped, we just increment
2076			 * the hold count, and activate it.
2077			 */
2078			if (ptepa) {
2079				if (ptepa & PG_PS)
2080					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2081				mpte = PHYS_TO_VM_PAGE(ptepa);
2082				mpte->hold_count++;
2083			} else {
2084				mpte = _pmap_allocpte(pmap, ptepindex);
2085				if (mpte == NULL)
2086					goto retry;
2087			}
2088		}
2089	} else {
2090		mpte = NULL;
2091	}
2092
2093	/*
2094	 * This call to vtopte makes the assumption that we are
2095	 * entering the page into the current pmap.  In order to support
2096	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2097	 * But that isn't as quick as vtopte.
2098	 */
2099	pte = vtopte(va);
2100	if (*pte) {
2101		if (mpte != NULL) {
2102			vm_page_lock_queues();
2103			pmap_unwire_pte_hold(pmap, mpte);
2104			vm_page_unlock_queues();
2105		}
2106		return 0;
2107	}
2108
2109	/*
2110	 * Enter on the PV list if part of our managed memory. Note that we
2111	 * raise IPL while manipulating pv_table since pmap_enter can be
2112	 * called at interrupt time.
2113	 */
2114	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2115		pmap_insert_entry(pmap, va, mpte, m);
2116
2117	/*
2118	 * Increment counters
2119	 */
2120	pmap->pm_stats.resident_count++;
2121
2122	pa = VM_PAGE_TO_PHYS(m);
2123
2124	/*
2125	 * Now validate mapping with RO protection
2126	 */
2127	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2128		pte_store(pte, pa | PG_V | PG_U);
2129	else
2130		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2131
2132	return mpte;
2133}
2134
2135/*
2136 * Make a temporary mapping for a physical address.  This is only intended
2137 * to be used for panic dumps.
2138 */
2139void *
2140pmap_kenter_temporary(vm_paddr_t pa, int i)
2141{
2142	vm_offset_t va;
2143
2144	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2145	pmap_kenter(va, pa);
2146#ifndef I386_CPU
2147	invlpg(va);
2148#else
2149	invltlb();
2150#endif
2151	return ((void *)crashdumpmap);
2152}
2153
2154/*
2155 * This code maps large physical mmap regions into the
2156 * processor address space.  Note that some shortcuts
2157 * are taken, but the code works.
2158 */
2159void
2160pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2161		    vm_object_t object, vm_pindex_t pindex,
2162		    vm_size_t size)
2163{
2164	vm_page_t p;
2165
2166	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2167	KASSERT(object->type == OBJT_DEVICE,
2168	    ("pmap_object_init_pt: non-device object"));
2169	if (pseflag &&
2170	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2171		int i;
2172		vm_page_t m[1];
2173		unsigned int ptepindex;
2174		int npdes;
2175		pd_entry_t ptepa;
2176
2177		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2178			return;
2179retry:
2180		p = vm_page_lookup(object, pindex);
2181		if (p != NULL) {
2182			vm_page_lock_queues();
2183			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2184				goto retry;
2185		} else {
2186			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2187			if (p == NULL)
2188				return;
2189			m[0] = p;
2190
2191			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2192				vm_page_lock_queues();
2193				vm_page_free(p);
2194				vm_page_unlock_queues();
2195				return;
2196			}
2197
2198			p = vm_page_lookup(object, pindex);
2199			vm_page_lock_queues();
2200			vm_page_wakeup(p);
2201		}
2202		vm_page_unlock_queues();
2203
2204		ptepa = VM_PAGE_TO_PHYS(p);
2205		if (ptepa & (NBPDR - 1))
2206			return;
2207
2208		p->valid = VM_PAGE_BITS_ALL;
2209
2210		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2211		npdes = size >> PDRSHIFT;
2212		for(i = 0; i < npdes; i++) {
2213			pde_store(&pmap->pm_pdir[ptepindex],
2214			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
2215			ptepa += NBPDR;
2216			ptepindex += 1;
2217		}
2218		pmap_invalidate_all(pmap);
2219	}
2220}
2221
2222/*
2223 *	Routine:	pmap_change_wiring
2224 *	Function:	Change the wiring attribute for a map/virtual-address
2225 *			pair.
2226 *	In/out conditions:
2227 *			The mapping must already exist in the pmap.
2228 */
2229void
2230pmap_change_wiring(pmap, va, wired)
2231	register pmap_t pmap;
2232	vm_offset_t va;
2233	boolean_t wired;
2234{
2235	register pt_entry_t *pte;
2236
2237	if (pmap == NULL)
2238		return;
2239
2240	pte = pmap_pte(pmap, va);
2241
2242	if (wired && !pmap_pte_w(pte))
2243		pmap->pm_stats.wired_count++;
2244	else if (!wired && pmap_pte_w(pte))
2245		pmap->pm_stats.wired_count--;
2246
2247	/*
2248	 * Wiring is not a hardware characteristic so there is no need to
2249	 * invalidate TLB.
2250	 */
2251	pmap_pte_set_w(pte, wired);
2252}
2253
2254
2255
2256/*
2257 *	Copy the range specified by src_addr/len
2258 *	from the source map to the range dst_addr/len
2259 *	in the destination map.
2260 *
2261 *	This routine is only advisory and need not do anything.
2262 */
2263
2264void
2265pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2266	  vm_offset_t src_addr)
2267{
2268	vm_offset_t addr;
2269	vm_offset_t end_addr = src_addr + len;
2270	vm_offset_t pdnxt;
2271	vm_page_t m;
2272
2273	if (dst_addr != src_addr)
2274		return;
2275
2276	if (!pmap_is_current(src_pmap))
2277		return;
2278
2279	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2280		pt_entry_t *src_pte, *dst_pte;
2281		vm_page_t dstmpte, srcmpte;
2282		pd_entry_t srcptepaddr;
2283		unsigned ptepindex;
2284
2285		if (addr >= UPT_MIN_ADDRESS)
2286			panic("pmap_copy: invalid to pmap_copy page tables\n");
2287
2288		/*
2289		 * Don't let optional prefaulting of pages make us go
2290		 * way below the low water mark of free pages or way
2291		 * above high water mark of used pv entries.
2292		 */
2293		if (cnt.v_free_count < cnt.v_free_reserved ||
2294		    pv_entry_count > pv_entry_high_water)
2295			break;
2296
2297		pdnxt = (addr + NBPDR) & ~PDRMASK;
2298		ptepindex = addr >> PDRSHIFT;
2299
2300		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2301		if (srcptepaddr == 0)
2302			continue;
2303
2304		if (srcptepaddr & PG_PS) {
2305			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2306				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2307				dst_pmap->pm_stats.resident_count +=
2308				    NBPDR / PAGE_SIZE;
2309			}
2310			continue;
2311		}
2312
2313		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2314		if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY))
2315			continue;
2316
2317		if (pdnxt > end_addr)
2318			pdnxt = end_addr;
2319
2320		src_pte = vtopte(addr);
2321		while (addr < pdnxt) {
2322			pt_entry_t ptetemp;
2323			ptetemp = *src_pte;
2324			/*
2325			 * we only virtual copy managed pages
2326			 */
2327			if ((ptetemp & PG_MANAGED) != 0) {
2328				/*
2329				 * We have to check after allocpte for the
2330				 * pte still being around...  allocpte can
2331				 * block.
2332				 */
2333				dstmpte = pmap_allocpte(dst_pmap, addr);
2334				dst_pte = pmap_pte(dst_pmap, addr);
2335				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2336					/*
2337					 * Clear the modified and
2338					 * accessed (referenced) bits
2339					 * during the copy.
2340					 */
2341					m = PHYS_TO_VM_PAGE(ptetemp);
2342					*dst_pte = ptetemp & ~(PG_M | PG_A);
2343					dst_pmap->pm_stats.resident_count++;
2344					pmap_insert_entry(dst_pmap, addr,
2345						dstmpte, m);
2346	 			} else {
2347					vm_page_lock_queues();
2348					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2349					vm_page_unlock_queues();
2350				}
2351				if (dstmpte->hold_count >= srcmpte->hold_count)
2352					break;
2353			}
2354			addr += PAGE_SIZE;
2355			src_pte++;
2356		}
2357	}
2358}
2359
2360static __inline void
2361pagezero(void *page)
2362{
2363#if defined(I686_CPU)
2364	if (cpu_class == CPUCLASS_686) {
2365#if defined(CPU_ENABLE_SSE)
2366		if (cpu_feature & CPUID_SSE2)
2367			sse2_pagezero(page);
2368		else
2369#endif
2370			i686_pagezero(page);
2371	} else
2372#endif
2373		bzero(page, PAGE_SIZE);
2374}
2375
2376/*
2377 *	pmap_zero_page zeros the specified hardware page by mapping
2378 *	the page into KVM and using bzero to clear its contents.
2379 */
2380void
2381pmap_zero_page(vm_page_t m)
2382{
2383
2384	mtx_lock(&CMAPCADDR12_lock);
2385	if (*CMAP2)
2386		panic("pmap_zero_page: CMAP2 busy");
2387	sched_pin();
2388	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2389	invlcaddr(CADDR2);
2390	pagezero(CADDR2);
2391	*CMAP2 = 0;
2392	sched_unpin();
2393	mtx_unlock(&CMAPCADDR12_lock);
2394}
2395
2396/*
2397 *	pmap_zero_page_area zeros the specified hardware page by mapping
2398 *	the page into KVM and using bzero to clear its contents.
2399 *
2400 *	off and size may not cover an area beyond a single hardware page.
2401 */
2402void
2403pmap_zero_page_area(vm_page_t m, int off, int size)
2404{
2405
2406	mtx_lock(&CMAPCADDR12_lock);
2407	if (*CMAP2)
2408		panic("pmap_zero_page: CMAP2 busy");
2409	sched_pin();
2410	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2411	invlcaddr(CADDR2);
2412	if (off == 0 && size == PAGE_SIZE)
2413		pagezero(CADDR2);
2414	else
2415		bzero((char *)CADDR2 + off, size);
2416	*CMAP2 = 0;
2417	sched_unpin();
2418	mtx_unlock(&CMAPCADDR12_lock);
2419}
2420
2421/*
2422 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2423 *	the page into KVM and using bzero to clear its contents.  This
2424 *	is intended to be called from the vm_pagezero process only and
2425 *	outside of Giant.
2426 */
2427void
2428pmap_zero_page_idle(vm_page_t m)
2429{
2430
2431	if (*CMAP3)
2432		panic("pmap_zero_page: CMAP3 busy");
2433	sched_pin();
2434	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2435	invlcaddr(CADDR3);
2436	pagezero(CADDR3);
2437	*CMAP3 = 0;
2438	sched_unpin();
2439}
2440
2441/*
2442 *	pmap_copy_page copies the specified (machine independent)
2443 *	page by mapping the page into virtual memory and using
2444 *	bcopy to copy the page, one machine dependent page at a
2445 *	time.
2446 */
2447void
2448pmap_copy_page(vm_page_t src, vm_page_t dst)
2449{
2450
2451	mtx_lock(&CMAPCADDR12_lock);
2452	if (*CMAP1)
2453		panic("pmap_copy_page: CMAP1 busy");
2454	if (*CMAP2)
2455		panic("pmap_copy_page: CMAP2 busy");
2456	sched_pin();
2457#ifdef I386_CPU
2458	invltlb();
2459#else
2460	invlpg((u_int)CADDR1);
2461	invlpg((u_int)CADDR2);
2462#endif
2463	*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2464	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2465	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2466	*CMAP1 = 0;
2467	*CMAP2 = 0;
2468	sched_unpin();
2469	mtx_unlock(&CMAPCADDR12_lock);
2470}
2471
2472/*
2473 * Returns true if the pmap's pv is one of the first
2474 * 16 pvs linked to from this page.  This count may
2475 * be changed upwards or downwards in the future; it
2476 * is only necessary that true be returned for a small
2477 * subset of pmaps for proper page aging.
2478 */
2479boolean_t
2480pmap_page_exists_quick(pmap, m)
2481	pmap_t pmap;
2482	vm_page_t m;
2483{
2484	pv_entry_t pv;
2485	int loops = 0;
2486	int s;
2487
2488	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2489		return FALSE;
2490
2491	s = splvm();
2492	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2493	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2494		if (pv->pv_pmap == pmap) {
2495			splx(s);
2496			return TRUE;
2497		}
2498		loops++;
2499		if (loops >= 16)
2500			break;
2501	}
2502	splx(s);
2503	return (FALSE);
2504}
2505
2506#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2507/*
2508 * Remove all pages from specified address space
2509 * this aids process exit speeds.  Also, this code
2510 * is special cased for current process only, but
2511 * can have the more generic (and slightly slower)
2512 * mode enabled.  This is much faster than pmap_remove
2513 * in the case of running down an entire address space.
2514 */
2515void
2516pmap_remove_pages(pmap, sva, eva)
2517	pmap_t pmap;
2518	vm_offset_t sva, eva;
2519{
2520	pt_entry_t *pte, tpte;
2521	vm_page_t m;
2522	pv_entry_t pv, npv;
2523	int s;
2524
2525#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2526	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2527		printf("warning: pmap_remove_pages called with non-current pmap\n");
2528		return;
2529	}
2530#endif
2531	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2532	s = splvm();
2533	sched_pin();
2534	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2535
2536		if (pv->pv_va >= eva || pv->pv_va < sva) {
2537			npv = TAILQ_NEXT(pv, pv_plist);
2538			continue;
2539		}
2540
2541#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2542		pte = vtopte(pv->pv_va);
2543#else
2544		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2545#endif
2546		tpte = *pte;
2547
2548		if (tpte == 0) {
2549			printf("TPTE at %p  IS ZERO @ VA %08x\n",
2550							pte, pv->pv_va);
2551			panic("bad pte");
2552		}
2553
2554/*
2555 * We cannot remove wired pages from a process' mapping at this time
2556 */
2557		if (tpte & PG_W) {
2558			npv = TAILQ_NEXT(pv, pv_plist);
2559			continue;
2560		}
2561
2562		m = PHYS_TO_VM_PAGE(tpte);
2563		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2564		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2565		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2566
2567		KASSERT(m < &vm_page_array[vm_page_array_size],
2568			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2569
2570		pv->pv_pmap->pm_stats.resident_count--;
2571
2572		pte_clear(pte);
2573
2574		/*
2575		 * Update the vm_page_t clean and reference bits.
2576		 */
2577		if (tpte & PG_M) {
2578			vm_page_dirty(m);
2579		}
2580
2581		npv = TAILQ_NEXT(pv, pv_plist);
2582		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2583
2584		m->md.pv_list_count--;
2585		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2586		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2587			vm_page_flag_clear(m, PG_WRITEABLE);
2588		}
2589
2590		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2591		free_pv_entry(pv);
2592	}
2593	sched_unpin();
2594	splx(s);
2595	pmap_invalidate_all(pmap);
2596}
2597
2598/*
2599 *	pmap_is_modified:
2600 *
2601 *	Return whether or not the specified physical page was modified
2602 *	in any physical maps.
2603 */
2604boolean_t
2605pmap_is_modified(vm_page_t m)
2606{
2607	pv_entry_t pv;
2608	pt_entry_t *pte;
2609	int s;
2610
2611	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2612		return FALSE;
2613
2614	s = splvm();
2615	sched_pin();
2616	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2617	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2618		/*
2619		 * if the bit being tested is the modified bit, then
2620		 * mark clean_map and ptes as never
2621		 * modified.
2622		 */
2623		if (!pmap_track_modified(pv->pv_va))
2624			continue;
2625#if defined(PMAP_DIAGNOSTIC)
2626		if (!pv->pv_pmap) {
2627			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2628			continue;
2629		}
2630#endif
2631		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2632		if (*pte & PG_M) {
2633			sched_unpin();
2634			splx(s);
2635			return TRUE;
2636		}
2637	}
2638	sched_unpin();
2639	splx(s);
2640	return (FALSE);
2641}
2642
2643/*
2644 *	pmap_is_prefaultable:
2645 *
2646 *	Return whether or not the specified virtual address is elgible
2647 *	for prefault.
2648 */
2649boolean_t
2650pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2651{
2652	pt_entry_t *pte;
2653
2654	if ((*pmap_pde(pmap, addr)) == 0)
2655		return (FALSE);
2656	pte = vtopte(addr);
2657	if (*pte)
2658		return (FALSE);
2659	return (TRUE);
2660}
2661
2662/*
2663 *	Clear the given bit in each of the given page's ptes.
2664 */
2665static __inline void
2666pmap_clear_ptes(vm_page_t m, int bit)
2667{
2668	register pv_entry_t pv;
2669	pt_entry_t pbits, *pte;
2670	int s;
2671
2672	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2673	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2674		return;
2675
2676	s = splvm();
2677	sched_pin();
2678	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2679	/*
2680	 * Loop over all current mappings setting/clearing as appropos If
2681	 * setting RO do we need to clear the VAC?
2682	 */
2683	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2684		/*
2685		 * don't write protect pager mappings
2686		 */
2687		if (bit == PG_RW) {
2688			if (!pmap_track_modified(pv->pv_va))
2689				continue;
2690		}
2691
2692#if defined(PMAP_DIAGNOSTIC)
2693		if (!pv->pv_pmap) {
2694			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2695			continue;
2696		}
2697#endif
2698
2699		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2700		pbits = *pte;
2701		if (pbits & bit) {
2702			if (bit == PG_RW) {
2703				if (pbits & PG_M) {
2704					vm_page_dirty(m);
2705				}
2706				pte_store(pte, pbits & ~(PG_M|PG_RW));
2707			} else {
2708				pte_store(pte, pbits & ~bit);
2709			}
2710			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2711		}
2712	}
2713	if (bit == PG_RW)
2714		vm_page_flag_clear(m, PG_WRITEABLE);
2715	sched_unpin();
2716	splx(s);
2717}
2718
2719/*
2720 *      pmap_page_protect:
2721 *
2722 *      Lower the permission for all mappings to a given page.
2723 */
2724void
2725pmap_page_protect(vm_page_t m, vm_prot_t prot)
2726{
2727	if ((prot & VM_PROT_WRITE) == 0) {
2728		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2729			pmap_clear_ptes(m, PG_RW);
2730		} else {
2731			pmap_remove_all(m);
2732		}
2733	}
2734}
2735
2736/*
2737 *	pmap_ts_referenced:
2738 *
2739 *	Return a count of reference bits for a page, clearing those bits.
2740 *	It is not necessary for every reference bit to be cleared, but it
2741 *	is necessary that 0 only be returned when there are truly no
2742 *	reference bits set.
2743 *
2744 *	XXX: The exact number of bits to check and clear is a matter that
2745 *	should be tested and standardized at some point in the future for
2746 *	optimal aging of shared pages.
2747 */
2748int
2749pmap_ts_referenced(vm_page_t m)
2750{
2751	register pv_entry_t pv, pvf, pvn;
2752	pt_entry_t *pte;
2753	pt_entry_t v;
2754	int s;
2755	int rtval = 0;
2756
2757	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2758		return (rtval);
2759
2760	s = splvm();
2761	sched_pin();
2762	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2763	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2764
2765		pvf = pv;
2766
2767		do {
2768			pvn = TAILQ_NEXT(pv, pv_list);
2769
2770			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2771
2772			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2773
2774			if (!pmap_track_modified(pv->pv_va))
2775				continue;
2776
2777			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2778
2779			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2780				pte_store(pte, v & ~PG_A);
2781				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2782
2783				rtval++;
2784				if (rtval > 4) {
2785					break;
2786				}
2787			}
2788		} while ((pv = pvn) != NULL && pv != pvf);
2789	}
2790	sched_unpin();
2791	splx(s);
2792
2793	return (rtval);
2794}
2795
2796/*
2797 *	Clear the modify bits on the specified physical page.
2798 */
2799void
2800pmap_clear_modify(vm_page_t m)
2801{
2802	pmap_clear_ptes(m, PG_M);
2803}
2804
2805/*
2806 *	pmap_clear_reference:
2807 *
2808 *	Clear the reference bit on the specified physical page.
2809 */
2810void
2811pmap_clear_reference(vm_page_t m)
2812{
2813	pmap_clear_ptes(m, PG_A);
2814}
2815
2816/*
2817 * Miscellaneous support routines follow
2818 */
2819
2820/*
2821 * Map a set of physical memory pages into the kernel virtual
2822 * address space. Return a pointer to where it is mapped. This
2823 * routine is intended to be used for mapping device memory,
2824 * NOT real memory.
2825 */
2826void *
2827pmap_mapdev(pa, size)
2828	vm_paddr_t pa;
2829	vm_size_t size;
2830{
2831	vm_offset_t va, tmpva, offset;
2832
2833	offset = pa & PAGE_MASK;
2834	size = roundup(offset + size, PAGE_SIZE);
2835	pa = pa & PG_FRAME;
2836
2837	if (pa < KERNLOAD && pa + size <= KERNLOAD)
2838		va = KERNBASE + pa;
2839	else
2840		va = kmem_alloc_nofault(kernel_map, size);
2841	if (!va)
2842		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2843
2844	for (tmpva = va; size > 0; ) {
2845		pmap_kenter(tmpva, pa);
2846		size -= PAGE_SIZE;
2847		tmpva += PAGE_SIZE;
2848		pa += PAGE_SIZE;
2849	}
2850	pmap_invalidate_range(kernel_pmap, va, tmpva);
2851	return ((void *)(va + offset));
2852}
2853
2854void
2855pmap_unmapdev(va, size)
2856	vm_offset_t va;
2857	vm_size_t size;
2858{
2859	vm_offset_t base, offset, tmpva;
2860
2861	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
2862		return;
2863	base = va & PG_FRAME;
2864	offset = va & PAGE_MASK;
2865	size = roundup(offset + size, PAGE_SIZE);
2866	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2867		pmap_kremove(tmpva);
2868	pmap_invalidate_range(kernel_pmap, va, tmpva);
2869	kmem_free(kernel_map, base, size);
2870}
2871
2872/*
2873 * perform the pmap work for mincore
2874 */
2875int
2876pmap_mincore(pmap, addr)
2877	pmap_t pmap;
2878	vm_offset_t addr;
2879{
2880	pt_entry_t *ptep, pte;
2881	vm_page_t m;
2882	int val = 0;
2883
2884	ptep = pmap_pte(pmap, addr);
2885	if (ptep == 0) {
2886		return 0;
2887	}
2888
2889	if ((pte = *ptep) != 0) {
2890		vm_paddr_t pa;
2891
2892		val = MINCORE_INCORE;
2893		if ((pte & PG_MANAGED) == 0)
2894			return val;
2895
2896		pa = pte & PG_FRAME;
2897
2898		m = PHYS_TO_VM_PAGE(pa);
2899
2900		/*
2901		 * Modified by us
2902		 */
2903		if (pte & PG_M)
2904			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2905		else {
2906			/*
2907			 * Modified by someone else
2908			 */
2909			vm_page_lock_queues();
2910			if (m->dirty || pmap_is_modified(m))
2911				val |= MINCORE_MODIFIED_OTHER;
2912			vm_page_unlock_queues();
2913		}
2914		/*
2915		 * Referenced by us
2916		 */
2917		if (pte & PG_A)
2918			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2919		else {
2920			/*
2921			 * Referenced by someone else
2922			 */
2923			vm_page_lock_queues();
2924			if ((m->flags & PG_REFERENCED) ||
2925			    pmap_ts_referenced(m)) {
2926				val |= MINCORE_REFERENCED_OTHER;
2927				vm_page_flag_set(m, PG_REFERENCED);
2928			}
2929			vm_page_unlock_queues();
2930		}
2931	}
2932	return val;
2933}
2934
2935void
2936pmap_activate(struct thread *td)
2937{
2938	struct proc *p = td->td_proc;
2939	pmap_t	pmap, oldpmap;
2940	u_int32_t  cr3;
2941
2942	critical_enter();
2943	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2944	oldpmap = PCPU_GET(curpmap);
2945#if defined(SMP)
2946	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
2947	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
2948#else
2949	oldpmap->pm_active &= ~1;
2950	pmap->pm_active |= 1;
2951#endif
2952#ifdef PAE
2953	cr3 = vtophys(pmap->pm_pdpt);
2954#else
2955	cr3 = vtophys(pmap->pm_pdir);
2956#endif
2957	/* XXXKSE this is wrong.
2958	 * pmap_activate is for the current thread on the current cpu
2959	 */
2960	if (p->p_flag & P_SA) {
2961		/* Make sure all other cr3 entries are updated. */
2962		/* what if they are running?  XXXKSE (maybe abort them) */
2963		FOREACH_THREAD_IN_PROC(p, td) {
2964			td->td_pcb->pcb_cr3 = cr3;
2965		}
2966	} else {
2967		td->td_pcb->pcb_cr3 = cr3;
2968	}
2969	load_cr3(cr3);
2970	PCPU_SET(curpmap, pmap);
2971	critical_exit();
2972}
2973
2974vm_offset_t
2975pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
2976{
2977
2978	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
2979		return addr;
2980	}
2981
2982	addr = (addr + PDRMASK) & ~PDRMASK;
2983	return addr;
2984}
2985
2986
2987#if defined(PMAP_DEBUG)
2988pmap_pid_dump(int pid)
2989{
2990	pmap_t pmap;
2991	struct proc *p;
2992	int npte = 0;
2993	int index;
2994
2995	sx_slock(&allproc_lock);
2996	LIST_FOREACH(p, &allproc, p_list) {
2997		if (p->p_pid != pid)
2998			continue;
2999
3000		if (p->p_vmspace) {
3001			int i,j;
3002			index = 0;
3003			pmap = vmspace_pmap(p->p_vmspace);
3004			for (i = 0; i < NPDEPTD; i++) {
3005				pd_entry_t *pde;
3006				pt_entry_t *pte;
3007				vm_offset_t base = i << PDRSHIFT;
3008
3009				pde = &pmap->pm_pdir[i];
3010				if (pde && pmap_pde_v(pde)) {
3011					for (j = 0; j < NPTEPG; j++) {
3012						vm_offset_t va = base + (j << PAGE_SHIFT);
3013						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3014							if (index) {
3015								index = 0;
3016								printf("\n");
3017							}
3018							sx_sunlock(&allproc_lock);
3019							return npte;
3020						}
3021						pte = pmap_pte(pmap, va);
3022						if (pte && pmap_pte_v(pte)) {
3023							pt_entry_t pa;
3024							vm_page_t m;
3025							pa = *pte;
3026							m = PHYS_TO_VM_PAGE(pa);
3027							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3028								va, pa, m->hold_count, m->wire_count, m->flags);
3029							npte++;
3030							index++;
3031							if (index >= 2) {
3032								index = 0;
3033								printf("\n");
3034							} else {
3035								printf(" ");
3036							}
3037						}
3038					}
3039				}
3040			}
3041		}
3042	}
3043	sx_sunlock(&allproc_lock);
3044	return npte;
3045}
3046#endif
3047
3048#if defined(DEBUG)
3049
3050static void	pads(pmap_t pm);
3051void		pmap_pvdump(vm_offset_t pa);
3052
3053/* print address space of pmap*/
3054static void
3055pads(pm)
3056	pmap_t pm;
3057{
3058	int i, j;
3059	vm_paddr_t va;
3060	pt_entry_t *ptep;
3061
3062	if (pm == kernel_pmap)
3063		return;
3064	for (i = 0; i < NPDEPTD; i++)
3065		if (pm->pm_pdir[i])
3066			for (j = 0; j < NPTEPG; j++) {
3067				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3068				if (pm == kernel_pmap && va < KERNBASE)
3069					continue;
3070				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3071					continue;
3072				ptep = pmap_pte(pm, va);
3073				if (pmap_pte_v(ptep))
3074					printf("%x:%x ", va, *ptep);
3075			};
3076
3077}
3078
3079void
3080pmap_pvdump(pa)
3081	vm_paddr_t pa;
3082{
3083	pv_entry_t pv;
3084	vm_page_t m;
3085
3086	printf("pa %x", pa);
3087	m = PHYS_TO_VM_PAGE(pa);
3088	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3089		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3090		pads(pv->pv_pmap);
3091	}
3092	printf(" ");
3093}
3094#endif
3095