pmap.c revision 91358
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 * $FreeBSD: head/sys/i386/i386/pmap.c 91358 2002-02-27 06:05:24Z peter $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72#include "opt_pmap.h"
73#include "opt_msgbuf.h"
74#include "opt_kstack_pages.h"
75
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/kernel.h>
79#include <sys/lock.h>
80#include <sys/mman.h>
81#include <sys/msgbuf.h>
82#include <sys/mutex.h>
83#include <sys/proc.h>
84#include <sys/sx.h>
85#include <sys/user.h>
86#include <sys/vmmeter.h>
87#include <sys/sysctl.h>
88#if defined(SMP)
89#include <sys/smp.h>
90#endif
91
92#include <vm/vm.h>
93#include <vm/vm_param.h>
94#include <vm/vm_kern.h>
95#include <vm/vm_page.h>
96#include <vm/vm_map.h>
97#include <vm/vm_object.h>
98#include <vm/vm_extern.h>
99#include <vm/vm_pageout.h>
100#include <vm/vm_pager.h>
101#include <vm/vm_zone.h>
102
103#include <machine/cputypes.h>
104#include <machine/md_var.h>
105#include <machine/specialreg.h>
106#if defined(SMP) || defined(APIC_IO)
107#include <machine/apic.h>
108#include <machine/segments.h>
109#include <machine/tss.h>
110#endif /* SMP || APIC_IO */
111
112#define PMAP_KEEP_PDIRS
113#ifndef PMAP_SHPGPERPROC
114#define PMAP_SHPGPERPROC 200
115#endif
116
117#if defined(DIAGNOSTIC)
118#define PMAP_DIAGNOSTIC
119#endif
120
121#define MINPV 2048
122
123#if !defined(PMAP_DIAGNOSTIC)
124#define PMAP_INLINE __inline
125#else
126#define PMAP_INLINE
127#endif
128
129/*
130 * Get PDEs and PTEs for user/kernel address space
131 */
132#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
133#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
134
135#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
136#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
137#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
138#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
139#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
140
141#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
142#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
143
144/*
145 * Given a map and a machine independent protection code,
146 * convert to a vax protection code.
147 */
148#define pte_prot(m, p)	(protection_codes[p])
149static int protection_codes[8];
150
151static struct pmap kernel_pmap_store;
152pmap_t kernel_pmap;
153LIST_HEAD(pmaplist, pmap);
154struct pmaplist allpmaps;
155
156vm_offset_t avail_start;	/* PA of first available physical page */
157vm_offset_t avail_end;		/* PA of last available physical page */
158vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
159vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
160static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
161static int pgeflag;		/* PG_G or-in */
162static int pseflag;		/* PG_PS or-in */
163
164static vm_object_t kptobj;
165
166static int nkpt;
167vm_offset_t kernel_vm_end;
168
169/*
170 * Data for the pv entry allocation mechanism
171 */
172static vm_zone_t pvzone;
173static struct vm_zone pvzone_store;
174static struct vm_object pvzone_obj;
175static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
176static int pmap_pagedaemon_waken = 0;
177static struct pv_entry *pvinit;
178
179/*
180 * All those kernel PT submaps that BSD is so fond of
181 */
182pt_entry_t *CMAP1 = 0;
183static pt_entry_t *CMAP2, *ptmmap;
184caddr_t CADDR1 = 0, ptvmmap = 0;
185static caddr_t CADDR2;
186static pt_entry_t *msgbufmap;
187struct msgbuf *msgbufp = 0;
188
189/*
190 * Crashdump maps.
191 */
192static pt_entry_t *pt_crashdumpmap;
193static caddr_t crashdumpmap;
194
195#ifdef SMP
196extern pt_entry_t *SMPpt;
197#endif
198static pt_entry_t *PMAP1 = 0;
199static pt_entry_t *PADDR1 = 0;
200
201static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
202static pt_entry_t *get_ptbase __P((pmap_t pmap));
203static pv_entry_t get_pv_entry __P((void));
204static void	i386_protection_init __P((void));
205static __inline void	pmap_changebit __P((vm_page_t m, int bit, boolean_t setem));
206
207static void	pmap_remove_all __P((vm_page_t m));
208static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
209				      vm_page_t m, vm_page_t mpte));
210static int pmap_remove_pte __P((pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva));
211static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
212static int pmap_remove_entry __P((struct pmap *pmap, vm_page_t m,
213					vm_offset_t va));
214static boolean_t pmap_testbit __P((vm_page_t m, int bit));
215static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
216		vm_page_t mpte, vm_page_t m));
217
218static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
219
220static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
221static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
222static pt_entry_t *pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
223static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
224static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
225static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
226
227static pd_entry_t pdir4mb;
228
229/*
230 *	Routine:	pmap_pte
231 *	Function:
232 *		Extract the page table entry associated
233 *		with the given map/virtual_address pair.
234 */
235
236PMAP_INLINE pt_entry_t *
237pmap_pte(pmap, va)
238	register pmap_t pmap;
239	vm_offset_t va;
240{
241	pd_entry_t *pdeaddr;
242
243	if (pmap) {
244		pdeaddr = pmap_pde(pmap, va);
245		if (*pdeaddr & PG_PS)
246			return pdeaddr;
247		if (*pdeaddr) {
248			return get_ptbase(pmap) + i386_btop(va);
249		}
250	}
251	return (0);
252}
253
254/*
255 * Move the kernel virtual free pointer to the next
256 * 4MB.  This is used to help improve performance
257 * by using a large (4MB) page for much of the kernel
258 * (.text, .data, .bss)
259 */
260static vm_offset_t
261pmap_kmem_choose(vm_offset_t addr)
262{
263	vm_offset_t newaddr = addr;
264
265#ifndef DISABLE_PSE
266	if (cpu_feature & CPUID_PSE)
267		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
268#endif
269	return newaddr;
270}
271
272/*
273 *	Bootstrap the system enough to run with virtual memory.
274 *
275 *	On the i386 this is called after mapping has already been enabled
276 *	and just syncs the pmap module with what has already been done.
277 *	[We can't call it easily with mapping off since the kernel is not
278 *	mapped with PA == VA, hence we would have to relocate every address
279 *	from the linked base (virtual) address "KERNBASE" to the actual
280 *	(physical) address starting relative to 0]
281 */
282void
283pmap_bootstrap(firstaddr, loadaddr)
284	vm_offset_t firstaddr;
285	vm_offset_t loadaddr;
286{
287	vm_offset_t va;
288	pt_entry_t *pte;
289	int i;
290
291	avail_start = firstaddr;
292
293	/*
294	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
295	 * large. It should instead be correctly calculated in locore.s and
296	 * not based on 'first' (which is a physical address, not a virtual
297	 * address, for the start of unused physical memory). The kernel
298	 * page tables are NOT double mapped and thus should not be included
299	 * in this calculation.
300	 */
301	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
302	virtual_avail = pmap_kmem_choose(virtual_avail);
303
304	virtual_end = VM_MAX_KERNEL_ADDRESS;
305
306	/*
307	 * Initialize protection array.
308	 */
309	i386_protection_init();
310
311	/*
312	 * The kernel's pmap is statically allocated so we don't have to use
313	 * pmap_create, which is unlikely to work correctly at this part of
314	 * the boot sequence (XXX and which no longer exists).
315	 */
316	kernel_pmap = &kernel_pmap_store;
317
318	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
319	kernel_pmap->pm_count = 1;
320	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
321	TAILQ_INIT(&kernel_pmap->pm_pvlist);
322	LIST_INIT(&allpmaps);
323	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
324	nkpt = NKPT;
325
326	/*
327	 * Reserve some special page table entries/VA space for temporary
328	 * mapping of pages.
329	 */
330#define	SYSMAP(c, p, v, n)	\
331	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
332
333	va = virtual_avail;
334	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
335
336	/*
337	 * CMAP1/CMAP2 are used for zeroing and copying pages.
338	 */
339	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
340	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
341
342	/*
343	 * Crashdump maps.
344	 */
345	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
346
347	/*
348	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
349	 * XXX ptmmap is not used.
350	 */
351	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
352
353	/*
354	 * msgbufp is used to map the system message buffer.
355	 * XXX msgbufmap is not used.
356	 */
357	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
358	       atop(round_page(MSGBUF_SIZE)))
359
360	/*
361	 * ptemap is used for pmap_pte_quick
362	 */
363	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
364
365	virtual_avail = va;
366
367	*CMAP1 = *CMAP2 = 0;
368	for (i = 0; i < NKPT; i++)
369		PTD[i] = 0;
370
371	pgeflag = 0;
372#if /* !defined(SMP) || */ defined(ENABLE_PG_G)
373	if (cpu_feature & CPUID_PGE)
374		pgeflag = PG_G;
375#endif
376
377/*
378 * Initialize the 4MB page size flag
379 */
380	pseflag = 0;
381/*
382 * The 4MB page version of the initial
383 * kernel page mapping.
384 */
385	pdir4mb = 0;
386
387#ifndef DISABLE_PSE
388	if (cpu_feature & CPUID_PSE) {
389		pd_entry_t ptditmp;
390		/*
391		 * Note that we have enabled PSE mode
392		 */
393		pseflag = PG_PS;
394		ptditmp = *(PTmap + i386_btop(KERNBASE));
395		ptditmp &= ~(NBPDR - 1);
396		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
397		pdir4mb = ptditmp;
398	}
399#endif
400#ifndef SMP
401	/*
402	 * Turn on PGE/PSE.  SMP does this later on since the
403	 * 4K page tables are required for AP boot (for now).
404	 * XXX fixme.
405	 */
406	pmap_set_opt();
407#endif
408#ifdef SMP
409	if (cpu_apic_address == 0)
410		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
411	/* local apic is mapped on last page */
412	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
413	    (cpu_apic_address & PG_FRAME));
414#endif
415	cpu_invltlb();
416}
417
418/*
419 * Enable 4MB page mode for MP startup.  Turn on PG_G support.
420 * BSP will run this after all the AP's have started up.
421 */
422void
423pmap_set_opt(void)
424{
425	pt_entry_t *pte;
426	vm_offset_t va;
427
428	if (pgeflag && (cpu_feature & CPUID_PGE))
429		load_cr4(rcr4() | CR4_PGE);
430#ifndef DISABLE_PSE
431	if (pseflag && (cpu_feature & CPUID_PSE))
432		load_cr4(rcr4() | CR4_PSE);
433#endif
434	if (PCPU_GET(cpuid) == 0) {
435#ifndef DISABLE_PSE
436		if (pdir4mb)
437			kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
438#endif
439		if (pgeflag) {
440			/* XXX see earlier comments about virtual_avail */
441			for (va = KERNBASE; va < virtual_avail; va += PAGE_SIZE)
442			{
443				pte = vtopte(va);
444				if (*pte)
445					*pte |= pgeflag;
446			}
447		}
448		/*
449		 * for SMP, this will cause all cpus to reload again, which
450		 * is actually what we want since they now have CR4_PGE on.
451		 */
452		invltlb();
453	} else
454		cpu_invltlb();
455}
456
457/*
458 *	Initialize the pmap module.
459 *	Called by vm_init, to initialize any structures that the pmap
460 *	system needs to map virtual memory.
461 *	pmap_init has been enhanced to support in a fairly consistant
462 *	way, discontiguous physical memory.
463 */
464void
465pmap_init(phys_start, phys_end)
466	vm_offset_t phys_start, phys_end;
467{
468	int i;
469	int initial_pvs;
470
471	/*
472	 * object for kernel page table pages
473	 */
474	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
475
476	/*
477	 * Allocate memory for random pmap data structures.  Includes the
478	 * pv_head_table.
479	 */
480
481	for(i = 0; i < vm_page_array_size; i++) {
482		vm_page_t m;
483
484		m = &vm_page_array[i];
485		TAILQ_INIT(&m->md.pv_list);
486		m->md.pv_list_count = 0;
487	}
488
489	/*
490	 * init the pv free list
491	 */
492	initial_pvs = vm_page_array_size;
493	if (initial_pvs < MINPV)
494		initial_pvs = MINPV;
495	pvzone = &pvzone_store;
496	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
497		initial_pvs * sizeof (struct pv_entry));
498	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
499	    vm_page_array_size);
500
501	/*
502	 * Now it is safe to enable pv_table recording.
503	 */
504	pmap_initialized = TRUE;
505}
506
507/*
508 * Initialize the address space (zone) for the pv_entries.  Set a
509 * high water mark so that the system can recover from excessive
510 * numbers of pv entries.
511 */
512void
513pmap_init2()
514{
515	int shpgperproc = PMAP_SHPGPERPROC;
516
517	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
518	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
519	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
520	pv_entry_high_water = 9 * (pv_entry_max / 10);
521	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
522}
523
524
525/***************************************************
526 * Low level helper routines.....
527 ***************************************************/
528
529#if defined(PMAP_DIAGNOSTIC)
530
531/*
532 * This code checks for non-writeable/modified pages.
533 * This should be an invalid condition.
534 */
535static int
536pmap_nw_modified(pt_entry_t ptea)
537{
538	int pte;
539
540	pte = (int) ptea;
541
542	if ((pte & (PG_M|PG_RW)) == PG_M)
543		return 1;
544	else
545		return 0;
546}
547#endif
548
549
550/*
551 * this routine defines the region(s) of memory that should
552 * not be tested for the modified bit.
553 */
554static PMAP_INLINE int
555pmap_track_modified(vm_offset_t va)
556{
557	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
558		return 1;
559	else
560		return 0;
561}
562
563static __inline void
564pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
565{
566#if defined(SMP)
567	u_int cpumask;
568	u_int other_cpus;
569	struct thread *td;
570
571	td = curthread;
572	critical_enter();
573	/*
574	 * We need to disable interrupt preemption but MUST NOT have
575	 * interrupts disabled here.
576	 * XXX we may need to hold schedlock to get a coherent pm_active
577	 */
578	if (td->td_critnest == 1)
579		cpu_critical_exit(td->td_savecrit);
580	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
581		invlpg(va);	/* global */
582	} else {
583		cpumask = PCPU_GET(cpumask);
584		other_cpus = PCPU_GET(other_cpus);
585		if (pmap->pm_active & cpumask)
586			cpu_invlpg(va);
587		if (pmap->pm_active & other_cpus)
588			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
589	}
590	critical_exit();
591#else
592	if (pmap->pm_active)
593		cpu_invlpg(va);
594#endif
595}
596
597static __inline void
598pmap_invalidate_all(pmap_t pmap)
599{
600#if defined(SMP)
601	u_int cpumask;
602	u_int other_cpus;
603	struct thread *td;
604
605	td = curthread;
606	critical_enter();
607	/*
608	 * We need to disable interrupt preemption but MUST NOT have
609	 * interrupts disabled here.
610	 * XXX we may need to hold schedlock to get a coherent pm_active
611	 */
612	if (td->td_critnest == 1)
613		cpu_critical_exit(td->td_savecrit);
614	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
615		invltlb();	/* global */
616	} else {
617		cpumask = PCPU_GET(cpumask);
618		other_cpus = PCPU_GET(other_cpus);
619		if (pmap->pm_active & cpumask)
620			cpu_invltlb();
621		if (pmap->pm_active & other_cpus)
622			smp_masked_invltlb(pmap->pm_active & other_cpus);
623	}
624	critical_exit();
625#else
626	if (pmap->pm_active)
627		invltlb();
628#endif
629}
630
631/*
632 * Return an address which is the base of the Virtual mapping of
633 * all the PTEs for the given pmap. Note this doesn't say that
634 * all the PTEs will be present or that the pages there are valid.
635 * The PTEs are made available by the recursive mapping trick.
636 * It will map in the alternate PTE space if needed.
637 */
638static pt_entry_t *
639get_ptbase(pmap)
640	pmap_t pmap;
641{
642	pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
643
644	/* are we current address space or kernel? */
645	if (pmap == kernel_pmap || frame == (PTDpde & PG_FRAME))
646		return PTmap;
647	/* otherwise, we are alternate address space */
648	if (frame != (APTDpde & PG_FRAME)) {
649		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
650		invltlb();
651	}
652	return APTmap;
653}
654
655/*
656 * Super fast pmap_pte routine best used when scanning
657 * the pv lists.  This eliminates many coarse-grained
658 * invltlb calls.  Note that many of the pv list
659 * scans are across different pmaps.  It is very wasteful
660 * to do an entire invltlb for checking a single mapping.
661 */
662
663static pt_entry_t *
664pmap_pte_quick(pmap, va)
665	register pmap_t pmap;
666	vm_offset_t va;
667{
668	pd_entry_t pde, newpf;
669	pde = pmap->pm_pdir[va >> PDRSHIFT];
670	if (pde != 0) {
671		pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
672		unsigned index = i386_btop(va);
673		/* are we current address space or kernel? */
674		if (pmap == kernel_pmap || frame == (PTDpde & PG_FRAME))
675			return PTmap + index;
676		newpf = pde & PG_FRAME;
677		if (((*PMAP1) & PG_FRAME) != newpf) {
678			*PMAP1 = newpf | PG_RW | PG_V;
679			pmap_invalidate_page(pmap, (vm_offset_t) PADDR1);
680		}
681		return PADDR1 + (index & (NPTEPG - 1));
682	}
683	return (0);
684}
685
686/*
687 *	Routine:	pmap_extract
688 *	Function:
689 *		Extract the physical page address associated
690 *		with the given map/virtual_address pair.
691 */
692vm_offset_t
693pmap_extract(pmap, va)
694	register pmap_t pmap;
695	vm_offset_t va;
696{
697	vm_offset_t rtval;	/* XXX FIXME */
698	vm_offset_t pdirindex;
699
700	if (pmap == 0)
701		return 0;
702	pdirindex = va >> PDRSHIFT;
703	rtval = pmap->pm_pdir[pdirindex];
704	if (rtval != 0) {
705		pt_entry_t *pte;
706		if ((rtval & PG_PS) != 0) {
707			rtval &= ~(NBPDR - 1);
708			rtval |= va & (NBPDR - 1);
709			return rtval;
710		}
711		pte = get_ptbase(pmap) + i386_btop(va);
712		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
713		return rtval;
714	}
715	return 0;
716
717}
718
719/***************************************************
720 * Low level mapping routines.....
721 ***************************************************/
722
723/*
724 * add a wired page to the kva
725 */
726PMAP_INLINE void
727pmap_kenter(vm_offset_t va, vm_offset_t pa)
728{
729	pt_entry_t *pte;
730
731	pte = vtopte(va);
732	*pte = pa | PG_RW | PG_V | pgeflag;
733	invlpg(va);
734}
735
736/*
737 * remove a page from the kernel pagetables
738 */
739PMAP_INLINE void
740pmap_kremove(vm_offset_t va)
741{
742	pt_entry_t *pte;
743
744	pte = vtopte(va);
745	*pte = 0;
746	invlpg(va);
747}
748
749/*
750 *	Used to map a range of physical addresses into kernel
751 *	virtual address space.
752 *
753 *	The value passed in '*virt' is a suggested virtual address for
754 *	the mapping. Architectures which can support a direct-mapped
755 *	physical to virtual region can return the appropriate address
756 *	within that region, leaving '*virt' unchanged. Other
757 *	architectures should map the pages starting at '*virt' and
758 *	update '*virt' with the first usable address after the mapped
759 *	region.
760 */
761vm_offset_t
762pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
763{
764	vm_offset_t va, sva;
765
766	va = sva = *virt;
767	while (start < end) {
768		pmap_kenter(va, start);
769		va += PAGE_SIZE;
770		start += PAGE_SIZE;
771	}
772	invlpg_range(sva, end);
773	*virt = va;
774	return (sva);
775}
776
777
778/*
779 * Add a list of wired pages to the kva
780 * this routine is only used for temporary
781 * kernel mappings that do not need to have
782 * page modification or references recorded.
783 * Note that old mappings are simply written
784 * over.  The page *must* be wired.
785 */
786void
787pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
788{
789	vm_offset_t va, end_va;
790
791	va = sva;
792	end_va = va + count * PAGE_SIZE;
793
794	while (va < end_va) {
795		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
796		va += PAGE_SIZE;
797		m++;
798	}
799	invlpg_range(sva, end_va);
800}
801
802/*
803 * this routine jerks page mappings from the
804 * kernel -- it is meant only for temporary mappings.
805 */
806void
807pmap_qremove(vm_offset_t sva, int count)
808{
809	vm_offset_t va, end_va;
810
811	va = sva;
812	end_va = va + count * PAGE_SIZE;
813
814	while (va < end_va) {
815		pmap_kremove(va);
816		va += PAGE_SIZE;
817	}
818	invlpg_range(sva, end_va);
819}
820
821static vm_page_t
822pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
823{
824	vm_page_t m;
825retry:
826	m = vm_page_lookup(object, pindex);
827	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
828		goto retry;
829	return m;
830}
831
832/*
833 * Create the Uarea stack for a new process.
834 * This routine directly affects the fork perf for a process.
835 */
836void
837pmap_new_proc(struct proc *p)
838{
839	int i;
840	vm_page_t ma[UAREA_PAGES];
841	vm_object_t upobj;
842	vm_offset_t up;
843	vm_page_t m;
844
845	/*
846	 * allocate object for the upages
847	 */
848	upobj = p->p_upages_obj;
849	if (upobj == NULL) {
850		upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
851		p->p_upages_obj = upobj;
852	}
853
854	/* get a kernel virtual address for the U area for this thread */
855	up = (vm_offset_t)p->p_uarea;
856	if (up == 0) {
857		up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
858		if (up == 0)
859			panic("pmap_new_proc: upage allocation failed");
860		p->p_uarea = (struct user *)up;
861	}
862
863	for (i = 0; i < UAREA_PAGES; i++) {
864		/*
865		 * Get a kernel stack page
866		 */
867		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
868		ma[i] = m;
869
870		/*
871		 * Wire the page
872		 */
873		m->wire_count++;
874		cnt.v_wire_count++;
875
876		vm_page_wakeup(m);
877		vm_page_flag_clear(m, PG_ZERO);
878		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
879		m->valid = VM_PAGE_BITS_ALL;
880	}
881	pmap_qenter(up, ma, UAREA_PAGES);
882}
883
884/*
885 * Dispose the U-Area for a process that has exited.
886 * This routine directly impacts the exit perf of a process.
887 */
888void
889pmap_dispose_proc(p)
890	struct proc *p;
891{
892	int i;
893	vm_object_t upobj;
894	vm_offset_t up;
895	vm_page_t m;
896
897	upobj = p->p_upages_obj;
898	up = (vm_offset_t)p->p_uarea;
899	pmap_qremove(up, UAREA_PAGES);
900	for (i = 0; i < UAREA_PAGES; i++) {
901		m = vm_page_lookup(upobj, i);
902		if (m == NULL)
903			panic("pmap_dispose_proc: upage already missing?");
904		vm_page_busy(m);
905		vm_page_unwire(m, 0);
906		vm_page_free(m);
907	}
908}
909
910/*
911 * Allow the U_AREA for a process to be prejudicially paged out.
912 */
913void
914pmap_swapout_proc(p)
915	struct proc *p;
916{
917	int i;
918	vm_object_t upobj;
919	vm_offset_t up;
920	vm_page_t m;
921
922	upobj = p->p_upages_obj;
923	up = (vm_offset_t)p->p_uarea;
924	pmap_qremove(up, UAREA_PAGES);
925	for (i = 0; i < UAREA_PAGES; i++) {
926		m = vm_page_lookup(upobj, i);
927		if (m == NULL)
928			panic("pmap_swapout_proc: upage already missing?");
929		vm_page_dirty(m);
930		vm_page_unwire(m, 0);
931	}
932}
933
934/*
935 * Bring the U-Area for a specified process back in.
936 */
937void
938pmap_swapin_proc(p)
939	struct proc *p;
940{
941	int i, rv;
942	vm_page_t ma[UAREA_PAGES];
943	vm_object_t upobj;
944	vm_offset_t up;
945	vm_page_t m;
946
947	upobj = p->p_upages_obj;
948	up = (vm_offset_t)p->p_uarea;
949	for (i = 0; i < UAREA_PAGES; i++) {
950		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
951		if (m->valid != VM_PAGE_BITS_ALL) {
952			rv = vm_pager_get_pages(upobj, &m, 1, 0);
953			if (rv != VM_PAGER_OK)
954				panic("pmap_swapin_proc: cannot get upage for proc: %d\n", p->p_pid);
955			m = vm_page_lookup(upobj, i);
956			m->valid = VM_PAGE_BITS_ALL;
957		}
958		ma[i] = m;
959		vm_page_wire(m);
960		vm_page_wakeup(m);
961		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
962	}
963	pmap_qenter(up, ma, UAREA_PAGES);
964}
965
966/*
967 * Create the kernel stack (including pcb for i386) for a new thread.
968 * This routine directly affects the fork perf for a process and
969 * create performance for a thread.
970 */
971void
972pmap_new_thread(struct thread *td)
973{
974	int i;
975	vm_page_t ma[KSTACK_PAGES];
976	vm_object_t ksobj;
977	vm_page_t m;
978	vm_offset_t ks;
979
980	/*
981	 * allocate object for the kstack
982	 */
983	ksobj = td->td_kstack_obj;
984	if (ksobj == NULL) {
985		ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES);
986		td->td_kstack_obj = ksobj;
987	}
988
989	/* get a kernel virtual address for the kstack for this thread */
990	ks = td->td_kstack;
991#ifdef KSTACK_GUARD
992	if (ks == 0) {
993		ks = kmem_alloc_nofault(kernel_map,
994		    (KSTACK_PAGES + 1) * PAGE_SIZE);
995		if (ks == 0)
996			panic("pmap_new_thread: kstack allocation failed");
997		if (*vtopte(ks) != 0)
998			pmap_qremove(ks, 1);
999		ks += PAGE_SIZE;
1000		td->td_kstack = ks;
1001	}
1002#else
1003	if (ks == 0) {
1004		ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE);
1005		if (ks == 0)
1006			panic("pmap_new_thread: kstack allocation failed");
1007		td->td_kstack = ks;
1008	}
1009#endif
1010	for (i = 0; i < KSTACK_PAGES; i++) {
1011		/*
1012		 * Get a kernel stack page
1013		 */
1014		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1015		ma[i] = m;
1016
1017		/*
1018		 * Wire the page
1019		 */
1020		m->wire_count++;
1021		cnt.v_wire_count++;
1022
1023		vm_page_wakeup(m);
1024		vm_page_flag_clear(m, PG_ZERO);
1025		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1026		m->valid = VM_PAGE_BITS_ALL;
1027	}
1028	pmap_qenter(ks, ma, KSTACK_PAGES);
1029}
1030
1031/*
1032 * Dispose the kernel stack for a thread that has exited.
1033 * This routine directly impacts the exit perf of a process and thread.
1034 */
1035void
1036pmap_dispose_thread(td)
1037	struct thread *td;
1038{
1039	int i;
1040	vm_object_t ksobj;
1041	vm_offset_t ks;
1042	vm_page_t m;
1043
1044	ksobj = td->td_kstack_obj;
1045	ks = td->td_kstack;
1046	pmap_qremove(ks, KSTACK_PAGES);
1047	for (i = 0; i < KSTACK_PAGES; i++) {
1048		m = vm_page_lookup(ksobj, i);
1049		if (m == NULL)
1050			panic("pmap_dispose_thread: kstack already missing?");
1051		vm_page_busy(m);
1052		vm_page_unwire(m, 0);
1053		vm_page_free(m);
1054	}
1055}
1056
1057/*
1058 * Allow the Kernel stack for a thread to be prejudicially paged out.
1059 */
1060void
1061pmap_swapout_thread(td)
1062	struct thread *td;
1063{
1064	int i;
1065	vm_object_t ksobj;
1066	vm_offset_t ks;
1067	vm_page_t m;
1068
1069	ksobj = td->td_kstack_obj;
1070	ks = td->td_kstack;
1071	pmap_qremove(ks, KSTACK_PAGES);
1072	for (i = 0; i < KSTACK_PAGES; i++) {
1073		m = vm_page_lookup(ksobj, i);
1074		if (m == NULL)
1075			panic("pmap_swapout_thread: kstack already missing?");
1076		vm_page_dirty(m);
1077		vm_page_unwire(m, 0);
1078	}
1079}
1080
1081/*
1082 * Bring the kernel stack for a specified thread back in.
1083 */
1084void
1085pmap_swapin_thread(td)
1086	struct thread *td;
1087{
1088	int i, rv;
1089	vm_page_t ma[KSTACK_PAGES];
1090	vm_object_t ksobj;
1091	vm_offset_t ks;
1092	vm_page_t m;
1093
1094	ksobj = td->td_kstack_obj;
1095	ks = td->td_kstack;
1096	for (i = 0; i < KSTACK_PAGES; i++) {
1097		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1098		if (m->valid != VM_PAGE_BITS_ALL) {
1099			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
1100			if (rv != VM_PAGER_OK)
1101				panic("pmap_swapin_thread: cannot get kstack for proc: %d\n", td->td_proc->p_pid);
1102			m = vm_page_lookup(ksobj, i);
1103			m->valid = VM_PAGE_BITS_ALL;
1104		}
1105		ma[i] = m;
1106		vm_page_wire(m);
1107		vm_page_wakeup(m);
1108		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1109	}
1110	pmap_qenter(ks, ma, KSTACK_PAGES);
1111}
1112
1113/***************************************************
1114 * Page table page management routines.....
1115 ***************************************************/
1116
1117/*
1118 * This routine unholds page table pages, and if the hold count
1119 * drops to zero, then it decrements the wire count.
1120 */
1121static int
1122_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1123{
1124
1125	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
1126		;
1127
1128	if (m->hold_count == 0) {
1129		vm_offset_t pteva;
1130		/*
1131		 * unmap the page table page
1132		 */
1133		pmap->pm_pdir[m->pindex] = 0;
1134		--pmap->pm_stats.resident_count;
1135		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1136		    (PTDpde & PG_FRAME)) {
1137			/*
1138			 * Do a invltlb to make the invalidated mapping
1139			 * take effect immediately.
1140			 */
1141			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1142			pmap_invalidate_page(pmap, pteva);
1143		}
1144
1145		if (pmap->pm_ptphint == m)
1146			pmap->pm_ptphint = NULL;
1147
1148		/*
1149		 * If the page is finally unwired, simply free it.
1150		 */
1151		--m->wire_count;
1152		if (m->wire_count == 0) {
1153
1154			vm_page_flash(m);
1155			vm_page_busy(m);
1156			vm_page_free_zero(m);
1157			--cnt.v_wire_count;
1158		}
1159		return 1;
1160	}
1161	return 0;
1162}
1163
1164static PMAP_INLINE int
1165pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1166{
1167	vm_page_unhold(m);
1168	if (m->hold_count == 0)
1169		return _pmap_unwire_pte_hold(pmap, m);
1170	else
1171		return 0;
1172}
1173
1174/*
1175 * After removing a page table entry, this routine is used to
1176 * conditionally free the page, and manage the hold/wire counts.
1177 */
1178static int
1179pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1180{
1181	unsigned ptepindex;
1182	if (va >= VM_MAXUSER_ADDRESS)
1183		return 0;
1184
1185	if (mpte == NULL) {
1186		ptepindex = (va >> PDRSHIFT);
1187		if (pmap->pm_ptphint &&
1188			(pmap->pm_ptphint->pindex == ptepindex)) {
1189			mpte = pmap->pm_ptphint;
1190		} else {
1191			mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1192			pmap->pm_ptphint = mpte;
1193		}
1194	}
1195
1196	return pmap_unwire_pte_hold(pmap, mpte);
1197}
1198
1199void
1200pmap_pinit0(pmap)
1201	struct pmap *pmap;
1202{
1203	pmap->pm_pdir =
1204		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1205	pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t)IdlePTD);
1206	invlpg((vm_offset_t)pmap->pm_pdir);
1207	pmap->pm_count = 1;
1208	pmap->pm_ptphint = NULL;
1209	pmap->pm_active = 0;
1210	TAILQ_INIT(&pmap->pm_pvlist);
1211	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1212	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1213}
1214
1215/*
1216 * Initialize a preallocated and zeroed pmap structure,
1217 * such as one in a vmspace structure.
1218 */
1219void
1220pmap_pinit(pmap)
1221	register struct pmap *pmap;
1222{
1223	vm_page_t ptdpg;
1224
1225	/*
1226	 * No need to allocate page table space yet but we do need a valid
1227	 * page directory table.
1228	 */
1229	if (pmap->pm_pdir == NULL)
1230		pmap->pm_pdir =
1231			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1232
1233	/*
1234	 * allocate object for the ptes
1235	 */
1236	if (pmap->pm_pteobj == NULL)
1237		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
1238
1239	/*
1240	 * allocate the page directory page
1241	 */
1242	ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
1243			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1244
1245	ptdpg->wire_count = 1;
1246	++cnt.v_wire_count;
1247
1248
1249	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
1250	ptdpg->valid = VM_PAGE_BITS_ALL;
1251
1252	pmap_qenter((vm_offset_t) pmap->pm_pdir, &ptdpg, 1);
1253	if ((ptdpg->flags & PG_ZERO) == 0)
1254		bzero(pmap->pm_pdir, PAGE_SIZE);
1255
1256	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1257	/* Wire in kernel global address entries. */
1258	/* XXX copies current process, does not fill in MPPTDI */
1259	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1260#ifdef SMP
1261	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1262#endif
1263
1264	/* install self-referential address mapping entry */
1265	pmap->pm_pdir[PTDPTDI] =
1266		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1267
1268	pmap->pm_count = 1;
1269	pmap->pm_active = 0;
1270	pmap->pm_ptphint = NULL;
1271	TAILQ_INIT(&pmap->pm_pvlist);
1272	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1273}
1274
1275/*
1276 * Wire in kernel global address entries.  To avoid a race condition
1277 * between pmap initialization and pmap_growkernel, this procedure
1278 * should be called after the vmspace is attached to the process
1279 * but before this pmap is activated.
1280 */
1281void
1282pmap_pinit2(pmap)
1283	struct pmap *pmap;
1284{
1285	/* XXX: Remove this stub when no longer called */
1286}
1287
1288static int
1289pmap_release_free_page(pmap_t pmap, vm_page_t p)
1290{
1291	pd_entry_t *pde = pmap->pm_pdir;
1292	/*
1293	 * This code optimizes the case of freeing non-busy
1294	 * page-table pages.  Those pages are zero now, and
1295	 * might as well be placed directly into the zero queue.
1296	 */
1297	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1298		return 0;
1299
1300	vm_page_busy(p);
1301
1302	/*
1303	 * Remove the page table page from the processes address space.
1304	 */
1305	pde[p->pindex] = 0;
1306	pmap->pm_stats.resident_count--;
1307
1308	if (p->hold_count)  {
1309		panic("pmap_release: freeing held page table page");
1310	}
1311	/*
1312	 * Page directory pages need to have the kernel
1313	 * stuff cleared, so they can go into the zero queue also.
1314	 */
1315	if (p->pindex == PTDPTDI) {
1316		bzero(pde + KPTDI, nkpt * PTESIZE);
1317#ifdef SMP
1318		pde[MPPTDI] = 0;
1319#endif
1320		pde[APTDPTDI] = 0;
1321		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1322	}
1323
1324	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1325		pmap->pm_ptphint = NULL;
1326
1327	p->wire_count--;
1328	cnt.v_wire_count--;
1329	vm_page_free_zero(p);
1330	return 1;
1331}
1332
1333/*
1334 * this routine is called if the page table page is not
1335 * mapped correctly.
1336 */
1337static vm_page_t
1338_pmap_allocpte(pmap, ptepindex)
1339	pmap_t	pmap;
1340	unsigned ptepindex;
1341{
1342	vm_offset_t pteva, ptepa;	/* XXXPA */
1343	vm_page_t m;
1344
1345	/*
1346	 * Find or fabricate a new pagetable page
1347	 */
1348	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1349			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1350
1351	KASSERT(m->queue == PQ_NONE,
1352		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1353
1354	if (m->wire_count == 0)
1355		cnt.v_wire_count++;
1356	m->wire_count++;
1357
1358	/*
1359	 * Increment the hold count for the page table page
1360	 * (denoting a new mapping.)
1361	 */
1362	m->hold_count++;
1363
1364	/*
1365	 * Map the pagetable page into the process address space, if
1366	 * it isn't already there.
1367	 */
1368
1369	pmap->pm_stats.resident_count++;
1370
1371	ptepa = VM_PAGE_TO_PHYS(m);
1372	pmap->pm_pdir[ptepindex] =
1373		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1374
1375	/*
1376	 * Set the page table hint
1377	 */
1378	pmap->pm_ptphint = m;
1379
1380	/*
1381	 * Try to use the new mapping, but if we cannot, then
1382	 * do it with the routine that maps the page explicitly.
1383	 */
1384	if ((m->flags & PG_ZERO) == 0) {
1385		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1386		    (PTDpde & PG_FRAME)) {
1387			pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex);
1388			bzero((caddr_t) pteva, PAGE_SIZE);
1389		} else {
1390			pmap_zero_page(ptepa);
1391		}
1392	}
1393
1394	m->valid = VM_PAGE_BITS_ALL;
1395	vm_page_flag_clear(m, PG_ZERO);
1396	vm_page_flag_set(m, PG_MAPPED);
1397	vm_page_wakeup(m);
1398
1399	return m;
1400}
1401
1402static vm_page_t
1403pmap_allocpte(pmap_t pmap, vm_offset_t va)
1404{
1405	unsigned ptepindex;
1406	pd_entry_t ptepa;
1407	vm_page_t m;
1408
1409	/*
1410	 * Calculate pagetable page index
1411	 */
1412	ptepindex = va >> PDRSHIFT;
1413
1414	/*
1415	 * Get the page directory entry
1416	 */
1417	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1418
1419	/*
1420	 * This supports switching from a 4MB page to a
1421	 * normal 4K page.
1422	 */
1423	if (ptepa & PG_PS) {
1424		pmap->pm_pdir[ptepindex] = 0;
1425		ptepa = 0;
1426		invltlb();
1427	}
1428
1429	/*
1430	 * If the page table page is mapped, we just increment the
1431	 * hold count, and activate it.
1432	 */
1433	if (ptepa) {
1434		/*
1435		 * In order to get the page table page, try the
1436		 * hint first.
1437		 */
1438		if (pmap->pm_ptphint &&
1439			(pmap->pm_ptphint->pindex == ptepindex)) {
1440			m = pmap->pm_ptphint;
1441		} else {
1442			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1443			pmap->pm_ptphint = m;
1444		}
1445		m->hold_count++;
1446		return m;
1447	}
1448	/*
1449	 * Here if the pte page isn't mapped, or if it has been deallocated.
1450	 */
1451	return _pmap_allocpte(pmap, ptepindex);
1452}
1453
1454
1455/***************************************************
1456* Pmap allocation/deallocation routines.
1457 ***************************************************/
1458
1459/*
1460 * Release any resources held by the given physical map.
1461 * Called when a pmap initialized by pmap_pinit is being released.
1462 * Should only be called if the map contains no valid mappings.
1463 */
1464void
1465pmap_release(pmap_t pmap)
1466{
1467	vm_page_t p,n,ptdpg;
1468	vm_object_t object = pmap->pm_pteobj;
1469	int curgeneration;
1470
1471#if defined(DIAGNOSTIC)
1472	if (object->ref_count != 1)
1473		panic("pmap_release: pteobj reference count != 1");
1474#endif
1475
1476	ptdpg = NULL;
1477	LIST_REMOVE(pmap, pm_list);
1478retry:
1479	curgeneration = object->generation;
1480	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1481		n = TAILQ_NEXT(p, listq);
1482		if (p->pindex == PTDPTDI) {
1483			ptdpg = p;
1484			continue;
1485		}
1486		while (1) {
1487			if (!pmap_release_free_page(pmap, p) &&
1488				(object->generation != curgeneration))
1489				goto retry;
1490		}
1491	}
1492
1493	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1494		goto retry;
1495}
1496
1497static int
1498kvm_size(SYSCTL_HANDLER_ARGS)
1499{
1500	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1501
1502        return sysctl_handle_long(oidp, &ksize, 0, req);
1503}
1504SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1505    0, 0, kvm_size, "IU", "Size of KVM");
1506
1507static int
1508kvm_free(SYSCTL_HANDLER_ARGS)
1509{
1510	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1511
1512        return sysctl_handle_long(oidp, &kfree, 0, req);
1513}
1514SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1515    0, 0, kvm_free, "IU", "Amount of KVM free");
1516
1517/*
1518 * grow the number of kernel page table entries, if needed
1519 */
1520void
1521pmap_growkernel(vm_offset_t addr)
1522{
1523	struct pmap *pmap;
1524	int s;
1525	vm_offset_t ptppaddr;
1526	vm_page_t nkpg;
1527	pd_entry_t newpdir;
1528
1529	s = splhigh();
1530	if (kernel_vm_end == 0) {
1531		kernel_vm_end = KERNBASE;
1532		nkpt = 0;
1533		while (pdir_pde(PTD, kernel_vm_end)) {
1534			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1535			nkpt++;
1536		}
1537	}
1538	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1539	while (kernel_vm_end < addr) {
1540		if (pdir_pde(PTD, kernel_vm_end)) {
1541			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1542			continue;
1543		}
1544
1545		/*
1546		 * This index is bogus, but out of the way
1547		 */
1548		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
1549		if (!nkpg)
1550			panic("pmap_growkernel: no memory to grow kernel");
1551
1552		nkpt++;
1553
1554		vm_page_wire(nkpg);
1555		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1556		pmap_zero_page(ptppaddr);
1557		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1558		pdir_pde(PTD, kernel_vm_end) = newpdir;
1559
1560		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1561			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1562		}
1563		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1564	}
1565	splx(s);
1566}
1567
1568/*
1569 *	Retire the given physical map from service.
1570 *	Should only be called if the map contains
1571 *	no valid mappings.
1572 */
1573void
1574pmap_destroy(pmap_t pmap)
1575{
1576	int count;
1577
1578	if (pmap == NULL)
1579		return;
1580
1581	count = --pmap->pm_count;
1582	if (count == 0) {
1583		pmap_release(pmap);
1584		panic("destroying a pmap is not yet implemented");
1585	}
1586}
1587
1588/*
1589 *	Add a reference to the specified pmap.
1590 */
1591void
1592pmap_reference(pmap_t pmap)
1593{
1594	if (pmap != NULL) {
1595		pmap->pm_count++;
1596	}
1597}
1598
1599/***************************************************
1600* page management routines.
1601 ***************************************************/
1602
1603/*
1604 * free the pv_entry back to the free list
1605 */
1606static PMAP_INLINE void
1607free_pv_entry(pv_entry_t pv)
1608{
1609	pv_entry_count--;
1610	zfree(pvzone, pv);
1611}
1612
1613/*
1614 * get a new pv_entry, allocating a block from the system
1615 * when needed.
1616 * the memory allocation is performed bypassing the malloc code
1617 * because of the possibility of allocations at interrupt time.
1618 */
1619static pv_entry_t
1620get_pv_entry(void)
1621{
1622	pv_entry_count++;
1623	if (pv_entry_high_water &&
1624		(pv_entry_count > pv_entry_high_water) &&
1625		(pmap_pagedaemon_waken == 0)) {
1626		pmap_pagedaemon_waken = 1;
1627		wakeup (&vm_pages_needed);
1628	}
1629	return zalloc(pvzone);
1630}
1631
1632/*
1633 * This routine is very drastic, but can save the system
1634 * in a pinch.
1635 */
1636void
1637pmap_collect()
1638{
1639	int i;
1640	vm_page_t m;
1641	static int warningdone = 0;
1642
1643	if (pmap_pagedaemon_waken == 0)
1644		return;
1645
1646	if (warningdone < 5) {
1647		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1648		warningdone++;
1649	}
1650
1651	for(i = 0; i < vm_page_array_size; i++) {
1652		m = &vm_page_array[i];
1653		if (m->wire_count || m->hold_count || m->busy ||
1654		    (m->flags & (PG_BUSY | PG_UNMANAGED)))
1655			continue;
1656		pmap_remove_all(m);
1657	}
1658	pmap_pagedaemon_waken = 0;
1659}
1660
1661
1662/*
1663 * If it is the first entry on the list, it is actually
1664 * in the header and we must copy the following entry up
1665 * to the header.  Otherwise we must search the list for
1666 * the entry.  In either case we free the now unused entry.
1667 */
1668
1669static int
1670pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1671{
1672	pv_entry_t pv;
1673	int rtval;
1674	int s;
1675
1676	s = splvm();
1677	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1678		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1679			if (pmap == pv->pv_pmap && va == pv->pv_va)
1680				break;
1681		}
1682	} else {
1683		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1684			if (va == pv->pv_va)
1685				break;
1686		}
1687	}
1688
1689	rtval = 0;
1690	if (pv) {
1691		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1692		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1693		m->md.pv_list_count--;
1694		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1695			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1696
1697		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1698		free_pv_entry(pv);
1699	}
1700
1701	splx(s);
1702	return rtval;
1703}
1704
1705/*
1706 * Create a pv entry for page at pa for
1707 * (pmap, va).
1708 */
1709static void
1710pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1711{
1712
1713	int s;
1714	pv_entry_t pv;
1715
1716	s = splvm();
1717	pv = get_pv_entry();
1718	pv->pv_va = va;
1719	pv->pv_pmap = pmap;
1720	pv->pv_ptem = mpte;
1721
1722	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1723	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1724	m->md.pv_list_count++;
1725
1726	splx(s);
1727}
1728
1729/*
1730 * pmap_remove_pte: do the things to unmap a page in a process
1731 */
1732static int
1733pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1734{
1735	pt_entry_t oldpte;
1736	vm_page_t m;
1737
1738	oldpte = atomic_readandclear_int(ptq);
1739	if (oldpte & PG_W)
1740		pmap->pm_stats.wired_count -= 1;
1741	/*
1742	 * Machines that don't support invlpg, also don't support
1743	 * PG_G.
1744	 */
1745	if (oldpte & PG_G)
1746		invlpg(va);
1747	pmap->pm_stats.resident_count -= 1;
1748	if (oldpte & PG_MANAGED) {
1749		m = PHYS_TO_VM_PAGE(oldpte);
1750		if (oldpte & PG_M) {
1751#if defined(PMAP_DIAGNOSTIC)
1752			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1753				printf(
1754	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1755				    va, oldpte);
1756			}
1757#endif
1758			if (pmap_track_modified(va))
1759				vm_page_dirty(m);
1760		}
1761		if (oldpte & PG_A)
1762			vm_page_flag_set(m, PG_REFERENCED);
1763		return pmap_remove_entry(pmap, m, va);
1764	} else {
1765		return pmap_unuse_pt(pmap, va, NULL);
1766	}
1767
1768	return 0;
1769}
1770
1771/*
1772 * Remove a single page from a process address space
1773 */
1774static void
1775pmap_remove_page(pmap_t pmap, vm_offset_t va)
1776{
1777	register pt_entry_t *ptq;
1778
1779	/*
1780	 * if there is no pte for this address, just skip it!!!
1781	 */
1782	if (*pmap_pde(pmap, va) == 0) {
1783		return;
1784	}
1785
1786	/*
1787	 * get a local va for mappings for this pmap.
1788	 */
1789	ptq = get_ptbase(pmap) + i386_btop(va);
1790	if (*ptq) {
1791		(void) pmap_remove_pte(pmap, ptq, va);
1792		pmap_invalidate_page(pmap, va);
1793	}
1794	return;
1795}
1796
1797/*
1798 *	Remove the given range of addresses from the specified map.
1799 *
1800 *	It is assumed that the start and end are properly
1801 *	rounded to the page size.
1802 */
1803void
1804pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1805{
1806	register pt_entry_t *ptbase;
1807	vm_offset_t pdnxt;
1808	pd_entry_t ptpaddr;
1809	vm_offset_t sindex, eindex;
1810	int anyvalid;
1811
1812	if (pmap == NULL)
1813		return;
1814
1815	if (pmap->pm_stats.resident_count == 0)
1816		return;
1817
1818	/*
1819	 * special handling of removing one page.  a very
1820	 * common operation and easy to short circuit some
1821	 * code.
1822	 */
1823	if ((sva + PAGE_SIZE == eva) &&
1824	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1825		pmap_remove_page(pmap, sva);
1826		return;
1827	}
1828
1829	anyvalid = 0;
1830
1831	/*
1832	 * Get a local virtual address for the mappings that are being
1833	 * worked with.
1834	 */
1835	ptbase = get_ptbase(pmap);
1836
1837	sindex = i386_btop(sva);
1838	eindex = i386_btop(eva);
1839
1840	for (; sindex < eindex; sindex = pdnxt) {
1841		unsigned pdirindex;
1842
1843		/*
1844		 * Calculate index for next page table.
1845		 */
1846		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1847		if (pmap->pm_stats.resident_count == 0)
1848			break;
1849
1850		pdirindex = sindex / NPDEPG;
1851		ptpaddr = pmap->pm_pdir[pdirindex];
1852		if ((ptpaddr & PG_PS) != 0) {
1853			pmap->pm_pdir[pdirindex] = 0;
1854			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1855			anyvalid++;
1856			continue;
1857		}
1858
1859		/*
1860		 * Weed out invalid mappings. Note: we assume that the page
1861		 * directory table is always allocated, and in kernel virtual.
1862		 */
1863		if (ptpaddr == 0)
1864			continue;
1865
1866		/*
1867		 * Limit our scan to either the end of the va represented
1868		 * by the current page table page, or to the end of the
1869		 * range being removed.
1870		 */
1871		if (pdnxt > eindex) {
1872			pdnxt = eindex;
1873		}
1874
1875		for (; sindex != pdnxt; sindex++) {
1876			vm_offset_t va;
1877			if (ptbase[sindex] == 0) {
1878				continue;
1879			}
1880			va = i386_ptob(sindex);
1881
1882			anyvalid++;
1883			if (pmap_remove_pte(pmap,
1884				ptbase + sindex, va))
1885				break;
1886		}
1887	}
1888
1889	if (anyvalid)
1890		pmap_invalidate_all(pmap);
1891}
1892
1893/*
1894 *	Routine:	pmap_remove_all
1895 *	Function:
1896 *		Removes this physical page from
1897 *		all physical maps in which it resides.
1898 *		Reflects back modify bits to the pager.
1899 *
1900 *	Notes:
1901 *		Original versions of this routine were very
1902 *		inefficient because they iteratively called
1903 *		pmap_remove (slow...)
1904 */
1905
1906static void
1907pmap_remove_all(vm_page_t m)
1908{
1909	register pv_entry_t pv;
1910	pt_entry_t *pte, tpte;
1911	int s;
1912
1913#if defined(PMAP_DIAGNOSTIC)
1914	/*
1915	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1916	 * pages!
1917	 */
1918	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1919		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
1920	}
1921#endif
1922
1923	s = splvm();
1924	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1925		pv->pv_pmap->pm_stats.resident_count--;
1926
1927		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1928
1929		tpte = atomic_readandclear_int(pte);
1930		if (tpte & PG_W)
1931			pv->pv_pmap->pm_stats.wired_count--;
1932
1933		if (tpte & PG_A)
1934			vm_page_flag_set(m, PG_REFERENCED);
1935
1936		/*
1937		 * Update the vm_page_t clean and reference bits.
1938		 */
1939		if (tpte & PG_M) {
1940#if defined(PMAP_DIAGNOSTIC)
1941			if (pmap_nw_modified((pt_entry_t) tpte)) {
1942				printf(
1943	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1944				    pv->pv_va, tpte);
1945			}
1946#endif
1947			if (pmap_track_modified(pv->pv_va))
1948				vm_page_dirty(m);
1949		}
1950		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1951
1952		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1953		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1954		m->md.pv_list_count--;
1955		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1956		free_pv_entry(pv);
1957	}
1958
1959	vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1960
1961	splx(s);
1962}
1963
1964/*
1965 *	Set the physical protection on the
1966 *	specified range of this map as requested.
1967 */
1968void
1969pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1970{
1971	register pt_entry_t *ptbase;
1972	vm_offset_t pdnxt;
1973	pd_entry_t ptpaddr;
1974	vm_pindex_t sindex, eindex;
1975	int anychanged;
1976
1977	if (pmap == NULL)
1978		return;
1979
1980	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1981		pmap_remove(pmap, sva, eva);
1982		return;
1983	}
1984
1985	if (prot & VM_PROT_WRITE)
1986		return;
1987
1988	anychanged = 0;
1989
1990	ptbase = get_ptbase(pmap);
1991
1992	sindex = i386_btop(sva);
1993	eindex = i386_btop(eva);
1994
1995	for (; sindex < eindex; sindex = pdnxt) {
1996
1997		unsigned pdirindex;
1998
1999		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2000
2001		pdirindex = sindex / NPDEPG;
2002		ptpaddr = pmap->pm_pdir[pdirindex];
2003		if ((ptpaddr & PG_PS) != 0) {
2004			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2005			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2006			anychanged++;
2007			continue;
2008		}
2009
2010		/*
2011		 * Weed out invalid mappings. Note: we assume that the page
2012		 * directory table is always allocated, and in kernel virtual.
2013		 */
2014		if (ptpaddr == 0)
2015			continue;
2016
2017		if (pdnxt > eindex) {
2018			pdnxt = eindex;
2019		}
2020
2021		for (; sindex != pdnxt; sindex++) {
2022
2023			pt_entry_t pbits;
2024			vm_page_t m;
2025
2026			pbits = ptbase[sindex];
2027
2028			if (pbits & PG_MANAGED) {
2029				m = NULL;
2030				if (pbits & PG_A) {
2031					m = PHYS_TO_VM_PAGE(pbits);
2032					vm_page_flag_set(m, PG_REFERENCED);
2033					pbits &= ~PG_A;
2034				}
2035				if (pbits & PG_M) {
2036					if (pmap_track_modified(i386_ptob(sindex))) {
2037						if (m == NULL)
2038							m = PHYS_TO_VM_PAGE(pbits);
2039						vm_page_dirty(m);
2040						pbits &= ~PG_M;
2041					}
2042				}
2043			}
2044
2045			pbits &= ~PG_RW;
2046
2047			if (pbits != ptbase[sindex]) {
2048				ptbase[sindex] = pbits;
2049				anychanged = 1;
2050			}
2051		}
2052	}
2053	if (anychanged)
2054		pmap_invalidate_all(pmap);
2055}
2056
2057/*
2058 *	Insert the given physical page (p) at
2059 *	the specified virtual address (v) in the
2060 *	target physical map with the protection requested.
2061 *
2062 *	If specified, the page will be wired down, meaning
2063 *	that the related pte can not be reclaimed.
2064 *
2065 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2066 *	or lose information.  That is, this routine must actually
2067 *	insert this page into the given map NOW.
2068 */
2069void
2070pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2071	   boolean_t wired)
2072{
2073	vm_offset_t pa;
2074	register pt_entry_t *pte;
2075	vm_offset_t opa;
2076	pt_entry_t origpte, newpte;
2077	vm_page_t mpte;
2078
2079	if (pmap == NULL)
2080		return;
2081
2082	va &= PG_FRAME;
2083#ifdef PMAP_DIAGNOSTIC
2084	if (va > VM_MAX_KERNEL_ADDRESS)
2085		panic("pmap_enter: toobig");
2086	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2087		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2088#endif
2089
2090	mpte = NULL;
2091	/*
2092	 * In the case that a page table page is not
2093	 * resident, we are creating it here.
2094	 */
2095	if (va < VM_MAXUSER_ADDRESS) {
2096		mpte = pmap_allocpte(pmap, va);
2097	}
2098#if 0 && defined(PMAP_DIAGNOSTIC)
2099	else {
2100		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2101		origpte = *pdeaddr;
2102		if ((origpte & PG_V) == 0) {
2103			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2104				pmap->pm_pdir[PTDPTDI], origpte, va);
2105		}
2106	}
2107#endif
2108
2109	pte = pmap_pte(pmap, va);
2110
2111	/*
2112	 * Page Directory table entry not valid, we need a new PT page
2113	 */
2114	if (pte == NULL) {
2115		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n",
2116			(void *)pmap->pm_pdir[PTDPTDI], va);
2117	}
2118
2119	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2120	origpte = *(vm_offset_t *)pte;
2121	opa = origpte & PG_FRAME;
2122
2123	if (origpte & PG_PS)
2124		panic("pmap_enter: attempted pmap_enter on 4MB page");
2125
2126	/*
2127	 * Mapping has not changed, must be protection or wiring change.
2128	 */
2129	if (origpte && (opa == pa)) {
2130		/*
2131		 * Wiring change, just update stats. We don't worry about
2132		 * wiring PT pages as they remain resident as long as there
2133		 * are valid mappings in them. Hence, if a user page is wired,
2134		 * the PT page will be also.
2135		 */
2136		if (wired && ((origpte & PG_W) == 0))
2137			pmap->pm_stats.wired_count++;
2138		else if (!wired && (origpte & PG_W))
2139			pmap->pm_stats.wired_count--;
2140
2141#if defined(PMAP_DIAGNOSTIC)
2142		if (pmap_nw_modified((pt_entry_t) origpte)) {
2143			printf(
2144	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2145			    va, origpte);
2146		}
2147#endif
2148
2149		/*
2150		 * Remove extra pte reference
2151		 */
2152		if (mpte)
2153			mpte->hold_count--;
2154
2155		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2156			if ((origpte & PG_RW) == 0) {
2157				*pte |= PG_RW;
2158				pmap_invalidate_page(pmap, va);
2159			}
2160			return;
2161		}
2162
2163		/*
2164		 * We might be turning off write access to the page,
2165		 * so we go ahead and sense modify status.
2166		 */
2167		if (origpte & PG_MANAGED) {
2168			if ((origpte & PG_M) && pmap_track_modified(va)) {
2169				vm_page_t om;
2170				om = PHYS_TO_VM_PAGE(opa);
2171				vm_page_dirty(om);
2172			}
2173			pa |= PG_MANAGED;
2174		}
2175		goto validate;
2176	}
2177	/*
2178	 * Mapping has changed, invalidate old range and fall through to
2179	 * handle validating new mapping.
2180	 */
2181	if (opa) {
2182		int err;
2183		err = pmap_remove_pte(pmap, pte, va);
2184		if (err)
2185			panic("pmap_enter: pte vanished, va: 0x%x", va);
2186	}
2187
2188	/*
2189	 * Enter on the PV list if part of our managed memory. Note that we
2190	 * raise IPL while manipulating pv_table since pmap_enter can be
2191	 * called at interrupt time.
2192	 */
2193	if (pmap_initialized &&
2194	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2195		pmap_insert_entry(pmap, va, mpte, m);
2196		pa |= PG_MANAGED;
2197	}
2198
2199	/*
2200	 * Increment counters
2201	 */
2202	pmap->pm_stats.resident_count++;
2203	if (wired)
2204		pmap->pm_stats.wired_count++;
2205
2206validate:
2207	/*
2208	 * Now validate mapping with desired protection/wiring.
2209	 */
2210	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2211
2212	if (wired)
2213		newpte |= PG_W;
2214	if (va < VM_MAXUSER_ADDRESS)
2215		newpte |= PG_U;
2216	if (pmap == kernel_pmap)
2217		newpte |= pgeflag;
2218
2219	/*
2220	 * if the mapping or permission bits are different, we need
2221	 * to update the pte.
2222	 */
2223	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2224		*pte = newpte | PG_A;
2225		/*if (origpte)*/ {
2226			pmap_invalidate_page(pmap, va);
2227		}
2228	}
2229}
2230
2231/*
2232 * this code makes some *MAJOR* assumptions:
2233 * 1. Current pmap & pmap exists.
2234 * 2. Not wired.
2235 * 3. Read access.
2236 * 4. No page table pages.
2237 * 5. Tlbflush is deferred to calling procedure.
2238 * 6. Page IS managed.
2239 * but is *MUCH* faster than pmap_enter...
2240 */
2241
2242static vm_page_t
2243pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2244{
2245	pt_entry_t *pte;
2246	vm_offset_t pa;
2247
2248	/*
2249	 * In the case that a page table page is not
2250	 * resident, we are creating it here.
2251	 */
2252	if (va < VM_MAXUSER_ADDRESS) {
2253		unsigned ptepindex;
2254		pd_entry_t ptepa;
2255
2256		/*
2257		 * Calculate pagetable page index
2258		 */
2259		ptepindex = va >> PDRSHIFT;
2260		if (mpte && (mpte->pindex == ptepindex)) {
2261			mpte->hold_count++;
2262		} else {
2263retry:
2264			/*
2265			 * Get the page directory entry
2266			 */
2267			ptepa = pmap->pm_pdir[ptepindex];
2268
2269			/*
2270			 * If the page table page is mapped, we just increment
2271			 * the hold count, and activate it.
2272			 */
2273			if (ptepa) {
2274				if (ptepa & PG_PS)
2275					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2276				if (pmap->pm_ptphint &&
2277					(pmap->pm_ptphint->pindex == ptepindex)) {
2278					mpte = pmap->pm_ptphint;
2279				} else {
2280					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
2281					pmap->pm_ptphint = mpte;
2282				}
2283				if (mpte == NULL)
2284					goto retry;
2285				mpte->hold_count++;
2286			} else {
2287				mpte = _pmap_allocpte(pmap, ptepindex);
2288			}
2289		}
2290	} else {
2291		mpte = NULL;
2292	}
2293
2294	/*
2295	 * This call to vtopte makes the assumption that we are
2296	 * entering the page into the current pmap.  In order to support
2297	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2298	 * But that isn't as quick as vtopte.
2299	 */
2300	pte = vtopte(va);
2301	if (*pte) {
2302		if (mpte)
2303			pmap_unwire_pte_hold(pmap, mpte);
2304		return 0;
2305	}
2306
2307	/*
2308	 * Enter on the PV list if part of our managed memory. Note that we
2309	 * raise IPL while manipulating pv_table since pmap_enter can be
2310	 * called at interrupt time.
2311	 */
2312	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2313		pmap_insert_entry(pmap, va, mpte, m);
2314
2315	/*
2316	 * Increment counters
2317	 */
2318	pmap->pm_stats.resident_count++;
2319
2320	pa = VM_PAGE_TO_PHYS(m);
2321
2322	/*
2323	 * Now validate mapping with RO protection
2324	 */
2325	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2326		*pte = pa | PG_V | PG_U;
2327	else
2328		*pte = pa | PG_V | PG_U | PG_MANAGED;
2329
2330	return mpte;
2331}
2332
2333/*
2334 * Make a temporary mapping for a physical address.  This is only intended
2335 * to be used for panic dumps.
2336 */
2337void *
2338pmap_kenter_temporary(vm_offset_t pa, int i)
2339{
2340	pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2341	invlpg((vm_offset_t)crashdumpmap + (i * PAGE_SIZE));
2342	return ((void *)crashdumpmap);
2343}
2344
2345#define MAX_INIT_PT (96)
2346/*
2347 * pmap_object_init_pt preloads the ptes for a given object
2348 * into the specified pmap.  This eliminates the blast of soft
2349 * faults on process startup and immediately after an mmap.
2350 */
2351void
2352pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2353		    vm_object_t object, vm_pindex_t pindex,
2354		    vm_size_t size, int limit)
2355{
2356	vm_offset_t tmpidx;
2357	int psize;
2358	vm_page_t p, mpte;
2359	int objpgs;
2360
2361	if (pmap == NULL || object == NULL)
2362		return;
2363
2364	/*
2365	 * This code maps large physical mmap regions into the
2366	 * processor address space.  Note that some shortcuts
2367	 * are taken, but the code works.
2368	 */
2369	if (pseflag && (object->type == OBJT_DEVICE) &&
2370	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2371		int i;
2372		vm_page_t m[1];
2373		unsigned int ptepindex;
2374		int npdes;
2375		pd_entry_t ptepa;
2376
2377		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2378			return;
2379
2380retry:
2381		p = vm_page_lookup(object, pindex);
2382		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
2383			goto retry;
2384
2385		if (p == NULL) {
2386			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2387			if (p == NULL)
2388				return;
2389			m[0] = p;
2390
2391			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2392				vm_page_free(p);
2393				return;
2394			}
2395
2396			p = vm_page_lookup(object, pindex);
2397			vm_page_wakeup(p);
2398		}
2399
2400		ptepa = VM_PAGE_TO_PHYS(p);
2401		if (ptepa & (NBPDR - 1)) {
2402			return;
2403		}
2404
2405		p->valid = VM_PAGE_BITS_ALL;
2406
2407		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2408		npdes = size >> PDRSHIFT;
2409		for(i = 0; i < npdes; i++) {
2410			pmap->pm_pdir[ptepindex] =
2411			    ptepa | PG_U | PG_RW | PG_V | PG_PS;
2412			ptepa += NBPDR;
2413			ptepindex += 1;
2414		}
2415		vm_page_flag_set(p, PG_MAPPED);
2416		invltlb();
2417		return;
2418	}
2419
2420	psize = i386_btop(size);
2421
2422	if ((object->type != OBJT_VNODE) ||
2423	    ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2424	     (object->resident_page_count > MAX_INIT_PT))) {
2425		return;
2426	}
2427
2428	if (psize + pindex > object->size) {
2429		if (object->size < pindex)
2430			return;
2431		psize = object->size - pindex;
2432	}
2433
2434	mpte = NULL;
2435	/*
2436	 * if we are processing a major portion of the object, then scan the
2437	 * entire thing.
2438	 */
2439	if (psize > (object->resident_page_count >> 2)) {
2440		objpgs = psize;
2441
2442		for (p = TAILQ_FIRST(&object->memq);
2443		    ((objpgs > 0) && (p != NULL));
2444		    p = TAILQ_NEXT(p, listq)) {
2445
2446			tmpidx = p->pindex;
2447			if (tmpidx < pindex) {
2448				continue;
2449			}
2450			tmpidx -= pindex;
2451			if (tmpidx >= psize) {
2452				continue;
2453			}
2454			/*
2455			 * don't allow an madvise to blow away our really
2456			 * free pages allocating pv entries.
2457			 */
2458			if ((limit & MAP_PREFAULT_MADVISE) &&
2459			    cnt.v_free_count < cnt.v_free_reserved) {
2460				break;
2461			}
2462			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2463				(p->busy == 0) &&
2464			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2465				if ((p->queue - p->pc) == PQ_CACHE)
2466					vm_page_deactivate(p);
2467				vm_page_busy(p);
2468				mpte = pmap_enter_quick(pmap,
2469					addr + i386_ptob(tmpidx), p, mpte);
2470				vm_page_flag_set(p, PG_MAPPED);
2471				vm_page_wakeup(p);
2472			}
2473			objpgs -= 1;
2474		}
2475	} else {
2476		/*
2477		 * else lookup the pages one-by-one.
2478		 */
2479		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2480			/*
2481			 * don't allow an madvise to blow away our really
2482			 * free pages allocating pv entries.
2483			 */
2484			if ((limit & MAP_PREFAULT_MADVISE) &&
2485			    cnt.v_free_count < cnt.v_free_reserved) {
2486				break;
2487			}
2488			p = vm_page_lookup(object, tmpidx + pindex);
2489			if (p &&
2490			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2491				(p->busy == 0) &&
2492			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2493				if ((p->queue - p->pc) == PQ_CACHE)
2494					vm_page_deactivate(p);
2495				vm_page_busy(p);
2496				mpte = pmap_enter_quick(pmap,
2497					addr + i386_ptob(tmpidx), p, mpte);
2498				vm_page_flag_set(p, PG_MAPPED);
2499				vm_page_wakeup(p);
2500			}
2501		}
2502	}
2503	return;
2504}
2505
2506/*
2507 * pmap_prefault provides a quick way of clustering
2508 * pagefaults into a processes address space.  It is a "cousin"
2509 * of pmap_object_init_pt, except it runs at page fault time instead
2510 * of mmap time.
2511 */
2512#define PFBAK 4
2513#define PFFOR 4
2514#define PAGEORDER_SIZE (PFBAK+PFFOR)
2515
2516static int pmap_prefault_pageorder[] = {
2517	-PAGE_SIZE, PAGE_SIZE,
2518	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2519	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2520	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2521};
2522
2523void
2524pmap_prefault(pmap, addra, entry)
2525	pmap_t pmap;
2526	vm_offset_t addra;
2527	vm_map_entry_t entry;
2528{
2529	int i;
2530	vm_offset_t starta;
2531	vm_offset_t addr;
2532	vm_pindex_t pindex;
2533	vm_page_t m, mpte;
2534	vm_object_t object;
2535
2536	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2537		return;
2538
2539	object = entry->object.vm_object;
2540
2541	starta = addra - PFBAK * PAGE_SIZE;
2542	if (starta < entry->start) {
2543		starta = entry->start;
2544	} else if (starta > addra) {
2545		starta = 0;
2546	}
2547
2548	mpte = NULL;
2549	for (i = 0; i < PAGEORDER_SIZE; i++) {
2550		vm_object_t lobject;
2551		pt_entry_t *pte;
2552
2553		addr = addra + pmap_prefault_pageorder[i];
2554		if (addr > addra + (PFFOR * PAGE_SIZE))
2555			addr = 0;
2556
2557		if (addr < starta || addr >= entry->end)
2558			continue;
2559
2560		if ((*pmap_pde(pmap, addr)) == NULL)
2561			continue;
2562
2563		pte = vtopte(addr);
2564		if (*pte)
2565			continue;
2566
2567		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2568		lobject = object;
2569		for (m = vm_page_lookup(lobject, pindex);
2570		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2571		    lobject = lobject->backing_object) {
2572			if (lobject->backing_object_offset & PAGE_MASK)
2573				break;
2574			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2575			m = vm_page_lookup(lobject->backing_object, pindex);
2576		}
2577
2578		/*
2579		 * give-up when a page is not in memory
2580		 */
2581		if (m == NULL)
2582			break;
2583
2584		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2585			(m->busy == 0) &&
2586		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2587
2588			if ((m->queue - m->pc) == PQ_CACHE) {
2589				vm_page_deactivate(m);
2590			}
2591			vm_page_busy(m);
2592			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2593			vm_page_flag_set(m, PG_MAPPED);
2594			vm_page_wakeup(m);
2595		}
2596	}
2597}
2598
2599/*
2600 *	Routine:	pmap_change_wiring
2601 *	Function:	Change the wiring attribute for a map/virtual-address
2602 *			pair.
2603 *	In/out conditions:
2604 *			The mapping must already exist in the pmap.
2605 */
2606void
2607pmap_change_wiring(pmap, va, wired)
2608	register pmap_t pmap;
2609	vm_offset_t va;
2610	boolean_t wired;
2611{
2612	register pt_entry_t *pte;
2613
2614	if (pmap == NULL)
2615		return;
2616
2617	pte = pmap_pte(pmap, va);
2618
2619	if (wired && !pmap_pte_w(pte))
2620		pmap->pm_stats.wired_count++;
2621	else if (!wired && pmap_pte_w(pte))
2622		pmap->pm_stats.wired_count--;
2623
2624	/*
2625	 * Wiring is not a hardware characteristic so there is no need to
2626	 * invalidate TLB.
2627	 */
2628	pmap_pte_set_w(pte, wired);
2629}
2630
2631
2632
2633/*
2634 *	Copy the range specified by src_addr/len
2635 *	from the source map to the range dst_addr/len
2636 *	in the destination map.
2637 *
2638 *	This routine is only advisory and need not do anything.
2639 */
2640
2641void
2642pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2643	  vm_offset_t src_addr)
2644{
2645	vm_offset_t addr;
2646	vm_offset_t end_addr = src_addr + len;
2647	vm_offset_t pdnxt;
2648	pd_entry_t src_frame, dst_frame;
2649	vm_page_t m;
2650
2651	if (dst_addr != src_addr)
2652		return;
2653
2654	src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2655	if (src_frame != (PTDpde & PG_FRAME))
2656		return;
2657
2658	dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2659	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2660		pt_entry_t *src_pte, *dst_pte;
2661		vm_page_t dstmpte, srcmpte;
2662		pd_entry_t srcptepaddr;
2663		unsigned ptepindex;
2664
2665		if (addr >= UPT_MIN_ADDRESS)
2666			panic("pmap_copy: invalid to pmap_copy page tables\n");
2667
2668		/*
2669		 * Don't let optional prefaulting of pages make us go
2670		 * way below the low water mark of free pages or way
2671		 * above high water mark of used pv entries.
2672		 */
2673		if (cnt.v_free_count < cnt.v_free_reserved ||
2674		    pv_entry_count > pv_entry_high_water)
2675			break;
2676
2677		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2678		ptepindex = addr >> PDRSHIFT;
2679
2680		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2681		if (srcptepaddr == 0)
2682			continue;
2683
2684		if (srcptepaddr & PG_PS) {
2685			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2686				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2687				dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2688			}
2689			continue;
2690		}
2691
2692		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2693		if ((srcmpte == NULL) ||
2694		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2695			continue;
2696
2697		if (pdnxt > end_addr)
2698			pdnxt = end_addr;
2699
2700		/*
2701		 * Have to recheck this before every avtopte() call below
2702		 * in case we have blocked and something else used APTDpde.
2703		 */
2704		if (dst_frame != (APTDpde & PG_FRAME)) {
2705			APTDpde = dst_frame | PG_RW | PG_V;
2706			invltlb();
2707		}
2708		src_pte = vtopte(addr);
2709		dst_pte = avtopte(addr);
2710		while (addr < pdnxt) {
2711			pt_entry_t ptetemp;
2712			ptetemp = *src_pte;
2713			/*
2714			 * we only virtual copy managed pages
2715			 */
2716			if ((ptetemp & PG_MANAGED) != 0) {
2717				/*
2718				 * We have to check after allocpte for the
2719				 * pte still being around...  allocpte can
2720				 * block.
2721				 */
2722				dstmpte = pmap_allocpte(dst_pmap, addr);
2723				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2724					/*
2725					 * Clear the modified and
2726					 * accessed (referenced) bits
2727					 * during the copy.
2728					 */
2729					m = PHYS_TO_VM_PAGE(ptetemp);
2730					*dst_pte = ptetemp & ~(PG_M | PG_A);
2731					dst_pmap->pm_stats.resident_count++;
2732					pmap_insert_entry(dst_pmap, addr,
2733						dstmpte, m);
2734	 			} else {
2735					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2736				}
2737				if (dstmpte->hold_count >= srcmpte->hold_count)
2738					break;
2739			}
2740			addr += PAGE_SIZE;
2741			src_pte++;
2742			dst_pte++;
2743		}
2744	}
2745}
2746
2747/*
2748 *	Routine:	pmap_kernel
2749 *	Function:
2750 *		Returns the physical map handle for the kernel.
2751 */
2752pmap_t
2753pmap_kernel()
2754{
2755	return (kernel_pmap);
2756}
2757
2758/*
2759 *	pmap_zero_page zeros the specified hardware page by mapping
2760 *	the page into KVM and using bzero to clear its contents.
2761 */
2762void
2763pmap_zero_page(vm_offset_t phys)
2764{
2765
2766#ifdef SMP
2767	/* XXX overkill, we only want to disable migration here */
2768	/* XXX or maybe not. down the track we have reentrancy issues */
2769	critical_enter();
2770#endif
2771	if (*CMAP2)
2772		panic("pmap_zero_page: CMAP2 busy");
2773	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2774	cpu_invlpg((vm_offset_t)CADDR2);	/* SMP: local cpu only */
2775#if defined(I686_CPU)
2776	if (cpu_class == CPUCLASS_686)
2777		i686_pagezero(CADDR2);
2778	else
2779#endif
2780		bzero(CADDR2, PAGE_SIZE);
2781	*CMAP2 = 0;
2782#ifdef SMP
2783	critical_exit();
2784#endif
2785}
2786
2787/*
2788 *	pmap_zero_page_area zeros the specified hardware page by mapping
2789 *	the page into KVM and using bzero to clear its contents.
2790 *
2791 *	off and size may not cover an area beyond a single hardware page.
2792 */
2793void
2794pmap_zero_page_area(vm_offset_t phys, int off, int size)
2795{
2796
2797#ifdef SMP
2798	/* XXX overkill, we only want to disable migration here */
2799	/* XXX or maybe not. down the track we have reentrancy issues */
2800	critical_enter();
2801#endif
2802	if (*CMAP2)
2803		panic("pmap_zero_page: CMAP2 busy");
2804	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2805	cpu_invlpg((vm_offset_t)CADDR2);	/* SMP: local cpu only */
2806#if defined(I686_CPU)
2807	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2808		i686_pagezero(CADDR2);
2809	else
2810#endif
2811		bzero((char *)CADDR2 + off, size);
2812	*CMAP2 = 0;
2813#ifdef SMP
2814	critical_exit();
2815#endif
2816}
2817
2818/*
2819 *	pmap_copy_page copies the specified (machine independent)
2820 *	page by mapping the page into virtual memory and using
2821 *	bcopy to copy the page, one machine dependent page at a
2822 *	time.
2823 */
2824void
2825pmap_copy_page(vm_offset_t src, vm_offset_t dst)
2826{
2827
2828#ifdef SMP
2829	/* XXX overkill, we only want to disable migration here */
2830	/* XXX or maybe not. down the track we have reentrancy issues */
2831	critical_enter();
2832#endif
2833	if (*CMAP1)
2834		panic("pmap_copy_page: CMAP1 busy");
2835	if (*CMAP2)
2836		panic("pmap_copy_page: CMAP2 busy");
2837
2838	*CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2839	*CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2840	cpu_invlpg((u_int)CADDR1);		/* SMP: local only */
2841	cpu_invlpg((u_int)CADDR2);		/* SMP: local only */
2842	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2843	*CMAP1 = 0;
2844	*CMAP2 = 0;
2845#ifdef SMP
2846	critical_exit();
2847#endif
2848}
2849
2850
2851/*
2852 *	Routine:	pmap_pageable
2853 *	Function:
2854 *		Make the specified pages (by pmap, offset)
2855 *		pageable (or not) as requested.
2856 *
2857 *		A page which is not pageable may not take
2858 *		a fault; therefore, its page table entry
2859 *		must remain valid for the duration.
2860 *
2861 *		This routine is merely advisory; pmap_enter
2862 *		will specify that these pages are to be wired
2863 *		down (or not) as appropriate.
2864 */
2865void
2866pmap_pageable(pmap, sva, eva, pageable)
2867	pmap_t pmap;
2868	vm_offset_t sva, eva;
2869	boolean_t pageable;
2870{
2871}
2872
2873/*
2874 * this routine returns true if a physical page resides
2875 * in the given pmap.
2876 */
2877boolean_t
2878pmap_page_exists(pmap, m)
2879	pmap_t pmap;
2880	vm_page_t m;
2881{
2882	register pv_entry_t pv;
2883	int s;
2884
2885	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2886		return FALSE;
2887
2888	s = splvm();
2889
2890	/*
2891	 * Not found, check current mappings returning immediately if found.
2892	 */
2893	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2894		if (pv->pv_pmap == pmap) {
2895			splx(s);
2896			return TRUE;
2897		}
2898	}
2899	splx(s);
2900	return (FALSE);
2901}
2902
2903#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2904/*
2905 * Remove all pages from specified address space
2906 * this aids process exit speeds.  Also, this code
2907 * is special cased for current process only, but
2908 * can have the more generic (and slightly slower)
2909 * mode enabled.  This is much faster than pmap_remove
2910 * in the case of running down an entire address space.
2911 */
2912void
2913pmap_remove_pages(pmap, sva, eva)
2914	pmap_t pmap;
2915	vm_offset_t sva, eva;
2916{
2917	pt_entry_t *pte, tpte;
2918	vm_page_t m;
2919	pv_entry_t pv, npv;
2920	int s;
2921
2922#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2923	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2924		printf("warning: pmap_remove_pages called with non-current pmap\n");
2925		return;
2926	}
2927#endif
2928
2929	s = splvm();
2930	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2931
2932		if (pv->pv_va >= eva || pv->pv_va < sva) {
2933			npv = TAILQ_NEXT(pv, pv_plist);
2934			continue;
2935		}
2936
2937#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2938		pte = vtopte(pv->pv_va);
2939#else
2940		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2941#endif
2942		tpte = *pte;
2943
2944		if (tpte == 0) {
2945			printf("TPTE at %p  IS ZERO @ VA %08x\n",
2946							pte, pv->pv_va);
2947			panic("bad peter");
2948		}
2949
2950/*
2951 * We cannot remove wired pages from a process' mapping at this time
2952 */
2953		if (tpte & PG_W) {
2954			npv = TAILQ_NEXT(pv, pv_plist);
2955			continue;
2956		}
2957
2958		m = PHYS_TO_VM_PAGE(tpte);
2959		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2960		    ("vm_page_t %p phys_addr mismatch %08x %08x",
2961		    m, m->phys_addr, tpte));
2962
2963		KASSERT(m < &vm_page_array[vm_page_array_size],
2964			("pmap_remove_pages: bad tpte %x", tpte));
2965
2966		pv->pv_pmap->pm_stats.resident_count--;
2967
2968		*pte = 0;
2969
2970		/*
2971		 * Update the vm_page_t clean and reference bits.
2972		 */
2973		if (tpte & PG_M) {
2974			vm_page_dirty(m);
2975		}
2976
2977		npv = TAILQ_NEXT(pv, pv_plist);
2978		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2979
2980		m->md.pv_list_count--;
2981		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2982		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2983			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2984		}
2985
2986		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2987		free_pv_entry(pv);
2988	}
2989	splx(s);
2990	pmap_invalidate_all(pmap);
2991}
2992
2993/*
2994 * pmap_testbit tests bits in pte's
2995 * note that the testbit/changebit routines are inline,
2996 * and a lot of things compile-time evaluate.
2997 */
2998static boolean_t
2999pmap_testbit(m, bit)
3000	vm_page_t m;
3001	int bit;
3002{
3003	pv_entry_t pv;
3004	pt_entry_t *pte;
3005	int s;
3006
3007	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3008		return FALSE;
3009
3010	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
3011		return FALSE;
3012
3013	s = splvm();
3014
3015	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3016		/*
3017		 * if the bit being tested is the modified bit, then
3018		 * mark clean_map and ptes as never
3019		 * modified.
3020		 */
3021		if (bit & (PG_A|PG_M)) {
3022			if (!pmap_track_modified(pv->pv_va))
3023				continue;
3024		}
3025
3026#if defined(PMAP_DIAGNOSTIC)
3027		if (!pv->pv_pmap) {
3028			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
3029			continue;
3030		}
3031#endif
3032		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3033		if (*pte & bit) {
3034			splx(s);
3035			return TRUE;
3036		}
3037	}
3038	splx(s);
3039	return (FALSE);
3040}
3041
3042/*
3043 * this routine is used to modify bits in ptes
3044 */
3045static __inline void
3046pmap_changebit(vm_page_t m, int bit, boolean_t setem)
3047{
3048	register pv_entry_t pv;
3049	register pt_entry_t *pte;
3050	int s;
3051
3052	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3053		return;
3054
3055	s = splvm();
3056
3057	/*
3058	 * Loop over all current mappings setting/clearing as appropos If
3059	 * setting RO do we need to clear the VAC?
3060	 */
3061	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3062		/*
3063		 * don't write protect pager mappings
3064		 */
3065		if (!setem && (bit == PG_RW)) {
3066			if (!pmap_track_modified(pv->pv_va))
3067				continue;
3068		}
3069
3070#if defined(PMAP_DIAGNOSTIC)
3071		if (!pv->pv_pmap) {
3072			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3073			continue;
3074		}
3075#endif
3076
3077		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3078
3079		if (setem) {
3080			*pte |= bit;
3081			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3082		} else {
3083			pt_entry_t pbits = *pte;
3084			if (pbits & bit) {
3085				if (bit == PG_RW) {
3086					if (pbits & PG_M) {
3087						vm_page_dirty(m);
3088					}
3089					*pte = pbits & ~(PG_M|PG_RW);
3090				} else {
3091					*pte = pbits & ~bit;
3092				}
3093				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3094			}
3095		}
3096	}
3097	splx(s);
3098}
3099
3100/*
3101 *      pmap_page_protect:
3102 *
3103 *      Lower the permission for all mappings to a given page.
3104 */
3105void
3106pmap_page_protect(vm_page_t m, vm_prot_t prot)
3107{
3108	if ((prot & VM_PROT_WRITE) == 0) {
3109		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3110			pmap_changebit(m, PG_RW, FALSE);
3111		} else {
3112			pmap_remove_all(m);
3113		}
3114	}
3115}
3116
3117vm_offset_t
3118pmap_phys_address(ppn)
3119	int ppn;
3120{
3121	return (i386_ptob(ppn));
3122}
3123
3124/*
3125 *	pmap_ts_referenced:
3126 *
3127 *	Return the count of reference bits for a page, clearing all of them.
3128 */
3129int
3130pmap_ts_referenced(vm_page_t m)
3131{
3132	register pv_entry_t pv, pvf, pvn;
3133	pt_entry_t *pte;
3134	int s;
3135	int rtval = 0;
3136
3137	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3138		return (rtval);
3139
3140	s = splvm();
3141
3142	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3143
3144		pvf = pv;
3145
3146		do {
3147			pvn = TAILQ_NEXT(pv, pv_list);
3148
3149			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3150
3151			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3152
3153			if (!pmap_track_modified(pv->pv_va))
3154				continue;
3155
3156			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3157
3158			if (pte && (*pte & PG_A)) {
3159				*pte &= ~PG_A;
3160
3161				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3162
3163				rtval++;
3164				if (rtval > 4) {
3165					break;
3166				}
3167			}
3168		} while ((pv = pvn) != NULL && pv != pvf);
3169	}
3170	splx(s);
3171
3172	return (rtval);
3173}
3174
3175/*
3176 *	pmap_is_modified:
3177 *
3178 *	Return whether or not the specified physical page was modified
3179 *	in any physical maps.
3180 */
3181boolean_t
3182pmap_is_modified(vm_page_t m)
3183{
3184	return pmap_testbit(m, PG_M);
3185}
3186
3187/*
3188 *	Clear the modify bits on the specified physical page.
3189 */
3190void
3191pmap_clear_modify(vm_page_t m)
3192{
3193	pmap_changebit(m, PG_M, FALSE);
3194}
3195
3196/*
3197 *	pmap_clear_reference:
3198 *
3199 *	Clear the reference bit on the specified physical page.
3200 */
3201void
3202pmap_clear_reference(vm_page_t m)
3203{
3204	pmap_changebit(m, PG_A, FALSE);
3205}
3206
3207/*
3208 * Miscellaneous support routines follow
3209 */
3210
3211static void
3212i386_protection_init()
3213{
3214	register int *kp, prot;
3215
3216	kp = protection_codes;
3217	for (prot = 0; prot < 8; prot++) {
3218		switch (prot) {
3219		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3220			/*
3221			 * Read access is also 0. There isn't any execute bit,
3222			 * so just make it readable.
3223			 */
3224		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3225		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3226		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3227			*kp++ = 0;
3228			break;
3229		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3230		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3231		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3232		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3233			*kp++ = PG_RW;
3234			break;
3235		}
3236	}
3237}
3238
3239/*
3240 * Map a set of physical memory pages into the kernel virtual
3241 * address space. Return a pointer to where it is mapped. This
3242 * routine is intended to be used for mapping device memory,
3243 * NOT real memory.
3244 */
3245void *
3246pmap_mapdev(pa, size)
3247	vm_offset_t pa;
3248	vm_size_t size;
3249{
3250	vm_offset_t va, tmpva, offset;
3251	pt_entry_t *pte;
3252
3253	offset = pa & PAGE_MASK;
3254	size = roundup(offset + size, PAGE_SIZE);
3255
3256	GIANT_REQUIRED;
3257
3258	va = kmem_alloc_pageable(kernel_map, size);
3259	if (!va)
3260		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3261
3262	pa = pa & PG_FRAME;
3263	for (tmpva = va; size > 0; ) {
3264		pte = vtopte(tmpva);
3265		*pte = pa | PG_RW | PG_V | pgeflag;
3266		size -= PAGE_SIZE;
3267		tmpva += PAGE_SIZE;
3268	}
3269	invlpg_range(va, tmpva);
3270
3271	return ((void *)(va + offset));
3272}
3273
3274void
3275pmap_unmapdev(va, size)
3276	vm_offset_t va;
3277	vm_size_t size;
3278{
3279	vm_offset_t base, offset, tmpva;
3280	pt_entry_t *pte;
3281
3282	base = va & PG_FRAME;
3283	offset = va & PAGE_MASK;
3284	size = roundup(offset + size, PAGE_SIZE);
3285
3286	for (tmpva = base; size > 0; ) {
3287		pte = vtopte(tmpva);
3288		*pte = 0;
3289		size -= PAGE_SIZE;
3290		tmpva += PAGE_SIZE;
3291	}
3292	invlpg_range(va, tmpva);
3293	kmem_free(kernel_map, base, size);
3294}
3295
3296/*
3297 * perform the pmap work for mincore
3298 */
3299int
3300pmap_mincore(pmap, addr)
3301	pmap_t pmap;
3302	vm_offset_t addr;
3303{
3304	pt_entry_t *ptep, pte;
3305	vm_page_t m;
3306	int val = 0;
3307
3308	ptep = pmap_pte(pmap, addr);
3309	if (ptep == 0) {
3310		return 0;
3311	}
3312
3313	if ((pte = *ptep) != 0) {
3314		vm_offset_t pa;
3315
3316		val = MINCORE_INCORE;
3317		if ((pte & PG_MANAGED) == 0)
3318			return val;
3319
3320		pa = pte & PG_FRAME;
3321
3322		m = PHYS_TO_VM_PAGE(pa);
3323
3324		/*
3325		 * Modified by us
3326		 */
3327		if (pte & PG_M)
3328			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3329		/*
3330		 * Modified by someone
3331		 */
3332		else if (m->dirty || pmap_is_modified(m))
3333			val |= MINCORE_MODIFIED_OTHER;
3334		/*
3335		 * Referenced by us
3336		 */
3337		if (pte & PG_A)
3338			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3339
3340		/*
3341		 * Referenced by someone
3342		 */
3343		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3344			val |= MINCORE_REFERENCED_OTHER;
3345			vm_page_flag_set(m, PG_REFERENCED);
3346		}
3347	}
3348	return val;
3349}
3350
3351void
3352pmap_activate(struct thread *td)
3353{
3354	struct proc *p = td->td_proc;
3355	pmap_t	pmap;
3356	u_int32_t  cr3;
3357
3358	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3359#if defined(SMP)
3360	pmap->pm_active |= PCPU_GET(cpumask);
3361#else
3362	pmap->pm_active |= 1;
3363#endif
3364#if defined(SWTCH_OPTIM_STATS)
3365	tlb_flush_count++;
3366#endif
3367	cr3 = vtophys(pmap->pm_pdir);
3368	/* XXXKSE this is wrong.
3369	 * pmap_activate is for the current thread on the current cpu
3370	 */
3371	if (p->p_flag & P_KSES) {
3372		/* Make sure all other cr3 entries are updated. */
3373		/* what if they are running?  XXXKSE (maybe abort them) */
3374		FOREACH_THREAD_IN_PROC(p, td) {
3375			td->td_pcb->pcb_cr3 = cr3;
3376		}
3377	} else {
3378		td->td_pcb->pcb_cr3 = cr3;
3379	}
3380	load_cr3(cr3);
3381}
3382
3383vm_offset_t
3384pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3385{
3386
3387	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3388		return addr;
3389	}
3390
3391	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3392	return addr;
3393}
3394
3395
3396#if defined(PMAP_DEBUG)
3397pmap_pid_dump(int pid)
3398{
3399	pmap_t pmap;
3400	struct proc *p;
3401	int npte = 0;
3402	int index;
3403
3404	sx_slock(&allproc_lock);
3405	LIST_FOREACH(p, &allproc, p_list) {
3406		if (p->p_pid != pid)
3407			continue;
3408
3409		if (p->p_vmspace) {
3410			int i,j;
3411			index = 0;
3412			pmap = vmspace_pmap(p->p_vmspace);
3413			for (i = 0; i < NPDEPG; i++) {
3414				pd_entry_t *pde;
3415				pt_entry_t *pte;
3416				vm_offset_t base = i << PDRSHIFT;
3417
3418				pde = &pmap->pm_pdir[i];
3419				if (pde && pmap_pde_v(pde)) {
3420					for (j = 0; j < NPTEPG; j++) {
3421						vm_offset_t va = base + (j << PAGE_SHIFT);
3422						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3423							if (index) {
3424								index = 0;
3425								printf("\n");
3426							}
3427							sx_sunlock(&allproc_lock);
3428							return npte;
3429						}
3430						pte = pmap_pte_quick(pmap, va);
3431						if (pte && pmap_pte_v(pte)) {
3432							pt_entry_t pa;
3433							vm_page_t m;
3434							pa = *pte;
3435							m = PHYS_TO_VM_PAGE(pa);
3436							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3437								va, pa, m->hold_count, m->wire_count, m->flags);
3438							npte++;
3439							index++;
3440							if (index >= 2) {
3441								index = 0;
3442								printf("\n");
3443							} else {
3444								printf(" ");
3445							}
3446						}
3447					}
3448				}
3449			}
3450		}
3451	}
3452	sx_sunlock(&allproc_lock);
3453	return npte;
3454}
3455#endif
3456
3457#if defined(DEBUG)
3458
3459static void	pads __P((pmap_t pm));
3460void		pmap_pvdump __P((vm_offset_t pa));
3461
3462/* print address space of pmap*/
3463static void
3464pads(pm)
3465	pmap_t pm;
3466{
3467	int i, j;
3468	vm_offset_t va;
3469	pt_entry_t *ptep;
3470
3471	if (pm == kernel_pmap)
3472		return;
3473	for (i = 0; i < NPDEPG; i++)
3474		if (pm->pm_pdir[i])
3475			for (j = 0; j < NPTEPG; j++) {
3476				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3477				if (pm == kernel_pmap && va < KERNBASE)
3478					continue;
3479				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3480					continue;
3481				ptep = pmap_pte_quick(pm, va);
3482				if (pmap_pte_v(ptep))
3483					printf("%x:%x ", va, *ptep);
3484			};
3485
3486}
3487
3488void
3489pmap_pvdump(pa)
3490	vm_offset_t pa;
3491{
3492	pv_entry_t pv;
3493	vm_page_t m;
3494
3495	printf("pa %x", pa);
3496	m = PHYS_TO_VM_PAGE(pa);
3497	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3498		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3499		pads(pv->pv_pmap);
3500	}
3501	printf(" ");
3502}
3503#endif
3504