pmap.c revision 41370
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 *	$Id: pmap.c,v 1.213 1998/11/24 20:25:52 eivind Exp $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72#include "opt_pmap.h"
73#include "opt_msgbuf.h"
74
75#include <sys/param.h>
76#include <sys/systm.h>
77#include <sys/proc.h>
78#include <sys/msgbuf.h>
79#include <sys/vmmeter.h>
80#include <sys/mman.h>
81
82#include <vm/vm.h>
83#include <vm/vm_param.h>
84#include <vm/vm_prot.h>
85#include <sys/lock.h>
86#include <vm/vm_kern.h>
87#include <vm/vm_page.h>
88#include <vm/vm_map.h>
89#include <vm/vm_object.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_pageout.h>
92#include <vm/vm_pager.h>
93#include <vm/vm_zone.h>
94
95#include <sys/user.h>
96
97#include <machine/cputypes.h>
98#include <machine/md_var.h>
99#include <machine/specialreg.h>
100#if defined(SMP) || defined(APIC_IO)
101#include <machine/smp.h>
102#include <machine/apic.h>
103#endif /* SMP || APIC_IO */
104
105#define PMAP_KEEP_PDIRS
106#ifndef PMAP_SHPGPERPROC
107#define PMAP_SHPGPERPROC 200
108#endif
109
110#if defined(DIAGNOSTIC)
111#define PMAP_DIAGNOSTIC
112#endif
113
114#define MINPV 2048
115
116#if !defined(PMAP_DIAGNOSTIC)
117#define PMAP_INLINE __inline
118#else
119#define PMAP_INLINE
120#endif
121
122/*
123 * Get PDEs and PTEs for user/kernel address space
124 */
125#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
126#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
127
128#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
129#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
130#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
131#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
132#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
133
134#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
135#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
136
137/*
138 * Given a map and a machine independent protection code,
139 * convert to a vax protection code.
140 */
141#define pte_prot(m, p)	(protection_codes[p])
142static int protection_codes[8];
143
144#define	pa_index(pa)		atop((pa) - vm_first_phys)
145#define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
146
147static struct pmap kernel_pmap_store;
148pmap_t kernel_pmap;
149extern pd_entry_t my_idlePTD;
150
151vm_offset_t avail_start;	/* PA of first available physical page */
152vm_offset_t avail_end;		/* PA of last available physical page */
153vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
154vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
155static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
156static vm_offset_t vm_first_phys;
157static int pgeflag;		/* PG_G or-in */
158static int pseflag;		/* PG_PS or-in */
159static int pv_npg;
160
161static vm_object_t kptobj;
162
163static int nkpt;
164vm_offset_t kernel_vm_end;
165
166/*
167 * Data for the pv entry allocation mechanism
168 */
169static vm_zone_t pvzone;
170static struct vm_zone pvzone_store;
171static struct vm_object pvzone_obj;
172static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
173static int pmap_pagedaemon_waken = 0;
174static struct pv_entry *pvinit;
175
176/*
177 * All those kernel PT submaps that BSD is so fond of
178 */
179pt_entry_t *CMAP1 = 0;
180static pt_entry_t *CMAP2, *ptmmap;
181static pv_table_t *pv_table;
182caddr_t CADDR1 = 0, ptvmmap = 0;
183static caddr_t CADDR2;
184static pt_entry_t *msgbufmap;
185struct msgbuf *msgbufp=0;
186
187/*
188 *  PPro_vmtrr
189 */
190struct ppro_vmtrr PPro_vmtrr[NPPROVMTRR];
191
192/* AIO support */
193extern struct vmspace *aiovmspace;
194
195#ifdef SMP
196extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[];
197extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3;
198extern pd_entry_t *IdlePTDS[];
199extern pt_entry_t SMP_prvpt[];
200#endif
201
202#ifdef SMP
203extern unsigned int prv_PPAGE1[];
204extern pt_entry_t *prv_PMAP1;
205#else
206static pt_entry_t *PMAP1 = 0;
207static unsigned *PADDR1 = 0;
208#endif
209
210static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
211static unsigned * get_ptbase __P((pmap_t pmap));
212static pv_entry_t get_pv_entry __P((void));
213static void	i386_protection_init __P((void));
214static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
215
216static PMAP_INLINE int	pmap_is_managed __P((vm_offset_t pa));
217static void	pmap_remove_all __P((vm_offset_t pa));
218static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
219				      vm_offset_t pa, vm_page_t mpte));
220static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq,
221					vm_offset_t sva));
222static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
223static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv,
224					vm_offset_t va));
225static boolean_t pmap_testbit __P((vm_offset_t pa, int bit));
226static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
227		vm_page_t mpte, vm_offset_t pa));
228
229static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
230
231static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
232static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
233static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
234static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
235static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
236static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
237void pmap_collect(void);
238
239static unsigned pdir4mb;
240
241/*
242 *	Routine:	pmap_pte
243 *	Function:
244 *		Extract the page table entry associated
245 *		with the given map/virtual_address pair.
246 */
247
248PMAP_INLINE unsigned *
249pmap_pte(pmap, va)
250	register pmap_t pmap;
251	vm_offset_t va;
252{
253	unsigned *pdeaddr;
254
255	if (pmap) {
256		pdeaddr = (unsigned *) pmap_pde(pmap, va);
257		if (*pdeaddr & PG_PS)
258			return pdeaddr;
259		if (*pdeaddr) {
260			return get_ptbase(pmap) + i386_btop(va);
261		}
262	}
263	return (0);
264}
265
266/*
267 * Move the kernel virtual free pointer to the next
268 * 4MB.  This is used to help improve performance
269 * by using a large (4MB) page for much of the kernel
270 * (.text, .data, .bss)
271 */
272static vm_offset_t
273pmap_kmem_choose(vm_offset_t addr) {
274	vm_offset_t newaddr = addr;
275#ifndef DISABLE_PSE
276	if (cpu_feature & CPUID_PSE) {
277		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
278	}
279#endif
280	return newaddr;
281}
282
283/*
284 *	Bootstrap the system enough to run with virtual memory.
285 *
286 *	On the i386 this is called after mapping has already been enabled
287 *	and just syncs the pmap module with what has already been done.
288 *	[We can't call it easily with mapping off since the kernel is not
289 *	mapped with PA == VA, hence we would have to relocate every address
290 *	from the linked base (virtual) address "KERNBASE" to the actual
291 *	(physical) address starting relative to 0]
292 */
293void
294pmap_bootstrap(firstaddr, loadaddr)
295	vm_offset_t firstaddr;
296	vm_offset_t loadaddr;
297{
298	vm_offset_t va;
299	pt_entry_t *pte;
300	int i, j;
301
302	avail_start = firstaddr;
303
304	/*
305	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
306	 * large. It should instead be correctly calculated in locore.s and
307	 * not based on 'first' (which is a physical address, not a virtual
308	 * address, for the start of unused physical memory). The kernel
309	 * page tables are NOT double mapped and thus should not be included
310	 * in this calculation.
311	 */
312	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
313	virtual_avail = pmap_kmem_choose(virtual_avail);
314
315	virtual_end = VM_MAX_KERNEL_ADDRESS;
316
317	/*
318	 * Initialize protection array.
319	 */
320	i386_protection_init();
321
322	/*
323	 * The kernel's pmap is statically allocated so we don't have to use
324	 * pmap_create, which is unlikely to work correctly at this part of
325	 * the boot sequence (XXX and which no longer exists).
326	 */
327	kernel_pmap = &kernel_pmap_store;
328
329	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
330
331	kernel_pmap->pm_count = 1;
332	TAILQ_INIT(&kernel_pmap->pm_pvlist);
333	nkpt = NKPT;
334
335	/*
336	 * Reserve some special page table entries/VA space for temporary
337	 * mapping of pages.
338	 */
339#define	SYSMAP(c, p, v, n)	\
340	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
341
342	va = virtual_avail;
343	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
344
345	/*
346	 * CMAP1/CMAP2 are used for zeroing and copying pages.
347	 */
348	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
349	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
350
351	/*
352	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
353	 * XXX ptmmap is not used.
354	 */
355	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
356
357	/*
358	 * msgbufp is used to map the system message buffer.
359	 * XXX msgbufmap is not used.
360	 */
361	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
362	       atop(round_page(MSGBUF_SIZE)))
363
364#if !defined(SMP)
365	/*
366	 * ptemap is used for pmap_pte_quick
367	 */
368	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
369#endif
370
371	virtual_avail = va;
372
373	*(int *) CMAP1 = *(int *) CMAP2 = 0;
374	*(int *) PTD = 0;
375
376
377	pgeflag = 0;
378#if !defined(SMP)
379	if (cpu_feature & CPUID_PGE) {
380		pgeflag = PG_G;
381	}
382#endif
383
384/*
385 * Initialize the 4MB page size flag
386 */
387	pseflag = 0;
388/*
389 * The 4MB page version of the initial
390 * kernel page mapping.
391 */
392	pdir4mb = 0;
393
394#if !defined(DISABLE_PSE)
395	if (cpu_feature & CPUID_PSE) {
396		unsigned ptditmp;
397		/*
398		 * Enable the PSE mode
399		 */
400		load_cr4(rcr4() | CR4_PSE);
401
402		/*
403		 * Note that we have enabled PSE mode
404		 */
405		pseflag = PG_PS;
406		ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE));
407		ptditmp &= ~(NBPDR - 1);
408		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
409		pdir4mb = ptditmp;
410		/*
411		 * We can do the mapping here for the single processor
412		 * case.  We simply ignore the old page table page from
413		 * now on.
414		 */
415#if !defined(SMP)
416		PTD[KPTDI] = (pd_entry_t) ptditmp;
417		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
418		invltlb();
419#endif
420	}
421#endif
422
423#ifdef SMP
424	if (cpu_apic_address == 0)
425		panic("pmap_bootstrap: no local apic!");
426
427	/* 0 = private page */
428	/* 1 = page table page */
429	/* 2 = local apic */
430	/* 16-31 = io apics */
431	SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag |
432	    (cpu_apic_address & PG_FRAME));
433
434	for (i = 0; i < mp_napics; i++) {
435		for (j = 0; j < 16; j++) {
436			/* same page frame as a previous IO apic? */
437			if (((vm_offset_t)SMP_prvpt[j + 16] & PG_FRAME) ==
438			    (io_apic_address[0] & PG_FRAME)) {
439				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
440				break;
441			}
442			/* use this slot if available */
443			if (((vm_offset_t)SMP_prvpt[j + 16] & PG_FRAME) == 0) {
444				SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW |
445				    pgeflag | (io_apic_address[i] & PG_FRAME));
446				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
447				break;
448			}
449		}
450		if (j == 16)
451			panic("no space to map IO apic %d!", i);
452	}
453
454	/* BSP does this itself, AP's get it pre-set */
455	prv_CMAP1 = &SMP_prvpt[3 + UPAGES];
456	prv_CMAP2 = &SMP_prvpt[4 + UPAGES];
457	prv_CMAP3 = &SMP_prvpt[5 + UPAGES];
458	prv_PMAP1 = &SMP_prvpt[6 + UPAGES];
459#endif
460
461	invltlb();
462
463}
464
465void
466getmtrr()
467{
468	int i;
469
470	if (cpu_class == CPUCLASS_686) {
471		for(i = 0; i < NPPROVMTRR; i++) {
472			PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2);
473			PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2);
474		}
475	}
476}
477
478void
479putmtrr()
480{
481	int i;
482
483	if (cpu_class == CPUCLASS_686) {
484		wbinvd();
485		for(i = 0; i < NPPROVMTRR; i++) {
486			wrmsr(PPRO_VMTRRphysBase0 + i * 2, PPro_vmtrr[i].base);
487			wrmsr(PPRO_VMTRRphysMask0 + i * 2, PPro_vmtrr[i].mask);
488		}
489	}
490}
491
492void
493pmap_setvidram(void)
494{
495#if 0
496	if (cpu_class == CPUCLASS_686) {
497		wbinvd();
498		/*
499		 * Set memory between 0-640K to be WB
500		 */
501		wrmsr(0x250, 0x0606060606060606LL);
502		wrmsr(0x258, 0x0606060606060606LL);
503		/*
504		 * Set normal, PC video memory to be WC
505		 */
506		wrmsr(0x259, 0x0101010101010101LL);
507	}
508#endif
509}
510
511void
512pmap_setdevram(unsigned long long basea, vm_offset_t sizea)
513{
514	int i, free, skip;
515	unsigned basepage, basepaget;
516	unsigned long long base;
517	unsigned long long mask;
518
519	if (cpu_class != CPUCLASS_686)
520		return;
521
522	free = -1;
523	skip = 0;
524	basea &= ~0xfff;
525	base = basea | 0x1;
526	mask = (long long) (0xfffffffffLL - ((long) sizea - 1)) | (long long) 0x800;
527	mask &= ~0x7ff;
528
529	basepage = (long long) (base >> 12);
530	for(i = 0; i < NPPROVMTRR; i++) {
531		PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2);
532		PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2);
533		basepaget = (long long) (PPro_vmtrr[i].base >> 12);
534		if (basepage == basepaget)
535			skip = 1;
536		if ((PPro_vmtrr[i].mask & 0x800) == 0) {
537			if (free == -1)
538				free = i;
539		}
540	}
541
542	if (!skip && free != -1) {
543		wbinvd();
544		PPro_vmtrr[free].base = base;
545		PPro_vmtrr[free].mask = mask;
546		wrmsr(PPRO_VMTRRphysBase0 + free * 2, base);
547		wrmsr(PPRO_VMTRRphysMask0 + free * 2, mask);
548		printf(
549	"pmap: added WC mapping at page: 0x%x %x, size: %u mask: 0x%x %x\n",
550		    (u_int)(base >> 32), (u_int)base, sizea,
551		    (u_int)(mask >> 32), (u_int)mask);
552	}
553}
554
555/*
556 * Set 4mb pdir for mp startup, and global flags
557 */
558void
559pmap_set_opt(unsigned *pdir) {
560	int i;
561
562	if (pseflag && (cpu_feature & CPUID_PSE)) {
563		load_cr4(rcr4() | CR4_PSE);
564		if (pdir4mb) {
565			pdir[KPTDI] = pdir4mb;
566		}
567	}
568
569	if (pgeflag && (cpu_feature & CPUID_PGE)) {
570		load_cr4(rcr4() | CR4_PGE);
571		for(i = KPTDI; i < KPTDI + nkpt; i++) {
572			if (pdir[i]) {
573				pdir[i] |= PG_G;
574			}
575		}
576	}
577}
578
579/*
580 * Setup the PTD for the boot processor
581 */
582void
583pmap_set_opt_bsp(void)
584{
585	pmap_set_opt((unsigned *)kernel_pmap->pm_pdir);
586	pmap_set_opt((unsigned *)PTD);
587	invltlb();
588}
589
590/*
591 *	Initialize the pmap module.
592 *	Called by vm_init, to initialize any structures that the pmap
593 *	system needs to map virtual memory.
594 *	pmap_init has been enhanced to support in a fairly consistant
595 *	way, discontiguous physical memory.
596 */
597void
598pmap_init(phys_start, phys_end)
599	vm_offset_t phys_start, phys_end;
600{
601	vm_offset_t addr;
602	vm_size_t s;
603	int i;
604	int initial_pvs;
605
606	/*
607	 * calculate the number of pv_entries needed
608	 */
609	vm_first_phys = phys_avail[0];
610	for (i = 0; phys_avail[i + 1]; i += 2);
611	pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
612
613	/*
614	 * Allocate memory for random pmap data structures.  Includes the
615	 * pv_head_table.
616	 */
617	s = (vm_size_t) (sizeof(pv_table_t) * pv_npg);
618	s = round_page(s);
619
620	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
621	pv_table = (pv_table_t *) addr;
622	for(i = 0; i < pv_npg; i++) {
623		vm_offset_t pa;
624		TAILQ_INIT(&pv_table[i].pv_list);
625		pv_table[i].pv_list_count = 0;
626		pa = vm_first_phys + i * PAGE_SIZE;
627		pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa);
628	}
629
630	/*
631	 * init the pv free list
632	 */
633	initial_pvs = pv_npg;
634	if (initial_pvs < MINPV)
635		initial_pvs = MINPV;
636	pvzone = &pvzone_store;
637	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
638		initial_pvs * sizeof (struct pv_entry));
639	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg);
640	/*
641	 * object for kernel page table pages
642	 */
643	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
644
645	/*
646	 * Now it is safe to enable pv_table recording.
647	 */
648	pmap_initialized = TRUE;
649}
650
651/*
652 * Initialize the address space (zone) for the pv_entries.  Set a
653 * high water mark so that the system can recover from excessive
654 * numbers of pv entries.
655 */
656void
657pmap_init2() {
658	pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg;
659	pv_entry_high_water = 9 * (pv_entry_max / 10);
660	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
661}
662
663/*
664 *	Used to map a range of physical addresses into kernel
665 *	virtual address space.
666 *
667 *	For now, VM is already on, we only need to map the
668 *	specified memory.
669 */
670vm_offset_t
671pmap_map(virt, start, end, prot)
672	vm_offset_t virt;
673	vm_offset_t start;
674	vm_offset_t end;
675	int prot;
676{
677	while (start < end) {
678		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
679		virt += PAGE_SIZE;
680		start += PAGE_SIZE;
681	}
682	return (virt);
683}
684
685
686/***************************************************
687 * Low level helper routines.....
688 ***************************************************/
689
690#if defined(PMAP_DIAGNOSTIC)
691
692/*
693 * This code checks for non-writeable/modified pages.
694 * This should be an invalid condition.
695 */
696static int
697pmap_nw_modified(pt_entry_t ptea) {
698	int pte;
699
700	pte = (int) ptea;
701
702	if ((pte & (PG_M|PG_RW)) == PG_M)
703		return 1;
704	else
705		return 0;
706}
707#endif
708
709
710/*
711 * this routine defines the region(s) of memory that should
712 * not be tested for the modified bit.
713 */
714static PMAP_INLINE int
715pmap_track_modified( vm_offset_t va) {
716	if ((va < clean_sva) || (va >= clean_eva))
717		return 1;
718	else
719		return 0;
720}
721
722static PMAP_INLINE void
723invltlb_1pg( vm_offset_t va) {
724#if defined(I386_CPU)
725	if (cpu_class == CPUCLASS_386) {
726		invltlb();
727	} else
728#endif
729	{
730		invlpg(va);
731	}
732}
733
734static PMAP_INLINE void
735invltlb_2pg( vm_offset_t va1, vm_offset_t va2) {
736#if defined(I386_CPU)
737	if (cpu_class == CPUCLASS_386) {
738		invltlb();
739	} else
740#endif
741	{
742		invlpg(va1);
743		invlpg(va2);
744	}
745}
746
747static unsigned *
748get_ptbase(pmap)
749	pmap_t pmap;
750{
751	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
752
753	/* are we current address space or kernel? */
754	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
755		return (unsigned *) PTmap;
756	}
757	/* otherwise, we are alternate address space */
758	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
759		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
760#if defined(SMP)
761		/* The page directory is not shared between CPUs */
762		cpu_invltlb();
763#else
764		invltlb();
765#endif
766	}
767	return (unsigned *) APTmap;
768}
769
770/*
771 * Super fast pmap_pte routine best used when scanning
772 * the pv lists.  This eliminates many coarse-grained
773 * invltlb calls.  Note that many of the pv list
774 * scans are across different pmaps.  It is very wasteful
775 * to do an entire invltlb for checking a single mapping.
776 */
777
778static unsigned *
779pmap_pte_quick(pmap, va)
780	register pmap_t pmap;
781	vm_offset_t va;
782{
783	unsigned pde, newpf;
784	if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) {
785		unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
786		unsigned index = i386_btop(va);
787		/* are we current address space or kernel? */
788		if ((pmap == kernel_pmap) ||
789			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
790			return (unsigned *) PTmap + index;
791		}
792		newpf = pde & PG_FRAME;
793#ifdef SMP
794		if ( ((* (unsigned *) prv_PMAP1) & PG_FRAME) != newpf) {
795			* (unsigned *) prv_PMAP1 = newpf | PG_RW | PG_V;
796			cpu_invlpg(&prv_PPAGE1);
797		}
798		return prv_PPAGE1 + ((unsigned) index & (NPTEPG - 1));
799#else
800		if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) {
801			* (unsigned *) PMAP1 = newpf | PG_RW | PG_V;
802			invltlb_1pg((vm_offset_t) PADDR1);
803		}
804		return PADDR1 + ((unsigned) index & (NPTEPG - 1));
805#endif
806	}
807	return (0);
808}
809
810/*
811 *	Routine:	pmap_extract
812 *	Function:
813 *		Extract the physical page address associated
814 *		with the given map/virtual_address pair.
815 */
816vm_offset_t
817pmap_extract(pmap, va)
818	register pmap_t pmap;
819	vm_offset_t va;
820{
821	vm_offset_t rtval;
822	vm_offset_t pdirindex;
823	pdirindex = va >> PDRSHIFT;
824	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
825		unsigned *pte;
826		if ((rtval & PG_PS) != 0) {
827			rtval &= ~(NBPDR - 1);
828			rtval |= va & (NBPDR - 1);
829			return rtval;
830		}
831		pte = get_ptbase(pmap) + i386_btop(va);
832		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
833		return rtval;
834	}
835	return 0;
836
837}
838
839/*
840 * determine if a page is managed (memory vs. device)
841 */
842static PMAP_INLINE int
843pmap_is_managed(pa)
844	vm_offset_t pa;
845{
846	int i;
847
848	if (!pmap_initialized)
849		return 0;
850
851	for (i = 0; phys_avail[i + 1]; i += 2) {
852		if (pa < phys_avail[i + 1] && pa >= phys_avail[i])
853			return 1;
854	}
855	return 0;
856}
857
858
859/***************************************************
860 * Low level mapping routines.....
861 ***************************************************/
862
863/*
864 * Add a list of wired pages to the kva
865 * this routine is only used for temporary
866 * kernel mappings that do not need to have
867 * page modification or references recorded.
868 * Note that old mappings are simply written
869 * over.  The page *must* be wired.
870 */
871void
872pmap_qenter(va, m, count)
873	vm_offset_t va;
874	vm_page_t *m;
875	int count;
876{
877	int i;
878	register unsigned *pte;
879
880	for (i = 0; i < count; i++) {
881		vm_offset_t tva = va + i * PAGE_SIZE;
882		unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag;
883		unsigned opte;
884		pte = (unsigned *)vtopte(tva);
885		opte = *pte;
886		*pte = npte;
887		if (opte)
888			invltlb_1pg(tva);
889	}
890}
891
892/*
893 * this routine jerks page mappings from the
894 * kernel -- it is meant only for temporary mappings.
895 */
896void
897pmap_qremove(va, count)
898	vm_offset_t va;
899	int count;
900{
901	int i;
902	register unsigned *pte;
903
904	for (i = 0; i < count; i++) {
905		pte = (unsigned *)vtopte(va);
906		*pte = 0;
907		invltlb_1pg(va);
908		va += PAGE_SIZE;
909	}
910}
911
912/*
913 * add a wired page to the kva
914 * note that in order for the mapping to take effect -- you
915 * should do a invltlb after doing the pmap_kenter...
916 */
917PMAP_INLINE void
918pmap_kenter(va, pa)
919	vm_offset_t va;
920	register vm_offset_t pa;
921{
922	register unsigned *pte;
923	unsigned npte, opte;
924
925	npte = pa | PG_RW | PG_V | pgeflag;
926	pte = (unsigned *)vtopte(va);
927	opte = *pte;
928	*pte = npte;
929	if (opte)
930		invltlb_1pg(va);
931}
932
933/*
934 * remove a page from the kernel pagetables
935 */
936PMAP_INLINE void
937pmap_kremove(va)
938	vm_offset_t va;
939{
940	register unsigned *pte;
941
942	pte = (unsigned *)vtopte(va);
943	*pte = 0;
944	invltlb_1pg(va);
945}
946
947static vm_page_t
948pmap_page_lookup(object, pindex)
949	vm_object_t object;
950	vm_pindex_t pindex;
951{
952	vm_page_t m;
953retry:
954	m = vm_page_lookup(object, pindex);
955	if (m && vm_page_sleep(m, "pplookp", NULL))
956		goto retry;
957	return m;
958}
959
960/*
961 * Create the UPAGES for a new process.
962 * This routine directly affects the fork perf for a process.
963 */
964void
965pmap_new_proc(p)
966	struct proc *p;
967{
968	int i, updateneeded;
969	vm_object_t upobj;
970	vm_page_t m;
971	struct user *up;
972	unsigned *ptek, oldpte;
973
974	/*
975	 * allocate object for the upages
976	 */
977	if ((upobj = p->p_upages_obj) == NULL) {
978		upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES);
979		p->p_upages_obj = upobj;
980	}
981
982	/* get a kernel virtual address for the UPAGES for this proc */
983	if ((up = p->p_addr) == NULL) {
984		up = (struct user *) kmem_alloc_pageable(kernel_map,
985				UPAGES * PAGE_SIZE);
986#if !defined(MAX_PERF)
987		if (up == NULL)
988			panic("pmap_new_proc: u_map allocation failed");
989#endif
990		p->p_addr = up;
991	}
992
993	ptek = (unsigned *) vtopte((vm_offset_t) up);
994
995	updateneeded = 0;
996	for(i=0;i<UPAGES;i++) {
997		/*
998		 * Get a kernel stack page
999		 */
1000		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1001
1002		/*
1003		 * Wire the page
1004		 */
1005		m->wire_count++;
1006		cnt.v_wire_count++;
1007
1008		oldpte = *(ptek + i);
1009		/*
1010		 * Enter the page into the kernel address space.
1011		 */
1012		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
1013		if (oldpte) {
1014			if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386)) {
1015				invlpg((vm_offset_t) up + i * PAGE_SIZE);
1016			} else {
1017				updateneeded = 1;
1018			}
1019		}
1020
1021		vm_page_wakeup(m);
1022		m->flags &= ~PG_ZERO;
1023		m->flags |= PG_MAPPED | PG_WRITEABLE;
1024		m->valid = VM_PAGE_BITS_ALL;
1025	}
1026	if (updateneeded)
1027		invltlb();
1028}
1029
1030/*
1031 * Dispose the UPAGES for a process that has exited.
1032 * This routine directly impacts the exit perf of a process.
1033 */
1034void
1035pmap_dispose_proc(p)
1036	struct proc *p;
1037{
1038	int i;
1039	vm_object_t upobj;
1040	vm_page_t m;
1041	unsigned *ptek, oldpte;
1042
1043	upobj = p->p_upages_obj;
1044
1045	ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr);
1046	for(i=0;i<UPAGES;i++) {
1047
1048		if ((m = vm_page_lookup(upobj, i)) == NULL)
1049			panic("pmap_dispose_proc: upage already missing???");
1050
1051		m->flags |= PG_BUSY;
1052
1053		oldpte = *(ptek + i);
1054		*(ptek + i) = 0;
1055		if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386))
1056			invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE);
1057		vm_page_unwire(m, 0);
1058		vm_page_free(m);
1059	}
1060
1061	if (cpu_class <= CPUCLASS_386)
1062		invltlb();
1063}
1064
1065/*
1066 * Allow the UPAGES for a process to be prejudicially paged out.
1067 */
1068void
1069pmap_swapout_proc(p)
1070	struct proc *p;
1071{
1072	int i;
1073	vm_object_t upobj;
1074	vm_page_t m;
1075
1076	upobj = p->p_upages_obj;
1077	/*
1078	 * let the upages be paged
1079	 */
1080	for(i=0;i<UPAGES;i++) {
1081		if ((m = vm_page_lookup(upobj, i)) == NULL)
1082			panic("pmap_swapout_proc: upage already missing???");
1083		m->dirty = VM_PAGE_BITS_ALL;
1084		vm_page_unwire(m, 0);
1085		pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i);
1086	}
1087}
1088
1089/*
1090 * Bring the UPAGES for a specified process back in.
1091 */
1092void
1093pmap_swapin_proc(p)
1094	struct proc *p;
1095{
1096	int i,rv;
1097	vm_object_t upobj;
1098	vm_page_t m;
1099
1100	upobj = p->p_upages_obj;
1101	for(i=0;i<UPAGES;i++) {
1102
1103		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1104
1105		pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE,
1106			VM_PAGE_TO_PHYS(m));
1107
1108		if (m->valid != VM_PAGE_BITS_ALL) {
1109			rv = vm_pager_get_pages(upobj, &m, 1, 0);
1110#if !defined(MAX_PERF)
1111			if (rv != VM_PAGER_OK)
1112				panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid);
1113#endif
1114			m = vm_page_lookup(upobj, i);
1115			m->valid = VM_PAGE_BITS_ALL;
1116		}
1117
1118		vm_page_wire(m);
1119		vm_page_wakeup(m);
1120		m->flags |= PG_MAPPED | PG_WRITEABLE;
1121	}
1122}
1123
1124/***************************************************
1125 * Page table page management routines.....
1126 ***************************************************/
1127
1128/*
1129 * This routine unholds page table pages, and if the hold count
1130 * drops to zero, then it decrements the wire count.
1131 */
1132static int
1133_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
1134	int s;
1135
1136	while (vm_page_sleep(m, "pmuwpt", NULL));
1137
1138	if (m->hold_count == 0) {
1139		vm_offset_t pteva;
1140		/*
1141		 * unmap the page table page
1142		 */
1143		pmap->pm_pdir[m->pindex] = 0;
1144		--pmap->pm_stats.resident_count;
1145		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
1146			(((unsigned) PTDpde) & PG_FRAME)) {
1147			/*
1148			 * Do a invltlb to make the invalidated mapping
1149			 * take effect immediately.
1150			 */
1151			pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex);
1152			invltlb_1pg(pteva);
1153		}
1154
1155		if (pmap->pm_ptphint == m)
1156			pmap->pm_ptphint = NULL;
1157
1158		/*
1159		 * If the page is finally unwired, simply free it.
1160		 */
1161		--m->wire_count;
1162		if (m->wire_count == 0) {
1163
1164			if (m->flags & PG_WANTED) {
1165				m->flags &= ~PG_WANTED;
1166				wakeup(m);
1167			}
1168
1169			m->flags |= PG_BUSY;
1170			vm_page_free_zero(m);
1171			--cnt.v_wire_count;
1172		}
1173		return 1;
1174	}
1175	return 0;
1176}
1177
1178static PMAP_INLINE int
1179pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
1180	vm_page_unhold(m);
1181	if (m->hold_count == 0)
1182		return _pmap_unwire_pte_hold(pmap, m);
1183	else
1184		return 0;
1185}
1186
1187/*
1188 * After removing a page table entry, this routine is used to
1189 * conditionally free the page, and manage the hold/wire counts.
1190 */
1191static int
1192pmap_unuse_pt(pmap, va, mpte)
1193	pmap_t pmap;
1194	vm_offset_t va;
1195	vm_page_t mpte;
1196{
1197	unsigned ptepindex;
1198	if (va >= UPT_MIN_ADDRESS)
1199		return 0;
1200
1201	if (mpte == NULL) {
1202		ptepindex = (va >> PDRSHIFT);
1203		if (pmap->pm_ptphint &&
1204			(pmap->pm_ptphint->pindex == ptepindex)) {
1205			mpte = pmap->pm_ptphint;
1206		} else {
1207			mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1208			pmap->pm_ptphint = mpte;
1209		}
1210	}
1211
1212	return pmap_unwire_pte_hold(pmap, mpte);
1213}
1214
1215#if !defined(SMP)
1216void
1217pmap_pinit0(pmap)
1218	struct pmap *pmap;
1219{
1220	pmap->pm_pdir =
1221		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1222	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
1223	pmap->pm_flags = 0;
1224	pmap->pm_count = 1;
1225	pmap->pm_ptphint = NULL;
1226	TAILQ_INIT(&pmap->pm_pvlist);
1227	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1228}
1229#else
1230void
1231pmap_pinit0(pmap)
1232	struct pmap *pmap;
1233{
1234	pmap_pinit(pmap);
1235}
1236#endif
1237
1238/*
1239 * Initialize a preallocated and zeroed pmap structure,
1240 * such as one in a vmspace structure.
1241 */
1242void
1243pmap_pinit(pmap)
1244	register struct pmap *pmap;
1245{
1246	vm_page_t ptdpg;
1247
1248	/*
1249	 * No need to allocate page table space yet but we do need a valid
1250	 * page directory table.
1251	 */
1252	if (pmap->pm_pdir == NULL)
1253		pmap->pm_pdir =
1254			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1255
1256	/*
1257	 * allocate object for the ptes
1258	 */
1259	if (pmap->pm_pteobj == NULL)
1260		pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1);
1261
1262	/*
1263	 * allocate the page directory page
1264	 */
1265retry:
1266	ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI,
1267			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1268
1269	ptdpg->wire_count = 1;
1270	++cnt.v_wire_count;
1271
1272	ptdpg->flags &= ~(PG_MAPPED | PG_BUSY);	/* not mapped normally */
1273	ptdpg->valid = VM_PAGE_BITS_ALL;
1274
1275	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1276	if ((ptdpg->flags & PG_ZERO) == 0)
1277		bzero(pmap->pm_pdir, PAGE_SIZE);
1278
1279	/* wire in kernel global address entries */
1280	/* XXX copies current process, does not fill in MPPTDI */
1281	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1282
1283	/* install self-referential address mapping entry */
1284	*(unsigned *) (pmap->pm_pdir + PTDPTDI) =
1285		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1286
1287	pmap->pm_flags = 0;
1288	pmap->pm_count = 1;
1289	pmap->pm_ptphint = NULL;
1290	TAILQ_INIT(&pmap->pm_pvlist);
1291	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1292}
1293
1294static int
1295pmap_release_free_page(pmap, p)
1296	struct pmap *pmap;
1297	vm_page_t p;
1298{
1299	int s;
1300	unsigned *pde = (unsigned *) pmap->pm_pdir;
1301	/*
1302	 * This code optimizes the case of freeing non-busy
1303	 * page-table pages.  Those pages are zero now, and
1304	 * might as well be placed directly into the zero queue.
1305	 */
1306	if (vm_page_sleep(p, "pmaprl", NULL))
1307		return 0;
1308
1309	p->flags |= PG_BUSY;
1310
1311	/*
1312	 * Remove the page table page from the processes address space.
1313	 */
1314	pde[p->pindex] = 0;
1315	pmap->pm_stats.resident_count--;
1316
1317#if !defined(MAX_PERF)
1318	if (p->hold_count)  {
1319		panic("pmap_release: freeing held page table page");
1320	}
1321#endif
1322	/*
1323	 * Page directory pages need to have the kernel
1324	 * stuff cleared, so they can go into the zero queue also.
1325	 */
1326	if (p->pindex == PTDPTDI) {
1327		bzero(pde + KPTDI, nkpt * PTESIZE);
1328#ifdef SMP
1329		pde[MPPTDI] = 0;
1330#endif
1331		pde[APTDPTDI] = 0;
1332		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1333	}
1334
1335	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1336		pmap->pm_ptphint = NULL;
1337
1338	p->wire_count--;
1339	cnt.v_wire_count--;
1340	vm_page_free_zero(p);
1341	return 1;
1342}
1343
1344/*
1345 * this routine is called if the page table page is not
1346 * mapped correctly.
1347 */
1348static vm_page_t
1349_pmap_allocpte(pmap, ptepindex)
1350	pmap_t	pmap;
1351	unsigned ptepindex;
1352{
1353	vm_offset_t pteva, ptepa;
1354	vm_page_t m;
1355
1356	/*
1357	 * Find or fabricate a new pagetable page
1358	 */
1359	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1360			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1361
1362	if (m->queue != PQ_NONE) {
1363		int s = splvm();
1364		vm_page_unqueue(m);
1365		splx(s);
1366	}
1367
1368	if (m->wire_count == 0)
1369		cnt.v_wire_count++;
1370	m->wire_count++;
1371
1372	/*
1373	 * Increment the hold count for the page table page
1374	 * (denoting a new mapping.)
1375	 */
1376	m->hold_count++;
1377
1378	/*
1379	 * Map the pagetable page into the process address space, if
1380	 * it isn't already there.
1381	 */
1382
1383	pmap->pm_stats.resident_count++;
1384
1385	ptepa = VM_PAGE_TO_PHYS(m);
1386	pmap->pm_pdir[ptepindex] =
1387		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1388
1389	/*
1390	 * Set the page table hint
1391	 */
1392	pmap->pm_ptphint = m;
1393
1394	/*
1395	 * Try to use the new mapping, but if we cannot, then
1396	 * do it with the routine that maps the page explicitly.
1397	 */
1398	if ((m->flags & PG_ZERO) == 0) {
1399		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
1400			(((unsigned) PTDpde) & PG_FRAME)) {
1401			pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex);
1402			bzero((caddr_t) pteva, PAGE_SIZE);
1403		} else {
1404			pmap_zero_page(ptepa);
1405		}
1406	}
1407
1408	m->valid = VM_PAGE_BITS_ALL;
1409	m->flags &= ~(PG_ZERO | PG_BUSY);
1410	m->flags |= PG_MAPPED;
1411
1412	return m;
1413}
1414
1415static vm_page_t
1416pmap_allocpte(pmap, va)
1417	pmap_t	pmap;
1418	vm_offset_t va;
1419{
1420	unsigned ptepindex;
1421	vm_offset_t ptepa;
1422	vm_page_t m;
1423
1424	/*
1425	 * Calculate pagetable page index
1426	 */
1427	ptepindex = va >> PDRSHIFT;
1428
1429	/*
1430	 * Get the page directory entry
1431	 */
1432	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1433
1434	/*
1435	 * This supports switching from a 4MB page to a
1436	 * normal 4K page.
1437	 */
1438	if (ptepa & PG_PS) {
1439		pmap->pm_pdir[ptepindex] = 0;
1440		ptepa = 0;
1441		invltlb();
1442	}
1443
1444	/*
1445	 * If the page table page is mapped, we just increment the
1446	 * hold count, and activate it.
1447	 */
1448	if (ptepa) {
1449		/*
1450		 * In order to get the page table page, try the
1451		 * hint first.
1452		 */
1453		if (pmap->pm_ptphint &&
1454			(pmap->pm_ptphint->pindex == ptepindex)) {
1455			m = pmap->pm_ptphint;
1456		} else {
1457			m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1458			pmap->pm_ptphint = m;
1459		}
1460		m->hold_count++;
1461		return m;
1462	}
1463	/*
1464	 * Here if the pte page isn't mapped, or if it has been deallocated.
1465	 */
1466	return _pmap_allocpte(pmap, ptepindex);
1467}
1468
1469
1470/***************************************************
1471* Pmap allocation/deallocation routines.
1472 ***************************************************/
1473
1474/*
1475 * Release any resources held by the given physical map.
1476 * Called when a pmap initialized by pmap_pinit is being released.
1477 * Should only be called if the map contains no valid mappings.
1478 */
1479void
1480pmap_release(pmap)
1481	register struct pmap *pmap;
1482{
1483	vm_page_t p,n,ptdpg;
1484	vm_object_t object = pmap->pm_pteobj;
1485	int curgeneration;
1486
1487#if defined(DIAGNOSTIC)
1488	if (object->ref_count != 1)
1489		panic("pmap_release: pteobj reference count != 1");
1490#endif
1491
1492	ptdpg = NULL;
1493retry:
1494	curgeneration = object->generation;
1495	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1496		n = TAILQ_NEXT(p, listq);
1497		if (p->pindex == PTDPTDI) {
1498			ptdpg = p;
1499			continue;
1500		}
1501		while (1) {
1502			if (!pmap_release_free_page(pmap, p) &&
1503				(object->generation != curgeneration))
1504				goto retry;
1505		}
1506	}
1507
1508	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1509		goto retry;
1510}
1511
1512/*
1513 * grow the number of kernel page table entries, if needed
1514 */
1515void
1516pmap_growkernel(vm_offset_t addr)
1517{
1518	struct proc *p;
1519	struct pmap *pmap;
1520	int s;
1521	vm_offset_t ptppaddr;
1522	vm_page_t nkpg;
1523#ifdef SMP
1524	int i;
1525#endif
1526	pd_entry_t newpdir;
1527
1528	s = splhigh();
1529	if (kernel_vm_end == 0) {
1530		kernel_vm_end = KERNBASE;
1531		nkpt = 0;
1532		while (pdir_pde(PTD, kernel_vm_end)) {
1533			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1534			nkpt++;
1535		}
1536	}
1537	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1538	while (kernel_vm_end < addr) {
1539		if (pdir_pde(PTD, kernel_vm_end)) {
1540			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1541			continue;
1542		}
1543
1544		/*
1545		 * This index is bogus, but out of the way
1546		 */
1547		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
1548#if !defined(MAX_PERF)
1549		if (!nkpg)
1550			panic("pmap_growkernel: no memory to grow kernel");
1551#endif
1552
1553		nkpt++;
1554
1555		vm_page_wire(nkpg);
1556		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1557		pmap_zero_page(ptppaddr);
1558		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1559		pdir_pde(PTD, kernel_vm_end) = newpdir;
1560
1561#ifdef SMP
1562		for (i = 0; i < mp_ncpus; i++) {
1563			if (IdlePTDS[i])
1564				pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir;
1565		}
1566#endif
1567
1568		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1569			if (p->p_vmspace) {
1570				pmap = &p->p_vmspace->vm_pmap;
1571				*pmap_pde(pmap, kernel_vm_end) = newpdir;
1572			}
1573		}
1574		if (aiovmspace != NULL) {
1575			pmap = &aiovmspace->vm_pmap;
1576			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1577		}
1578		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1579		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1580	}
1581	splx(s);
1582}
1583
1584/*
1585 *	Retire the given physical map from service.
1586 *	Should only be called if the map contains
1587 *	no valid mappings.
1588 */
1589void
1590pmap_destroy(pmap)
1591	register pmap_t pmap;
1592{
1593	int count;
1594
1595	if (pmap == NULL)
1596		return;
1597
1598	count = --pmap->pm_count;
1599	if (count == 0) {
1600		pmap_release(pmap);
1601#if !defined(MAX_PERF)
1602		panic("destroying a pmap is not yet implemented");
1603#endif
1604	}
1605}
1606
1607/*
1608 *	Add a reference to the specified pmap.
1609 */
1610void
1611pmap_reference(pmap)
1612	pmap_t pmap;
1613{
1614	if (pmap != NULL) {
1615		pmap->pm_count++;
1616	}
1617}
1618
1619/***************************************************
1620* page management routines.
1621 ***************************************************/
1622
1623/*
1624 * free the pv_entry back to the free list
1625 */
1626static PMAP_INLINE void
1627free_pv_entry(pv)
1628	pv_entry_t pv;
1629{
1630	pv_entry_count--;
1631	zfreei(pvzone, pv);
1632}
1633
1634/*
1635 * get a new pv_entry, allocating a block from the system
1636 * when needed.
1637 * the memory allocation is performed bypassing the malloc code
1638 * because of the possibility of allocations at interrupt time.
1639 */
1640static pv_entry_t
1641get_pv_entry(void)
1642{
1643	pv_entry_count++;
1644	if (pv_entry_high_water &&
1645		(pv_entry_count > pv_entry_high_water) &&
1646		(pmap_pagedaemon_waken == 0)) {
1647		pmap_pagedaemon_waken = 1;
1648		wakeup (&vm_pages_needed);
1649	}
1650	return zalloci(pvzone);
1651}
1652
1653/*
1654 * This routine is very drastic, but can save the system
1655 * in a pinch.
1656 */
1657void
1658pmap_collect() {
1659	pv_table_t *ppv;
1660	int i;
1661	vm_offset_t pa;
1662	vm_page_t m;
1663	static int warningdone=0;
1664
1665	if (pmap_pagedaemon_waken == 0)
1666		return;
1667
1668	if (warningdone < 5) {
1669		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1670		warningdone++;
1671	}
1672
1673	for(i = 0; i < pv_npg; i++) {
1674		if ((ppv = &pv_table[i]) == 0)
1675			continue;
1676		m = ppv->pv_vm_page;
1677		if ((pa = VM_PAGE_TO_PHYS(m)) == 0)
1678			continue;
1679		if (m->wire_count || m->hold_count || m->busy ||
1680			(m->flags & PG_BUSY))
1681			continue;
1682		pmap_remove_all(pa);
1683	}
1684	pmap_pagedaemon_waken = 0;
1685}
1686
1687
1688/*
1689 * If it is the first entry on the list, it is actually
1690 * in the header and we must copy the following entry up
1691 * to the header.  Otherwise we must search the list for
1692 * the entry.  In either case we free the now unused entry.
1693 */
1694
1695static int
1696pmap_remove_entry(pmap, ppv, va)
1697	struct pmap *pmap;
1698	pv_table_t *ppv;
1699	vm_offset_t va;
1700{
1701	pv_entry_t pv;
1702	int rtval;
1703	int s;
1704
1705	s = splvm();
1706	if (ppv->pv_list_count < pmap->pm_stats.resident_count) {
1707		for (pv = TAILQ_FIRST(&ppv->pv_list);
1708			pv;
1709			pv = TAILQ_NEXT(pv, pv_list)) {
1710			if (pmap == pv->pv_pmap && va == pv->pv_va)
1711				break;
1712		}
1713	} else {
1714		for (pv = TAILQ_FIRST(&pmap->pm_pvlist);
1715			pv;
1716			pv = TAILQ_NEXT(pv, pv_plist)) {
1717			if (va == pv->pv_va)
1718				break;
1719		}
1720	}
1721
1722	rtval = 0;
1723	if (pv) {
1724
1725		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1726		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
1727		ppv->pv_list_count--;
1728		if (TAILQ_FIRST(&ppv->pv_list) == NULL)
1729			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
1730
1731		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1732		free_pv_entry(pv);
1733	}
1734
1735	splx(s);
1736	return rtval;
1737}
1738
1739/*
1740 * Create a pv entry for page at pa for
1741 * (pmap, va).
1742 */
1743static void
1744pmap_insert_entry(pmap, va, mpte, pa)
1745	pmap_t pmap;
1746	vm_offset_t va;
1747	vm_page_t mpte;
1748	vm_offset_t pa;
1749{
1750
1751	int s;
1752	pv_entry_t pv;
1753	pv_table_t *ppv;
1754
1755	s = splvm();
1756	pv = get_pv_entry();
1757	pv->pv_va = va;
1758	pv->pv_pmap = pmap;
1759	pv->pv_ptem = mpte;
1760
1761	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1762
1763	ppv = pa_to_pvh(pa);
1764	TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
1765	ppv->pv_list_count++;
1766
1767	splx(s);
1768}
1769
1770/*
1771 * pmap_remove_pte: do the things to unmap a page in a process
1772 */
1773static int
1774pmap_remove_pte(pmap, ptq, va)
1775	struct pmap *pmap;
1776	unsigned *ptq;
1777	vm_offset_t va;
1778{
1779	unsigned oldpte;
1780	pv_table_t *ppv;
1781
1782	oldpte = *ptq;
1783	*ptq = 0;
1784	if (oldpte & PG_W)
1785		pmap->pm_stats.wired_count -= 1;
1786	/*
1787	 * Machines that don't support invlpg, also don't support
1788	 * PG_G.
1789	 */
1790	if (oldpte & PG_G)
1791		invlpg(va);
1792	pmap->pm_stats.resident_count -= 1;
1793	if (oldpte & PG_MANAGED) {
1794		ppv = pa_to_pvh(oldpte);
1795		if (oldpte & PG_M) {
1796#if defined(PMAP_DIAGNOSTIC)
1797			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1798				printf(
1799	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1800				    va, oldpte);
1801			}
1802#endif
1803			if (pmap_track_modified(va))
1804				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
1805		}
1806		if (oldpte & PG_A)
1807			ppv->pv_vm_page->flags |= PG_REFERENCED;
1808		return pmap_remove_entry(pmap, ppv, va);
1809	} else {
1810		return pmap_unuse_pt(pmap, va, NULL);
1811	}
1812
1813	return 0;
1814}
1815
1816/*
1817 * Remove a single page from a process address space
1818 */
1819static void
1820pmap_remove_page(pmap, va)
1821	struct pmap *pmap;
1822	register vm_offset_t va;
1823{
1824	register unsigned *ptq;
1825
1826	/*
1827	 * if there is no pte for this address, just skip it!!!
1828	 */
1829	if (*pmap_pde(pmap, va) == 0) {
1830		return;
1831	}
1832
1833	/*
1834	 * get a local va for mappings for this pmap.
1835	 */
1836	ptq = get_ptbase(pmap) + i386_btop(va);
1837	if (*ptq) {
1838		(void) pmap_remove_pte(pmap, ptq, va);
1839		invltlb_1pg(va);
1840	}
1841	return;
1842}
1843
1844/*
1845 *	Remove the given range of addresses from the specified map.
1846 *
1847 *	It is assumed that the start and end are properly
1848 *	rounded to the page size.
1849 */
1850void
1851pmap_remove(pmap, sva, eva)
1852	struct pmap *pmap;
1853	register vm_offset_t sva;
1854	register vm_offset_t eva;
1855{
1856	register unsigned *ptbase;
1857	vm_offset_t pdnxt;
1858	vm_offset_t ptpaddr;
1859	vm_offset_t sindex, eindex;
1860	int anyvalid;
1861
1862	if (pmap == NULL)
1863		return;
1864
1865	if (pmap->pm_stats.resident_count == 0)
1866		return;
1867
1868	/*
1869	 * special handling of removing one page.  a very
1870	 * common operation and easy to short circuit some
1871	 * code.
1872	 */
1873	if (((sva + PAGE_SIZE) == eva) &&
1874		(((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1875		pmap_remove_page(pmap, sva);
1876		return;
1877	}
1878
1879	anyvalid = 0;
1880
1881	/*
1882	 * Get a local virtual address for the mappings that are being
1883	 * worked with.
1884	 */
1885	ptbase = get_ptbase(pmap);
1886
1887	sindex = i386_btop(sva);
1888	eindex = i386_btop(eva);
1889
1890	for (; sindex < eindex; sindex = pdnxt) {
1891		unsigned pdirindex;
1892
1893		/*
1894		 * Calculate index for next page table.
1895		 */
1896		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1897		if (pmap->pm_stats.resident_count == 0)
1898			break;
1899
1900		pdirindex = sindex / NPDEPG;
1901		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1902			pmap->pm_pdir[pdirindex] = 0;
1903			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1904			anyvalid++;
1905			continue;
1906		}
1907
1908		/*
1909		 * Weed out invalid mappings. Note: we assume that the page
1910		 * directory table is always allocated, and in kernel virtual.
1911		 */
1912		if (ptpaddr == 0)
1913			continue;
1914
1915		/*
1916		 * Limit our scan to either the end of the va represented
1917		 * by the current page table page, or to the end of the
1918		 * range being removed.
1919		 */
1920		if (pdnxt > eindex) {
1921			pdnxt = eindex;
1922		}
1923
1924		for ( ;sindex != pdnxt; sindex++) {
1925			vm_offset_t va;
1926			if (ptbase[sindex] == 0) {
1927				continue;
1928			}
1929			va = i386_ptob(sindex);
1930
1931			anyvalid++;
1932			if (pmap_remove_pte(pmap,
1933				ptbase + sindex, va))
1934				break;
1935		}
1936	}
1937
1938	if (anyvalid) {
1939		invltlb();
1940	}
1941}
1942
1943/*
1944 *	Routine:	pmap_remove_all
1945 *	Function:
1946 *		Removes this physical page from
1947 *		all physical maps in which it resides.
1948 *		Reflects back modify bits to the pager.
1949 *
1950 *	Notes:
1951 *		Original versions of this routine were very
1952 *		inefficient because they iteratively called
1953 *		pmap_remove (slow...)
1954 */
1955
1956static void
1957pmap_remove_all(pa)
1958	vm_offset_t pa;
1959{
1960	register pv_entry_t pv;
1961	pv_table_t *ppv;
1962	register unsigned *pte, tpte;
1963	int nmodify;
1964	int update_needed;
1965	int s;
1966
1967	nmodify = 0;
1968	update_needed = 0;
1969#if defined(PMAP_DIAGNOSTIC)
1970	/*
1971	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1972	 * pages!
1973	 */
1974	if (!pmap_is_managed(pa)) {
1975		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", pa);
1976	}
1977#endif
1978
1979	s = splvm();
1980	ppv = pa_to_pvh(pa);
1981	while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) {
1982		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1983
1984		pv->pv_pmap->pm_stats.resident_count--;
1985
1986		tpte = *pte;
1987		*pte = 0;
1988		if (tpte & PG_W)
1989			pv->pv_pmap->pm_stats.wired_count--;
1990
1991		if (tpte & PG_A)
1992			ppv->pv_vm_page->flags |= PG_REFERENCED;
1993
1994		/*
1995		 * Update the vm_page_t clean and reference bits.
1996		 */
1997		if (tpte & PG_M) {
1998#if defined(PMAP_DIAGNOSTIC)
1999			if (pmap_nw_modified((pt_entry_t) tpte)) {
2000				printf(
2001	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
2002				    pv->pv_va, tpte);
2003			}
2004#endif
2005			if (pmap_track_modified(pv->pv_va))
2006				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
2007		}
2008		if (!update_needed &&
2009			((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) ||
2010			(pv->pv_pmap == kernel_pmap))) {
2011			update_needed = 1;
2012		}
2013
2014		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2015		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
2016		ppv->pv_list_count--;
2017		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2018		free_pv_entry(pv);
2019	}
2020
2021	ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
2022
2023	if (update_needed)
2024		invltlb();
2025
2026	splx(s);
2027	return;
2028}
2029
2030/*
2031 *	Set the physical protection on the
2032 *	specified range of this map as requested.
2033 */
2034void
2035pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2036{
2037	register unsigned *ptbase;
2038	vm_offset_t pdnxt, ptpaddr;
2039	vm_pindex_t sindex, eindex;
2040	int anychanged;
2041
2042
2043	if (pmap == NULL)
2044		return;
2045
2046	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2047		pmap_remove(pmap, sva, eva);
2048		return;
2049	}
2050
2051	if (prot & VM_PROT_WRITE)
2052		return;
2053
2054	anychanged = 0;
2055
2056	ptbase = get_ptbase(pmap);
2057
2058	sindex = i386_btop(sva);
2059	eindex = i386_btop(eva);
2060
2061	for (; sindex < eindex; sindex = pdnxt) {
2062
2063		unsigned pdirindex;
2064
2065		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2066
2067		pdirindex = sindex / NPDEPG;
2068		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
2069			(unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2070			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2071			anychanged++;
2072			continue;
2073		}
2074
2075		/*
2076		 * Weed out invalid mappings. Note: we assume that the page
2077		 * directory table is always allocated, and in kernel virtual.
2078		 */
2079		if (ptpaddr == 0)
2080			continue;
2081
2082		if (pdnxt > eindex) {
2083			pdnxt = eindex;
2084		}
2085
2086		for (; sindex != pdnxt; sindex++) {
2087
2088			unsigned pbits;
2089			pv_table_t *ppv;
2090
2091			pbits = ptbase[sindex];
2092
2093			if (pbits & PG_MANAGED) {
2094				ppv = NULL;
2095				if (pbits & PG_A) {
2096					ppv = pa_to_pvh(pbits);
2097					ppv->pv_vm_page->flags |= PG_REFERENCED;
2098					pbits &= ~PG_A;
2099				}
2100				if (pbits & PG_M) {
2101					if (pmap_track_modified(i386_ptob(sindex))) {
2102						if (ppv == NULL)
2103							ppv = pa_to_pvh(pbits);
2104						ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
2105						pbits &= ~PG_M;
2106					}
2107				}
2108			}
2109
2110			pbits &= ~PG_RW;
2111
2112			if (pbits != ptbase[sindex]) {
2113				ptbase[sindex] = pbits;
2114				anychanged = 1;
2115			}
2116		}
2117	}
2118	if (anychanged)
2119		invltlb();
2120}
2121
2122/*
2123 *	Insert the given physical page (p) at
2124 *	the specified virtual address (v) in the
2125 *	target physical map with the protection requested.
2126 *
2127 *	If specified, the page will be wired down, meaning
2128 *	that the related pte can not be reclaimed.
2129 *
2130 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2131 *	or lose information.  That is, this routine must actually
2132 *	insert this page into the given map NOW.
2133 */
2134void
2135pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot,
2136	   boolean_t wired)
2137{
2138	register unsigned *pte;
2139	vm_offset_t opa;
2140	vm_offset_t origpte, newpte;
2141	vm_page_t mpte;
2142
2143	if (pmap == NULL)
2144		return;
2145
2146	va &= PG_FRAME;
2147#ifdef PMAP_DIAGNOSTIC
2148	if (va > VM_MAX_KERNEL_ADDRESS)
2149		panic("pmap_enter: toobig");
2150	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2151		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2152#endif
2153
2154	mpte = NULL;
2155	/*
2156	 * In the case that a page table page is not
2157	 * resident, we are creating it here.
2158	 */
2159	if (va < UPT_MIN_ADDRESS) {
2160		mpte = pmap_allocpte(pmap, va);
2161	}
2162#if 0 && defined(PMAP_DIAGNOSTIC)
2163	else {
2164		vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va);
2165		if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) {
2166			panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n",
2167				pmap->pm_pdir[PTDPTDI], origpte, va);
2168		}
2169		if (smp_active) {
2170			pdeaddr = (vm_offset_t *) IdlePTDS[cpuid];
2171			if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) {
2172				if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr))
2173					printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr);
2174				printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr);
2175				panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n",
2176					pmap->pm_pdir[PTDPTDI], newpte, origpte, va);
2177			}
2178		}
2179	}
2180#endif
2181
2182	pte = pmap_pte(pmap, va);
2183
2184#if !defined(MAX_PERF)
2185	/*
2186	 * Page Directory table entry not valid, we need a new PT page
2187	 */
2188	if (pte == NULL) {
2189		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n",
2190			(void *)pmap->pm_pdir[PTDPTDI], va);
2191	}
2192#endif
2193
2194	origpte = *(vm_offset_t *)pte;
2195	pa &= PG_FRAME;
2196	opa = origpte & PG_FRAME;
2197
2198#if !defined(MAX_PERF)
2199	if (origpte & PG_PS)
2200		panic("pmap_enter: attempted pmap_enter on 4MB page");
2201#endif
2202
2203	/*
2204	 * Mapping has not changed, must be protection or wiring change.
2205	 */
2206	if (origpte && (opa == pa)) {
2207		/*
2208		 * Wiring change, just update stats. We don't worry about
2209		 * wiring PT pages as they remain resident as long as there
2210		 * are valid mappings in them. Hence, if a user page is wired,
2211		 * the PT page will be also.
2212		 */
2213		if (wired && ((origpte & PG_W) == 0))
2214			pmap->pm_stats.wired_count++;
2215		else if (!wired && (origpte & PG_W))
2216			pmap->pm_stats.wired_count--;
2217
2218#if defined(PMAP_DIAGNOSTIC)
2219		if (pmap_nw_modified((pt_entry_t) origpte)) {
2220			printf(
2221	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2222			    va, origpte);
2223		}
2224#endif
2225
2226		/*
2227		 * Remove extra pte reference
2228		 */
2229		if (mpte)
2230			mpte->hold_count--;
2231
2232		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2233			if ((origpte & PG_RW) == 0) {
2234				*pte |= PG_RW;
2235				invltlb_1pg(va);
2236			}
2237			return;
2238		}
2239
2240		/*
2241		 * We might be turning off write access to the page,
2242		 * so we go ahead and sense modify status.
2243		 */
2244		if (origpte & PG_MANAGED) {
2245			if ((origpte & PG_M) && pmap_track_modified(va)) {
2246				pv_table_t *ppv;
2247				ppv = pa_to_pvh(opa);
2248				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
2249			}
2250			pa |= PG_MANAGED;
2251		}
2252		goto validate;
2253	}
2254	/*
2255	 * Mapping has changed, invalidate old range and fall through to
2256	 * handle validating new mapping.
2257	 */
2258	if (opa) {
2259		int err;
2260		err = pmap_remove_pte(pmap, pte, va);
2261#if !defined(MAX_PERF)
2262		if (err)
2263			panic("pmap_enter: pte vanished, va: 0x%x", va);
2264#endif
2265	}
2266
2267	/*
2268	 * Enter on the PV list if part of our managed memory Note that we
2269	 * raise IPL while manipulating pv_table since pmap_enter can be
2270	 * called at interrupt time.
2271	 */
2272	if (pmap_is_managed(pa)) {
2273		pmap_insert_entry(pmap, va, mpte, pa);
2274		pa |= PG_MANAGED;
2275	}
2276
2277	/*
2278	 * Increment counters
2279	 */
2280	pmap->pm_stats.resident_count++;
2281	if (wired)
2282		pmap->pm_stats.wired_count++;
2283
2284validate:
2285	/*
2286	 * Now validate mapping with desired protection/wiring.
2287	 */
2288	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2289
2290	if (wired)
2291		newpte |= PG_W;
2292	if (va < UPT_MIN_ADDRESS)
2293		newpte |= PG_U;
2294	if (pmap == kernel_pmap)
2295		newpte |= pgeflag;
2296
2297	/*
2298	 * if the mapping or permission bits are different, we need
2299	 * to update the pte.
2300	 */
2301	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2302		*pte = newpte | PG_A;
2303		if (origpte)
2304			invltlb_1pg(va);
2305	}
2306}
2307
2308/*
2309 * this code makes some *MAJOR* assumptions:
2310 * 1. Current pmap & pmap exists.
2311 * 2. Not wired.
2312 * 3. Read access.
2313 * 4. No page table pages.
2314 * 5. Tlbflush is deferred to calling procedure.
2315 * 6. Page IS managed.
2316 * but is *MUCH* faster than pmap_enter...
2317 */
2318
2319static vm_page_t
2320pmap_enter_quick(pmap, va, pa, mpte)
2321	register pmap_t pmap;
2322	vm_offset_t va;
2323	register vm_offset_t pa;
2324	vm_page_t mpte;
2325{
2326	register unsigned *pte;
2327
2328	/*
2329	 * In the case that a page table page is not
2330	 * resident, we are creating it here.
2331	 */
2332	if (va < UPT_MIN_ADDRESS) {
2333		unsigned ptepindex;
2334		vm_offset_t ptepa;
2335
2336		/*
2337		 * Calculate pagetable page index
2338		 */
2339		ptepindex = va >> PDRSHIFT;
2340		if (mpte && (mpte->pindex == ptepindex)) {
2341			mpte->hold_count++;
2342		} else {
2343retry:
2344			/*
2345			 * Get the page directory entry
2346			 */
2347			ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
2348
2349			/*
2350			 * If the page table page is mapped, we just increment
2351			 * the hold count, and activate it.
2352			 */
2353			if (ptepa) {
2354#if !defined(MAX_PERF)
2355				if (ptepa & PG_PS)
2356					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2357#endif
2358				if (pmap->pm_ptphint &&
2359					(pmap->pm_ptphint->pindex == ptepindex)) {
2360					mpte = pmap->pm_ptphint;
2361				} else {
2362					mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
2363					pmap->pm_ptphint = mpte;
2364				}
2365				if (mpte == NULL)
2366					goto retry;
2367				mpte->hold_count++;
2368			} else {
2369				mpte = _pmap_allocpte(pmap, ptepindex);
2370			}
2371		}
2372	} else {
2373		mpte = NULL;
2374	}
2375
2376	/*
2377	 * This call to vtopte makes the assumption that we are
2378	 * entering the page into the current pmap.  In order to support
2379	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2380	 * But that isn't as quick as vtopte.
2381	 */
2382	pte = (unsigned *)vtopte(va);
2383	if (*pte) {
2384		if (mpte)
2385			pmap_unwire_pte_hold(pmap, mpte);
2386		return 0;
2387	}
2388
2389	/*
2390	 * Enter on the PV list if part of our managed memory Note that we
2391	 * raise IPL while manipulating pv_table since pmap_enter can be
2392	 * called at interrupt time.
2393	 */
2394	pmap_insert_entry(pmap, va, mpte, pa);
2395
2396	/*
2397	 * Increment counters
2398	 */
2399	pmap->pm_stats.resident_count++;
2400
2401	/*
2402	 * Now validate mapping with RO protection
2403	 */
2404	*pte = pa | PG_V | PG_U | PG_MANAGED;
2405
2406	return mpte;
2407}
2408
2409#define MAX_INIT_PT (96)
2410/*
2411 * pmap_object_init_pt preloads the ptes for a given object
2412 * into the specified pmap.  This eliminates the blast of soft
2413 * faults on process startup and immediately after an mmap.
2414 */
2415void
2416pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
2417	pmap_t pmap;
2418	vm_offset_t addr;
2419	vm_object_t object;
2420	vm_pindex_t pindex;
2421	vm_size_t size;
2422	int limit;
2423{
2424	vm_offset_t tmpidx;
2425	int psize;
2426	vm_page_t p, mpte;
2427	int objpgs;
2428
2429	if (!pmap)
2430		return;
2431
2432	/*
2433	 * This code maps large physical mmap regions into the
2434	 * processor address space.  Note that some shortcuts
2435	 * are taken, but the code works.
2436	 */
2437	if (pseflag &&
2438		(object->type == OBJT_DEVICE) &&
2439		((addr & (NBPDR - 1)) == 0) &&
2440		((size & (NBPDR - 1)) == 0) ) {
2441		int i;
2442		int s;
2443		vm_page_t m[1];
2444		unsigned int ptepindex;
2445		int npdes;
2446		vm_offset_t ptepa;
2447
2448		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2449			return;
2450
2451retry:
2452		p = vm_page_lookup(object, pindex);
2453		if (p && vm_page_sleep(p, "init4p", NULL))
2454			goto retry;
2455
2456		if (p == NULL) {
2457			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2458			if (p == NULL)
2459				return;
2460			m[0] = p;
2461
2462			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2463				vm_page_free(p);
2464				return;
2465			}
2466
2467			p = vm_page_lookup(object, pindex);
2468			vm_page_wakeup(p);
2469		}
2470
2471		ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p);
2472		if (ptepa & (NBPDR - 1)) {
2473			return;
2474		}
2475
2476		p->valid = VM_PAGE_BITS_ALL;
2477
2478		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2479		npdes = size >> PDRSHIFT;
2480		for(i=0;i<npdes;i++) {
2481			pmap->pm_pdir[ptepindex] =
2482				(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS);
2483			ptepa += NBPDR;
2484			ptepindex += 1;
2485		}
2486		p->flags |= PG_MAPPED;
2487		invltlb();
2488		return;
2489	}
2490
2491	psize = i386_btop(size);
2492
2493	if ((object->type != OBJT_VNODE) ||
2494		(limit && (psize > MAX_INIT_PT) &&
2495			(object->resident_page_count > MAX_INIT_PT))) {
2496		return;
2497	}
2498
2499	if (psize + pindex > object->size)
2500		psize = object->size - pindex;
2501
2502	mpte = NULL;
2503	/*
2504	 * if we are processing a major portion of the object, then scan the
2505	 * entire thing.
2506	 */
2507	if (psize > (object->size >> 2)) {
2508		objpgs = psize;
2509
2510		for (p = TAILQ_FIRST(&object->memq);
2511		    ((objpgs > 0) && (p != NULL));
2512		    p = TAILQ_NEXT(p, listq)) {
2513
2514			tmpidx = p->pindex;
2515			if (tmpidx < pindex) {
2516				continue;
2517			}
2518			tmpidx -= pindex;
2519			if (tmpidx >= psize) {
2520				continue;
2521			}
2522			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2523				(p->busy == 0) &&
2524			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2525				if ((p->queue - p->pc) == PQ_CACHE)
2526					vm_page_deactivate(p);
2527				p->flags |= PG_BUSY;
2528				mpte = pmap_enter_quick(pmap,
2529					addr + i386_ptob(tmpidx),
2530					VM_PAGE_TO_PHYS(p), mpte);
2531				p->flags |= PG_MAPPED;
2532				vm_page_wakeup(p);
2533			}
2534			objpgs -= 1;
2535		}
2536	} else {
2537		/*
2538		 * else lookup the pages one-by-one.
2539		 */
2540		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2541			p = vm_page_lookup(object, tmpidx + pindex);
2542			if (p &&
2543			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2544				(p->busy == 0) &&
2545			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2546				if ((p->queue - p->pc) == PQ_CACHE)
2547					vm_page_deactivate(p);
2548				p->flags |= PG_BUSY;
2549				mpte = pmap_enter_quick(pmap,
2550					addr + i386_ptob(tmpidx),
2551					VM_PAGE_TO_PHYS(p), mpte);
2552				p->flags |= PG_MAPPED;
2553				vm_page_wakeup(p);
2554			}
2555		}
2556	}
2557	return;
2558}
2559
2560/*
2561 * pmap_prefault provides a quick way of clustering
2562 * pagefaults into a processes address space.  It is a "cousin"
2563 * of pmap_object_init_pt, except it runs at page fault time instead
2564 * of mmap time.
2565 */
2566#define PFBAK 4
2567#define PFFOR 4
2568#define PAGEORDER_SIZE (PFBAK+PFFOR)
2569
2570static int pmap_prefault_pageorder[] = {
2571	-PAGE_SIZE, PAGE_SIZE,
2572	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2573	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2574	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2575};
2576
2577void
2578pmap_prefault(pmap, addra, entry)
2579	pmap_t pmap;
2580	vm_offset_t addra;
2581	vm_map_entry_t entry;
2582{
2583	int i;
2584	vm_offset_t starta;
2585	vm_offset_t addr;
2586	vm_pindex_t pindex;
2587	vm_page_t m, mpte;
2588	vm_object_t object;
2589
2590	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
2591		return;
2592
2593	object = entry->object.vm_object;
2594
2595	starta = addra - PFBAK * PAGE_SIZE;
2596	if (starta < entry->start) {
2597		starta = entry->start;
2598	} else if (starta > addra) {
2599		starta = 0;
2600	}
2601
2602	mpte = NULL;
2603	for (i = 0; i < PAGEORDER_SIZE; i++) {
2604		vm_object_t lobject;
2605		unsigned *pte;
2606
2607		addr = addra + pmap_prefault_pageorder[i];
2608		if (addr > addra + (PFFOR * PAGE_SIZE))
2609			addr = 0;
2610
2611		if (addr < starta || addr >= entry->end)
2612			continue;
2613
2614		if ((*pmap_pde(pmap, addr)) == NULL)
2615			continue;
2616
2617		pte = (unsigned *) vtopte(addr);
2618		if (*pte)
2619			continue;
2620
2621		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2622		lobject = object;
2623		for (m = vm_page_lookup(lobject, pindex);
2624		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2625		    lobject = lobject->backing_object) {
2626			if (lobject->backing_object_offset & PAGE_MASK)
2627				break;
2628			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2629			m = vm_page_lookup(lobject->backing_object, pindex);
2630		}
2631
2632		/*
2633		 * give-up when a page is not in memory
2634		 */
2635		if (m == NULL)
2636			break;
2637
2638		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2639			(m->busy == 0) &&
2640		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2641
2642			if ((m->queue - m->pc) == PQ_CACHE) {
2643				vm_page_deactivate(m);
2644			}
2645			m->flags |= PG_BUSY;
2646			mpte = pmap_enter_quick(pmap, addr,
2647				VM_PAGE_TO_PHYS(m), mpte);
2648			m->flags |= PG_MAPPED;
2649			vm_page_wakeup(m);
2650		}
2651	}
2652}
2653
2654/*
2655 *	Routine:	pmap_change_wiring
2656 *	Function:	Change the wiring attribute for a map/virtual-address
2657 *			pair.
2658 *	In/out conditions:
2659 *			The mapping must already exist in the pmap.
2660 */
2661void
2662pmap_change_wiring(pmap, va, wired)
2663	register pmap_t pmap;
2664	vm_offset_t va;
2665	boolean_t wired;
2666{
2667	register unsigned *pte;
2668
2669	if (pmap == NULL)
2670		return;
2671
2672	pte = pmap_pte(pmap, va);
2673
2674	if (wired && !pmap_pte_w(pte))
2675		pmap->pm_stats.wired_count++;
2676	else if (!wired && pmap_pte_w(pte))
2677		pmap->pm_stats.wired_count--;
2678
2679	/*
2680	 * Wiring is not a hardware characteristic so there is no need to
2681	 * invalidate TLB.
2682	 */
2683	pmap_pte_set_w(pte, wired);
2684}
2685
2686
2687
2688/*
2689 *	Copy the range specified by src_addr/len
2690 *	from the source map to the range dst_addr/len
2691 *	in the destination map.
2692 *
2693 *	This routine is only advisory and need not do anything.
2694 */
2695
2696void
2697pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
2698	pmap_t dst_pmap, src_pmap;
2699	vm_offset_t dst_addr;
2700	vm_size_t len;
2701	vm_offset_t src_addr;
2702{
2703	vm_offset_t addr;
2704	vm_offset_t end_addr = src_addr + len;
2705	vm_offset_t pdnxt;
2706	unsigned src_frame, dst_frame;
2707
2708	if (dst_addr != src_addr)
2709		return;
2710
2711	src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
2712	if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) {
2713		return;
2714	}
2715
2716	dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
2717	if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) {
2718		APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
2719		invltlb();
2720	}
2721
2722	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
2723		unsigned *src_pte, *dst_pte;
2724		vm_page_t dstmpte, srcmpte;
2725		vm_offset_t srcptepaddr;
2726		unsigned ptepindex;
2727
2728#if !defined(MAX_PERF)
2729		if (addr >= UPT_MIN_ADDRESS)
2730			panic("pmap_copy: invalid to pmap_copy page tables\n");
2731#endif
2732
2733		/*
2734		 * Don't let optional prefaulting of pages make us go
2735		 * way below the low water mark of free pages or way
2736		 * above high water mark of used pv entries.
2737		 */
2738		if (cnt.v_free_count < cnt.v_free_reserved ||
2739		    pv_entry_count > pv_entry_high_water)
2740			break;
2741
2742		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2743		ptepindex = addr >> PDRSHIFT;
2744
2745		srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
2746		if (srcptepaddr == 0)
2747			continue;
2748
2749		if (srcptepaddr & PG_PS) {
2750			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2751				dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
2752				dst_pmap->pm_stats.resident_count += NBPDR;
2753			}
2754			continue;
2755		}
2756
2757		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2758		if ((srcmpte == NULL) ||
2759			(srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2760			continue;
2761
2762		if (pdnxt > end_addr)
2763			pdnxt = end_addr;
2764
2765		src_pte = (unsigned *) vtopte(addr);
2766		dst_pte = (unsigned *) avtopte(addr);
2767		while (addr < pdnxt) {
2768			unsigned ptetemp;
2769			ptetemp = *src_pte;
2770			/*
2771			 * we only virtual copy managed pages
2772			 */
2773			if ((ptetemp & PG_MANAGED) != 0) {
2774				/*
2775				 * We have to check after allocpte for the
2776				 * pte still being around...  allocpte can
2777				 * block.
2778				 */
2779				dstmpte = pmap_allocpte(dst_pmap, addr);
2780				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2781					/*
2782					 * Clear the modified and
2783					 * accessed (referenced) bits
2784					 * during the copy.
2785					 */
2786					*dst_pte = ptetemp & ~(PG_M | PG_A);
2787					dst_pmap->pm_stats.resident_count++;
2788					pmap_insert_entry(dst_pmap, addr,
2789						dstmpte,
2790						(ptetemp & PG_FRAME));
2791	 			} else {
2792					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2793				}
2794				if (dstmpte->hold_count >= srcmpte->hold_count)
2795					break;
2796			}
2797			addr += PAGE_SIZE;
2798			src_pte++;
2799			dst_pte++;
2800		}
2801	}
2802}
2803
2804/*
2805 *	Routine:	pmap_kernel
2806 *	Function:
2807 *		Returns the physical map handle for the kernel.
2808 */
2809pmap_t
2810pmap_kernel()
2811{
2812	return (kernel_pmap);
2813}
2814
2815/*
2816 *	pmap_zero_page zeros the specified (machine independent)
2817 *	page by mapping the page into virtual memory and using
2818 *	bzero to clear its contents, one machine dependent page
2819 *	at a time.
2820 */
2821void
2822pmap_zero_page(phys)
2823	vm_offset_t phys;
2824{
2825#ifdef SMP
2826#if !defined(MAX_PERF)
2827	if (*(int *) prv_CMAP3)
2828		panic("pmap_zero_page: prv_CMAP3 busy");
2829#endif
2830
2831	*(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2832	cpu_invlpg(&prv_CPAGE3);
2833
2834#if defined(I686_CPU)
2835	if (cpu_class == CPUCLASS_686)
2836		i686_pagezero(&prv_CPAGE3);
2837	else
2838#endif
2839		bzero(&prv_CPAGE3, PAGE_SIZE);
2840
2841	*(int *) prv_CMAP3 = 0;
2842#else
2843#if !defined(MAX_PERF)
2844	if (*(int *) CMAP2)
2845		panic("pmap_zero_page: CMAP2 busy");
2846#endif
2847
2848	*(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2849	if (cpu_class == CPUCLASS_386) {
2850		invltlb();
2851	} else {
2852		invlpg((u_int)CADDR2);
2853	}
2854
2855#if defined(I686_CPU)
2856	if (cpu_class == CPUCLASS_686)
2857		i686_pagezero(CADDR2);
2858	else
2859#endif
2860		bzero(CADDR2, PAGE_SIZE);
2861	*(int *) CMAP2 = 0;
2862#endif
2863}
2864
2865/*
2866 *	pmap_copy_page copies the specified (machine independent)
2867 *	page by mapping the page into virtual memory and using
2868 *	bcopy to copy the page, one machine dependent page at a
2869 *	time.
2870 */
2871void
2872pmap_copy_page(src, dst)
2873	vm_offset_t src;
2874	vm_offset_t dst;
2875{
2876#ifdef SMP
2877#if !defined(MAX_PERF)
2878	if (*(int *) prv_CMAP1)
2879		panic("pmap_copy_page: prv_CMAP1 busy");
2880	if (*(int *) prv_CMAP2)
2881		panic("pmap_copy_page: prv_CMAP2 busy");
2882#endif
2883
2884	*(int *) prv_CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2885	*(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2886
2887	cpu_invlpg(&prv_CPAGE1);
2888	cpu_invlpg(&prv_CPAGE2);
2889
2890	bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE);
2891
2892	*(int *) prv_CMAP1 = 0;
2893	*(int *) prv_CMAP2 = 0;
2894#else
2895#if !defined(MAX_PERF)
2896	if (*(int *) CMAP1 || *(int *) CMAP2)
2897		panic("pmap_copy_page: CMAP busy");
2898#endif
2899
2900	*(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2901	*(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2902	if (cpu_class == CPUCLASS_386) {
2903		invltlb();
2904	} else {
2905		invlpg((u_int)CADDR1);
2906		invlpg((u_int)CADDR2);
2907	}
2908
2909	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2910
2911	*(int *) CMAP1 = 0;
2912	*(int *) CMAP2 = 0;
2913#endif
2914}
2915
2916
2917/*
2918 *	Routine:	pmap_pageable
2919 *	Function:
2920 *		Make the specified pages (by pmap, offset)
2921 *		pageable (or not) as requested.
2922 *
2923 *		A page which is not pageable may not take
2924 *		a fault; therefore, its page table entry
2925 *		must remain valid for the duration.
2926 *
2927 *		This routine is merely advisory; pmap_enter
2928 *		will specify that these pages are to be wired
2929 *		down (or not) as appropriate.
2930 */
2931void
2932pmap_pageable(pmap, sva, eva, pageable)
2933	pmap_t pmap;
2934	vm_offset_t sva, eva;
2935	boolean_t pageable;
2936{
2937}
2938
2939/*
2940 * this routine returns true if a physical page resides
2941 * in the given pmap.
2942 */
2943boolean_t
2944pmap_page_exists(pmap, pa)
2945	pmap_t pmap;
2946	vm_offset_t pa;
2947{
2948	register pv_entry_t pv;
2949	pv_table_t *ppv;
2950	int s;
2951
2952	if (!pmap_is_managed(pa))
2953		return FALSE;
2954
2955	s = splvm();
2956
2957	ppv = pa_to_pvh(pa);
2958	/*
2959	 * Not found, check current mappings returning immediately if found.
2960	 */
2961	for (pv = TAILQ_FIRST(&ppv->pv_list);
2962		pv;
2963		pv = TAILQ_NEXT(pv, pv_list)) {
2964		if (pv->pv_pmap == pmap) {
2965			splx(s);
2966			return TRUE;
2967		}
2968	}
2969	splx(s);
2970	return (FALSE);
2971}
2972
2973#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2974/*
2975 * Remove all pages from specified address space
2976 * this aids process exit speeds.  Also, this code
2977 * is special cased for current process only, but
2978 * can have the more generic (and slightly slower)
2979 * mode enabled.  This is much faster than pmap_remove
2980 * in the case of running down an entire address space.
2981 */
2982void
2983pmap_remove_pages(pmap, sva, eva)
2984	pmap_t pmap;
2985	vm_offset_t sva, eva;
2986{
2987	unsigned *pte, tpte;
2988	pv_table_t *ppv;
2989	pv_entry_t pv, npv;
2990	int s;
2991
2992#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2993	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) {
2994		printf("warning: pmap_remove_pages called with non-current pmap\n");
2995		return;
2996	}
2997#endif
2998
2999	s = splvm();
3000	for(pv = TAILQ_FIRST(&pmap->pm_pvlist);
3001		pv;
3002		pv = npv) {
3003
3004		if (pv->pv_va >= eva || pv->pv_va < sva) {
3005			npv = TAILQ_NEXT(pv, pv_plist);
3006			continue;
3007		}
3008
3009#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
3010		pte = (unsigned *)vtopte(pv->pv_va);
3011#else
3012		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3013#endif
3014		tpte = *pte;
3015
3016/*
3017 * We cannot remove wired pages from a process' mapping at this time
3018 */
3019		if (tpte & PG_W) {
3020			npv = TAILQ_NEXT(pv, pv_plist);
3021			continue;
3022		}
3023		*pte = 0;
3024
3025		ppv = pa_to_pvh(tpte);
3026
3027		pv->pv_pmap->pm_stats.resident_count--;
3028
3029		/*
3030		 * Update the vm_page_t clean and reference bits.
3031		 */
3032		if (tpte & PG_M) {
3033			ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
3034		}
3035
3036
3037		npv = TAILQ_NEXT(pv, pv_plist);
3038		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
3039
3040		ppv->pv_list_count--;
3041		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
3042		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
3043			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
3044		}
3045
3046		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
3047		free_pv_entry(pv);
3048	}
3049	splx(s);
3050	invltlb();
3051}
3052
3053/*
3054 * pmap_testbit tests bits in pte's
3055 * note that the testbit/changebit routines are inline,
3056 * and a lot of things compile-time evaluate.
3057 */
3058static boolean_t
3059pmap_testbit(pa, bit)
3060	register vm_offset_t pa;
3061	int bit;
3062{
3063	register pv_entry_t pv;
3064	pv_table_t *ppv;
3065	unsigned *pte;
3066	int s;
3067
3068	if (!pmap_is_managed(pa))
3069		return FALSE;
3070
3071	ppv = pa_to_pvh(pa);
3072	if (TAILQ_FIRST(&ppv->pv_list) == NULL)
3073		return FALSE;
3074
3075	s = splvm();
3076
3077	for (pv = TAILQ_FIRST(&ppv->pv_list);
3078		pv;
3079		pv = TAILQ_NEXT(pv, pv_list)) {
3080
3081		/*
3082		 * if the bit being tested is the modified bit, then
3083		 * mark clean_map and ptes as never
3084		 * modified.
3085		 */
3086		if (bit & (PG_A|PG_M)) {
3087			if (!pmap_track_modified(pv->pv_va))
3088				continue;
3089		}
3090
3091#if defined(PMAP_DIAGNOSTIC)
3092		if (!pv->pv_pmap) {
3093			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
3094			continue;
3095		}
3096#endif
3097		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3098		if (*pte & bit) {
3099			splx(s);
3100			return TRUE;
3101		}
3102	}
3103	splx(s);
3104	return (FALSE);
3105}
3106
3107/*
3108 * this routine is used to modify bits in ptes
3109 */
3110static void
3111pmap_changebit(pa, bit, setem)
3112	vm_offset_t pa;
3113	int bit;
3114	boolean_t setem;
3115{
3116	register pv_entry_t pv;
3117	pv_table_t *ppv;
3118	register unsigned *pte;
3119	int changed;
3120	int s;
3121
3122	if (!pmap_is_managed(pa))
3123		return;
3124
3125	s = splvm();
3126	changed = 0;
3127	ppv = pa_to_pvh(pa);
3128
3129	/*
3130	 * Loop over all current mappings setting/clearing as appropos If
3131	 * setting RO do we need to clear the VAC?
3132	 */
3133	for (pv = TAILQ_FIRST(&ppv->pv_list);
3134		pv;
3135		pv = TAILQ_NEXT(pv, pv_list)) {
3136
3137		/*
3138		 * don't write protect pager mappings
3139		 */
3140		if (!setem && (bit == PG_RW)) {
3141			if (!pmap_track_modified(pv->pv_va))
3142				continue;
3143		}
3144
3145#if defined(PMAP_DIAGNOSTIC)
3146		if (!pv->pv_pmap) {
3147			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3148			continue;
3149		}
3150#endif
3151
3152		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3153
3154		if (setem) {
3155			*(int *)pte |= bit;
3156			changed = 1;
3157		} else {
3158			vm_offset_t pbits = *(vm_offset_t *)pte;
3159			if (pbits & bit) {
3160				changed = 1;
3161				if (bit == PG_RW) {
3162					if (pbits & PG_M) {
3163						ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
3164					}
3165					*(int *)pte = pbits & ~(PG_M|PG_RW);
3166				} else {
3167					*(int *)pte = pbits & ~bit;
3168				}
3169			}
3170		}
3171	}
3172	splx(s);
3173	if (changed)
3174		invltlb();
3175}
3176
3177/*
3178 *      pmap_page_protect:
3179 *
3180 *      Lower the permission for all mappings to a given page.
3181 */
3182void
3183pmap_page_protect(vm_offset_t phys, vm_prot_t prot)
3184{
3185	if ((prot & VM_PROT_WRITE) == 0) {
3186		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3187			pmap_changebit(phys, PG_RW, FALSE);
3188		} else {
3189			pmap_remove_all(phys);
3190		}
3191	}
3192}
3193
3194vm_offset_t
3195pmap_phys_address(ppn)
3196	int ppn;
3197{
3198	return (i386_ptob(ppn));
3199}
3200
3201/*
3202 *	pmap_ts_referenced:
3203 *
3204 *	Return the count of reference bits for a page, clearing all of them.
3205 *
3206 */
3207int
3208pmap_ts_referenced(vm_offset_t pa)
3209{
3210	register pv_entry_t pv;
3211	pv_table_t *ppv;
3212	unsigned *pte;
3213	int s;
3214	int rtval = 0;
3215
3216	if (!pmap_is_managed(pa))
3217		return FALSE;
3218
3219	s = splvm();
3220
3221	ppv = pa_to_pvh(pa);
3222
3223	if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
3224		splx(s);
3225		return 0;
3226	}
3227
3228	/*
3229	 * Not found, check current mappings returning immediately if found.
3230	 */
3231	for (pv = TAILQ_FIRST(&ppv->pv_list);
3232		pv;
3233		pv = TAILQ_NEXT(pv, pv_list)) {
3234
3235		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
3236		/*
3237		 * if the bit being tested is the modified bit, then
3238		 * mark clean_map and ptes as never
3239		 * modified.
3240		 */
3241		if (!pmap_track_modified(pv->pv_va)) {
3242			TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
3243			continue;
3244		}
3245
3246		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3247		if (pte == NULL) {
3248			TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
3249			continue;
3250		}
3251
3252		if (*pte & PG_A) {
3253			rtval++;
3254			*pte &= ~PG_A;
3255			if (rtval > 4) {
3256				TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
3257				break;
3258			}
3259		}
3260		TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
3261	}
3262
3263	splx(s);
3264	if (rtval) {
3265		invltlb();
3266	}
3267	return (rtval);
3268}
3269
3270/*
3271 *	pmap_is_modified:
3272 *
3273 *	Return whether or not the specified physical page was modified
3274 *	in any physical maps.
3275 */
3276boolean_t
3277pmap_is_modified(vm_offset_t pa)
3278{
3279	return pmap_testbit((pa), PG_M);
3280}
3281
3282/*
3283 *	Clear the modify bits on the specified physical page.
3284 */
3285void
3286pmap_clear_modify(vm_offset_t pa)
3287{
3288	pmap_changebit((pa), PG_M, FALSE);
3289}
3290
3291/*
3292 *	pmap_clear_reference:
3293 *
3294 *	Clear the reference bit on the specified physical page.
3295 */
3296void
3297pmap_clear_reference(vm_offset_t pa)
3298{
3299	pmap_changebit((pa), PG_A, FALSE);
3300}
3301
3302/*
3303 * Miscellaneous support routines follow
3304 */
3305
3306static void
3307i386_protection_init()
3308{
3309	register int *kp, prot;
3310
3311	kp = protection_codes;
3312	for (prot = 0; prot < 8; prot++) {
3313		switch (prot) {
3314		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3315			/*
3316			 * Read access is also 0. There isn't any execute bit,
3317			 * so just make it readable.
3318			 */
3319		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3320		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3321		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3322			*kp++ = 0;
3323			break;
3324		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3325		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3326		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3327		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3328			*kp++ = PG_RW;
3329			break;
3330		}
3331	}
3332}
3333
3334/*
3335 * Map a set of physical memory pages into the kernel virtual
3336 * address space. Return a pointer to where it is mapped. This
3337 * routine is intended to be used for mapping device memory,
3338 * NOT real memory.
3339 */
3340void *
3341pmap_mapdev(pa, size)
3342	vm_offset_t pa;
3343	vm_size_t size;
3344{
3345	vm_offset_t va, tmpva;
3346	unsigned *pte;
3347
3348	size = roundup(size, PAGE_SIZE);
3349
3350	va = kmem_alloc_pageable(kernel_map, size);
3351#if !defined(MAX_PERF)
3352	if (!va)
3353		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3354#endif
3355
3356	pa = pa & PG_FRAME;
3357	for (tmpva = va; size > 0;) {
3358		pte = (unsigned *)vtopte(tmpva);
3359		*pte = pa | PG_RW | PG_V | pgeflag;
3360		size -= PAGE_SIZE;
3361		tmpva += PAGE_SIZE;
3362		pa += PAGE_SIZE;
3363	}
3364	invltlb();
3365
3366	return ((void *) va);
3367}
3368
3369/*
3370 * perform the pmap work for mincore
3371 */
3372int
3373pmap_mincore(pmap, addr)
3374	pmap_t pmap;
3375	vm_offset_t addr;
3376{
3377
3378	unsigned *ptep, pte;
3379	vm_page_t m;
3380	int val = 0;
3381
3382	ptep = pmap_pte(pmap, addr);
3383	if (ptep == 0) {
3384		return 0;
3385	}
3386
3387	if (pte = *ptep) {
3388		pv_table_t *ppv;
3389		vm_offset_t pa;
3390
3391		val = MINCORE_INCORE;
3392		if ((pte & PG_MANAGED) == 0)
3393			return val;
3394
3395		pa = pte & PG_FRAME;
3396
3397		ppv = pa_to_pvh((pa & PG_FRAME));
3398		m = ppv->pv_vm_page;
3399
3400		/*
3401		 * Modified by us
3402		 */
3403		if (pte & PG_M)
3404			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3405		/*
3406		 * Modified by someone
3407		 */
3408		else if (m->dirty || pmap_is_modified(pa))
3409			val |= MINCORE_MODIFIED_OTHER;
3410		/*
3411		 * Referenced by us
3412		 */
3413		if (pte & PG_A)
3414			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3415
3416		/*
3417		 * Referenced by someone
3418		 */
3419		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) {
3420			val |= MINCORE_REFERENCED_OTHER;
3421			m->flags |= PG_REFERENCED;
3422		}
3423	}
3424	return val;
3425}
3426
3427void
3428pmap_activate(struct proc *p)
3429{
3430#if defined(SWTCH_OPTIM_STATS)
3431	tlb_flush_count++;
3432#endif
3433	load_cr3(p->p_addr->u_pcb.pcb_cr3 =
3434		vtophys(p->p_vmspace->vm_pmap.pm_pdir));
3435}
3436
3437vm_offset_t
3438pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) {
3439
3440	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3441		return addr;
3442	}
3443
3444	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3445	return addr;
3446}
3447
3448
3449#if defined(PMAP_DEBUG)
3450pmap_pid_dump(int pid) {
3451	pmap_t pmap;
3452	struct proc *p;
3453	int npte = 0;
3454	int index;
3455	for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) {
3456		if (p->p_pid != pid)
3457			continue;
3458
3459		if (p->p_vmspace) {
3460			int i,j;
3461			index = 0;
3462			pmap = &p->p_vmspace->vm_pmap;
3463			for(i=0;i<1024;i++) {
3464				pd_entry_t *pde;
3465				unsigned *pte;
3466				unsigned base = i << PDRSHIFT;
3467
3468				pde = &pmap->pm_pdir[i];
3469				if (pde && pmap_pde_v(pde)) {
3470					for(j=0;j<1024;j++) {
3471						unsigned va = base + (j << PAGE_SHIFT);
3472						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3473							if (index) {
3474								index = 0;
3475								printf("\n");
3476							}
3477							return npte;
3478						}
3479						pte = pmap_pte_quick( pmap, va);
3480						if (pte && pmap_pte_v(pte)) {
3481							vm_offset_t pa;
3482							vm_page_t m;
3483							pa = *(int *)pte;
3484							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
3485							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3486								va, pa, m->hold_count, m->wire_count, m->flags);
3487							npte++;
3488							index++;
3489							if (index >= 2) {
3490								index = 0;
3491								printf("\n");
3492							} else {
3493								printf(" ");
3494							}
3495						}
3496					}
3497				}
3498			}
3499		}
3500	}
3501	return npte;
3502}
3503#endif
3504
3505#if defined(DEBUG)
3506
3507static void	pads __P((pmap_t pm));
3508static void	pmap_pvdump __P((vm_offset_t pa));
3509
3510/* print address space of pmap*/
3511static void
3512pads(pm)
3513	pmap_t pm;
3514{
3515	unsigned va, i, j;
3516	unsigned *ptep;
3517
3518	if (pm == kernel_pmap)
3519		return;
3520	for (i = 0; i < 1024; i++)
3521		if (pm->pm_pdir[i])
3522			for (j = 0; j < 1024; j++) {
3523				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3524				if (pm == kernel_pmap && va < KERNBASE)
3525					continue;
3526				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3527					continue;
3528				ptep = pmap_pte_quick(pm, va);
3529				if (pmap_pte_v(ptep))
3530					printf("%x:%x ", va, *(int *) ptep);
3531			};
3532
3533}
3534
3535static void
3536pmap_pvdump(pa)
3537	vm_offset_t pa;
3538{
3539	pv_table_t *ppv;
3540	register pv_entry_t pv;
3541
3542	printf("pa %x", pa);
3543	ppv = pa_to_pvh(pa);
3544	for (pv = TAILQ_FIRST(&ppv->pv_list);
3545		pv;
3546		pv = TAILQ_NEXT(pv, pv_list)) {
3547#ifdef used_to_be
3548		printf(" -> pmap %p, va %x, flags %x",
3549		    (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags);
3550#endif
3551		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3552		pads(pv->pv_pmap);
3553	}
3554	printf(" ");
3555}
3556#endif
3557