pmap.c revision 33936
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 *	$Id: pmap.c,v 1.186 1998/02/12 22:00:01 bde Exp $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/proc.h>
76#include <sys/msgbuf.h>
77#include <sys/vmmeter.h>
78#include <sys/mman.h>
79
80#include <vm/vm.h>
81#include <vm/vm_param.h>
82#include <vm/vm_prot.h>
83#include <sys/lock.h>
84#include <vm/vm_kern.h>
85#include <vm/vm_page.h>
86#include <vm/vm_map.h>
87#include <vm/vm_object.h>
88#include <vm/vm_extern.h>
89#include <vm/vm_pageout.h>
90#include <vm/vm_pager.h>
91#include <vm/vm_zone.h>
92
93#include <sys/user.h>
94
95#include <machine/cputypes.h>
96#include <machine/md_var.h>
97#include <machine/specialreg.h>
98#if defined(SMP) || defined(APIC_IO)
99#include <machine/smp.h>
100#include <machine/apic.h>
101#endif /* SMP || APIC_IO */
102
103#define PMAP_KEEP_PDIRS
104#ifndef PMAP_SHPGPERPROC
105#define PMAP_SHPGPERPROC 200
106#endif
107
108#if defined(DIAGNOSTIC)
109#define PMAP_DIAGNOSTIC
110#endif
111
112#define MINPV 2048
113
114#if !defined(PMAP_DIAGNOSTIC)
115#define PMAP_INLINE __inline
116#else
117#define PMAP_INLINE
118#endif
119
120/*
121 * Get PDEs and PTEs for user/kernel address space
122 */
123#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
124#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
125
126#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
127#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
128#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
129#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
130#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
131
132#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
133#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
134
135/*
136 * Given a map and a machine independent protection code,
137 * convert to a vax protection code.
138 */
139#define pte_prot(m, p)	(protection_codes[p])
140static int protection_codes[8];
141
142#define	pa_index(pa)		atop((pa) - vm_first_phys)
143#define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
144
145static struct pmap kernel_pmap_store;
146pmap_t kernel_pmap;
147extern pd_entry_t my_idlePTD;
148
149vm_offset_t avail_start;	/* PA of first available physical page */
150vm_offset_t avail_end;		/* PA of last available physical page */
151vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
152vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
153static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
154static vm_offset_t vm_first_phys;
155static int pgeflag;		/* PG_G or-in */
156static int pseflag;		/* PG_PS or-in */
157static int pv_npg;
158
159static int nkpt;
160vm_offset_t kernel_vm_end;
161
162/*
163 * Data for the pv entry allocation mechanism
164 */
165static vm_zone_t pvzone;
166static struct vm_zone pvzone_store;
167static struct vm_object pvzone_obj;
168static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
169static int pmap_pagedaemon_waken = 0;
170static struct pv_entry *pvinit;
171
172/*
173 * All those kernel PT submaps that BSD is so fond of
174 */
175pt_entry_t *CMAP1 = 0;
176static pt_entry_t *CMAP2, *ptmmap;
177static pv_table_t *pv_table;
178caddr_t CADDR1 = 0, ptvmmap = 0;
179static caddr_t CADDR2;
180static pt_entry_t *msgbufmap;
181struct msgbuf *msgbufp=0;
182
183#ifdef SMP
184extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[];
185extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3;
186extern pd_entry_t *IdlePTDS[];
187extern pt_entry_t SMP_prvpt[];
188#endif
189
190static pt_entry_t *PMAP1 = 0;
191static unsigned *PADDR1 = 0;
192
193static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
194static unsigned * get_ptbase __P((pmap_t pmap));
195static pv_entry_t get_pv_entry __P((void));
196static void	i386_protection_init __P((void));
197static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
198
199static PMAP_INLINE int	pmap_is_managed __P((vm_offset_t pa));
200static void	pmap_remove_all __P((vm_offset_t pa));
201static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
202				      vm_offset_t pa, vm_page_t mpte));
203static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq,
204					vm_offset_t sva));
205static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
206static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv,
207					vm_offset_t va));
208static boolean_t pmap_testbit __P((vm_offset_t pa, int bit));
209static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
210		vm_page_t mpte, vm_offset_t pa));
211
212static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
213
214static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
215static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
216static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
217static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
218static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
219static vm_offset_t pmap_kmem_choose(vm_offset_t addr) ;
220void pmap_collect(void);
221
222static unsigned pdir4mb;
223
224/*
225 *	Routine:	pmap_pte
226 *	Function:
227 *		Extract the page table entry associated
228 *		with the given map/virtual_address pair.
229 */
230
231PMAP_INLINE unsigned *
232pmap_pte(pmap, va)
233	register pmap_t pmap;
234	vm_offset_t va;
235{
236	unsigned *pdeaddr;
237
238	if (pmap) {
239		pdeaddr = (unsigned *) pmap_pde(pmap, va);
240		if (*pdeaddr & PG_PS)
241			return pdeaddr;
242		if (*pdeaddr) {
243			return get_ptbase(pmap) + i386_btop(va);
244		}
245	}
246	return (0);
247}
248
249/*
250 * Move the kernel virtual free pointer to the next
251 * 4MB.  This is used to help improve performance
252 * by using a large (4MB) page for much of the kernel
253 * (.text, .data, .bss)
254 */
255static vm_offset_t
256pmap_kmem_choose(vm_offset_t addr) {
257	vm_offset_t newaddr = addr;
258#ifndef DISABLE_PSE
259	if (cpu_feature & CPUID_PSE) {
260		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
261	}
262#endif
263	return newaddr;
264}
265
266/*
267 *	Bootstrap the system enough to run with virtual memory.
268 *
269 *	On the i386 this is called after mapping has already been enabled
270 *	and just syncs the pmap module with what has already been done.
271 *	[We can't call it easily with mapping off since the kernel is not
272 *	mapped with PA == VA, hence we would have to relocate every address
273 *	from the linked base (virtual) address "KERNBASE" to the actual
274 *	(physical) address starting relative to 0]
275 */
276void
277pmap_bootstrap(firstaddr, loadaddr)
278	vm_offset_t firstaddr;
279	vm_offset_t loadaddr;
280{
281	vm_offset_t va;
282	pt_entry_t *pte;
283	int i, j;
284
285	avail_start = firstaddr;
286
287	/*
288	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
289	 * large. It should instead be correctly calculated in locore.s and
290	 * not based on 'first' (which is a physical address, not a virtual
291	 * address, for the start of unused physical memory). The kernel
292	 * page tables are NOT double mapped and thus should not be included
293	 * in this calculation.
294	 */
295	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
296	virtual_avail = pmap_kmem_choose(virtual_avail);
297
298	virtual_end = VM_MAX_KERNEL_ADDRESS;
299
300	/*
301	 * Initialize protection array.
302	 */
303	i386_protection_init();
304
305	/*
306	 * The kernel's pmap is statically allocated so we don't have to use
307	 * pmap_create, which is unlikely to work correctly at this part of
308	 * the boot sequence (XXX and which no longer exists).
309	 */
310	kernel_pmap = &kernel_pmap_store;
311
312	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
313
314	kernel_pmap->pm_count = 1;
315	TAILQ_INIT(&kernel_pmap->pm_pvlist);
316	nkpt = NKPT;
317
318	/*
319	 * Reserve some special page table entries/VA space for temporary
320	 * mapping of pages.
321	 */
322#define	SYSMAP(c, p, v, n)	\
323	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
324
325	va = virtual_avail;
326	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
327
328	/*
329	 * CMAP1/CMAP2 are used for zeroing and copying pages.
330	 */
331	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
332	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
333
334	/*
335	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
336	 * XXX ptmmap is not used.
337	 */
338	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
339
340	/*
341	 * msgbufp is used to map the system message buffer.
342	 * XXX msgbufmap is not used.
343	 */
344	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
345	       atop(round_page(sizeof(struct msgbuf))))
346
347	/*
348	 * ptemap is used for pmap_pte_quick
349	 */
350	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
351
352	virtual_avail = va;
353
354	*(int *) CMAP1 = *(int *) CMAP2 = 0;
355	*(int *) PTD = 0;
356
357
358	pgeflag = 0;
359#if !defined(SMP)
360	if (cpu_feature & CPUID_PGE) {
361		pgeflag = PG_G;
362	}
363#endif
364
365/*
366 * Initialize the 4MB page size flag
367 */
368	pseflag = 0;
369/*
370 * The 4MB page version of the initial
371 * kernel page mapping.
372 */
373	pdir4mb = 0;
374
375#if !defined(DISABLE_PSE)
376	if (cpu_feature & CPUID_PSE) {
377		unsigned ptditmp;
378		/*
379		 * Enable the PSE mode
380		 */
381		load_cr4(rcr4() | CR4_PSE);
382
383		/*
384		 * Note that we have enabled PSE mode
385		 */
386		pseflag = PG_PS;
387		ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE));
388		ptditmp &= ~(NBPDR - 1);
389		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
390		pdir4mb = ptditmp;
391		/*
392		 * We can do the mapping here for the single processor
393		 * case.  We simply ignore the old page table page from
394		 * now on.
395		 */
396#if !defined(SMP)
397		PTD[KPTDI] = (pd_entry_t) ptditmp;
398		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
399		invltlb();
400#endif
401	}
402#endif
403
404#ifdef SMP
405	if (cpu_apic_address == 0)
406		panic("pmap_bootstrap: no local apic!");
407
408	/* 0 = private page */
409	/* 1 = page table page */
410	/* 2 = local apic */
411	/* 16-31 = io apics */
412	SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)cpu_apic_address & PG_FRAME));
413
414	for (i = 0; i < mp_napics; i++) {
415		for (j = 0; j < 16; j++) {
416			/* same page frame as a previous IO apic? */
417			if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) ==
418			    ((u_long)io_apic_address[0] & PG_FRAME)) {
419				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
420				break;
421			}
422			/* use this slot if available */
423			if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == 0) {
424				SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag |
425				    ((u_long)io_apic_address[i] & PG_FRAME));
426				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
427				break;
428			}
429		}
430		if (j == 16)
431			panic("no space to map IO apic %d!", i);
432	}
433
434	/* BSP does this itself, AP's get it pre-set */
435	prv_CMAP1 = (pt_entry_t *)&SMP_prvpt[3 + UPAGES];
436	prv_CMAP2 = (pt_entry_t *)&SMP_prvpt[4 + UPAGES];
437	prv_CMAP3 = (pt_entry_t *)&SMP_prvpt[5 + UPAGES];
438#endif
439
440	invltlb();
441
442}
443
444/*
445 * Set 4mb pdir for mp startup, and global flags
446 */
447void
448pmap_set_opt(unsigned *pdir) {
449	int i;
450
451	if (pseflag && (cpu_feature & CPUID_PSE)) {
452		load_cr4(rcr4() | CR4_PSE);
453		if (pdir4mb) {
454			(unsigned) pdir[KPTDI] = pdir4mb;
455		}
456	}
457
458	if (pgeflag && (cpu_feature & CPUID_PGE)) {
459		load_cr4(rcr4() | CR4_PGE);
460		for(i = KPTDI; i < KPTDI + nkpt; i++) {
461			if (pdir[i]) {
462				pdir[i] |= PG_G;
463			}
464		}
465	}
466}
467
468/*
469 * Setup the PTD for the boot processor
470 */
471void
472pmap_set_opt_bsp(void)
473{
474	pmap_set_opt((unsigned *)kernel_pmap->pm_pdir);
475	pmap_set_opt((unsigned *)PTD);
476	invltlb();
477}
478
479/*
480 *	Initialize the pmap module.
481 *	Called by vm_init, to initialize any structures that the pmap
482 *	system needs to map virtual memory.
483 *	pmap_init has been enhanced to support in a fairly consistant
484 *	way, discontiguous physical memory.
485 */
486void
487pmap_init(phys_start, phys_end)
488	vm_offset_t phys_start, phys_end;
489{
490	vm_offset_t addr;
491	vm_size_t s;
492	int i;
493	int initial_pvs;
494
495	/*
496	 * calculate the number of pv_entries needed
497	 */
498	vm_first_phys = phys_avail[0];
499	for (i = 0; phys_avail[i + 1]; i += 2);
500	pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
501
502	/*
503	 * Allocate memory for random pmap data structures.  Includes the
504	 * pv_head_table.
505	 */
506	s = (vm_size_t) (sizeof(pv_table_t) * pv_npg);
507	s = round_page(s);
508
509	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
510	pv_table = (pv_table_t *) addr;
511	for(i = 0; i < pv_npg; i++) {
512		vm_offset_t pa;
513		TAILQ_INIT(&pv_table[i].pv_list);
514		pv_table[i].pv_list_count = 0;
515		pa = vm_first_phys + i * PAGE_SIZE;
516		pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa);
517	}
518
519	/*
520	 * init the pv free list
521	 */
522	initial_pvs = pv_npg;
523	if (initial_pvs < MINPV)
524		initial_pvs = MINPV;
525	pvzone = &pvzone_store;
526	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
527		initial_pvs * sizeof (struct pv_entry));
528	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg);
529
530	/*
531	 * Now it is safe to enable pv_table recording.
532	 */
533	pmap_initialized = TRUE;
534}
535
536/*
537 * Initialize the address space (zone) for the pv_entries.  Set a
538 * high water mark so that the system can recover from excessive
539 * numbers of pv entries.
540 */
541void
542pmap_init2() {
543	pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg;
544	pv_entry_high_water = 9 * (pv_entry_max / 10);
545	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
546}
547
548/*
549 *	Used to map a range of physical addresses into kernel
550 *	virtual address space.
551 *
552 *	For now, VM is already on, we only need to map the
553 *	specified memory.
554 */
555vm_offset_t
556pmap_map(virt, start, end, prot)
557	vm_offset_t virt;
558	vm_offset_t start;
559	vm_offset_t end;
560	int prot;
561{
562	while (start < end) {
563		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
564		virt += PAGE_SIZE;
565		start += PAGE_SIZE;
566	}
567	return (virt);
568}
569
570
571/***************************************************
572 * Low level helper routines.....
573 ***************************************************/
574
575#if defined(PMAP_DIAGNOSTIC)
576
577/*
578 * This code checks for non-writeable/modified pages.
579 * This should be an invalid condition.
580 */
581static int
582pmap_nw_modified(pt_entry_t ptea) {
583	int pte;
584
585	pte = (int) ptea;
586
587	if ((pte & (PG_M|PG_RW)) == PG_M)
588		return 1;
589	else
590		return 0;
591}
592#endif
593
594
595/*
596 * this routine defines the region(s) of memory that should
597 * not be tested for the modified bit.
598 */
599static PMAP_INLINE int
600pmap_track_modified( vm_offset_t va) {
601	if ((va < clean_sva) || (va >= clean_eva))
602		return 1;
603	else
604		return 0;
605}
606
607static PMAP_INLINE void
608invltlb_1pg( vm_offset_t va) {
609#if defined(I386_CPU)
610	if (cpu_class == CPUCLASS_386) {
611		invltlb();
612	} else
613#endif
614	{
615		invlpg(va);
616	}
617}
618
619static PMAP_INLINE void
620invltlb_2pg( vm_offset_t va1, vm_offset_t va2) {
621#if defined(I386_CPU)
622	if (cpu_class == CPUCLASS_386) {
623		invltlb();
624	} else
625#endif
626	{
627		invlpg(va1);
628		invlpg(va2);
629	}
630}
631
632static unsigned *
633get_ptbase(pmap)
634	pmap_t pmap;
635{
636	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
637
638	/* are we current address space or kernel? */
639	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
640		return (unsigned *) PTmap;
641	}
642	/* otherwise, we are alternate address space */
643	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
644		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
645		invltlb();
646	}
647	return (unsigned *) APTmap;
648}
649
650/*
651 * Super fast pmap_pte routine best used when scanning
652 * the pv lists.  This eliminates many coarse-grained
653 * invltlb calls.  Note that many of the pv list
654 * scans are across different pmaps.  It is very wasteful
655 * to do an entire invltlb for checking a single mapping.
656 */
657
658static unsigned *
659pmap_pte_quick(pmap, va)
660	register pmap_t pmap;
661	vm_offset_t va;
662{
663	unsigned pde, newpf;
664	if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) {
665		unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
666		unsigned index = i386_btop(va);
667		/* are we current address space or kernel? */
668		if ((pmap == kernel_pmap) ||
669			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
670			return (unsigned *) PTmap + index;
671		}
672		newpf = pde & PG_FRAME;
673		if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) {
674			* (unsigned *) PMAP1 = newpf | PG_RW | PG_V;
675			invltlb_1pg((vm_offset_t) PADDR1);
676		}
677		return PADDR1 + ((unsigned) index & (NPTEPG - 1));
678	}
679	return (0);
680}
681
682/*
683 *	Routine:	pmap_extract
684 *	Function:
685 *		Extract the physical page address associated
686 *		with the given map/virtual_address pair.
687 */
688vm_offset_t
689pmap_extract(pmap, va)
690	register pmap_t pmap;
691	vm_offset_t va;
692{
693	vm_offset_t rtval;
694	vm_offset_t pdirindex;
695	pdirindex = va >> PDRSHIFT;
696	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
697		unsigned *pte;
698		if ((rtval & PG_PS) != 0) {
699			rtval &= ~(NBPDR - 1);
700			rtval |= va & (NBPDR - 1);
701			return rtval;
702		}
703		pte = get_ptbase(pmap) + i386_btop(va);
704		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
705		return rtval;
706	}
707	return 0;
708
709}
710
711/*
712 * determine if a page is managed (memory vs. device)
713 */
714static PMAP_INLINE int
715pmap_is_managed(pa)
716	vm_offset_t pa;
717{
718	int i;
719
720	if (!pmap_initialized)
721		return 0;
722
723	for (i = 0; phys_avail[i + 1]; i += 2) {
724		if (pa < phys_avail[i + 1] && pa >= phys_avail[i])
725			return 1;
726	}
727	return 0;
728}
729
730
731/***************************************************
732 * Low level mapping routines.....
733 ***************************************************/
734
735/*
736 * Add a list of wired pages to the kva
737 * this routine is only used for temporary
738 * kernel mappings that do not need to have
739 * page modification or references recorded.
740 * Note that old mappings are simply written
741 * over.  The page *must* be wired.
742 */
743void
744pmap_qenter(va, m, count)
745	vm_offset_t va;
746	vm_page_t *m;
747	int count;
748{
749	int i;
750	register unsigned *pte;
751
752	for (i = 0; i < count; i++) {
753		vm_offset_t tva = va + i * PAGE_SIZE;
754		unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag;
755		unsigned opte;
756		pte = (unsigned *)vtopte(tva);
757		opte = *pte;
758		*pte = npte;
759		if (opte)
760			invltlb_1pg(tva);
761	}
762}
763
764/*
765 * this routine jerks page mappings from the
766 * kernel -- it is meant only for temporary mappings.
767 */
768void
769pmap_qremove(va, count)
770	vm_offset_t va;
771	int count;
772{
773	int i;
774	register unsigned *pte;
775
776	for (i = 0; i < count; i++) {
777		pte = (unsigned *)vtopte(va);
778		*pte = 0;
779		invltlb_1pg(va);
780		va += PAGE_SIZE;
781	}
782}
783
784/*
785 * add a wired page to the kva
786 * note that in order for the mapping to take effect -- you
787 * should do a invltlb after doing the pmap_kenter...
788 */
789PMAP_INLINE void
790pmap_kenter(va, pa)
791	vm_offset_t va;
792	register vm_offset_t pa;
793{
794	register unsigned *pte;
795	unsigned npte, opte;
796
797	npte = pa | PG_RW | PG_V | pgeflag;
798	pte = (unsigned *)vtopte(va);
799	opte = *pte;
800	*pte = npte;
801	if (opte)
802		invltlb_1pg(va);
803}
804
805/*
806 * remove a page from the kernel pagetables
807 */
808PMAP_INLINE void
809pmap_kremove(va)
810	vm_offset_t va;
811{
812	register unsigned *pte;
813
814	pte = (unsigned *)vtopte(va);
815	*pte = 0;
816	invltlb_1pg(va);
817}
818
819static vm_page_t
820pmap_page_lookup(object, pindex)
821	vm_object_t object;
822	vm_pindex_t pindex;
823{
824	vm_page_t m;
825retry:
826	m = vm_page_lookup(object, pindex);
827	if (m && vm_page_sleep(m, "pplookp", NULL))
828		goto retry;
829	return m;
830}
831
832/*
833 * Create the UPAGES for a new process.
834 * This routine directly affects the fork perf for a process.
835 */
836void
837pmap_new_proc(p)
838	struct proc *p;
839{
840	int i, updateneeded;
841	vm_object_t upobj;
842	vm_page_t m;
843	struct user *up;
844	unsigned *ptek, oldpte;
845
846	/*
847	 * allocate object for the upages
848	 */
849	if ((upobj = p->p_upages_obj) == NULL) {
850		upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES);
851		p->p_upages_obj = upobj;
852	}
853
854	/* get a kernel virtual address for the UPAGES for this proc */
855	if ((up = p->p_addr) == NULL) {
856		up = (struct user *) kmem_alloc_pageable(kernel_map,
857				UPAGES * PAGE_SIZE);
858#if !defined(MAX_PERF)
859		if (up == NULL)
860			panic("pmap_new_proc: u_map allocation failed");
861#endif
862		p->p_addr = up;
863	}
864
865	ptek = (unsigned *) vtopte((vm_offset_t) up);
866
867	updateneeded = 0;
868	for(i=0;i<UPAGES;i++) {
869		/*
870		 * Get a kernel stack page
871		 */
872		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
873
874		/*
875		 * Wire the page
876		 */
877		m->wire_count++;
878		cnt.v_wire_count++;
879
880		oldpte = *(ptek + i);
881		/*
882		 * Enter the page into the kernel address space.
883		 */
884		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
885		if (oldpte) {
886			if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386)) {
887				invlpg((vm_offset_t) up + i * PAGE_SIZE);
888			} else {
889				updateneeded = 1;
890			}
891		}
892
893		PAGE_WAKEUP(m);
894		m->flags &= ~PG_ZERO;
895		m->flags |= PG_MAPPED | PG_WRITEABLE;
896		m->valid = VM_PAGE_BITS_ALL;
897	}
898	if (updateneeded)
899		invltlb();
900}
901
902/*
903 * Dispose the UPAGES for a process that has exited.
904 * This routine directly impacts the exit perf of a process.
905 */
906void
907pmap_dispose_proc(p)
908	struct proc *p;
909{
910	int i;
911	vm_object_t upobj;
912	vm_page_t m;
913	unsigned *ptek, oldpte;
914
915	upobj = p->p_upages_obj;
916
917	ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr);
918	for(i=0;i<UPAGES;i++) {
919
920		if ((m = vm_page_lookup(upobj, i)) == NULL)
921			panic("pmap_dispose_proc: upage already missing???");
922
923		m->flags |= PG_BUSY;
924
925		oldpte = *(ptek + i);
926		*(ptek + i) = 0;
927		if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386))
928			invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE);
929		vm_page_unwire(m);
930		vm_page_free(m);
931	}
932
933	if (cpu_class <= CPUCLASS_386)
934		invltlb();
935}
936
937/*
938 * Allow the UPAGES for a process to be prejudicially paged out.
939 */
940void
941pmap_swapout_proc(p)
942	struct proc *p;
943{
944	int i;
945	vm_object_t upobj;
946	vm_page_t m;
947
948	upobj = p->p_upages_obj;
949	/*
950	 * let the upages be paged
951	 */
952	for(i=0;i<UPAGES;i++) {
953		if ((m = vm_page_lookup(upobj, i)) == NULL)
954			panic("pmap_swapout_proc: upage already missing???");
955		m->dirty = VM_PAGE_BITS_ALL;
956		vm_page_unwire(m);
957		vm_page_deactivate(m);
958		pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i);
959	}
960}
961
962/*
963 * Bring the UPAGES for a specified process back in.
964 */
965void
966pmap_swapin_proc(p)
967	struct proc *p;
968{
969	int i,rv;
970	vm_object_t upobj;
971	vm_page_t m;
972
973	upobj = p->p_upages_obj;
974	for(i=0;i<UPAGES;i++) {
975
976		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
977
978		pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE,
979			VM_PAGE_TO_PHYS(m));
980
981		if (m->valid != VM_PAGE_BITS_ALL) {
982			rv = vm_pager_get_pages(upobj, &m, 1, 0);
983#if !defined(MAX_PERF)
984			if (rv != VM_PAGER_OK)
985				panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid);
986#endif
987			m = vm_page_lookup(upobj, i);
988			m->valid = VM_PAGE_BITS_ALL;
989		}
990
991		vm_page_wire(m);
992		PAGE_WAKEUP(m);
993		m->flags |= PG_MAPPED | PG_WRITEABLE;
994	}
995}
996
997/***************************************************
998 * Page table page management routines.....
999 ***************************************************/
1000
1001/*
1002 * This routine unholds page table pages, and if the hold count
1003 * drops to zero, then it decrements the wire count.
1004 */
1005static int
1006_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
1007	int s;
1008
1009	while (vm_page_sleep(m, "pmuwpt", NULL));
1010
1011	if (m->hold_count == 0) {
1012		vm_offset_t pteva;
1013		/*
1014		 * unmap the page table page
1015		 */
1016		pmap->pm_pdir[m->pindex] = 0;
1017		--pmap->pm_stats.resident_count;
1018		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
1019			(((unsigned) PTDpde) & PG_FRAME)) {
1020			/*
1021			 * Do a invltlb to make the invalidated mapping
1022			 * take effect immediately.
1023			 */
1024			pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex);
1025			invltlb_1pg(pteva);
1026		}
1027
1028		if (pmap->pm_ptphint == m)
1029			pmap->pm_ptphint = NULL;
1030
1031		/*
1032		 * If the page is finally unwired, simply free it.
1033		 */
1034		--m->wire_count;
1035		if (m->wire_count == 0) {
1036
1037			if (m->flags & PG_WANTED) {
1038				m->flags &= ~PG_WANTED;
1039				wakeup(m);
1040			}
1041
1042			m->flags |= PG_BUSY;
1043			vm_page_free_zero(m);
1044			--cnt.v_wire_count;
1045		}
1046		return 1;
1047	}
1048	return 0;
1049}
1050
1051__inline static int
1052pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
1053	vm_page_unhold(m);
1054	if (m->hold_count == 0)
1055		return _pmap_unwire_pte_hold(pmap, m);
1056	else
1057		return 0;
1058}
1059
1060/*
1061 * After removing a page table entry, this routine is used to
1062 * conditionally free the page, and manage the hold/wire counts.
1063 */
1064static int
1065pmap_unuse_pt(pmap, va, mpte)
1066	pmap_t pmap;
1067	vm_offset_t va;
1068	vm_page_t mpte;
1069{
1070	unsigned ptepindex;
1071	if (va >= UPT_MIN_ADDRESS)
1072		return 0;
1073
1074	if (mpte == NULL) {
1075		ptepindex = (va >> PDRSHIFT);
1076		if (pmap->pm_ptphint &&
1077			(pmap->pm_ptphint->pindex == ptepindex)) {
1078			mpte = pmap->pm_ptphint;
1079		} else {
1080			mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1081			pmap->pm_ptphint = mpte;
1082		}
1083	}
1084
1085	return pmap_unwire_pte_hold(pmap, mpte);
1086}
1087
1088#if !defined(SMP)
1089void
1090pmap_pinit0(pmap)
1091	struct pmap *pmap;
1092{
1093	pmap->pm_pdir =
1094		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1095	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
1096	pmap->pm_flags = 0;
1097	pmap->pm_count = 1;
1098	pmap->pm_ptphint = NULL;
1099	TAILQ_INIT(&pmap->pm_pvlist);
1100	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1101}
1102#else
1103void
1104pmap_pinit0(pmap)
1105	struct pmap *pmap;
1106{
1107	pmap_pinit(pmap);
1108}
1109#endif
1110
1111/*
1112 * Initialize a preallocated and zeroed pmap structure,
1113 * such as one in a vmspace structure.
1114 */
1115void
1116pmap_pinit(pmap)
1117	register struct pmap *pmap;
1118{
1119	vm_page_t ptdpg;
1120
1121	/*
1122	 * No need to allocate page table space yet but we do need a valid
1123	 * page directory table.
1124	 */
1125	if (pmap->pm_pdir == NULL)
1126		pmap->pm_pdir =
1127			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1128
1129	/*
1130	 * allocate object for the ptes
1131	 */
1132	if (pmap->pm_pteobj == NULL)
1133		pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1);
1134
1135	/*
1136	 * allocate the page directory page
1137	 */
1138retry:
1139	ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI,
1140			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1141
1142	ptdpg->wire_count = 1;
1143	++cnt.v_wire_count;
1144
1145	ptdpg->flags &= ~(PG_MAPPED | PG_BUSY);	/* not mapped normally */
1146	ptdpg->valid = VM_PAGE_BITS_ALL;
1147
1148	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1149	if ((ptdpg->flags & PG_ZERO) == 0)
1150		bzero(pmap->pm_pdir, PAGE_SIZE);
1151
1152	/* wire in kernel global address entries */
1153	/* XXX copies current process, does not fill in MPPTDI */
1154	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1155
1156	/* install self-referential address mapping entry */
1157	*(unsigned *) (pmap->pm_pdir + PTDPTDI) =
1158		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW;
1159
1160	pmap->pm_flags = 0;
1161	pmap->pm_count = 1;
1162	pmap->pm_ptphint = NULL;
1163	TAILQ_INIT(&pmap->pm_pvlist);
1164	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1165}
1166
1167static int
1168pmap_release_free_page(pmap, p)
1169	struct pmap *pmap;
1170	vm_page_t p;
1171{
1172	int s;
1173	unsigned *pde = (unsigned *) pmap->pm_pdir;
1174	/*
1175	 * This code optimizes the case of freeing non-busy
1176	 * page-table pages.  Those pages are zero now, and
1177	 * might as well be placed directly into the zero queue.
1178	 */
1179	if (vm_page_sleep(p, "pmaprl", NULL))
1180		return 0;
1181
1182	p->flags |= PG_BUSY;
1183
1184	/*
1185	 * Remove the page table page from the processes address space.
1186	 */
1187	pde[p->pindex] = 0;
1188	pmap->pm_stats.resident_count--;
1189
1190#if !defined(MAX_PERF)
1191	if (p->hold_count)  {
1192		panic("pmap_release: freeing held page table page");
1193	}
1194#endif
1195	/*
1196	 * Page directory pages need to have the kernel
1197	 * stuff cleared, so they can go into the zero queue also.
1198	 */
1199	if (p->pindex == PTDPTDI) {
1200		bzero(pde + KPTDI, nkpt * PTESIZE);
1201#ifdef SMP
1202		pde[MPPTDI] = 0;
1203#endif
1204		pde[APTDPTDI] = 0;
1205		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1206	}
1207
1208	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1209		pmap->pm_ptphint = NULL;
1210
1211	vm_page_free_zero(p);
1212	return 1;
1213}
1214
1215/*
1216 * this routine is called if the page table page is not
1217 * mapped correctly.
1218 */
1219static vm_page_t
1220_pmap_allocpte(pmap, ptepindex)
1221	pmap_t	pmap;
1222	unsigned ptepindex;
1223{
1224	vm_offset_t pteva, ptepa;
1225	vm_page_t m;
1226
1227	/*
1228	 * Find or fabricate a new pagetable page
1229	 */
1230	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1231			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1232
1233	if (m->queue != PQ_NONE) {
1234		int s = splvm();
1235		vm_page_unqueue(m);
1236		splx(s);
1237	}
1238
1239	if (m->wire_count == 0)
1240		cnt.v_wire_count++;
1241	m->wire_count++;
1242
1243	/*
1244	 * Increment the hold count for the page table page
1245	 * (denoting a new mapping.)
1246	 */
1247	m->hold_count++;
1248
1249	/*
1250	 * Map the pagetable page into the process address space, if
1251	 * it isn't already there.
1252	 */
1253
1254	pmap->pm_stats.resident_count++;
1255
1256	ptepa = VM_PAGE_TO_PHYS(m);
1257	pmap->pm_pdir[ptepindex] =
1258		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A);
1259
1260	/*
1261	 * Set the page table hint
1262	 */
1263	pmap->pm_ptphint = m;
1264
1265	/*
1266	 * Try to use the new mapping, but if we cannot, then
1267	 * do it with the routine that maps the page explicitly.
1268	 */
1269	if ((m->flags & PG_ZERO) == 0) {
1270		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
1271			(((unsigned) PTDpde) & PG_FRAME)) {
1272			pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex);
1273			bzero((caddr_t) pteva, PAGE_SIZE);
1274		} else {
1275			pmap_zero_page(ptepa);
1276		}
1277	}
1278
1279	m->valid = VM_PAGE_BITS_ALL;
1280	m->flags &= ~(PG_ZERO | PG_BUSY);
1281	m->flags |= PG_MAPPED;
1282
1283	return m;
1284}
1285
1286static vm_page_t
1287pmap_allocpte(pmap, va)
1288	pmap_t	pmap;
1289	vm_offset_t va;
1290{
1291	unsigned ptepindex;
1292	vm_offset_t ptepa;
1293	vm_page_t m;
1294
1295	/*
1296	 * Calculate pagetable page index
1297	 */
1298	ptepindex = va >> PDRSHIFT;
1299
1300	/*
1301	 * Get the page directory entry
1302	 */
1303	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1304
1305	/*
1306	 * This supports switching from a 4MB page to a
1307	 * normal 4K page.
1308	 */
1309	if (ptepa & PG_PS) {
1310		pmap->pm_pdir[ptepindex] = 0;
1311		ptepa = 0;
1312		invltlb();
1313	}
1314
1315	/*
1316	 * If the page table page is mapped, we just increment the
1317	 * hold count, and activate it.
1318	 */
1319	if (ptepa) {
1320		/*
1321		 * In order to get the page table page, try the
1322		 * hint first.
1323		 */
1324		if (pmap->pm_ptphint &&
1325			(pmap->pm_ptphint->pindex == ptepindex)) {
1326			m = pmap->pm_ptphint;
1327		} else {
1328			m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1329			pmap->pm_ptphint = m;
1330		}
1331		m->hold_count++;
1332		return m;
1333	}
1334	/*
1335	 * Here if the pte page isn't mapped, or if it has been deallocated.
1336	 */
1337	return _pmap_allocpte(pmap, ptepindex);
1338}
1339
1340
1341/***************************************************
1342* Pmap allocation/deallocation routines.
1343 ***************************************************/
1344
1345/*
1346 * Release any resources held by the given physical map.
1347 * Called when a pmap initialized by pmap_pinit is being released.
1348 * Should only be called if the map contains no valid mappings.
1349 */
1350void
1351pmap_release(pmap)
1352	register struct pmap *pmap;
1353{
1354	vm_page_t p,n,ptdpg;
1355	vm_object_t object = pmap->pm_pteobj;
1356	int curgeneration;
1357
1358#if defined(DIAGNOSTIC)
1359	if (object->ref_count != 1)
1360		panic("pmap_release: pteobj reference count != 1");
1361#endif
1362
1363	ptdpg = NULL;
1364retry:
1365	curgeneration = object->generation;
1366	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1367		n = TAILQ_NEXT(p, listq);
1368		if (p->pindex == PTDPTDI) {
1369			ptdpg = p;
1370			continue;
1371		}
1372		while (1) {
1373			if (!pmap_release_free_page(pmap, p) &&
1374				(object->generation != curgeneration))
1375				goto retry;
1376		}
1377	}
1378
1379	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1380		goto retry;
1381}
1382
1383/*
1384 * grow the number of kernel page table entries, if needed
1385 */
1386void
1387pmap_growkernel(vm_offset_t addr)
1388{
1389	struct proc *p;
1390	struct pmap *pmap;
1391	int s;
1392	vm_offset_t ptpkva, ptppaddr;
1393	vm_page_t nkpg;
1394#ifdef SMP
1395	int i;
1396#endif
1397	pd_entry_t newpdir;
1398	vm_pindex_t ptpidx;
1399
1400	s = splhigh();
1401	if (kernel_vm_end == 0) {
1402		kernel_vm_end = KERNBASE;
1403		nkpt = 0;
1404		while (pdir_pde(PTD, kernel_vm_end)) {
1405			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1406			nkpt++;
1407		}
1408	}
1409	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1410	while (kernel_vm_end < addr) {
1411		if (pdir_pde(PTD, kernel_vm_end)) {
1412			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1413			continue;
1414		}
1415		nkpt++;
1416		ptpkva = (vm_offset_t) vtopte(addr);
1417		ptpidx = (ptpkva >> PAGE_SHIFT);
1418		/*
1419		 * This index is bogus, but out of the way
1420		 */
1421		nkpg = vm_page_alloc(kernel_object, ptpidx, VM_ALLOC_SYSTEM);
1422#if !defined(MAX_PERF)
1423		if (!nkpg)
1424			panic("pmap_growkernel: no memory to grow kernel");
1425#endif
1426
1427		vm_page_wire(nkpg);
1428		vm_page_remove(nkpg);
1429		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1430		pmap_zero_page(ptppaddr);
1431		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW);
1432		pdir_pde(PTD, kernel_vm_end) = newpdir;
1433
1434#ifdef SMP
1435		for (i = 0; i < mp_ncpus; i++) {
1436			if (IdlePTDS[i])
1437				pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir;
1438		}
1439#endif
1440
1441		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1442			if (p->p_vmspace) {
1443				pmap = &p->p_vmspace->vm_pmap;
1444				*pmap_pde(pmap, kernel_vm_end) = newpdir;
1445			}
1446		}
1447		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1448		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1449	}
1450	splx(s);
1451}
1452
1453/*
1454 *	Retire the given physical map from service.
1455 *	Should only be called if the map contains
1456 *	no valid mappings.
1457 */
1458void
1459pmap_destroy(pmap)
1460	register pmap_t pmap;
1461{
1462	int count;
1463
1464	if (pmap == NULL)
1465		return;
1466
1467	count = --pmap->pm_count;
1468	if (count == 0) {
1469		pmap_release(pmap);
1470#if !defined(MAX_PERF)
1471		panic("destroying a pmap is not yet implemented");
1472#endif
1473	}
1474}
1475
1476/*
1477 *	Add a reference to the specified pmap.
1478 */
1479void
1480pmap_reference(pmap)
1481	pmap_t pmap;
1482{
1483	if (pmap != NULL) {
1484		pmap->pm_count++;
1485	}
1486}
1487
1488/***************************************************
1489* page management routines.
1490 ***************************************************/
1491
1492/*
1493 * free the pv_entry back to the free list
1494 */
1495static inline void
1496free_pv_entry(pv)
1497	pv_entry_t pv;
1498{
1499	pv_entry_count--;
1500	zfreei(pvzone, pv);
1501}
1502
1503/*
1504 * get a new pv_entry, allocating a block from the system
1505 * when needed.
1506 * the memory allocation is performed bypassing the malloc code
1507 * because of the possibility of allocations at interrupt time.
1508 */
1509static pv_entry_t
1510get_pv_entry(void)
1511{
1512	pv_entry_count++;
1513	if (pv_entry_high_water &&
1514		(pv_entry_count > pv_entry_high_water) &&
1515		(pmap_pagedaemon_waken == 0)) {
1516		pmap_pagedaemon_waken = 1;
1517		wakeup (&vm_pages_needed);
1518	}
1519	return zalloci(pvzone);
1520}
1521
1522/*
1523 * This routine is very drastic, but can save the system
1524 * in a pinch.
1525 */
1526void
1527pmap_collect() {
1528	pv_table_t *ppv;
1529	int i;
1530	vm_offset_t pa;
1531	vm_page_t m;
1532	static int warningdone=0;
1533
1534	if (pmap_pagedaemon_waken == 0)
1535		return;
1536
1537	if (warningdone < 5) {
1538		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1539		warningdone++;
1540	}
1541
1542	for(i = 0; i < pv_npg; i++) {
1543		if ((ppv = &pv_table[i]) == 0)
1544			continue;
1545		m = ppv->pv_vm_page;
1546		if ((pa = VM_PAGE_TO_PHYS(m)) == 0)
1547			continue;
1548		if (m->wire_count || m->hold_count || m->busy ||
1549			(m->flags & PG_BUSY))
1550			continue;
1551		pmap_remove_all(pa);
1552	}
1553	pmap_pagedaemon_waken = 0;
1554}
1555
1556
1557/*
1558 * If it is the first entry on the list, it is actually
1559 * in the header and we must copy the following entry up
1560 * to the header.  Otherwise we must search the list for
1561 * the entry.  In either case we free the now unused entry.
1562 */
1563
1564static int
1565pmap_remove_entry(pmap, ppv, va)
1566	struct pmap *pmap;
1567	pv_table_t *ppv;
1568	vm_offset_t va;
1569{
1570	pv_entry_t pv;
1571	int rtval;
1572	int s;
1573
1574	s = splvm();
1575	if (ppv->pv_list_count < pmap->pm_stats.resident_count) {
1576		for (pv = TAILQ_FIRST(&ppv->pv_list);
1577			pv;
1578			pv = TAILQ_NEXT(pv, pv_list)) {
1579			if (pmap == pv->pv_pmap && va == pv->pv_va)
1580				break;
1581		}
1582	} else {
1583		for (pv = TAILQ_FIRST(&pmap->pm_pvlist);
1584			pv;
1585			pv = TAILQ_NEXT(pv, pv_plist)) {
1586			if (va == pv->pv_va)
1587				break;
1588		}
1589	}
1590
1591	rtval = 0;
1592	if (pv) {
1593
1594		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1595		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
1596		ppv->pv_list_count--;
1597		if (TAILQ_FIRST(&ppv->pv_list) == NULL)
1598			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
1599
1600		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1601		free_pv_entry(pv);
1602	}
1603
1604	splx(s);
1605	return rtval;
1606}
1607
1608/*
1609 * Create a pv entry for page at pa for
1610 * (pmap, va).
1611 */
1612static void
1613pmap_insert_entry(pmap, va, mpte, pa)
1614	pmap_t pmap;
1615	vm_offset_t va;
1616	vm_page_t mpte;
1617	vm_offset_t pa;
1618{
1619
1620	int s;
1621	pv_entry_t pv;
1622	pv_table_t *ppv;
1623
1624	s = splvm();
1625	pv = get_pv_entry();
1626	pv->pv_va = va;
1627	pv->pv_pmap = pmap;
1628	pv->pv_ptem = mpte;
1629
1630	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1631
1632	ppv = pa_to_pvh(pa);
1633	TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
1634	ppv->pv_list_count++;
1635
1636	splx(s);
1637}
1638
1639/*
1640 * pmap_remove_pte: do the things to unmap a page in a process
1641 */
1642static int
1643pmap_remove_pte(pmap, ptq, va)
1644	struct pmap *pmap;
1645	unsigned *ptq;
1646	vm_offset_t va;
1647{
1648	unsigned oldpte;
1649	pv_table_t *ppv;
1650
1651	oldpte = *ptq;
1652	*ptq = 0;
1653	if (oldpte & PG_W)
1654		pmap->pm_stats.wired_count -= 1;
1655	/*
1656	 * Machines that don't support invlpg, also don't support
1657	 * PG_G.
1658	 */
1659	if (oldpte & PG_G)
1660		invlpg(va);
1661	pmap->pm_stats.resident_count -= 1;
1662	if (oldpte & PG_MANAGED) {
1663		ppv = pa_to_pvh(oldpte);
1664		if (oldpte & PG_M) {
1665#if defined(PMAP_DIAGNOSTIC)
1666			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1667				printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte);
1668			}
1669#endif
1670			if (pmap_track_modified(va))
1671				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
1672		}
1673		if (oldpte & PG_A)
1674			ppv->pv_vm_page->flags |= PG_REFERENCED;
1675		return pmap_remove_entry(pmap, ppv, va);
1676	} else {
1677		return pmap_unuse_pt(pmap, va, NULL);
1678	}
1679
1680	return 0;
1681}
1682
1683/*
1684 * Remove a single page from a process address space
1685 */
1686static void
1687pmap_remove_page(pmap, va)
1688	struct pmap *pmap;
1689	register vm_offset_t va;
1690{
1691	register unsigned *ptq;
1692
1693	/*
1694	 * if there is no pte for this address, just skip it!!!
1695	 */
1696	if (*pmap_pde(pmap, va) == 0) {
1697		return;
1698	}
1699
1700	/*
1701	 * get a local va for mappings for this pmap.
1702	 */
1703	ptq = get_ptbase(pmap) + i386_btop(va);
1704	if (*ptq) {
1705		(void) pmap_remove_pte(pmap, ptq, va);
1706		invltlb_1pg(va);
1707	}
1708	return;
1709}
1710
1711/*
1712 *	Remove the given range of addresses from the specified map.
1713 *
1714 *	It is assumed that the start and end are properly
1715 *	rounded to the page size.
1716 */
1717void
1718pmap_remove(pmap, sva, eva)
1719	struct pmap *pmap;
1720	register vm_offset_t sva;
1721	register vm_offset_t eva;
1722{
1723	register unsigned *ptbase;
1724	vm_offset_t pdnxt;
1725	vm_offset_t ptpaddr;
1726	vm_offset_t sindex, eindex;
1727	int anyvalid;
1728
1729	if (pmap == NULL)
1730		return;
1731
1732	if (pmap->pm_stats.resident_count == 0)
1733		return;
1734
1735	/*
1736	 * special handling of removing one page.  a very
1737	 * common operation and easy to short circuit some
1738	 * code.
1739	 */
1740	if (((sva + PAGE_SIZE) == eva) &&
1741		(((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1742		pmap_remove_page(pmap, sva);
1743		return;
1744	}
1745
1746	anyvalid = 0;
1747
1748	/*
1749	 * Get a local virtual address for the mappings that are being
1750	 * worked with.
1751	 */
1752	ptbase = get_ptbase(pmap);
1753
1754	sindex = i386_btop(sva);
1755	eindex = i386_btop(eva);
1756
1757	for (; sindex < eindex; sindex = pdnxt) {
1758		unsigned pdirindex;
1759
1760		/*
1761		 * Calculate index for next page table.
1762		 */
1763		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1764		if (pmap->pm_stats.resident_count == 0)
1765			break;
1766
1767		pdirindex = sindex / NPDEPG;
1768		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1769			pmap->pm_pdir[pdirindex] = 0;
1770			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1771			anyvalid++;
1772			continue;
1773		}
1774
1775		/*
1776		 * Weed out invalid mappings. Note: we assume that the page
1777		 * directory table is always allocated, and in kernel virtual.
1778		 */
1779		if (ptpaddr == 0)
1780			continue;
1781
1782		/*
1783		 * Limit our scan to either the end of the va represented
1784		 * by the current page table page, or to the end of the
1785		 * range being removed.
1786		 */
1787		if (pdnxt > eindex) {
1788			pdnxt = eindex;
1789		}
1790
1791		for ( ;sindex != pdnxt; sindex++) {
1792			vm_offset_t va;
1793			if (ptbase[sindex] == 0) {
1794				continue;
1795			}
1796			va = i386_ptob(sindex);
1797
1798			anyvalid++;
1799			if (pmap_remove_pte(pmap,
1800				ptbase + sindex, va))
1801				break;
1802		}
1803	}
1804
1805	if (anyvalid) {
1806		invltlb();
1807	}
1808}
1809
1810/*
1811 *	Routine:	pmap_remove_all
1812 *	Function:
1813 *		Removes this physical page from
1814 *		all physical maps in which it resides.
1815 *		Reflects back modify bits to the pager.
1816 *
1817 *	Notes:
1818 *		Original versions of this routine were very
1819 *		inefficient because they iteratively called
1820 *		pmap_remove (slow...)
1821 */
1822
1823static void
1824pmap_remove_all(pa)
1825	vm_offset_t pa;
1826{
1827	register pv_entry_t pv;
1828	pv_table_t *ppv;
1829	register unsigned *pte, tpte;
1830	int nmodify;
1831	int update_needed;
1832	int s;
1833
1834	nmodify = 0;
1835	update_needed = 0;
1836#if defined(PMAP_DIAGNOSTIC)
1837	/*
1838	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1839	 * pages!
1840	 */
1841	if (!pmap_is_managed(pa)) {
1842		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", pa);
1843	}
1844#endif
1845
1846	s = splvm();
1847	ppv = pa_to_pvh(pa);
1848	while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) {
1849		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1850
1851		pv->pv_pmap->pm_stats.resident_count--;
1852
1853		tpte = *pte;
1854		*pte = 0;
1855		if (tpte & PG_W)
1856			pv->pv_pmap->pm_stats.wired_count--;
1857
1858		if (tpte & PG_A)
1859			ppv->pv_vm_page->flags |= PG_REFERENCED;
1860
1861		/*
1862		 * Update the vm_page_t clean and reference bits.
1863		 */
1864		if (tpte & PG_M) {
1865#if defined(PMAP_DIAGNOSTIC)
1866			if (pmap_nw_modified((pt_entry_t) tpte)) {
1867				printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte);
1868			}
1869#endif
1870			if (pmap_track_modified(pv->pv_va))
1871				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
1872		}
1873		if (!update_needed &&
1874			((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) ||
1875			(pv->pv_pmap == kernel_pmap))) {
1876			update_needed = 1;
1877		}
1878
1879		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1880		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
1881		ppv->pv_list_count--;
1882		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1883		free_pv_entry(pv);
1884	}
1885
1886	ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
1887
1888	if (update_needed)
1889		invltlb();
1890
1891	splx(s);
1892	return;
1893}
1894
1895/*
1896 *	Set the physical protection on the
1897 *	specified range of this map as requested.
1898 */
1899void
1900pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1901{
1902	register unsigned *ptbase;
1903	vm_offset_t pdnxt, ptpaddr;
1904	vm_pindex_t sindex, eindex;
1905	int anychanged;
1906
1907
1908	if (pmap == NULL)
1909		return;
1910
1911	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1912		pmap_remove(pmap, sva, eva);
1913		return;
1914	}
1915
1916	if (prot & VM_PROT_WRITE)
1917		return;
1918
1919	anychanged = 0;
1920
1921	ptbase = get_ptbase(pmap);
1922
1923	sindex = i386_btop(sva);
1924	eindex = i386_btop(eva);
1925
1926	for (; sindex < eindex; sindex = pdnxt) {
1927
1928		unsigned pdirindex;
1929
1930		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1931
1932		pdirindex = sindex / NPDEPG;
1933		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1934			(unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1935			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1936			anychanged++;
1937			continue;
1938		}
1939
1940		/*
1941		 * Weed out invalid mappings. Note: we assume that the page
1942		 * directory table is always allocated, and in kernel virtual.
1943		 */
1944		if (ptpaddr == 0)
1945			continue;
1946
1947		if (pdnxt > eindex) {
1948			pdnxt = eindex;
1949		}
1950
1951		for (; sindex != pdnxt; sindex++) {
1952
1953			unsigned pbits;
1954			pv_table_t *ppv;
1955
1956			pbits = ptbase[sindex];
1957
1958			if (pbits & PG_MANAGED) {
1959				ppv = NULL;
1960				if (pbits & PG_A) {
1961					ppv = pa_to_pvh(pbits);
1962					ppv->pv_vm_page->flags |= PG_REFERENCED;
1963					pbits &= ~PG_A;
1964				}
1965				if (pbits & PG_M) {
1966					if (pmap_track_modified(i386_ptob(sindex))) {
1967						if (ppv == NULL)
1968							ppv = pa_to_pvh(pbits);
1969						ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
1970						pbits &= ~PG_M;
1971					}
1972				}
1973			}
1974
1975			pbits &= ~PG_RW;
1976
1977			if (pbits != ptbase[sindex]) {
1978				ptbase[sindex] = pbits;
1979				anychanged = 1;
1980			}
1981		}
1982	}
1983	if (anychanged)
1984		invltlb();
1985}
1986
1987/*
1988 *	Insert the given physical page (p) at
1989 *	the specified virtual address (v) in the
1990 *	target physical map with the protection requested.
1991 *
1992 *	If specified, the page will be wired down, meaning
1993 *	that the related pte can not be reclaimed.
1994 *
1995 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1996 *	or lose information.  That is, this routine must actually
1997 *	insert this page into the given map NOW.
1998 */
1999void
2000pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot,
2001	   boolean_t wired)
2002{
2003	register unsigned *pte;
2004	vm_offset_t opa;
2005	vm_offset_t origpte, newpte;
2006	vm_page_t mpte;
2007
2008	if (pmap == NULL)
2009		return;
2010
2011	va &= PG_FRAME;
2012#ifdef PMAP_DIAGNOSTIC
2013	if (va > VM_MAX_KERNEL_ADDRESS)
2014		panic("pmap_enter: toobig");
2015	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2016		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2017#endif
2018
2019	mpte = NULL;
2020	/*
2021	 * In the case that a page table page is not
2022	 * resident, we are creating it here.
2023	 */
2024	if (va < UPT_MIN_ADDRESS) {
2025		mpte = pmap_allocpte(pmap, va);
2026	}
2027#if 0 && defined(PMAP_DIAGNOSTIC)
2028	else {
2029		vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va);
2030		if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) {
2031			panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n",
2032				pmap->pm_pdir[PTDPTDI], origpte, va);
2033		}
2034		if (smp_active) {
2035			pdeaddr = (vm_offset_t *) IdlePTDS[cpuid];
2036			if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) {
2037				if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr))
2038					printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr);
2039				printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr);
2040				panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n",
2041					pmap->pm_pdir[PTDPTDI], newpte, origpte, va);
2042			}
2043		}
2044	}
2045#endif
2046
2047	pte = pmap_pte(pmap, va);
2048
2049#if !defined(MAX_PERF)
2050	/*
2051	 * Page Directory table entry not valid, we need a new PT page
2052	 */
2053	if (pte == NULL) {
2054		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n",
2055			pmap->pm_pdir[PTDPTDI], va);
2056	}
2057#endif
2058
2059	origpte = *(vm_offset_t *)pte;
2060	pa &= PG_FRAME;
2061	opa = origpte & PG_FRAME;
2062
2063#if !defined(MAX_PERF)
2064	if (origpte & PG_PS)
2065		panic("pmap_enter: attempted pmap_enter on 4MB page");
2066#endif
2067
2068	/*
2069	 * Mapping has not changed, must be protection or wiring change.
2070	 */
2071	if (origpte && (opa == pa)) {
2072		/*
2073		 * Wiring change, just update stats. We don't worry about
2074		 * wiring PT pages as they remain resident as long as there
2075		 * are valid mappings in them. Hence, if a user page is wired,
2076		 * the PT page will be also.
2077		 */
2078		if (wired && ((origpte & PG_W) == 0))
2079			pmap->pm_stats.wired_count++;
2080		else if (!wired && (origpte & PG_W))
2081			pmap->pm_stats.wired_count--;
2082
2083#if defined(PMAP_DIAGNOSTIC)
2084		if (pmap_nw_modified((pt_entry_t) origpte)) {
2085			printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte);
2086		}
2087#endif
2088
2089		/*
2090		 * Remove extra pte reference
2091		 */
2092		if (mpte)
2093			mpte->hold_count--;
2094
2095		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2096			if ((origpte & PG_RW) == 0) {
2097				*pte |= PG_RW;
2098				invltlb_1pg(va);
2099			}
2100			return;
2101		}
2102
2103		/*
2104		 * We might be turning off write access to the page,
2105		 * so we go ahead and sense modify status.
2106		 */
2107		if (origpte & PG_MANAGED) {
2108			if ((origpte & PG_M) && pmap_track_modified(va)) {
2109				pv_table_t *ppv;
2110				ppv = pa_to_pvh(opa);
2111				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
2112			}
2113			pa |= PG_MANAGED;
2114		}
2115		goto validate;
2116	}
2117	/*
2118	 * Mapping has changed, invalidate old range and fall through to
2119	 * handle validating new mapping.
2120	 */
2121	if (opa) {
2122		int err;
2123		err = pmap_remove_pte(pmap, pte, va);
2124#if !defined(MAX_PERF)
2125		if (err)
2126			panic("pmap_enter: pte vanished, va: 0x%x", va);
2127#endif
2128	}
2129
2130	/*
2131	 * Enter on the PV list if part of our managed memory Note that we
2132	 * raise IPL while manipulating pv_table since pmap_enter can be
2133	 * called at interrupt time.
2134	 */
2135	if (pmap_is_managed(pa)) {
2136		pmap_insert_entry(pmap, va, mpte, pa);
2137		pa |= PG_MANAGED;
2138	}
2139
2140	/*
2141	 * Increment counters
2142	 */
2143	pmap->pm_stats.resident_count++;
2144	if (wired)
2145		pmap->pm_stats.wired_count++;
2146
2147validate:
2148	/*
2149	 * Now validate mapping with desired protection/wiring.
2150	 */
2151	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2152
2153	if (wired)
2154		newpte |= PG_W;
2155	if (va < UPT_MIN_ADDRESS)
2156		newpte |= PG_U;
2157	if (pmap == kernel_pmap)
2158		newpte |= pgeflag;
2159
2160	/*
2161	 * if the mapping or permission bits are different, we need
2162	 * to update the pte.
2163	 */
2164	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2165		*pte = newpte;
2166		if (origpte)
2167			invltlb_1pg(va);
2168	}
2169}
2170
2171/*
2172 * this code makes some *MAJOR* assumptions:
2173 * 1. Current pmap & pmap exists.
2174 * 2. Not wired.
2175 * 3. Read access.
2176 * 4. No page table pages.
2177 * 5. Tlbflush is deferred to calling procedure.
2178 * 6. Page IS managed.
2179 * but is *MUCH* faster than pmap_enter...
2180 */
2181
2182static vm_page_t
2183pmap_enter_quick(pmap, va, pa, mpte)
2184	register pmap_t pmap;
2185	vm_offset_t va;
2186	register vm_offset_t pa;
2187	vm_page_t mpte;
2188{
2189	register unsigned *pte;
2190
2191	/*
2192	 * In the case that a page table page is not
2193	 * resident, we are creating it here.
2194	 */
2195	if (va < UPT_MIN_ADDRESS) {
2196		unsigned ptepindex;
2197		vm_offset_t ptepa;
2198
2199		/*
2200		 * Calculate pagetable page index
2201		 */
2202		ptepindex = va >> PDRSHIFT;
2203		if (mpte && (mpte->pindex == ptepindex)) {
2204			mpte->hold_count++;
2205		} else {
2206retry:
2207			/*
2208			 * Get the page directory entry
2209			 */
2210			ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
2211
2212			/*
2213			 * If the page table page is mapped, we just increment
2214			 * the hold count, and activate it.
2215			 */
2216			if (ptepa) {
2217#if !defined(MAX_PERF)
2218				if (ptepa & PG_PS)
2219					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2220#endif
2221				if (pmap->pm_ptphint &&
2222					(pmap->pm_ptphint->pindex == ptepindex)) {
2223					mpte = pmap->pm_ptphint;
2224				} else {
2225					mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
2226					pmap->pm_ptphint = mpte;
2227				}
2228				if (mpte == NULL)
2229					goto retry;
2230				mpte->hold_count++;
2231			} else {
2232				mpte = _pmap_allocpte(pmap, ptepindex);
2233			}
2234		}
2235	} else {
2236		mpte = NULL;
2237	}
2238
2239	/*
2240	 * This call to vtopte makes the assumption that we are
2241	 * entering the page into the current pmap.  In order to support
2242	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2243	 * But that isn't as quick as vtopte.
2244	 */
2245	pte = (unsigned *)vtopte(va);
2246	if (*pte) {
2247		if (mpte)
2248			pmap_unwire_pte_hold(pmap, mpte);
2249		return 0;
2250	}
2251
2252	/*
2253	 * Enter on the PV list if part of our managed memory Note that we
2254	 * raise IPL while manipulating pv_table since pmap_enter can be
2255	 * called at interrupt time.
2256	 */
2257	pmap_insert_entry(pmap, va, mpte, pa);
2258
2259	/*
2260	 * Increment counters
2261	 */
2262	pmap->pm_stats.resident_count++;
2263
2264	/*
2265	 * Now validate mapping with RO protection
2266	 */
2267	*pte = pa | PG_V | PG_U | PG_MANAGED;
2268
2269	return mpte;
2270}
2271
2272#define MAX_INIT_PT (96)
2273/*
2274 * pmap_object_init_pt preloads the ptes for a given object
2275 * into the specified pmap.  This eliminates the blast of soft
2276 * faults on process startup and immediately after an mmap.
2277 */
2278void
2279pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
2280	pmap_t pmap;
2281	vm_offset_t addr;
2282	vm_object_t object;
2283	vm_pindex_t pindex;
2284	vm_size_t size;
2285	int limit;
2286{
2287	vm_offset_t tmpidx;
2288	int psize;
2289	vm_page_t p, mpte;
2290	int objpgs;
2291
2292	if (!pmap)
2293		return;
2294
2295	/*
2296	 * This code maps large physical mmap regions into the
2297	 * processor address space.  Note that some shortcuts
2298	 * are taken, but the code works.
2299	 */
2300	if (pseflag &&
2301		(object->type == OBJT_DEVICE) &&
2302		((addr & (NBPDR - 1)) == 0) &&
2303		((size & (NBPDR - 1)) == 0) ) {
2304		int i;
2305		int s;
2306		vm_page_t m[1];
2307		unsigned int ptepindex;
2308		int npdes;
2309		vm_offset_t ptepa;
2310
2311		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2312			return;
2313
2314retry:
2315		p = vm_page_lookup(object, pindex);
2316		if (p && vm_page_sleep(p, "init4p", NULL))
2317			goto retry;
2318
2319		if (p == NULL) {
2320			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2321			if (p == NULL)
2322				return;
2323			m[0] = p;
2324
2325			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2326				vm_page_free(p);
2327				return;
2328			}
2329
2330			p = vm_page_lookup(object, pindex);
2331			PAGE_WAKEUP(p);
2332		}
2333
2334		ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p);
2335		if (ptepa & (NBPDR - 1)) {
2336			return;
2337		}
2338
2339		p->valid = VM_PAGE_BITS_ALL;
2340
2341		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2342		npdes = size >> PDRSHIFT;
2343		for(i=0;i<npdes;i++) {
2344			pmap->pm_pdir[ptepindex] =
2345				(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS);
2346			ptepa += NBPDR;
2347			ptepindex += 1;
2348		}
2349		p->flags |= PG_MAPPED;
2350		invltlb();
2351		return;
2352	}
2353
2354	psize = i386_btop(size);
2355
2356	if ((object->type != OBJT_VNODE) ||
2357		(limit && (psize > MAX_INIT_PT) &&
2358			(object->resident_page_count > MAX_INIT_PT))) {
2359		return;
2360	}
2361
2362	if (psize + pindex > object->size)
2363		psize = object->size - pindex;
2364
2365	mpte = NULL;
2366	/*
2367	 * if we are processing a major portion of the object, then scan the
2368	 * entire thing.
2369	 */
2370	if (psize > (object->size >> 2)) {
2371		objpgs = psize;
2372
2373		for (p = TAILQ_FIRST(&object->memq);
2374		    ((objpgs > 0) && (p != NULL));
2375		    p = TAILQ_NEXT(p, listq)) {
2376
2377			tmpidx = p->pindex;
2378			if (tmpidx < pindex) {
2379				continue;
2380			}
2381			tmpidx -= pindex;
2382			if (tmpidx >= psize) {
2383				continue;
2384			}
2385			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2386			    (p->busy == 0) &&
2387			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2388				if ((p->queue - p->pc) == PQ_CACHE)
2389					vm_page_deactivate(p);
2390				p->flags |= PG_BUSY;
2391				mpte = pmap_enter_quick(pmap,
2392					addr + i386_ptob(tmpidx),
2393					VM_PAGE_TO_PHYS(p), mpte);
2394				p->flags |= PG_MAPPED;
2395				PAGE_WAKEUP(p);
2396			}
2397			objpgs -= 1;
2398		}
2399	} else {
2400		/*
2401		 * else lookup the pages one-by-one.
2402		 */
2403		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2404			p = vm_page_lookup(object, tmpidx + pindex);
2405			if (p &&
2406			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2407			    (p->busy == 0) &&
2408			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2409				if ((p->queue - p->pc) == PQ_CACHE)
2410					vm_page_deactivate(p);
2411				p->flags |= PG_BUSY;
2412				mpte = pmap_enter_quick(pmap,
2413					addr + i386_ptob(tmpidx),
2414					VM_PAGE_TO_PHYS(p), mpte);
2415				p->flags |= PG_MAPPED;
2416				PAGE_WAKEUP(p);
2417			}
2418		}
2419	}
2420	return;
2421}
2422
2423/*
2424 * pmap_prefault provides a quick way of clustering
2425 * pagefaults into a processes address space.  It is a "cousin"
2426 * of pmap_object_init_pt, except it runs at page fault time instead
2427 * of mmap time.
2428 */
2429#define PFBAK 4
2430#define PFFOR 4
2431#define PAGEORDER_SIZE (PFBAK+PFFOR)
2432
2433static int pmap_prefault_pageorder[] = {
2434	-PAGE_SIZE, PAGE_SIZE,
2435	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2436	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2437	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2438};
2439
2440void
2441pmap_prefault(pmap, addra, entry)
2442	pmap_t pmap;
2443	vm_offset_t addra;
2444	vm_map_entry_t entry;
2445{
2446	int i;
2447	vm_offset_t starta;
2448	vm_offset_t addr;
2449	vm_pindex_t pindex;
2450	vm_page_t m, mpte;
2451	vm_object_t object;
2452
2453	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
2454		return;
2455
2456	object = entry->object.vm_object;
2457
2458	starta = addra - PFBAK * PAGE_SIZE;
2459	if (starta < entry->start) {
2460		starta = entry->start;
2461	} else if (starta > addra) {
2462		starta = 0;
2463	}
2464
2465	mpte = NULL;
2466	for (i = 0; i < PAGEORDER_SIZE; i++) {
2467		vm_object_t lobject;
2468		unsigned *pte;
2469
2470		addr = addra + pmap_prefault_pageorder[i];
2471		if (addr > addra + (PFFOR * PAGE_SIZE))
2472			addr = 0;
2473
2474		if (addr < starta || addr >= entry->end)
2475			continue;
2476
2477		if ((*pmap_pde(pmap, addr)) == NULL)
2478			continue;
2479
2480		pte = (unsigned *) vtopte(addr);
2481		if (*pte)
2482			continue;
2483
2484		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2485		lobject = object;
2486		for (m = vm_page_lookup(lobject, pindex);
2487		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2488		    lobject = lobject->backing_object) {
2489			if (lobject->backing_object_offset & PAGE_MASK)
2490				break;
2491			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2492			m = vm_page_lookup(lobject->backing_object, pindex);
2493		}
2494
2495		/*
2496		 * give-up when a page is not in memory
2497		 */
2498		if (m == NULL)
2499			break;
2500
2501		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2502		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2503
2504			if ((m->queue - m->pc) == PQ_CACHE) {
2505				vm_page_deactivate(m);
2506			}
2507			m->flags |= PG_BUSY;
2508			mpte = pmap_enter_quick(pmap, addr,
2509				VM_PAGE_TO_PHYS(m), mpte);
2510			m->flags |= PG_MAPPED;
2511			PAGE_WAKEUP(m);
2512		}
2513	}
2514}
2515
2516/*
2517 *	Routine:	pmap_change_wiring
2518 *	Function:	Change the wiring attribute for a map/virtual-address
2519 *			pair.
2520 *	In/out conditions:
2521 *			The mapping must already exist in the pmap.
2522 */
2523void
2524pmap_change_wiring(pmap, va, wired)
2525	register pmap_t pmap;
2526	vm_offset_t va;
2527	boolean_t wired;
2528{
2529	register unsigned *pte;
2530
2531	if (pmap == NULL)
2532		return;
2533
2534	pte = pmap_pte(pmap, va);
2535
2536	if (wired && !pmap_pte_w(pte))
2537		pmap->pm_stats.wired_count++;
2538	else if (!wired && pmap_pte_w(pte))
2539		pmap->pm_stats.wired_count--;
2540
2541	/*
2542	 * Wiring is not a hardware characteristic so there is no need to
2543	 * invalidate TLB.
2544	 */
2545	pmap_pte_set_w(pte, wired);
2546}
2547
2548
2549
2550/*
2551 *	Copy the range specified by src_addr/len
2552 *	from the source map to the range dst_addr/len
2553 *	in the destination map.
2554 *
2555 *	This routine is only advisory and need not do anything.
2556 */
2557
2558void
2559pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
2560	pmap_t dst_pmap, src_pmap;
2561	vm_offset_t dst_addr;
2562	vm_size_t len;
2563	vm_offset_t src_addr;
2564{
2565	vm_offset_t addr;
2566	vm_offset_t end_addr = src_addr + len;
2567	vm_offset_t pdnxt;
2568	unsigned src_frame, dst_frame;
2569
2570	if (dst_addr != src_addr)
2571		return;
2572
2573	src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
2574	if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) {
2575		return;
2576	}
2577
2578	dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
2579	if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) {
2580		APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
2581		invltlb();
2582	}
2583
2584	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
2585		unsigned *src_pte, *dst_pte;
2586		vm_page_t dstmpte, srcmpte;
2587		vm_offset_t srcptepaddr;
2588		unsigned ptepindex;
2589
2590#if !defined(MAX_PERF)
2591		if (addr >= UPT_MIN_ADDRESS)
2592			panic("pmap_copy: invalid to pmap_copy page tables\n");
2593#endif
2594
2595		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2596		ptepindex = addr >> PDRSHIFT;
2597
2598		srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
2599		if (srcptepaddr == 0)
2600			continue;
2601
2602		if (srcptepaddr & PG_PS) {
2603			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2604				dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
2605				dst_pmap->pm_stats.resident_count += NBPDR;
2606			}
2607			continue;
2608		}
2609
2610		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2611		if ((srcmpte == NULL) ||
2612			(srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2613			continue;
2614
2615		if (pdnxt > end_addr)
2616			pdnxt = end_addr;
2617
2618		src_pte = (unsigned *) vtopte(addr);
2619		dst_pte = (unsigned *) avtopte(addr);
2620		while (addr < pdnxt) {
2621			unsigned ptetemp;
2622			ptetemp = *src_pte;
2623			/*
2624			 * we only virtual copy managed pages
2625			 */
2626			if ((ptetemp & PG_MANAGED) != 0) {
2627				/*
2628				 * We have to check after allocpte for the
2629				 * pte still being around...  allocpte can
2630				 * block.
2631				 */
2632				dstmpte = pmap_allocpte(dst_pmap, addr);
2633				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2634					/*
2635					 * Clear the modified and
2636					 * accessed (referenced) bits
2637					 * during the copy.
2638					 */
2639					*dst_pte = ptetemp & ~(PG_M | PG_A);
2640					dst_pmap->pm_stats.resident_count++;
2641					pmap_insert_entry(dst_pmap, addr,
2642						dstmpte,
2643						(ptetemp & PG_FRAME));
2644	 			} else {
2645					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2646				}
2647				if (dstmpte->hold_count >= srcmpte->hold_count)
2648					break;
2649			}
2650			addr += PAGE_SIZE;
2651			src_pte++;
2652			dst_pte++;
2653		}
2654	}
2655}
2656
2657/*
2658 *	Routine:	pmap_kernel
2659 *	Function:
2660 *		Returns the physical map handle for the kernel.
2661 */
2662pmap_t
2663pmap_kernel()
2664{
2665	return (kernel_pmap);
2666}
2667
2668/*
2669 *	pmap_zero_page zeros the specified (machine independent)
2670 *	page by mapping the page into virtual memory and using
2671 *	bzero to clear its contents, one machine dependent page
2672 *	at a time.
2673 */
2674void
2675pmap_zero_page(phys)
2676	vm_offset_t phys;
2677{
2678#ifdef SMP
2679#if !defined(MAX_PERF)
2680	if (*(int *) prv_CMAP3)
2681		panic("pmap_zero_page: prv_CMAP3 busy");
2682#endif
2683
2684	*(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2685	invltlb_1pg((vm_offset_t) &prv_CPAGE3);
2686
2687	bzero(&prv_CPAGE3, PAGE_SIZE);
2688
2689	*(int *) prv_CMAP3 = 0;
2690	invltlb_1pg((vm_offset_t) &prv_CPAGE3);
2691#else
2692#if !defined(MAX_PERF)
2693	if (*(int *) CMAP2)
2694		panic("pmap_zero_page: CMAP busy");
2695#endif
2696
2697	*(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2698	bzero(CADDR2, PAGE_SIZE);
2699	*(int *) CMAP2 = 0;
2700	invltlb_1pg((vm_offset_t) CADDR2);
2701#endif
2702}
2703
2704/*
2705 *	pmap_copy_page copies the specified (machine independent)
2706 *	page by mapping the page into virtual memory and using
2707 *	bcopy to copy the page, one machine dependent page at a
2708 *	time.
2709 */
2710void
2711pmap_copy_page(src, dst)
2712	vm_offset_t src;
2713	vm_offset_t dst;
2714{
2715#ifdef SMP
2716#if !defined(MAX_PERF)
2717	if (*(int *) prv_CMAP1)
2718		panic("pmap_copy_page: prv_CMAP1 busy");
2719	if (*(int *) prv_CMAP2)
2720		panic("pmap_copy_page: prv_CMAP2 busy");
2721#endif
2722
2723	*(int *) prv_CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2724	*(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2725
2726	invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2);
2727
2728	bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE);
2729
2730	*(int *) prv_CMAP1 = 0;
2731	*(int *) prv_CMAP2 = 0;
2732	invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2);
2733#else
2734#if !defined(MAX_PERF)
2735	if (*(int *) CMAP1 || *(int *) CMAP2)
2736		panic("pmap_copy_page: CMAP busy");
2737#endif
2738
2739	*(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2740	*(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2741
2742	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2743
2744	*(int *) CMAP1 = 0;
2745	*(int *) CMAP2 = 0;
2746	invltlb_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2);
2747#endif
2748}
2749
2750
2751/*
2752 *	Routine:	pmap_pageable
2753 *	Function:
2754 *		Make the specified pages (by pmap, offset)
2755 *		pageable (or not) as requested.
2756 *
2757 *		A page which is not pageable may not take
2758 *		a fault; therefore, its page table entry
2759 *		must remain valid for the duration.
2760 *
2761 *		This routine is merely advisory; pmap_enter
2762 *		will specify that these pages are to be wired
2763 *		down (or not) as appropriate.
2764 */
2765void
2766pmap_pageable(pmap, sva, eva, pageable)
2767	pmap_t pmap;
2768	vm_offset_t sva, eva;
2769	boolean_t pageable;
2770{
2771}
2772
2773/*
2774 * this routine returns true if a physical page resides
2775 * in the given pmap.
2776 */
2777boolean_t
2778pmap_page_exists(pmap, pa)
2779	pmap_t pmap;
2780	vm_offset_t pa;
2781{
2782	register pv_entry_t pv;
2783	pv_table_t *ppv;
2784	int s;
2785
2786	if (!pmap_is_managed(pa))
2787		return FALSE;
2788
2789	s = splvm();
2790
2791	ppv = pa_to_pvh(pa);
2792	/*
2793	 * Not found, check current mappings returning immediately if found.
2794	 */
2795	for (pv = TAILQ_FIRST(&ppv->pv_list);
2796		pv;
2797		pv = TAILQ_NEXT(pv, pv_list)) {
2798		if (pv->pv_pmap == pmap) {
2799			splx(s);
2800			return TRUE;
2801		}
2802	}
2803	splx(s);
2804	return (FALSE);
2805}
2806
2807#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2808/*
2809 * Remove all pages from specified address space
2810 * this aids process exit speeds.  Also, this code
2811 * is special cased for current process only, but
2812 * can have the more generic (and slightly slower)
2813 * mode enabled.  This is much faster than pmap_remove
2814 * in the case of running down an entire address space.
2815 */
2816void
2817pmap_remove_pages(pmap, sva, eva)
2818	pmap_t pmap;
2819	vm_offset_t sva, eva;
2820{
2821	unsigned *pte, tpte;
2822	pv_table_t *ppv;
2823	pv_entry_t pv, npv;
2824	int s;
2825
2826#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2827	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) {
2828		printf("warning: pmap_remove_pages called with non-current pmap\n");
2829		return;
2830	}
2831#endif
2832
2833	s = splvm();
2834	for(pv = TAILQ_FIRST(&pmap->pm_pvlist);
2835		pv;
2836		pv = npv) {
2837
2838		if (pv->pv_va >= eva || pv->pv_va < sva) {
2839			npv = TAILQ_NEXT(pv, pv_plist);
2840			continue;
2841		}
2842
2843#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2844		pte = (unsigned *)vtopte(pv->pv_va);
2845#else
2846		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2847#endif
2848		tpte = *pte;
2849
2850/*
2851 * We cannot remove wired pages from a process' mapping at this time
2852 */
2853		if (tpte & PG_W) {
2854			npv = TAILQ_NEXT(pv, pv_plist);
2855			continue;
2856		}
2857		*pte = 0;
2858
2859		ppv = pa_to_pvh(tpte);
2860
2861		pv->pv_pmap->pm_stats.resident_count--;
2862
2863		/*
2864		 * Update the vm_page_t clean and reference bits.
2865		 */
2866		if (tpte & PG_M) {
2867			ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
2868		}
2869
2870
2871		npv = TAILQ_NEXT(pv, pv_plist);
2872		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2873
2874		ppv->pv_list_count--;
2875		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
2876		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
2877			ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE);
2878		}
2879
2880		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2881		free_pv_entry(pv);
2882	}
2883	splx(s);
2884	invltlb();
2885}
2886
2887/*
2888 * pmap_testbit tests bits in pte's
2889 * note that the testbit/changebit routines are inline,
2890 * and a lot of things compile-time evaluate.
2891 */
2892static boolean_t
2893pmap_testbit(pa, bit)
2894	register vm_offset_t pa;
2895	int bit;
2896{
2897	register pv_entry_t pv;
2898	pv_table_t *ppv;
2899	unsigned *pte;
2900	int s;
2901
2902	if (!pmap_is_managed(pa))
2903		return FALSE;
2904
2905	ppv = pa_to_pvh(pa);
2906	if (TAILQ_FIRST(&ppv->pv_list) == NULL)
2907		return FALSE;
2908
2909	s = splvm();
2910
2911	for (pv = TAILQ_FIRST(&ppv->pv_list);
2912		pv;
2913		pv = TAILQ_NEXT(pv, pv_list)) {
2914
2915		/*
2916		 * if the bit being tested is the modified bit, then
2917		 * mark clean_map and ptes as never
2918		 * modified.
2919		 */
2920		if (bit & (PG_A|PG_M)) {
2921			if (!pmap_track_modified(pv->pv_va))
2922				continue;
2923		}
2924
2925#if defined(PMAP_DIAGNOSTIC)
2926		if (!pv->pv_pmap) {
2927			printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
2928			continue;
2929		}
2930#endif
2931		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2932		if (*pte & bit) {
2933			splx(s);
2934			return TRUE;
2935		}
2936	}
2937	splx(s);
2938	return (FALSE);
2939}
2940
2941/*
2942 * this routine is used to modify bits in ptes
2943 */
2944static void
2945pmap_changebit(pa, bit, setem)
2946	vm_offset_t pa;
2947	int bit;
2948	boolean_t setem;
2949{
2950	register pv_entry_t pv;
2951	pv_table_t *ppv;
2952	register unsigned *pte;
2953	int changed;
2954	int s;
2955
2956	if (!pmap_is_managed(pa))
2957		return;
2958
2959	s = splvm();
2960	changed = 0;
2961	ppv = pa_to_pvh(pa);
2962
2963	/*
2964	 * Loop over all current mappings setting/clearing as appropos If
2965	 * setting RO do we need to clear the VAC?
2966	 */
2967	for (pv = TAILQ_FIRST(&ppv->pv_list);
2968		pv;
2969		pv = TAILQ_NEXT(pv, pv_list)) {
2970
2971		/*
2972		 * don't write protect pager mappings
2973		 */
2974		if (!setem && (bit == PG_RW)) {
2975			if (!pmap_track_modified(pv->pv_va))
2976				continue;
2977		}
2978
2979#if defined(PMAP_DIAGNOSTIC)
2980		if (!pv->pv_pmap) {
2981			printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va);
2982			continue;
2983		}
2984#endif
2985
2986		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2987
2988		if (setem) {
2989			*(int *)pte |= bit;
2990			changed = 1;
2991		} else {
2992			vm_offset_t pbits = *(vm_offset_t *)pte;
2993			if (pbits & bit) {
2994				changed = 1;
2995				if (bit == PG_RW) {
2996					if (pbits & PG_M) {
2997						ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
2998					}
2999					*(int *)pte = pbits & ~(PG_M|PG_RW);
3000				} else {
3001					*(int *)pte = pbits & ~bit;
3002				}
3003			}
3004		}
3005	}
3006	splx(s);
3007	if (changed)
3008		invltlb();
3009}
3010
3011/*
3012 *      pmap_page_protect:
3013 *
3014 *      Lower the permission for all mappings to a given page.
3015 */
3016void
3017pmap_page_protect(vm_offset_t phys, vm_prot_t prot)
3018{
3019	if ((prot & VM_PROT_WRITE) == 0) {
3020		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3021			pmap_changebit(phys, PG_RW, FALSE);
3022		} else {
3023			pmap_remove_all(phys);
3024		}
3025	}
3026}
3027
3028vm_offset_t
3029pmap_phys_address(ppn)
3030	int ppn;
3031{
3032	return (i386_ptob(ppn));
3033}
3034
3035/*
3036 *	pmap_ts_referenced:
3037 *
3038 *	Return the count of reference bits for a page, clearing all of them.
3039 *
3040 */
3041int
3042pmap_ts_referenced(vm_offset_t pa)
3043{
3044	register pv_entry_t pv;
3045	pv_table_t *ppv;
3046	unsigned *pte;
3047	int s;
3048	int rtval = 0;
3049
3050	if (!pmap_is_managed(pa))
3051		return FALSE;
3052
3053	s = splvm();
3054
3055	ppv = pa_to_pvh(pa);
3056
3057	if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
3058		splx(s);
3059		return 0;
3060	}
3061
3062	/*
3063	 * Not found, check current mappings returning immediately if found.
3064	 */
3065	for (pv = TAILQ_FIRST(&ppv->pv_list);
3066		pv;
3067		pv = TAILQ_NEXT(pv, pv_list)) {
3068
3069		/*
3070		 * if the bit being tested is the modified bit, then
3071		 * mark clean_map and ptes as never
3072		 * modified.
3073		 */
3074		if (!pmap_track_modified(pv->pv_va))
3075			continue;
3076
3077		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3078		if (pte == NULL) {
3079			continue;
3080		}
3081
3082		if (*pte & PG_A) {
3083			rtval++;
3084			*pte &= ~PG_A;
3085			if (rtval > 16)
3086				break;
3087		}
3088	}
3089
3090	splx(s);
3091	if (rtval) {
3092		invltlb();
3093	}
3094	return (rtval);
3095}
3096
3097/*
3098 *	pmap_is_modified:
3099 *
3100 *	Return whether or not the specified physical page was modified
3101 *	in any physical maps.
3102 */
3103boolean_t
3104pmap_is_modified(vm_offset_t pa)
3105{
3106	return pmap_testbit((pa), PG_M);
3107}
3108
3109/*
3110 *	Clear the modify bits on the specified physical page.
3111 */
3112void
3113pmap_clear_modify(vm_offset_t pa)
3114{
3115	pmap_changebit((pa), PG_M, FALSE);
3116}
3117
3118/*
3119 *	pmap_clear_reference:
3120 *
3121 *	Clear the reference bit on the specified physical page.
3122 */
3123void
3124pmap_clear_reference(vm_offset_t pa)
3125{
3126	pmap_changebit((pa), PG_A, FALSE);
3127}
3128
3129/*
3130 * Miscellaneous support routines follow
3131 */
3132
3133static void
3134i386_protection_init()
3135{
3136	register int *kp, prot;
3137
3138	kp = protection_codes;
3139	for (prot = 0; prot < 8; prot++) {
3140		switch (prot) {
3141		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3142			/*
3143			 * Read access is also 0. There isn't any execute bit,
3144			 * so just make it readable.
3145			 */
3146		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3147		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3148		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3149			*kp++ = 0;
3150			break;
3151		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3152		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3153		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3154		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3155			*kp++ = PG_RW;
3156			break;
3157		}
3158	}
3159}
3160
3161/*
3162 * Map a set of physical memory pages into the kernel virtual
3163 * address space. Return a pointer to where it is mapped. This
3164 * routine is intended to be used for mapping device memory,
3165 * NOT real memory.
3166 */
3167void *
3168pmap_mapdev(pa, size)
3169	vm_offset_t pa;
3170	vm_size_t size;
3171{
3172	vm_offset_t va, tmpva;
3173	unsigned *pte;
3174
3175	size = roundup(size, PAGE_SIZE);
3176
3177	va = kmem_alloc_pageable(kernel_map, size);
3178#if !defined(MAX_PERF)
3179	if (!va)
3180		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3181#endif
3182
3183	pa = pa & PG_FRAME;
3184	for (tmpva = va; size > 0;) {
3185		pte = (unsigned *)vtopte(tmpva);
3186		*pte = pa | PG_RW | PG_V | pgeflag;
3187		size -= PAGE_SIZE;
3188		tmpva += PAGE_SIZE;
3189		pa += PAGE_SIZE;
3190	}
3191	invltlb();
3192
3193	return ((void *) va);
3194}
3195
3196/*
3197 * perform the pmap work for mincore
3198 */
3199int
3200pmap_mincore(pmap, addr)
3201	pmap_t pmap;
3202	vm_offset_t addr;
3203{
3204
3205	unsigned *ptep, pte;
3206	vm_page_t m;
3207	int val = 0;
3208
3209	ptep = pmap_pte(pmap, addr);
3210	if (ptep == 0) {
3211		return 0;
3212	}
3213
3214	if (pte = *ptep) {
3215		pv_table_t *ppv;
3216		vm_offset_t pa;
3217
3218		val = MINCORE_INCORE;
3219		if ((pte & PG_MANAGED) == 0)
3220			return val;
3221
3222		pa = pte & PG_FRAME;
3223
3224		ppv = pa_to_pvh((pa & PG_FRAME));
3225		m = ppv->pv_vm_page;
3226
3227		/*
3228		 * Modified by us
3229		 */
3230		if (pte & PG_M)
3231			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3232		/*
3233		 * Modified by someone
3234		 */
3235		else if (m->dirty || pmap_is_modified(pa))
3236			val |= MINCORE_MODIFIED_OTHER;
3237		/*
3238		 * Referenced by us
3239		 */
3240		if (pte & PG_A)
3241			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3242
3243		/*
3244		 * Referenced by someone
3245		 */
3246		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) {
3247			val |= MINCORE_REFERENCED_OTHER;
3248			m->flags |= PG_REFERENCED;
3249		}
3250	}
3251	return val;
3252}
3253
3254void
3255pmap_activate(struct proc *p)
3256{
3257#if defined(SWTCH_OPTIM_STATS)
3258	tlb_flush_count++;
3259#endif
3260	load_cr3(p->p_addr->u_pcb.pcb_cr3 =
3261		vtophys(p->p_vmspace->vm_pmap.pm_pdir));
3262}
3263
3264vm_offset_t
3265pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) {
3266
3267	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3268		return addr;
3269	}
3270
3271	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3272	return addr;
3273}
3274
3275
3276#if defined(PMAP_DEBUG)
3277pmap_pid_dump(int pid) {
3278	pmap_t pmap;
3279	struct proc *p;
3280	int npte = 0;
3281	int index;
3282	for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) {
3283		if (p->p_pid != pid)
3284			continue;
3285
3286		if (p->p_vmspace) {
3287			int i,j;
3288			index = 0;
3289			pmap = &p->p_vmspace->vm_pmap;
3290			for(i=0;i<1024;i++) {
3291				pd_entry_t *pde;
3292				unsigned *pte;
3293				unsigned base = i << PDRSHIFT;
3294
3295				pde = &pmap->pm_pdir[i];
3296				if (pde && pmap_pde_v(pde)) {
3297					for(j=0;j<1024;j++) {
3298						unsigned va = base + (j << PAGE_SHIFT);
3299						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3300							if (index) {
3301								index = 0;
3302								printf("\n");
3303							}
3304							return npte;
3305						}
3306						pte = pmap_pte_quick( pmap, va);
3307						if (pte && pmap_pte_v(pte)) {
3308							vm_offset_t pa;
3309							vm_page_t m;
3310							pa = *(int *)pte;
3311							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
3312							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3313								va, pa, m->hold_count, m->wire_count, m->flags);
3314							npte++;
3315							index++;
3316							if (index >= 2) {
3317								index = 0;
3318								printf("\n");
3319							} else {
3320								printf(" ");
3321							}
3322						}
3323					}
3324				}
3325			}
3326		}
3327	}
3328	return npte;
3329}
3330#endif
3331
3332#if defined(DEBUG)
3333
3334static void	pads __P((pmap_t pm));
3335static void	pmap_pvdump __P((vm_offset_t pa));
3336
3337/* print address space of pmap*/
3338static void
3339pads(pm)
3340	pmap_t pm;
3341{
3342	unsigned va, i, j;
3343	unsigned *ptep;
3344
3345	if (pm == kernel_pmap)
3346		return;
3347	for (i = 0; i < 1024; i++)
3348		if (pm->pm_pdir[i])
3349			for (j = 0; j < 1024; j++) {
3350				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3351				if (pm == kernel_pmap && va < KERNBASE)
3352					continue;
3353				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3354					continue;
3355				ptep = pmap_pte_quick(pm, va);
3356				if (pmap_pte_v(ptep))
3357					printf("%x:%x ", va, *(int *) ptep);
3358			};
3359
3360}
3361
3362static void
3363pmap_pvdump(pa)
3364	vm_offset_t pa;
3365{
3366	pv_table_t *ppv;
3367	register pv_entry_t pv;
3368
3369	printf("pa %x", pa);
3370	ppv = pa_to_pvh(pa);
3371	for (pv = TAILQ_FIRST(&ppv->pv_list);
3372		pv;
3373		pv = TAILQ_NEXT(pv, pv_list)) {
3374#ifdef used_to_be
3375		printf(" -> pmap %x, va %x, flags %x",
3376		    pv->pv_pmap, pv->pv_va, pv->pv_flags);
3377#endif
3378		printf(" -> pmap %x, va %x",
3379		    pv->pv_pmap, pv->pv_va);
3380		pads(pv->pv_pmap);
3381	}
3382	printf(" ");
3383}
3384#endif
3385