pmap.c revision 82310
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 * $FreeBSD: head/sys/i386/i386/pmap.c 82310 2001-08-25 02:44:38Z julian $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72#include "opt_pmap.h"
73#include "opt_msgbuf.h"
74#include "opt_upages.h"
75
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/kernel.h>
79#include <sys/lock.h>
80#include <sys/mman.h>
81#include <sys/msgbuf.h>
82#include <sys/mutex.h>
83#include <sys/proc.h>
84#include <sys/sx.h>
85#include <sys/user.h>
86#include <sys/vmmeter.h>
87#include <sys/sysctl.h>
88
89#include <vm/vm.h>
90#include <vm/vm_param.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_page.h>
93#include <vm/vm_map.h>
94#include <vm/vm_object.h>
95#include <vm/vm_extern.h>
96#include <vm/vm_pageout.h>
97#include <vm/vm_pager.h>
98#include <vm/vm_zone.h>
99
100#include <machine/cputypes.h>
101#include <machine/md_var.h>
102#include <machine/specialreg.h>
103#if defined(SMP) || defined(APIC_IO)
104#include <machine/smp.h>
105#include <machine/apic.h>
106#include <machine/segments.h>
107#include <machine/tss.h>
108#include <machine/globaldata.h>
109#endif /* SMP || APIC_IO */
110
111#define PMAP_KEEP_PDIRS
112#ifndef PMAP_SHPGPERPROC
113#define PMAP_SHPGPERPROC 200
114#endif
115
116#if defined(DIAGNOSTIC)
117#define PMAP_DIAGNOSTIC
118#endif
119
120#define MINPV 2048
121
122#if !defined(PMAP_DIAGNOSTIC)
123#define PMAP_INLINE __inline
124#else
125#define PMAP_INLINE
126#endif
127
128/*
129 * Get PDEs and PTEs for user/kernel address space
130 */
131#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
132#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
133
134#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
135#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
136#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
137#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
138#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
139
140#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
141#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
142
143/*
144 * Given a map and a machine independent protection code,
145 * convert to a vax protection code.
146 */
147#define pte_prot(m, p)	(protection_codes[p])
148static int protection_codes[8];
149
150static struct pmap kernel_pmap_store;
151pmap_t kernel_pmap;
152LIST_HEAD(pmaplist, pmap);
153struct pmaplist allpmaps;
154
155vm_offset_t avail_start;	/* PA of first available physical page */
156vm_offset_t avail_end;		/* PA of last available physical page */
157vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
158vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
159static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
160static int pgeflag;		/* PG_G or-in */
161static int pseflag;		/* PG_PS or-in */
162
163static vm_object_t kptobj;
164
165static int nkpt;
166vm_offset_t kernel_vm_end;
167
168/*
169 * Data for the pv entry allocation mechanism
170 */
171static vm_zone_t pvzone;
172static struct vm_zone pvzone_store;
173static struct vm_object pvzone_obj;
174static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
175static int pmap_pagedaemon_waken = 0;
176static struct pv_entry *pvinit;
177
178/*
179 * All those kernel PT submaps that BSD is so fond of
180 */
181pt_entry_t *CMAP1 = 0;
182static pt_entry_t *CMAP2, *ptmmap;
183caddr_t CADDR1 = 0, ptvmmap = 0;
184static caddr_t CADDR2;
185static pt_entry_t *msgbufmap;
186struct msgbuf *msgbufp=0;
187
188/*
189 * Crashdump maps.
190 */
191static pt_entry_t *pt_crashdumpmap;
192static caddr_t crashdumpmap;
193
194#ifdef SMP
195extern pt_entry_t *SMPpt;
196#endif
197static pt_entry_t *PMAP1 = 0;
198static unsigned *PADDR1 = 0;
199
200static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
201static unsigned * get_ptbase __P((pmap_t pmap));
202static pv_entry_t get_pv_entry __P((void));
203static void	i386_protection_init __P((void));
204static __inline void	pmap_changebit __P((vm_page_t m, int bit, boolean_t setem));
205
206static void	pmap_remove_all __P((vm_page_t m));
207static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
208				      vm_page_t m, vm_page_t mpte));
209static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq,
210					vm_offset_t sva));
211static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
212static int pmap_remove_entry __P((struct pmap *pmap, vm_page_t m,
213					vm_offset_t va));
214static boolean_t pmap_testbit __P((vm_page_t m, int bit));
215static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
216		vm_page_t mpte, vm_page_t m));
217
218static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
219
220static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
221static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
222static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
223static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
224static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
225static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
226
227static unsigned pdir4mb;
228
229/*
230 *	Routine:	pmap_pte
231 *	Function:
232 *		Extract the page table entry associated
233 *		with the given map/virtual_address pair.
234 */
235
236PMAP_INLINE unsigned *
237pmap_pte(pmap, va)
238	register pmap_t pmap;
239	vm_offset_t va;
240{
241	unsigned *pdeaddr;
242
243	if (pmap) {
244		pdeaddr = (unsigned *) pmap_pde(pmap, va);
245		if (*pdeaddr & PG_PS)
246			return pdeaddr;
247		if (*pdeaddr) {
248			return get_ptbase(pmap) + i386_btop(va);
249		}
250	}
251	return (0);
252}
253
254/*
255 * Move the kernel virtual free pointer to the next
256 * 4MB.  This is used to help improve performance
257 * by using a large (4MB) page for much of the kernel
258 * (.text, .data, .bss)
259 */
260static vm_offset_t
261pmap_kmem_choose(vm_offset_t addr)
262{
263	vm_offset_t newaddr = addr;
264#ifndef DISABLE_PSE
265	if (cpu_feature & CPUID_PSE) {
266		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
267	}
268#endif
269	return newaddr;
270}
271
272/*
273 *	Bootstrap the system enough to run with virtual memory.
274 *
275 *	On the i386 this is called after mapping has already been enabled
276 *	and just syncs the pmap module with what has already been done.
277 *	[We can't call it easily with mapping off since the kernel is not
278 *	mapped with PA == VA, hence we would have to relocate every address
279 *	from the linked base (virtual) address "KERNBASE" to the actual
280 *	(physical) address starting relative to 0]
281 */
282void
283pmap_bootstrap(firstaddr, loadaddr)
284	vm_offset_t firstaddr;
285	vm_offset_t loadaddr;
286{
287	vm_offset_t va;
288	pt_entry_t *pte;
289	int i;
290
291	avail_start = firstaddr;
292
293	/*
294	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
295	 * large. It should instead be correctly calculated in locore.s and
296	 * not based on 'first' (which is a physical address, not a virtual
297	 * address, for the start of unused physical memory). The kernel
298	 * page tables are NOT double mapped and thus should not be included
299	 * in this calculation.
300	 */
301	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
302	virtual_avail = pmap_kmem_choose(virtual_avail);
303
304	virtual_end = VM_MAX_KERNEL_ADDRESS;
305
306	/*
307	 * Initialize protection array.
308	 */
309	i386_protection_init();
310
311	/*
312	 * The kernel's pmap is statically allocated so we don't have to use
313	 * pmap_create, which is unlikely to work correctly at this part of
314	 * the boot sequence (XXX and which no longer exists).
315	 */
316	kernel_pmap = &kernel_pmap_store;
317
318	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
319	kernel_pmap->pm_count = 1;
320	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
321	TAILQ_INIT(&kernel_pmap->pm_pvlist);
322	LIST_INIT(&allpmaps);
323	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
324	nkpt = NKPT;
325
326	/*
327	 * Reserve some special page table entries/VA space for temporary
328	 * mapping of pages.
329	 */
330#define	SYSMAP(c, p, v, n)	\
331	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
332
333	va = virtual_avail;
334	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
335
336	/*
337	 * CMAP1/CMAP2 are used for zeroing and copying pages.
338	 */
339	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
340	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
341
342	/*
343	 * Crashdump maps.
344	 */
345	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
346
347	/*
348	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
349	 * XXX ptmmap is not used.
350	 */
351	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
352
353	/*
354	 * msgbufp is used to map the system message buffer.
355	 * XXX msgbufmap is not used.
356	 */
357	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
358	       atop(round_page(MSGBUF_SIZE)))
359
360	/*
361	 * ptemap is used for pmap_pte_quick
362	 */
363	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
364
365	virtual_avail = va;
366
367	*(int *) CMAP1 = *(int *) CMAP2 = 0;
368	for (i = 0; i < NKPT; i++)
369		PTD[i] = 0;
370
371	pgeflag = 0;
372#if !defined(SMP)			/* XXX - see also mp_machdep.c */
373	if (cpu_feature & CPUID_PGE) {
374		pgeflag = PG_G;
375	}
376#endif
377
378/*
379 * Initialize the 4MB page size flag
380 */
381	pseflag = 0;
382/*
383 * The 4MB page version of the initial
384 * kernel page mapping.
385 */
386	pdir4mb = 0;
387
388#if !defined(DISABLE_PSE)
389	if (cpu_feature & CPUID_PSE) {
390		unsigned ptditmp;
391		/*
392		 * Note that we have enabled PSE mode
393		 */
394		pseflag = PG_PS;
395		ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE));
396		ptditmp &= ~(NBPDR - 1);
397		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
398		pdir4mb = ptditmp;
399
400#if !defined(SMP)
401		/*
402		 * Enable the PSE mode.
403		 */
404		load_cr4(rcr4() | CR4_PSE);
405
406		/*
407		 * We can do the mapping here for the single processor
408		 * case.  We simply ignore the old page table page from
409		 * now on.
410		 */
411		/*
412		 * For SMP, we still need 4K pages to bootstrap APs,
413		 * PSE will be enabled as soon as all APs are up.
414		 */
415		PTD[KPTDI] = (pd_entry_t) ptditmp;
416		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
417		invltlb();
418#endif
419	}
420#endif
421
422#ifdef SMP
423	if (cpu_apic_address == 0)
424		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
425
426	/* local apic is mapped on last page */
427	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
428	    (cpu_apic_address & PG_FRAME));
429#endif
430
431	invltlb();
432}
433
434#ifdef SMP
435/*
436 * Set 4mb pdir for mp startup
437 */
438void
439pmap_set_opt(void)
440{
441	if (pseflag && (cpu_feature & CPUID_PSE)) {
442		load_cr4(rcr4() | CR4_PSE);
443		if (pdir4mb && PCPU_GET(cpuid) == 0) {	/* only on BSP */
444			kernel_pmap->pm_pdir[KPTDI] =
445			    PTD[KPTDI] = (pd_entry_t)pdir4mb;
446			cpu_invltlb();
447		}
448	}
449}
450#endif
451
452/*
453 *	Initialize the pmap module.
454 *	Called by vm_init, to initialize any structures that the pmap
455 *	system needs to map virtual memory.
456 *	pmap_init has been enhanced to support in a fairly consistant
457 *	way, discontiguous physical memory.
458 */
459void
460pmap_init(phys_start, phys_end)
461	vm_offset_t phys_start, phys_end;
462{
463	int i;
464	int initial_pvs;
465
466	/*
467	 * object for kernel page table pages
468	 */
469	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
470
471	/*
472	 * Allocate memory for random pmap data structures.  Includes the
473	 * pv_head_table.
474	 */
475
476	for(i = 0; i < vm_page_array_size; i++) {
477		vm_page_t m;
478
479		m = &vm_page_array[i];
480		TAILQ_INIT(&m->md.pv_list);
481		m->md.pv_list_count = 0;
482	}
483
484	/*
485	 * init the pv free list
486	 */
487	initial_pvs = vm_page_array_size;
488	if (initial_pvs < MINPV)
489		initial_pvs = MINPV;
490	pvzone = &pvzone_store;
491	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
492		initial_pvs * sizeof (struct pv_entry));
493	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
494	    vm_page_array_size);
495
496	/*
497	 * Now it is safe to enable pv_table recording.
498	 */
499	pmap_initialized = TRUE;
500}
501
502/*
503 * Initialize the address space (zone) for the pv_entries.  Set a
504 * high water mark so that the system can recover from excessive
505 * numbers of pv entries.
506 */
507void
508pmap_init2()
509{
510	int shpgperproc = PMAP_SHPGPERPROC;
511
512	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
513	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
514	pv_entry_high_water = 9 * (pv_entry_max / 10);
515	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
516}
517
518
519/***************************************************
520 * Low level helper routines.....
521 ***************************************************/
522
523#if defined(PMAP_DIAGNOSTIC)
524
525/*
526 * This code checks for non-writeable/modified pages.
527 * This should be an invalid condition.
528 */
529static int
530pmap_nw_modified(pt_entry_t ptea)
531{
532	int pte;
533
534	pte = (int) ptea;
535
536	if ((pte & (PG_M|PG_RW)) == PG_M)
537		return 1;
538	else
539		return 0;
540}
541#endif
542
543
544/*
545 * this routine defines the region(s) of memory that should
546 * not be tested for the modified bit.
547 */
548static PMAP_INLINE int
549pmap_track_modified(vm_offset_t va)
550{
551	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
552		return 1;
553	else
554		return 0;
555}
556
557static PMAP_INLINE void
558invltlb_1pg(vm_offset_t va)
559{
560#ifdef I386_CPU
561	invltlb();
562#else
563	invlpg(va);
564#endif
565}
566
567static __inline void
568pmap_TLB_invalidate(pmap_t pmap, vm_offset_t va)
569{
570#if defined(SMP)
571	if (pmap->pm_active & (1 << PCPU_GET(cpuid)))
572		cpu_invlpg((void *)va);
573	if (pmap->pm_active & PCPU_GET(other_cpus))
574		smp_invltlb();
575#else
576	if (pmap->pm_active)
577		invltlb_1pg(va);
578#endif
579}
580
581static __inline void
582pmap_TLB_invalidate_all(pmap_t pmap)
583{
584#if defined(SMP)
585	if (pmap->pm_active & (1 << PCPU_GET(cpuid)))
586		cpu_invltlb();
587	if (pmap->pm_active & PCPU_GET(other_cpus))
588		smp_invltlb();
589#else
590	if (pmap->pm_active)
591		invltlb();
592#endif
593}
594
595/*
596 * Return an address which is the base of the Virtual mapping of
597 * all the PTEs for the given pmap. Note this doesn't say that
598 * all the PTEs will be present or that the pages there are valid.
599 * The PTEs are made available by the recursive mapping trick.
600 * It will map in the alternate PTE space if needed.
601 */
602static unsigned *
603get_ptbase(pmap)
604	pmap_t pmap;
605{
606	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
607
608	/* are we current address space or kernel? */
609	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
610		return (unsigned *) PTmap;
611	}
612	/* otherwise, we are alternate address space */
613	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
614		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
615#if defined(SMP)
616		/* The page directory is not shared between CPUs */
617		cpu_invltlb();
618#else
619		invltlb();
620#endif
621	}
622	return (unsigned *) APTmap;
623}
624
625/*
626 * Super fast pmap_pte routine best used when scanning
627 * the pv lists.  This eliminates many coarse-grained
628 * invltlb calls.  Note that many of the pv list
629 * scans are across different pmaps.  It is very wasteful
630 * to do an entire invltlb for checking a single mapping.
631 */
632
633static unsigned *
634pmap_pte_quick(pmap, va)
635	register pmap_t pmap;
636	vm_offset_t va;
637{
638	unsigned pde, newpf;
639	if ((pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) != 0) {
640		unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
641		unsigned index = i386_btop(va);
642		/* are we current address space or kernel? */
643		if ((pmap == kernel_pmap) ||
644			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
645			return (unsigned *) PTmap + index;
646		}
647		newpf = pde & PG_FRAME;
648		if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) {
649			* (unsigned *) PMAP1 = newpf | PG_RW | PG_V;
650			invltlb_1pg((vm_offset_t) PADDR1);
651		}
652		return PADDR1 + ((unsigned) index & (NPTEPG - 1));
653	}
654	return (0);
655}
656
657/*
658 *	Routine:	pmap_extract
659 *	Function:
660 *		Extract the physical page address associated
661 *		with the given map/virtual_address pair.
662 */
663vm_offset_t
664pmap_extract(pmap, va)
665	register pmap_t pmap;
666	vm_offset_t va;
667{
668	vm_offset_t rtval;
669	vm_offset_t pdirindex;
670	pdirindex = va >> PDRSHIFT;
671	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
672		unsigned *pte;
673		if ((rtval & PG_PS) != 0) {
674			rtval &= ~(NBPDR - 1);
675			rtval |= va & (NBPDR - 1);
676			return rtval;
677		}
678		pte = get_ptbase(pmap) + i386_btop(va);
679		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
680		return rtval;
681	}
682	return 0;
683
684}
685
686/***************************************************
687 * Low level mapping routines.....
688 ***************************************************/
689
690/*
691 * add a wired page to the kva
692 * note that in order for the mapping to take effect -- you
693 * should do a invltlb after doing the pmap_kenter...
694 */
695PMAP_INLINE void
696pmap_kenter(va, pa)
697	vm_offset_t va;
698	register vm_offset_t pa;
699{
700	register unsigned *pte;
701	unsigned npte, opte;
702
703	npte = pa | PG_RW | PG_V | pgeflag;
704	pte = (unsigned *)vtopte(va);
705	opte = *pte;
706	*pte = npte;
707	/*if (opte)*/
708		invltlb_1pg(va);	/* XXX what about SMP? */
709}
710
711/*
712 * remove a page from the kernel pagetables
713 */
714PMAP_INLINE void
715pmap_kremove(va)
716	vm_offset_t va;
717{
718	register unsigned *pte;
719
720	pte = (unsigned *)vtopte(va);
721	*pte = 0;
722	invltlb_1pg(va);	/* XXX what about SMP? */
723}
724
725/*
726 *	Used to map a range of physical addresses into kernel
727 *	virtual address space.
728 *
729 *	The value passed in '*virt' is a suggested virtual address for
730 *	the mapping. Architectures which can support a direct-mapped
731 *	physical to virtual region can return the appropriate address
732 *	within that region, leaving '*virt' unchanged. Other
733 *	architectures should map the pages starting at '*virt' and
734 *	update '*virt' with the first usable address after the mapped
735 *	region.
736 */
737vm_offset_t
738pmap_map(virt, start, end, prot)
739	vm_offset_t *virt;
740	vm_offset_t start;
741	vm_offset_t end;
742	int prot;
743{
744	vm_offset_t sva = *virt;
745	vm_offset_t va = sva;
746	while (start < end) {
747		pmap_kenter(va, start);
748		va += PAGE_SIZE;
749		start += PAGE_SIZE;
750	}
751	*virt = va;
752	return (sva);
753}
754
755
756/*
757 * Add a list of wired pages to the kva
758 * this routine is only used for temporary
759 * kernel mappings that do not need to have
760 * page modification or references recorded.
761 * Note that old mappings are simply written
762 * over.  The page *must* be wired.
763 */
764void
765pmap_qenter(va, m, count)
766	vm_offset_t va;
767	vm_page_t *m;
768	int count;
769{
770	int i;
771
772	for (i = 0; i < count; i++) {
773		vm_offset_t tva = va + i * PAGE_SIZE;
774		pmap_kenter(tva, VM_PAGE_TO_PHYS(m[i]));
775	}
776}
777
778/*
779 * this routine jerks page mappings from the
780 * kernel -- it is meant only for temporary mappings.
781 */
782void
783pmap_qremove(va, count)
784	vm_offset_t va;
785	int count;
786{
787	vm_offset_t end_va;
788
789	end_va = va + count*PAGE_SIZE;
790
791	while (va < end_va) {
792		unsigned *pte;
793
794		pte = (unsigned *)vtopte(va);
795		*pte = 0;
796#ifdef SMP
797		cpu_invlpg((void *)va);
798#else
799		invltlb_1pg(va);
800#endif
801		va += PAGE_SIZE;
802	}
803#ifdef SMP
804	smp_invltlb();
805#endif
806}
807
808static vm_page_t
809pmap_page_lookup(object, pindex)
810	vm_object_t object;
811	vm_pindex_t pindex;
812{
813	vm_page_t m;
814retry:
815	m = vm_page_lookup(object, pindex);
816	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
817		goto retry;
818	return m;
819}
820
821/*
822 * Create the UPAGES for a new process.
823 * This routine directly affects the fork perf for a process.
824 */
825void
826pmap_new_proc(p)
827	struct proc *p;
828{
829#ifdef I386_CPU
830	int updateneeded;
831#endif
832	int i;
833	vm_object_t upobj;
834	vm_page_t m;
835	struct user *up;
836	unsigned *ptek, oldpte;
837
838	/*
839	 * allocate object for the upages
840	 */
841	if ((upobj = p->p_upages_obj) == NULL) {
842		upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES);
843		p->p_upages_obj = upobj;
844	}
845
846	/* get a kernel virtual address for the UPAGES for this proc */
847	if ((up = p->p_addr) == NULL) {
848		up = (struct user *) kmem_alloc_nofault(kernel_map,
849				UPAGES * PAGE_SIZE);
850		if (up == NULL)
851			panic("pmap_new_proc: u_map allocation failed");
852		p->p_addr = up;
853	}
854
855	ptek = (unsigned *) vtopte((vm_offset_t) up);
856
857#ifdef I386_CPU
858	updateneeded = 0;
859#endif
860	for(i=0;i<UPAGES;i++) {
861		/*
862		 * Get a kernel stack page
863		 */
864		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
865
866		/*
867		 * Wire the page
868		 */
869		m->wire_count++;
870		cnt.v_wire_count++;
871
872		oldpte = *(ptek + i);
873		/*
874		 * Enter the page into the kernel address space.
875		 */
876		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
877		if (oldpte) {
878#ifdef I386_CPU
879			updateneeded = 1;
880#else
881			invlpg((vm_offset_t) up + i * PAGE_SIZE);
882#endif
883		}
884
885		vm_page_wakeup(m);
886		vm_page_flag_clear(m, PG_ZERO);
887		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
888		m->valid = VM_PAGE_BITS_ALL;
889	}
890#ifdef I386_CPU
891	if (updateneeded)
892		invltlb();
893#endif
894}
895
896/*
897 * Dispose the UPAGES for a process that has exited.
898 * This routine directly impacts the exit perf of a process.
899 */
900void
901pmap_dispose_proc(p)
902	struct proc *p;
903{
904	int i;
905	vm_object_t upobj;
906	vm_page_t m;
907	unsigned *ptek, oldpte;
908
909	upobj = p->p_upages_obj;
910
911	ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr);
912	for(i=0;i<UPAGES;i++) {
913
914		if ((m = vm_page_lookup(upobj, i)) == NULL)
915			panic("pmap_dispose_proc: upage already missing???");
916
917		vm_page_busy(m);
918
919		oldpte = *(ptek + i);
920		*(ptek + i) = 0;
921#ifndef I386_CPU
922		invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE);
923#endif
924		vm_page_unwire(m, 0);
925		vm_page_free(m);
926	}
927#ifdef I386_CPU
928	invltlb();
929#endif
930}
931
932/*
933 * Allow the UPAGES for a process to be prejudicially paged out.
934 */
935void
936pmap_swapout_proc(p)
937	struct proc *p;
938{
939	int i;
940	vm_object_t upobj;
941	vm_page_t m;
942
943	upobj = p->p_upages_obj;
944	/*
945	 * let the upages be paged
946	 */
947	for(i=0;i<UPAGES;i++) {
948		if ((m = vm_page_lookup(upobj, i)) == NULL)
949			panic("pmap_swapout_proc: upage already missing???");
950		vm_page_dirty(m);
951		vm_page_unwire(m, 0);
952		pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i);
953	}
954}
955
956/*
957 * Bring the UPAGES for a specified process back in.
958 */
959void
960pmap_swapin_proc(p)
961	struct proc *p;
962{
963	int i,rv;
964	vm_object_t upobj;
965	vm_page_t m;
966
967	upobj = p->p_upages_obj;
968	for(i=0;i<UPAGES;i++) {
969
970		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
971
972		pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE,
973			VM_PAGE_TO_PHYS(m));
974
975		if (m->valid != VM_PAGE_BITS_ALL) {
976			rv = vm_pager_get_pages(upobj, &m, 1, 0);
977			if (rv != VM_PAGER_OK)
978				panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid);
979			m = vm_page_lookup(upobj, i);
980			m->valid = VM_PAGE_BITS_ALL;
981		}
982
983		vm_page_wire(m);
984		vm_page_wakeup(m);
985		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
986	}
987}
988
989/***************************************************
990 * Page table page management routines.....
991 ***************************************************/
992
993/*
994 * This routine unholds page table pages, and if the hold count
995 * drops to zero, then it decrements the wire count.
996 */
997static int
998_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
999
1000	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
1001		;
1002
1003	if (m->hold_count == 0) {
1004		vm_offset_t pteva;
1005		/*
1006		 * unmap the page table page
1007		 */
1008		pmap->pm_pdir[m->pindex] = 0;
1009		--pmap->pm_stats.resident_count;
1010		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
1011			(((unsigned) PTDpde) & PG_FRAME)) {
1012			/*
1013			 * Do a invltlb to make the invalidated mapping
1014			 * take effect immediately.
1015			 */
1016			pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex);
1017			pmap_TLB_invalidate(pmap, pteva);
1018		}
1019
1020		if (pmap->pm_ptphint == m)
1021			pmap->pm_ptphint = NULL;
1022
1023		/*
1024		 * If the page is finally unwired, simply free it.
1025		 */
1026		--m->wire_count;
1027		if (m->wire_count == 0) {
1028
1029			vm_page_flash(m);
1030			vm_page_busy(m);
1031			vm_page_free_zero(m);
1032			--cnt.v_wire_count;
1033		}
1034		return 1;
1035	}
1036	return 0;
1037}
1038
1039static PMAP_INLINE int
1040pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1041{
1042	vm_page_unhold(m);
1043	if (m->hold_count == 0)
1044		return _pmap_unwire_pte_hold(pmap, m);
1045	else
1046		return 0;
1047}
1048
1049/*
1050 * After removing a page table entry, this routine is used to
1051 * conditionally free the page, and manage the hold/wire counts.
1052 */
1053static int
1054pmap_unuse_pt(pmap, va, mpte)
1055	pmap_t pmap;
1056	vm_offset_t va;
1057	vm_page_t mpte;
1058{
1059	unsigned ptepindex;
1060	if (va >= UPT_MIN_ADDRESS)
1061		return 0;
1062
1063	if (mpte == NULL) {
1064		ptepindex = (va >> PDRSHIFT);
1065		if (pmap->pm_ptphint &&
1066			(pmap->pm_ptphint->pindex == ptepindex)) {
1067			mpte = pmap->pm_ptphint;
1068		} else {
1069			mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1070			pmap->pm_ptphint = mpte;
1071		}
1072	}
1073
1074	return pmap_unwire_pte_hold(pmap, mpte);
1075}
1076
1077void
1078pmap_pinit0(pmap)
1079	struct pmap *pmap;
1080{
1081	pmap->pm_pdir =
1082		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1083	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
1084	pmap->pm_count = 1;
1085	pmap->pm_active = 0;
1086	pmap->pm_ptphint = NULL;
1087	TAILQ_INIT(&pmap->pm_pvlist);
1088	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1089	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1090}
1091
1092/*
1093 * Initialize a preallocated and zeroed pmap structure,
1094 * such as one in a vmspace structure.
1095 */
1096void
1097pmap_pinit(pmap)
1098	register struct pmap *pmap;
1099{
1100	vm_page_t ptdpg;
1101
1102	/*
1103	 * No need to allocate page table space yet but we do need a valid
1104	 * page directory table.
1105	 */
1106	if (pmap->pm_pdir == NULL)
1107		pmap->pm_pdir =
1108			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1109
1110	/*
1111	 * allocate object for the ptes
1112	 */
1113	if (pmap->pm_pteobj == NULL)
1114		pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1);
1115
1116	/*
1117	 * allocate the page directory page
1118	 */
1119	ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI,
1120			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1121
1122	ptdpg->wire_count = 1;
1123	++cnt.v_wire_count;
1124
1125
1126	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
1127	ptdpg->valid = VM_PAGE_BITS_ALL;
1128
1129	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1130	if ((ptdpg->flags & PG_ZERO) == 0)
1131		bzero(pmap->pm_pdir, PAGE_SIZE);
1132
1133	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1134	/* Wire in kernel global address entries. */
1135	/* XXX copies current process, does not fill in MPPTDI */
1136	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1137#ifdef SMP
1138	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1139#endif
1140
1141	/* install self-referential address mapping entry */
1142	*(unsigned *) (pmap->pm_pdir + PTDPTDI) =
1143		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1144
1145	pmap->pm_count = 1;
1146	pmap->pm_active = 0;
1147	pmap->pm_ptphint = NULL;
1148	TAILQ_INIT(&pmap->pm_pvlist);
1149	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1150}
1151
1152/*
1153 * Wire in kernel global address entries.  To avoid a race condition
1154 * between pmap initialization and pmap_growkernel, this procedure
1155 * should be called after the vmspace is attached to the process
1156 * but before this pmap is activated.
1157 */
1158void
1159pmap_pinit2(pmap)
1160	struct pmap *pmap;
1161{
1162	/* XXX: Remove this stub when no longer called */
1163}
1164
1165static int
1166pmap_release_free_page(pmap, p)
1167	struct pmap *pmap;
1168	vm_page_t p;
1169{
1170	unsigned *pde = (unsigned *) pmap->pm_pdir;
1171	/*
1172	 * This code optimizes the case of freeing non-busy
1173	 * page-table pages.  Those pages are zero now, and
1174	 * might as well be placed directly into the zero queue.
1175	 */
1176	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1177		return 0;
1178
1179	vm_page_busy(p);
1180
1181	/*
1182	 * Remove the page table page from the processes address space.
1183	 */
1184	pde[p->pindex] = 0;
1185	pmap->pm_stats.resident_count--;
1186
1187	if (p->hold_count)  {
1188		panic("pmap_release: freeing held page table page");
1189	}
1190	/*
1191	 * Page directory pages need to have the kernel
1192	 * stuff cleared, so they can go into the zero queue also.
1193	 */
1194	if (p->pindex == PTDPTDI) {
1195		bzero(pde + KPTDI, nkpt * PTESIZE);
1196#ifdef SMP
1197		pde[MPPTDI] = 0;
1198#endif
1199		pde[APTDPTDI] = 0;
1200		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1201	}
1202
1203	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1204		pmap->pm_ptphint = NULL;
1205
1206	p->wire_count--;
1207	cnt.v_wire_count--;
1208	vm_page_free_zero(p);
1209	return 1;
1210}
1211
1212/*
1213 * this routine is called if the page table page is not
1214 * mapped correctly.
1215 */
1216static vm_page_t
1217_pmap_allocpte(pmap, ptepindex)
1218	pmap_t	pmap;
1219	unsigned ptepindex;
1220{
1221	vm_offset_t pteva, ptepa;
1222	vm_page_t m;
1223
1224	/*
1225	 * Find or fabricate a new pagetable page
1226	 */
1227	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1228			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1229
1230	KASSERT(m->queue == PQ_NONE,
1231		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1232
1233	if (m->wire_count == 0)
1234		cnt.v_wire_count++;
1235	m->wire_count++;
1236
1237	/*
1238	 * Increment the hold count for the page table page
1239	 * (denoting a new mapping.)
1240	 */
1241	m->hold_count++;
1242
1243	/*
1244	 * Map the pagetable page into the process address space, if
1245	 * it isn't already there.
1246	 */
1247
1248	pmap->pm_stats.resident_count++;
1249
1250	ptepa = VM_PAGE_TO_PHYS(m);
1251	pmap->pm_pdir[ptepindex] =
1252		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1253
1254	/*
1255	 * Set the page table hint
1256	 */
1257	pmap->pm_ptphint = m;
1258
1259	/*
1260	 * Try to use the new mapping, but if we cannot, then
1261	 * do it with the routine that maps the page explicitly.
1262	 */
1263	if ((m->flags & PG_ZERO) == 0) {
1264		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
1265			(((unsigned) PTDpde) & PG_FRAME)) {
1266			pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex);
1267			bzero((caddr_t) pteva, PAGE_SIZE);
1268		} else {
1269			pmap_zero_page(ptepa);
1270		}
1271	}
1272
1273	m->valid = VM_PAGE_BITS_ALL;
1274	vm_page_flag_clear(m, PG_ZERO);
1275	vm_page_flag_set(m, PG_MAPPED);
1276	vm_page_wakeup(m);
1277
1278	return m;
1279}
1280
1281static vm_page_t
1282pmap_allocpte(pmap, va)
1283	pmap_t	pmap;
1284	vm_offset_t va;
1285{
1286	unsigned ptepindex;
1287	vm_offset_t ptepa;
1288	vm_page_t m;
1289
1290	/*
1291	 * Calculate pagetable page index
1292	 */
1293	ptepindex = va >> PDRSHIFT;
1294
1295	/*
1296	 * Get the page directory entry
1297	 */
1298	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1299
1300	/*
1301	 * This supports switching from a 4MB page to a
1302	 * normal 4K page.
1303	 */
1304	if (ptepa & PG_PS) {
1305		pmap->pm_pdir[ptepindex] = 0;
1306		ptepa = 0;
1307		invltlb();
1308	}
1309
1310	/*
1311	 * If the page table page is mapped, we just increment the
1312	 * hold count, and activate it.
1313	 */
1314	if (ptepa) {
1315		/*
1316		 * In order to get the page table page, try the
1317		 * hint first.
1318		 */
1319		if (pmap->pm_ptphint &&
1320			(pmap->pm_ptphint->pindex == ptepindex)) {
1321			m = pmap->pm_ptphint;
1322		} else {
1323			m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1324			pmap->pm_ptphint = m;
1325		}
1326		m->hold_count++;
1327		return m;
1328	}
1329	/*
1330	 * Here if the pte page isn't mapped, or if it has been deallocated.
1331	 */
1332	return _pmap_allocpte(pmap, ptepindex);
1333}
1334
1335
1336/***************************************************
1337* Pmap allocation/deallocation routines.
1338 ***************************************************/
1339
1340/*
1341 * Release any resources held by the given physical map.
1342 * Called when a pmap initialized by pmap_pinit is being released.
1343 * Should only be called if the map contains no valid mappings.
1344 */
1345void
1346pmap_release(pmap)
1347	register struct pmap *pmap;
1348{
1349	vm_page_t p,n,ptdpg;
1350	vm_object_t object = pmap->pm_pteobj;
1351	int curgeneration;
1352
1353#if defined(DIAGNOSTIC)
1354	if (object->ref_count != 1)
1355		panic("pmap_release: pteobj reference count != 1");
1356#endif
1357
1358	ptdpg = NULL;
1359	LIST_REMOVE(pmap, pm_list);
1360retry:
1361	curgeneration = object->generation;
1362	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1363		n = TAILQ_NEXT(p, listq);
1364		if (p->pindex == PTDPTDI) {
1365			ptdpg = p;
1366			continue;
1367		}
1368		while (1) {
1369			if (!pmap_release_free_page(pmap, p) &&
1370				(object->generation != curgeneration))
1371				goto retry;
1372		}
1373	}
1374
1375	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1376		goto retry;
1377}
1378
1379static int
1380kvm_size(SYSCTL_HANDLER_ARGS)
1381{
1382	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1383
1384        return sysctl_handle_long(oidp, &ksize, 0, req);
1385}
1386SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1387    0, 0, kvm_size, "IU", "Size of KVM");
1388
1389static int
1390kvm_free(SYSCTL_HANDLER_ARGS)
1391{
1392	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1393
1394        return sysctl_handle_long(oidp, &kfree, 0, req);
1395}
1396SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1397    0, 0, kvm_free, "IU", "Amount of KVM free");
1398
1399/*
1400 * grow the number of kernel page table entries, if needed
1401 */
1402void
1403pmap_growkernel(vm_offset_t addr)
1404{
1405	struct pmap *pmap;
1406	int s;
1407	vm_offset_t ptppaddr;
1408	vm_page_t nkpg;
1409	pd_entry_t newpdir;
1410
1411	s = splhigh();
1412	if (kernel_vm_end == 0) {
1413		kernel_vm_end = KERNBASE;
1414		nkpt = 0;
1415		while (pdir_pde(PTD, kernel_vm_end)) {
1416			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1417			nkpt++;
1418		}
1419	}
1420	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1421	while (kernel_vm_end < addr) {
1422		if (pdir_pde(PTD, kernel_vm_end)) {
1423			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1424			continue;
1425		}
1426
1427		/*
1428		 * This index is bogus, but out of the way
1429		 */
1430		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
1431		if (!nkpg)
1432			panic("pmap_growkernel: no memory to grow kernel");
1433
1434		nkpt++;
1435
1436		vm_page_wire(nkpg);
1437		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1438		pmap_zero_page(ptppaddr);
1439		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1440		pdir_pde(PTD, kernel_vm_end) = newpdir;
1441
1442		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1443			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1444		}
1445		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1446	}
1447	splx(s);
1448}
1449
1450/*
1451 *	Retire the given physical map from service.
1452 *	Should only be called if the map contains
1453 *	no valid mappings.
1454 */
1455void
1456pmap_destroy(pmap)
1457	register pmap_t pmap;
1458{
1459	int count;
1460
1461	if (pmap == NULL)
1462		return;
1463
1464	count = --pmap->pm_count;
1465	if (count == 0) {
1466		pmap_release(pmap);
1467		panic("destroying a pmap is not yet implemented");
1468	}
1469}
1470
1471/*
1472 *	Add a reference to the specified pmap.
1473 */
1474void
1475pmap_reference(pmap)
1476	pmap_t pmap;
1477{
1478	if (pmap != NULL) {
1479		pmap->pm_count++;
1480	}
1481}
1482
1483/***************************************************
1484* page management routines.
1485 ***************************************************/
1486
1487/*
1488 * free the pv_entry back to the free list
1489 */
1490static PMAP_INLINE void
1491free_pv_entry(pv)
1492	pv_entry_t pv;
1493{
1494	pv_entry_count--;
1495	zfree(pvzone, pv);
1496}
1497
1498/*
1499 * get a new pv_entry, allocating a block from the system
1500 * when needed.
1501 * the memory allocation is performed bypassing the malloc code
1502 * because of the possibility of allocations at interrupt time.
1503 */
1504static pv_entry_t
1505get_pv_entry(void)
1506{
1507	pv_entry_count++;
1508	if (pv_entry_high_water &&
1509		(pv_entry_count > pv_entry_high_water) &&
1510		(pmap_pagedaemon_waken == 0)) {
1511		pmap_pagedaemon_waken = 1;
1512		wakeup (&vm_pages_needed);
1513	}
1514	return zalloc(pvzone);
1515}
1516
1517/*
1518 * This routine is very drastic, but can save the system
1519 * in a pinch.
1520 */
1521void
1522pmap_collect()
1523{
1524	int i;
1525	vm_page_t m;
1526	static int warningdone=0;
1527
1528	if (pmap_pagedaemon_waken == 0)
1529		return;
1530
1531	if (warningdone < 5) {
1532		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1533		warningdone++;
1534	}
1535
1536	for(i = 0; i < vm_page_array_size; i++) {
1537		m = &vm_page_array[i];
1538		if (m->wire_count || m->hold_count || m->busy ||
1539		    (m->flags & PG_BUSY))
1540			continue;
1541		pmap_remove_all(m);
1542	}
1543	pmap_pagedaemon_waken = 0;
1544}
1545
1546
1547/*
1548 * If it is the first entry on the list, it is actually
1549 * in the header and we must copy the following entry up
1550 * to the header.  Otherwise we must search the list for
1551 * the entry.  In either case we free the now unused entry.
1552 */
1553
1554static int
1555pmap_remove_entry(pmap, m, va)
1556	struct pmap *pmap;
1557	vm_page_t m;
1558	vm_offset_t va;
1559{
1560	pv_entry_t pv;
1561	int rtval;
1562	int s;
1563
1564	s = splvm();
1565	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1566		for (pv = TAILQ_FIRST(&m->md.pv_list);
1567			pv;
1568			pv = TAILQ_NEXT(pv, pv_list)) {
1569			if (pmap == pv->pv_pmap && va == pv->pv_va)
1570				break;
1571		}
1572	} else {
1573		for (pv = TAILQ_FIRST(&pmap->pm_pvlist);
1574			pv;
1575			pv = TAILQ_NEXT(pv, pv_plist)) {
1576			if (va == pv->pv_va)
1577				break;
1578		}
1579	}
1580
1581	rtval = 0;
1582	if (pv) {
1583
1584		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1585		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1586		m->md.pv_list_count--;
1587		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1588			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1589
1590		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1591		free_pv_entry(pv);
1592	}
1593
1594	splx(s);
1595	return rtval;
1596}
1597
1598/*
1599 * Create a pv entry for page at pa for
1600 * (pmap, va).
1601 */
1602static void
1603pmap_insert_entry(pmap, va, mpte, m)
1604	pmap_t pmap;
1605	vm_offset_t va;
1606	vm_page_t mpte;
1607	vm_page_t m;
1608{
1609
1610	int s;
1611	pv_entry_t pv;
1612
1613	s = splvm();
1614	pv = get_pv_entry();
1615	pv->pv_va = va;
1616	pv->pv_pmap = pmap;
1617	pv->pv_ptem = mpte;
1618
1619	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1620	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1621	m->md.pv_list_count++;
1622
1623	splx(s);
1624}
1625
1626/*
1627 * pmap_remove_pte: do the things to unmap a page in a process
1628 */
1629static int
1630pmap_remove_pte(pmap, ptq, va)
1631	struct pmap *pmap;
1632	unsigned *ptq;
1633	vm_offset_t va;
1634{
1635	unsigned oldpte;
1636	vm_page_t m;
1637
1638	oldpte = atomic_readandclear_int(ptq);
1639	if (oldpte & PG_W)
1640		pmap->pm_stats.wired_count -= 1;
1641	/*
1642	 * Machines that don't support invlpg, also don't support
1643	 * PG_G.
1644	 */
1645	if (oldpte & PG_G)
1646		invlpg(va);
1647	pmap->pm_stats.resident_count -= 1;
1648	if (oldpte & PG_MANAGED) {
1649		m = PHYS_TO_VM_PAGE(oldpte);
1650		if (oldpte & PG_M) {
1651#if defined(PMAP_DIAGNOSTIC)
1652			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1653				printf(
1654	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1655				    va, oldpte);
1656			}
1657#endif
1658			if (pmap_track_modified(va))
1659				vm_page_dirty(m);
1660		}
1661		if (oldpte & PG_A)
1662			vm_page_flag_set(m, PG_REFERENCED);
1663		return pmap_remove_entry(pmap, m, va);
1664	} else {
1665		return pmap_unuse_pt(pmap, va, NULL);
1666	}
1667
1668	return 0;
1669}
1670
1671/*
1672 * Remove a single page from a process address space
1673 */
1674static void
1675pmap_remove_page(pmap, va)
1676	struct pmap *pmap;
1677	register vm_offset_t va;
1678{
1679	register unsigned *ptq;
1680
1681	/*
1682	 * if there is no pte for this address, just skip it!!!
1683	 */
1684	if (*pmap_pde(pmap, va) == 0) {
1685		return;
1686	}
1687
1688	/*
1689	 * get a local va for mappings for this pmap.
1690	 */
1691	ptq = get_ptbase(pmap) + i386_btop(va);
1692	if (*ptq) {
1693		(void) pmap_remove_pte(pmap, ptq, va);
1694		pmap_TLB_invalidate(pmap, va);
1695	}
1696	return;
1697}
1698
1699/*
1700 *	Remove the given range of addresses from the specified map.
1701 *
1702 *	It is assumed that the start and end are properly
1703 *	rounded to the page size.
1704 */
1705void
1706pmap_remove(pmap, sva, eva)
1707	struct pmap *pmap;
1708	register vm_offset_t sva;
1709	register vm_offset_t eva;
1710{
1711	register unsigned *ptbase;
1712	vm_offset_t pdnxt;
1713	vm_offset_t ptpaddr;
1714	vm_offset_t sindex, eindex;
1715	int anyvalid;
1716
1717	if (pmap == NULL)
1718		return;
1719
1720	if (pmap->pm_stats.resident_count == 0)
1721		return;
1722
1723	/*
1724	 * special handling of removing one page.  a very
1725	 * common operation and easy to short circuit some
1726	 * code.
1727	 */
1728	if (((sva + PAGE_SIZE) == eva) &&
1729		(((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1730		pmap_remove_page(pmap, sva);
1731		return;
1732	}
1733
1734	anyvalid = 0;
1735
1736	/*
1737	 * Get a local virtual address for the mappings that are being
1738	 * worked with.
1739	 */
1740	ptbase = get_ptbase(pmap);
1741
1742	sindex = i386_btop(sva);
1743	eindex = i386_btop(eva);
1744
1745	for (; sindex < eindex; sindex = pdnxt) {
1746		unsigned pdirindex;
1747
1748		/*
1749		 * Calculate index for next page table.
1750		 */
1751		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1752		if (pmap->pm_stats.resident_count == 0)
1753			break;
1754
1755		pdirindex = sindex / NPDEPG;
1756		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1757			pmap->pm_pdir[pdirindex] = 0;
1758			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1759			anyvalid++;
1760			continue;
1761		}
1762
1763		/*
1764		 * Weed out invalid mappings. Note: we assume that the page
1765		 * directory table is always allocated, and in kernel virtual.
1766		 */
1767		if (ptpaddr == 0)
1768			continue;
1769
1770		/*
1771		 * Limit our scan to either the end of the va represented
1772		 * by the current page table page, or to the end of the
1773		 * range being removed.
1774		 */
1775		if (pdnxt > eindex) {
1776			pdnxt = eindex;
1777		}
1778
1779		for ( ;sindex != pdnxt; sindex++) {
1780			vm_offset_t va;
1781			if (ptbase[sindex] == 0) {
1782				continue;
1783			}
1784			va = i386_ptob(sindex);
1785
1786			anyvalid++;
1787			if (pmap_remove_pte(pmap,
1788				ptbase + sindex, va))
1789				break;
1790		}
1791	}
1792
1793	if (anyvalid)
1794		pmap_TLB_invalidate_all(pmap);
1795}
1796
1797/*
1798 *	Routine:	pmap_remove_all
1799 *	Function:
1800 *		Removes this physical page from
1801 *		all physical maps in which it resides.
1802 *		Reflects back modify bits to the pager.
1803 *
1804 *	Notes:
1805 *		Original versions of this routine were very
1806 *		inefficient because they iteratively called
1807 *		pmap_remove (slow...)
1808 */
1809
1810static void
1811pmap_remove_all(m)
1812	vm_page_t m;
1813{
1814	register pv_entry_t pv;
1815	register unsigned *pte, tpte;
1816	int s;
1817
1818#if defined(PMAP_DIAGNOSTIC)
1819	/*
1820	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1821	 * pages!
1822	 */
1823	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1824		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
1825	}
1826#endif
1827
1828	s = splvm();
1829	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1830		pv->pv_pmap->pm_stats.resident_count--;
1831
1832		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1833
1834		tpte = atomic_readandclear_int(pte);
1835		if (tpte & PG_W)
1836			pv->pv_pmap->pm_stats.wired_count--;
1837
1838		if (tpte & PG_A)
1839			vm_page_flag_set(m, PG_REFERENCED);
1840
1841		/*
1842		 * Update the vm_page_t clean and reference bits.
1843		 */
1844		if (tpte & PG_M) {
1845#if defined(PMAP_DIAGNOSTIC)
1846			if (pmap_nw_modified((pt_entry_t) tpte)) {
1847				printf(
1848	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1849				    pv->pv_va, tpte);
1850			}
1851#endif
1852			if (pmap_track_modified(pv->pv_va))
1853				vm_page_dirty(m);
1854		}
1855		pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va);
1856
1857		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1858		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1859		m->md.pv_list_count--;
1860		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1861		free_pv_entry(pv);
1862	}
1863
1864	vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1865
1866	splx(s);
1867}
1868
1869/*
1870 *	Set the physical protection on the
1871 *	specified range of this map as requested.
1872 */
1873void
1874pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1875{
1876	register unsigned *ptbase;
1877	vm_offset_t pdnxt, ptpaddr;
1878	vm_pindex_t sindex, eindex;
1879	int anychanged;
1880
1881	if (pmap == NULL)
1882		return;
1883
1884	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1885		pmap_remove(pmap, sva, eva);
1886		return;
1887	}
1888
1889	if (prot & VM_PROT_WRITE)
1890		return;
1891
1892	anychanged = 0;
1893
1894	ptbase = get_ptbase(pmap);
1895
1896	sindex = i386_btop(sva);
1897	eindex = i386_btop(eva);
1898
1899	for (; sindex < eindex; sindex = pdnxt) {
1900
1901		unsigned pdirindex;
1902
1903		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1904
1905		pdirindex = sindex / NPDEPG;
1906		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1907			(unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1908			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1909			anychanged++;
1910			continue;
1911		}
1912
1913		/*
1914		 * Weed out invalid mappings. Note: we assume that the page
1915		 * directory table is always allocated, and in kernel virtual.
1916		 */
1917		if (ptpaddr == 0)
1918			continue;
1919
1920		if (pdnxt > eindex) {
1921			pdnxt = eindex;
1922		}
1923
1924		for (; sindex != pdnxt; sindex++) {
1925
1926			unsigned pbits;
1927			vm_page_t m;
1928
1929			pbits = ptbase[sindex];
1930
1931			if (pbits & PG_MANAGED) {
1932				m = NULL;
1933				if (pbits & PG_A) {
1934					m = PHYS_TO_VM_PAGE(pbits);
1935					vm_page_flag_set(m, PG_REFERENCED);
1936					pbits &= ~PG_A;
1937				}
1938				if (pbits & PG_M) {
1939					if (pmap_track_modified(i386_ptob(sindex))) {
1940						if (m == NULL)
1941							m = PHYS_TO_VM_PAGE(pbits);
1942						vm_page_dirty(m);
1943						pbits &= ~PG_M;
1944					}
1945				}
1946			}
1947
1948			pbits &= ~PG_RW;
1949
1950			if (pbits != ptbase[sindex]) {
1951				ptbase[sindex] = pbits;
1952				anychanged = 1;
1953			}
1954		}
1955	}
1956	if (anychanged)
1957		pmap_TLB_invalidate_all(pmap);
1958}
1959
1960/*
1961 *	Insert the given physical page (p) at
1962 *	the specified virtual address (v) in the
1963 *	target physical map with the protection requested.
1964 *
1965 *	If specified, the page will be wired down, meaning
1966 *	that the related pte can not be reclaimed.
1967 *
1968 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1969 *	or lose information.  That is, this routine must actually
1970 *	insert this page into the given map NOW.
1971 */
1972void
1973pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1974	   boolean_t wired)
1975{
1976	vm_offset_t pa;
1977	register unsigned *pte;
1978	vm_offset_t opa;
1979	vm_offset_t origpte, newpte;
1980	vm_page_t mpte;
1981
1982	if (pmap == NULL)
1983		return;
1984
1985	va &= PG_FRAME;
1986#ifdef PMAP_DIAGNOSTIC
1987	if (va > VM_MAX_KERNEL_ADDRESS)
1988		panic("pmap_enter: toobig");
1989	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1990		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1991#endif
1992
1993	mpte = NULL;
1994	/*
1995	 * In the case that a page table page is not
1996	 * resident, we are creating it here.
1997	 */
1998	if (va < UPT_MIN_ADDRESS) {
1999		mpte = pmap_allocpte(pmap, va);
2000	}
2001#if 0 && defined(PMAP_DIAGNOSTIC)
2002	else {
2003		vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va);
2004		if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) {
2005			panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n",
2006				pmap->pm_pdir[PTDPTDI], origpte, va);
2007		}
2008		if (smp_active) {
2009			pdeaddr = (vm_offset_t *) IdlePTDS[PCPU_GET(cpuid)];
2010			if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) {
2011				if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr))
2012					printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr);
2013				printf("cpuid: %d, pdeaddr: 0x%x\n", PCPU_GET(cpuid), pdeaddr);
2014				panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n",
2015					pmap->pm_pdir[PTDPTDI], newpte, origpte, va);
2016			}
2017		}
2018	}
2019#endif
2020
2021	pte = pmap_pte(pmap, va);
2022
2023	/*
2024	 * Page Directory table entry not valid, we need a new PT page
2025	 */
2026	if (pte == NULL) {
2027		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n",
2028			(void *)pmap->pm_pdir[PTDPTDI], va);
2029	}
2030
2031	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2032	origpte = *(vm_offset_t *)pte;
2033	opa = origpte & PG_FRAME;
2034
2035	if (origpte & PG_PS)
2036		panic("pmap_enter: attempted pmap_enter on 4MB page");
2037
2038	/*
2039	 * Mapping has not changed, must be protection or wiring change.
2040	 */
2041	if (origpte && (opa == pa)) {
2042		/*
2043		 * Wiring change, just update stats. We don't worry about
2044		 * wiring PT pages as they remain resident as long as there
2045		 * are valid mappings in them. Hence, if a user page is wired,
2046		 * the PT page will be also.
2047		 */
2048		if (wired && ((origpte & PG_W) == 0))
2049			pmap->pm_stats.wired_count++;
2050		else if (!wired && (origpte & PG_W))
2051			pmap->pm_stats.wired_count--;
2052
2053#if defined(PMAP_DIAGNOSTIC)
2054		if (pmap_nw_modified((pt_entry_t) origpte)) {
2055			printf(
2056	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2057			    va, origpte);
2058		}
2059#endif
2060
2061		/*
2062		 * Remove extra pte reference
2063		 */
2064		if (mpte)
2065			mpte->hold_count--;
2066
2067		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2068			if ((origpte & PG_RW) == 0) {
2069				*pte |= PG_RW;
2070#ifdef SMP
2071				cpu_invlpg((void *)va);
2072				if (pmap->pm_active & PCPU_GET(other_cpus))
2073					smp_invltlb();
2074#else
2075				invltlb_1pg(va);
2076#endif
2077			}
2078			return;
2079		}
2080
2081		/*
2082		 * We might be turning off write access to the page,
2083		 * so we go ahead and sense modify status.
2084		 */
2085		if (origpte & PG_MANAGED) {
2086			if ((origpte & PG_M) && pmap_track_modified(va)) {
2087				vm_page_t om;
2088				om = PHYS_TO_VM_PAGE(opa);
2089				vm_page_dirty(om);
2090			}
2091			pa |= PG_MANAGED;
2092		}
2093		goto validate;
2094	}
2095	/*
2096	 * Mapping has changed, invalidate old range and fall through to
2097	 * handle validating new mapping.
2098	 */
2099	if (opa) {
2100		int err;
2101		err = pmap_remove_pte(pmap, pte, va);
2102		if (err)
2103			panic("pmap_enter: pte vanished, va: 0x%x", va);
2104	}
2105
2106	/*
2107	 * Enter on the PV list if part of our managed memory. Note that we
2108	 * raise IPL while manipulating pv_table since pmap_enter can be
2109	 * called at interrupt time.
2110	 */
2111	if (pmap_initialized &&
2112	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2113		pmap_insert_entry(pmap, va, mpte, m);
2114		pa |= PG_MANAGED;
2115	}
2116
2117	/*
2118	 * Increment counters
2119	 */
2120	pmap->pm_stats.resident_count++;
2121	if (wired)
2122		pmap->pm_stats.wired_count++;
2123
2124validate:
2125	/*
2126	 * Now validate mapping with desired protection/wiring.
2127	 */
2128	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2129
2130	if (wired)
2131		newpte |= PG_W;
2132	if (va < UPT_MIN_ADDRESS)
2133		newpte |= PG_U;
2134	if (pmap == kernel_pmap)
2135		newpte |= pgeflag;
2136
2137	/*
2138	 * if the mapping or permission bits are different, we need
2139	 * to update the pte.
2140	 */
2141	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2142		*pte = newpte | PG_A;
2143		/*if (origpte)*/ {
2144#ifdef SMP
2145			cpu_invlpg((void *)va);
2146			if (pmap->pm_active & PCPU_GET(other_cpus))
2147				smp_invltlb();
2148#else
2149			invltlb_1pg(va);
2150#endif
2151		}
2152	}
2153}
2154
2155/*
2156 * this code makes some *MAJOR* assumptions:
2157 * 1. Current pmap & pmap exists.
2158 * 2. Not wired.
2159 * 3. Read access.
2160 * 4. No page table pages.
2161 * 5. Tlbflush is deferred to calling procedure.
2162 * 6. Page IS managed.
2163 * but is *MUCH* faster than pmap_enter...
2164 */
2165
2166static vm_page_t
2167pmap_enter_quick(pmap, va, m, mpte)
2168	register pmap_t pmap;
2169	vm_offset_t va;
2170	vm_page_t m;
2171	vm_page_t mpte;
2172{
2173	unsigned *pte;
2174	vm_offset_t pa;
2175
2176	/*
2177	 * In the case that a page table page is not
2178	 * resident, we are creating it here.
2179	 */
2180	if (va < UPT_MIN_ADDRESS) {
2181		unsigned ptepindex;
2182		vm_offset_t ptepa;
2183
2184		/*
2185		 * Calculate pagetable page index
2186		 */
2187		ptepindex = va >> PDRSHIFT;
2188		if (mpte && (mpte->pindex == ptepindex)) {
2189			mpte->hold_count++;
2190		} else {
2191retry:
2192			/*
2193			 * Get the page directory entry
2194			 */
2195			ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
2196
2197			/*
2198			 * If the page table page is mapped, we just increment
2199			 * the hold count, and activate it.
2200			 */
2201			if (ptepa) {
2202				if (ptepa & PG_PS)
2203					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2204				if (pmap->pm_ptphint &&
2205					(pmap->pm_ptphint->pindex == ptepindex)) {
2206					mpte = pmap->pm_ptphint;
2207				} else {
2208					mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
2209					pmap->pm_ptphint = mpte;
2210				}
2211				if (mpte == NULL)
2212					goto retry;
2213				mpte->hold_count++;
2214			} else {
2215				mpte = _pmap_allocpte(pmap, ptepindex);
2216			}
2217		}
2218	} else {
2219		mpte = NULL;
2220	}
2221
2222	/*
2223	 * This call to vtopte makes the assumption that we are
2224	 * entering the page into the current pmap.  In order to support
2225	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2226	 * But that isn't as quick as vtopte.
2227	 */
2228	pte = (unsigned *)vtopte(va);
2229	if (*pte) {
2230		if (mpte)
2231			pmap_unwire_pte_hold(pmap, mpte);
2232		return 0;
2233	}
2234
2235	/*
2236	 * Enter on the PV list if part of our managed memory. Note that we
2237	 * raise IPL while manipulating pv_table since pmap_enter can be
2238	 * called at interrupt time.
2239	 */
2240	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2241		pmap_insert_entry(pmap, va, mpte, m);
2242
2243	/*
2244	 * Increment counters
2245	 */
2246	pmap->pm_stats.resident_count++;
2247
2248	pa = VM_PAGE_TO_PHYS(m);
2249
2250	/*
2251	 * Now validate mapping with RO protection
2252	 */
2253	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2254		*pte = pa | PG_V | PG_U;
2255	else
2256		*pte = pa | PG_V | PG_U | PG_MANAGED;
2257
2258	return mpte;
2259}
2260
2261/*
2262 * Make a temporary mapping for a physical address.  This is only intended
2263 * to be used for panic dumps.
2264 */
2265void *
2266pmap_kenter_temporary(vm_offset_t pa, int i)
2267{
2268	pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2269	return ((void *)crashdumpmap);
2270}
2271
2272#define MAX_INIT_PT (96)
2273/*
2274 * pmap_object_init_pt preloads the ptes for a given object
2275 * into the specified pmap.  This eliminates the blast of soft
2276 * faults on process startup and immediately after an mmap.
2277 */
2278void
2279pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
2280	pmap_t pmap;
2281	vm_offset_t addr;
2282	vm_object_t object;
2283	vm_pindex_t pindex;
2284	vm_size_t size;
2285	int limit;
2286{
2287	vm_offset_t tmpidx;
2288	int psize;
2289	vm_page_t p, mpte;
2290	int objpgs;
2291
2292	if (pmap == NULL || object == NULL)
2293		return;
2294
2295	/*
2296	 * This code maps large physical mmap regions into the
2297	 * processor address space.  Note that some shortcuts
2298	 * are taken, but the code works.
2299	 */
2300	if (pseflag &&
2301		(object->type == OBJT_DEVICE) &&
2302		((addr & (NBPDR - 1)) == 0) &&
2303		((size & (NBPDR - 1)) == 0) ) {
2304		int i;
2305		vm_page_t m[1];
2306		unsigned int ptepindex;
2307		int npdes;
2308		vm_offset_t ptepa;
2309
2310		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2311			return;
2312
2313retry:
2314		p = vm_page_lookup(object, pindex);
2315		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
2316			goto retry;
2317
2318		if (p == NULL) {
2319			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2320			if (p == NULL)
2321				return;
2322			m[0] = p;
2323
2324			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2325				vm_page_free(p);
2326				return;
2327			}
2328
2329			p = vm_page_lookup(object, pindex);
2330			vm_page_wakeup(p);
2331		}
2332
2333		ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p);
2334		if (ptepa & (NBPDR - 1)) {
2335			return;
2336		}
2337
2338		p->valid = VM_PAGE_BITS_ALL;
2339
2340		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2341		npdes = size >> PDRSHIFT;
2342		for(i=0;i<npdes;i++) {
2343			pmap->pm_pdir[ptepindex] =
2344				(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS);
2345			ptepa += NBPDR;
2346			ptepindex += 1;
2347		}
2348		vm_page_flag_set(p, PG_MAPPED);
2349		invltlb();
2350		return;
2351	}
2352
2353	psize = i386_btop(size);
2354
2355	if ((object->type != OBJT_VNODE) ||
2356		(limit && (psize > MAX_INIT_PT) &&
2357			(object->resident_page_count > MAX_INIT_PT))) {
2358		return;
2359	}
2360
2361	if (psize + pindex > object->size) {
2362		if (object->size < pindex)
2363			return;
2364		psize = object->size - pindex;
2365	}
2366
2367	mpte = NULL;
2368	/*
2369	 * if we are processing a major portion of the object, then scan the
2370	 * entire thing.
2371	 */
2372	if (psize > (object->resident_page_count >> 2)) {
2373		objpgs = psize;
2374
2375		for (p = TAILQ_FIRST(&object->memq);
2376		    ((objpgs > 0) && (p != NULL));
2377		    p = TAILQ_NEXT(p, listq)) {
2378
2379			tmpidx = p->pindex;
2380			if (tmpidx < pindex) {
2381				continue;
2382			}
2383			tmpidx -= pindex;
2384			if (tmpidx >= psize) {
2385				continue;
2386			}
2387			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2388				(p->busy == 0) &&
2389			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2390				if ((p->queue - p->pc) == PQ_CACHE)
2391					vm_page_deactivate(p);
2392				vm_page_busy(p);
2393				mpte = pmap_enter_quick(pmap,
2394					addr + i386_ptob(tmpidx), p, mpte);
2395				vm_page_flag_set(p, PG_MAPPED);
2396				vm_page_wakeup(p);
2397			}
2398			objpgs -= 1;
2399		}
2400	} else {
2401		/*
2402		 * else lookup the pages one-by-one.
2403		 */
2404		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2405			p = vm_page_lookup(object, tmpidx + pindex);
2406			if (p &&
2407			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2408				(p->busy == 0) &&
2409			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2410				if ((p->queue - p->pc) == PQ_CACHE)
2411					vm_page_deactivate(p);
2412				vm_page_busy(p);
2413				mpte = pmap_enter_quick(pmap,
2414					addr + i386_ptob(tmpidx), p, mpte);
2415				vm_page_flag_set(p, PG_MAPPED);
2416				vm_page_wakeup(p);
2417			}
2418		}
2419	}
2420	return;
2421}
2422
2423/*
2424 * pmap_prefault provides a quick way of clustering
2425 * pagefaults into a processes address space.  It is a "cousin"
2426 * of pmap_object_init_pt, except it runs at page fault time instead
2427 * of mmap time.
2428 */
2429#define PFBAK 4
2430#define PFFOR 4
2431#define PAGEORDER_SIZE (PFBAK+PFFOR)
2432
2433static int pmap_prefault_pageorder[] = {
2434	-PAGE_SIZE, PAGE_SIZE,
2435	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2436	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2437	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2438};
2439
2440void
2441pmap_prefault(pmap, addra, entry)
2442	pmap_t pmap;
2443	vm_offset_t addra;
2444	vm_map_entry_t entry;
2445{
2446	int i;
2447	vm_offset_t starta;
2448	vm_offset_t addr;
2449	vm_pindex_t pindex;
2450	vm_page_t m, mpte;
2451	vm_object_t object;
2452
2453	if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace)))
2454		return;
2455
2456	object = entry->object.vm_object;
2457
2458	starta = addra - PFBAK * PAGE_SIZE;
2459	if (starta < entry->start) {
2460		starta = entry->start;
2461	} else if (starta > addra) {
2462		starta = 0;
2463	}
2464
2465	mpte = NULL;
2466	for (i = 0; i < PAGEORDER_SIZE; i++) {
2467		vm_object_t lobject;
2468		unsigned *pte;
2469
2470		addr = addra + pmap_prefault_pageorder[i];
2471		if (addr > addra + (PFFOR * PAGE_SIZE))
2472			addr = 0;
2473
2474		if (addr < starta || addr >= entry->end)
2475			continue;
2476
2477		if ((*pmap_pde(pmap, addr)) == NULL)
2478			continue;
2479
2480		pte = (unsigned *) vtopte(addr);
2481		if (*pte)
2482			continue;
2483
2484		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2485		lobject = object;
2486		for (m = vm_page_lookup(lobject, pindex);
2487		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2488		    lobject = lobject->backing_object) {
2489			if (lobject->backing_object_offset & PAGE_MASK)
2490				break;
2491			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2492			m = vm_page_lookup(lobject->backing_object, pindex);
2493		}
2494
2495		/*
2496		 * give-up when a page is not in memory
2497		 */
2498		if (m == NULL)
2499			break;
2500
2501		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2502			(m->busy == 0) &&
2503		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2504
2505			if ((m->queue - m->pc) == PQ_CACHE) {
2506				vm_page_deactivate(m);
2507			}
2508			vm_page_busy(m);
2509			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2510			vm_page_flag_set(m, PG_MAPPED);
2511			vm_page_wakeup(m);
2512		}
2513	}
2514}
2515
2516/*
2517 *	Routine:	pmap_change_wiring
2518 *	Function:	Change the wiring attribute for a map/virtual-address
2519 *			pair.
2520 *	In/out conditions:
2521 *			The mapping must already exist in the pmap.
2522 */
2523void
2524pmap_change_wiring(pmap, va, wired)
2525	register pmap_t pmap;
2526	vm_offset_t va;
2527	boolean_t wired;
2528{
2529	register unsigned *pte;
2530
2531	if (pmap == NULL)
2532		return;
2533
2534	pte = pmap_pte(pmap, va);
2535
2536	if (wired && !pmap_pte_w(pte))
2537		pmap->pm_stats.wired_count++;
2538	else if (!wired && pmap_pte_w(pte))
2539		pmap->pm_stats.wired_count--;
2540
2541	/*
2542	 * Wiring is not a hardware characteristic so there is no need to
2543	 * invalidate TLB.
2544	 */
2545	pmap_pte_set_w(pte, wired);
2546}
2547
2548
2549
2550/*
2551 *	Copy the range specified by src_addr/len
2552 *	from the source map to the range dst_addr/len
2553 *	in the destination map.
2554 *
2555 *	This routine is only advisory and need not do anything.
2556 */
2557
2558void
2559pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
2560	pmap_t dst_pmap, src_pmap;
2561	vm_offset_t dst_addr;
2562	vm_size_t len;
2563	vm_offset_t src_addr;
2564{
2565	vm_offset_t addr;
2566	vm_offset_t end_addr = src_addr + len;
2567	vm_offset_t pdnxt;
2568	unsigned src_frame, dst_frame;
2569	vm_page_t m;
2570
2571	if (dst_addr != src_addr)
2572		return;
2573
2574	src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
2575	if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) {
2576		return;
2577	}
2578
2579	dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
2580	if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) {
2581		APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
2582#if defined(SMP)
2583		/* The page directory is not shared between CPUs */
2584		cpu_invltlb();
2585#else
2586		invltlb();
2587#endif
2588	}
2589
2590	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
2591		unsigned *src_pte, *dst_pte;
2592		vm_page_t dstmpte, srcmpte;
2593		vm_offset_t srcptepaddr;
2594		unsigned ptepindex;
2595
2596		if (addr >= UPT_MIN_ADDRESS)
2597			panic("pmap_copy: invalid to pmap_copy page tables\n");
2598
2599		/*
2600		 * Don't let optional prefaulting of pages make us go
2601		 * way below the low water mark of free pages or way
2602		 * above high water mark of used pv entries.
2603		 */
2604		if (cnt.v_free_count < cnt.v_free_reserved ||
2605		    pv_entry_count > pv_entry_high_water)
2606			break;
2607
2608		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2609		ptepindex = addr >> PDRSHIFT;
2610
2611		srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
2612		if (srcptepaddr == 0)
2613			continue;
2614
2615		if (srcptepaddr & PG_PS) {
2616			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2617				dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
2618				dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2619			}
2620			continue;
2621		}
2622
2623		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2624		if ((srcmpte == NULL) ||
2625			(srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2626			continue;
2627
2628		if (pdnxt > end_addr)
2629			pdnxt = end_addr;
2630
2631		src_pte = (unsigned *) vtopte(addr);
2632		dst_pte = (unsigned *) avtopte(addr);
2633		while (addr < pdnxt) {
2634			unsigned ptetemp;
2635			ptetemp = *src_pte;
2636			/*
2637			 * we only virtual copy managed pages
2638			 */
2639			if ((ptetemp & PG_MANAGED) != 0) {
2640				/*
2641				 * We have to check after allocpte for the
2642				 * pte still being around...  allocpte can
2643				 * block.
2644				 */
2645				dstmpte = pmap_allocpte(dst_pmap, addr);
2646				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2647					/*
2648					 * Clear the modified and
2649					 * accessed (referenced) bits
2650					 * during the copy.
2651					 */
2652					m = PHYS_TO_VM_PAGE(ptetemp);
2653					*dst_pte = ptetemp & ~(PG_M | PG_A);
2654					dst_pmap->pm_stats.resident_count++;
2655					pmap_insert_entry(dst_pmap, addr,
2656						dstmpte, m);
2657	 			} else {
2658					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2659				}
2660				if (dstmpte->hold_count >= srcmpte->hold_count)
2661					break;
2662			}
2663			addr += PAGE_SIZE;
2664			src_pte++;
2665			dst_pte++;
2666		}
2667	}
2668}
2669
2670/*
2671 *	Routine:	pmap_kernel
2672 *	Function:
2673 *		Returns the physical map handle for the kernel.
2674 */
2675pmap_t
2676pmap_kernel()
2677{
2678	return (kernel_pmap);
2679}
2680
2681/*
2682 *	pmap_zero_page zeros the specified hardware page by mapping
2683 *	the page into KVM and using bzero to clear its contents.
2684 */
2685void
2686pmap_zero_page(phys)
2687	vm_offset_t phys;
2688{
2689
2690	if (*(int *) CMAP2)
2691		panic("pmap_zero_page: CMAP2 busy");
2692
2693	*(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2694	invltlb_1pg((vm_offset_t)CADDR2);
2695
2696#if defined(I686_CPU)
2697	if (cpu_class == CPUCLASS_686)
2698		i686_pagezero(CADDR2);
2699	else
2700#endif
2701		bzero(CADDR2, PAGE_SIZE);
2702	*(int *) CMAP2 = 0;
2703}
2704
2705/*
2706 *	pmap_zero_page_area zeros the specified hardware page by mapping
2707 *	the page into KVM and using bzero to clear its contents.
2708 *
2709 *	off and size may not cover an area beyond a single hardware page.
2710 */
2711void
2712pmap_zero_page_area(phys, off, size)
2713	vm_offset_t phys;
2714	int off;
2715	int size;
2716{
2717
2718	if (*(int *) CMAP2)
2719		panic("pmap_zero_page: CMAP2 busy");
2720
2721	*(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2722	invltlb_1pg((vm_offset_t)CADDR2);
2723
2724#if defined(I686_CPU)
2725	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2726		i686_pagezero(CADDR2);
2727	else
2728#endif
2729		bzero((char *)CADDR2 + off, size);
2730	*(int *) CMAP2 = 0;
2731}
2732
2733/*
2734 *	pmap_copy_page copies the specified (machine independent)
2735 *	page by mapping the page into virtual memory and using
2736 *	bcopy to copy the page, one machine dependent page at a
2737 *	time.
2738 */
2739void
2740pmap_copy_page(src, dst)
2741	vm_offset_t src;
2742	vm_offset_t dst;
2743{
2744
2745	if (*(int *) CMAP1)
2746		panic("pmap_copy_page: CMAP1 busy");
2747	if (*(int *) CMAP2)
2748		panic("pmap_copy_page: CMAP2 busy");
2749
2750	*(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2751	*(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2752#ifdef I386_CPU
2753	invltlb();
2754#else
2755	invlpg((u_int)CADDR1);
2756	invlpg((u_int)CADDR2);
2757#endif
2758
2759	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2760
2761	*(int *) CMAP1 = 0;
2762	*(int *) CMAP2 = 0;
2763}
2764
2765
2766/*
2767 *	Routine:	pmap_pageable
2768 *	Function:
2769 *		Make the specified pages (by pmap, offset)
2770 *		pageable (or not) as requested.
2771 *
2772 *		A page which is not pageable may not take
2773 *		a fault; therefore, its page table entry
2774 *		must remain valid for the duration.
2775 *
2776 *		This routine is merely advisory; pmap_enter
2777 *		will specify that these pages are to be wired
2778 *		down (or not) as appropriate.
2779 */
2780void
2781pmap_pageable(pmap, sva, eva, pageable)
2782	pmap_t pmap;
2783	vm_offset_t sva, eva;
2784	boolean_t pageable;
2785{
2786}
2787
2788/*
2789 * this routine returns true if a physical page resides
2790 * in the given pmap.
2791 */
2792boolean_t
2793pmap_page_exists(pmap, m)
2794	pmap_t pmap;
2795	vm_page_t m;
2796{
2797	register pv_entry_t pv;
2798	int s;
2799
2800	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2801		return FALSE;
2802
2803	s = splvm();
2804
2805	/*
2806	 * Not found, check current mappings returning immediately if found.
2807	 */
2808	for (pv = TAILQ_FIRST(&m->md.pv_list);
2809		pv;
2810		pv = TAILQ_NEXT(pv, pv_list)) {
2811		if (pv->pv_pmap == pmap) {
2812			splx(s);
2813			return TRUE;
2814		}
2815	}
2816	splx(s);
2817	return (FALSE);
2818}
2819
2820#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2821/*
2822 * Remove all pages from specified address space
2823 * this aids process exit speeds.  Also, this code
2824 * is special cased for current process only, but
2825 * can have the more generic (and slightly slower)
2826 * mode enabled.  This is much faster than pmap_remove
2827 * in the case of running down an entire address space.
2828 */
2829void
2830pmap_remove_pages(pmap, sva, eva)
2831	pmap_t pmap;
2832	vm_offset_t sva, eva;
2833{
2834	unsigned *pte, tpte;
2835	pv_entry_t pv, npv;
2836	int s;
2837	vm_page_t m;
2838
2839#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2840	if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace))) {
2841		printf("warning: pmap_remove_pages called with non-current pmap\n");
2842		return;
2843	}
2844#endif
2845
2846	s = splvm();
2847	for(pv = TAILQ_FIRST(&pmap->pm_pvlist);
2848		pv;
2849		pv = npv) {
2850
2851		if (pv->pv_va >= eva || pv->pv_va < sva) {
2852			npv = TAILQ_NEXT(pv, pv_plist);
2853			continue;
2854		}
2855
2856#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2857		pte = (unsigned *)vtopte(pv->pv_va);
2858#else
2859		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2860#endif
2861		tpte = *pte;
2862
2863/*
2864 * We cannot remove wired pages from a process' mapping at this time
2865 */
2866		if (tpte & PG_W) {
2867			npv = TAILQ_NEXT(pv, pv_plist);
2868			continue;
2869		}
2870		*pte = 0;
2871
2872		m = PHYS_TO_VM_PAGE(tpte);
2873
2874		KASSERT(m < &vm_page_array[vm_page_array_size],
2875			("pmap_remove_pages: bad tpte %x", tpte));
2876
2877		pv->pv_pmap->pm_stats.resident_count--;
2878
2879		/*
2880		 * Update the vm_page_t clean and reference bits.
2881		 */
2882		if (tpte & PG_M) {
2883			vm_page_dirty(m);
2884		}
2885
2886
2887		npv = TAILQ_NEXT(pv, pv_plist);
2888		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2889
2890		m->md.pv_list_count--;
2891		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2892		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2893			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2894		}
2895
2896		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2897		free_pv_entry(pv);
2898	}
2899	splx(s);
2900	pmap_TLB_invalidate_all(pmap);
2901}
2902
2903/*
2904 * pmap_testbit tests bits in pte's
2905 * note that the testbit/changebit routines are inline,
2906 * and a lot of things compile-time evaluate.
2907 */
2908static boolean_t
2909pmap_testbit(m, bit)
2910	vm_page_t m;
2911	int bit;
2912{
2913	pv_entry_t pv;
2914	unsigned *pte;
2915	int s;
2916
2917	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2918		return FALSE;
2919
2920	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
2921		return FALSE;
2922
2923	s = splvm();
2924
2925	for (pv = TAILQ_FIRST(&m->md.pv_list);
2926		pv;
2927		pv = TAILQ_NEXT(pv, pv_list)) {
2928
2929		/*
2930		 * if the bit being tested is the modified bit, then
2931		 * mark clean_map and ptes as never
2932		 * modified.
2933		 */
2934		if (bit & (PG_A|PG_M)) {
2935			if (!pmap_track_modified(pv->pv_va))
2936				continue;
2937		}
2938
2939#if defined(PMAP_DIAGNOSTIC)
2940		if (!pv->pv_pmap) {
2941			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2942			continue;
2943		}
2944#endif
2945		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2946		if (*pte & bit) {
2947			splx(s);
2948			return TRUE;
2949		}
2950	}
2951	splx(s);
2952	return (FALSE);
2953}
2954
2955/*
2956 * this routine is used to modify bits in ptes
2957 */
2958static __inline void
2959pmap_changebit(m, bit, setem)
2960	vm_page_t m;
2961	int bit;
2962	boolean_t setem;
2963{
2964	register pv_entry_t pv;
2965	register unsigned *pte;
2966	int s;
2967
2968	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2969		return;
2970
2971	s = splvm();
2972
2973	/*
2974	 * Loop over all current mappings setting/clearing as appropos If
2975	 * setting RO do we need to clear the VAC?
2976	 */
2977	for (pv = TAILQ_FIRST(&m->md.pv_list);
2978		pv;
2979		pv = TAILQ_NEXT(pv, pv_list)) {
2980
2981		/*
2982		 * don't write protect pager mappings
2983		 */
2984		if (!setem && (bit == PG_RW)) {
2985			if (!pmap_track_modified(pv->pv_va))
2986				continue;
2987		}
2988
2989#if defined(PMAP_DIAGNOSTIC)
2990		if (!pv->pv_pmap) {
2991			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2992			continue;
2993		}
2994#endif
2995
2996		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2997
2998		if (setem) {
2999			*(int *)pte |= bit;
3000			pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va);
3001		} else {
3002			vm_offset_t pbits = *(vm_offset_t *)pte;
3003			if (pbits & bit) {
3004				if (bit == PG_RW) {
3005					if (pbits & PG_M) {
3006						vm_page_dirty(m);
3007					}
3008					*(int *)pte = pbits & ~(PG_M|PG_RW);
3009				} else {
3010					*(int *)pte = pbits & ~bit;
3011				}
3012				pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va);
3013			}
3014		}
3015	}
3016	splx(s);
3017}
3018
3019/*
3020 *      pmap_page_protect:
3021 *
3022 *      Lower the permission for all mappings to a given page.
3023 */
3024void
3025pmap_page_protect(vm_page_t m, vm_prot_t prot)
3026{
3027	if ((prot & VM_PROT_WRITE) == 0) {
3028		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3029			pmap_changebit(m, PG_RW, FALSE);
3030		} else {
3031			pmap_remove_all(m);
3032		}
3033	}
3034}
3035
3036vm_offset_t
3037pmap_phys_address(ppn)
3038	int ppn;
3039{
3040	return (i386_ptob(ppn));
3041}
3042
3043/*
3044 *	pmap_ts_referenced:
3045 *
3046 *	Return the count of reference bits for a page, clearing all of them.
3047 */
3048int
3049pmap_ts_referenced(vm_page_t m)
3050{
3051	register pv_entry_t pv, pvf, pvn;
3052	unsigned *pte;
3053	int s;
3054	int rtval = 0;
3055
3056	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3057		return (rtval);
3058
3059	s = splvm();
3060
3061	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3062
3063		pvf = pv;
3064
3065		do {
3066			pvn = TAILQ_NEXT(pv, pv_list);
3067
3068			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3069
3070			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3071
3072			if (!pmap_track_modified(pv->pv_va))
3073				continue;
3074
3075			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3076
3077			if (pte && (*pte & PG_A)) {
3078				*pte &= ~PG_A;
3079
3080				pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va);
3081
3082				rtval++;
3083				if (rtval > 4) {
3084					break;
3085				}
3086			}
3087		} while ((pv = pvn) != NULL && pv != pvf);
3088	}
3089	splx(s);
3090
3091	return (rtval);
3092}
3093
3094/*
3095 *	pmap_is_modified:
3096 *
3097 *	Return whether or not the specified physical page was modified
3098 *	in any physical maps.
3099 */
3100boolean_t
3101pmap_is_modified(vm_page_t m)
3102{
3103	return pmap_testbit(m, PG_M);
3104}
3105
3106/*
3107 *	Clear the modify bits on the specified physical page.
3108 */
3109void
3110pmap_clear_modify(vm_page_t m)
3111{
3112	pmap_changebit(m, PG_M, FALSE);
3113}
3114
3115/*
3116 *	pmap_clear_reference:
3117 *
3118 *	Clear the reference bit on the specified physical page.
3119 */
3120void
3121pmap_clear_reference(vm_page_t m)
3122{
3123	pmap_changebit(m, PG_A, FALSE);
3124}
3125
3126/*
3127 * Miscellaneous support routines follow
3128 */
3129
3130static void
3131i386_protection_init()
3132{
3133	register int *kp, prot;
3134
3135	kp = protection_codes;
3136	for (prot = 0; prot < 8; prot++) {
3137		switch (prot) {
3138		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3139			/*
3140			 * Read access is also 0. There isn't any execute bit,
3141			 * so just make it readable.
3142			 */
3143		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3144		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3145		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3146			*kp++ = 0;
3147			break;
3148		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3149		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3150		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3151		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3152			*kp++ = PG_RW;
3153			break;
3154		}
3155	}
3156}
3157
3158/*
3159 * Map a set of physical memory pages into the kernel virtual
3160 * address space. Return a pointer to where it is mapped. This
3161 * routine is intended to be used for mapping device memory,
3162 * NOT real memory.
3163 */
3164void *
3165pmap_mapdev(pa, size)
3166	vm_offset_t pa;
3167	vm_size_t size;
3168{
3169	vm_offset_t va, tmpva, offset;
3170	unsigned *pte;
3171
3172	offset = pa & PAGE_MASK;
3173	size = roundup(offset + size, PAGE_SIZE);
3174
3175	GIANT_REQUIRED;
3176
3177	va = kmem_alloc_pageable(kernel_map, size);
3178	if (!va)
3179		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3180
3181	pa = pa & PG_FRAME;
3182	for (tmpva = va; size > 0;) {
3183		pte = (unsigned *)vtopte(tmpva);
3184		*pte = pa | PG_RW | PG_V | pgeflag;
3185		size -= PAGE_SIZE;
3186		tmpva += PAGE_SIZE;
3187		pa += PAGE_SIZE;
3188	}
3189	invltlb();
3190
3191	return ((void *)(va + offset));
3192}
3193
3194void
3195pmap_unmapdev(va, size)
3196	vm_offset_t va;
3197	vm_size_t size;
3198{
3199	vm_offset_t base, offset;
3200
3201	base = va & PG_FRAME;
3202	offset = va & PAGE_MASK;
3203	size = roundup(offset + size, PAGE_SIZE);
3204	kmem_free(kernel_map, base, size);
3205}
3206
3207/*
3208 * perform the pmap work for mincore
3209 */
3210int
3211pmap_mincore(pmap, addr)
3212	pmap_t pmap;
3213	vm_offset_t addr;
3214{
3215
3216	unsigned *ptep, pte;
3217	vm_page_t m;
3218	int val = 0;
3219
3220	ptep = pmap_pte(pmap, addr);
3221	if (ptep == 0) {
3222		return 0;
3223	}
3224
3225	if ((pte = *ptep) != 0) {
3226		vm_offset_t pa;
3227
3228		val = MINCORE_INCORE;
3229		if ((pte & PG_MANAGED) == 0)
3230			return val;
3231
3232		pa = pte & PG_FRAME;
3233
3234		m = PHYS_TO_VM_PAGE(pa);
3235
3236		/*
3237		 * Modified by us
3238		 */
3239		if (pte & PG_M)
3240			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3241		/*
3242		 * Modified by someone
3243		 */
3244		else if (m->dirty || pmap_is_modified(m))
3245			val |= MINCORE_MODIFIED_OTHER;
3246		/*
3247		 * Referenced by us
3248		 */
3249		if (pte & PG_A)
3250			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3251
3252		/*
3253		 * Referenced by someone
3254		 */
3255		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3256			val |= MINCORE_REFERENCED_OTHER;
3257			vm_page_flag_set(m, PG_REFERENCED);
3258		}
3259	}
3260	return val;
3261}
3262
3263void
3264pmap_activate(struct proc *p)
3265{
3266	pmap_t	pmap;
3267
3268	pmap = vmspace_pmap(p->p_vmspace);
3269#if defined(SMP)
3270	pmap->pm_active |= 1 << PCPU_GET(cpuid);
3271#else
3272	pmap->pm_active |= 1;
3273#endif
3274#if defined(SWTCH_OPTIM_STATS)
3275	tlb_flush_count++;
3276#endif
3277	load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(pmap->pm_pdir));
3278}
3279
3280vm_offset_t
3281pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3282{
3283
3284	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3285		return addr;
3286	}
3287
3288	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3289	return addr;
3290}
3291
3292
3293#if defined(PMAP_DEBUG)
3294pmap_pid_dump(int pid)
3295{
3296	pmap_t pmap;
3297	struct proc *p;
3298	int npte = 0;
3299	int index;
3300
3301	sx_slock(&allproc_lock);
3302	LIST_FOREACH(p, &allproc, p_list) {
3303		if (p->p_pid != pid)
3304			continue;
3305
3306		if (p->p_vmspace) {
3307			int i,j;
3308			index = 0;
3309			pmap = vmspace_pmap(p->p_vmspace);
3310			for(i=0;i<1024;i++) {
3311				pd_entry_t *pde;
3312				unsigned *pte;
3313				unsigned base = i << PDRSHIFT;
3314
3315				pde = &pmap->pm_pdir[i];
3316				if (pde && pmap_pde_v(pde)) {
3317					for(j=0;j<1024;j++) {
3318						unsigned va = base + (j << PAGE_SHIFT);
3319						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3320							if (index) {
3321								index = 0;
3322								printf("\n");
3323							}
3324							sx_sunlock(&allproc_lock);
3325							return npte;
3326						}
3327						pte = pmap_pte_quick( pmap, va);
3328						if (pte && pmap_pte_v(pte)) {
3329							vm_offset_t pa;
3330							vm_page_t m;
3331							pa = *(int *)pte;
3332							m = PHYS_TO_VM_PAGE(pa);
3333							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3334								va, pa, m->hold_count, m->wire_count, m->flags);
3335							npte++;
3336							index++;
3337							if (index >= 2) {
3338								index = 0;
3339								printf("\n");
3340							} else {
3341								printf(" ");
3342							}
3343						}
3344					}
3345				}
3346			}
3347		}
3348	}
3349	sx_sunlock(&allproc_lock);
3350	return npte;
3351}
3352#endif
3353
3354#if defined(DEBUG)
3355
3356static void	pads __P((pmap_t pm));
3357void		pmap_pvdump __P((vm_offset_t pa));
3358
3359/* print address space of pmap*/
3360static void
3361pads(pm)
3362	pmap_t pm;
3363{
3364	unsigned va, i, j;
3365	unsigned *ptep;
3366
3367	if (pm == kernel_pmap)
3368		return;
3369	for (i = 0; i < 1024; i++)
3370		if (pm->pm_pdir[i])
3371			for (j = 0; j < 1024; j++) {
3372				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3373				if (pm == kernel_pmap && va < KERNBASE)
3374					continue;
3375				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3376					continue;
3377				ptep = pmap_pte_quick(pm, va);
3378				if (pmap_pte_v(ptep))
3379					printf("%x:%x ", va, *(int *) ptep);
3380			};
3381
3382}
3383
3384void
3385pmap_pvdump(pa)
3386	vm_offset_t pa;
3387{
3388	register pv_entry_t pv;
3389	vm_page_t m;
3390
3391	printf("pa %x", pa);
3392	m = PHYS_TO_VM_PAGE(pa);
3393	for (pv = TAILQ_FIRST(&m->md.pv_list);
3394		pv;
3395		pv = TAILQ_NEXT(pv, pv_list)) {
3396#ifdef used_to_be
3397		printf(" -> pmap %p, va %x, flags %x",
3398		    (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags);
3399#endif
3400		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3401		pads(pv->pv_pmap);
3402	}
3403	printf(" ");
3404}
3405#endif
3406