pmap.c revision 88742
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 * $FreeBSD: head/sys/i386/i386/pmap.c 88742 2001-12-31 20:02:46Z dillon $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72#include "opt_pmap.h"
73#include "opt_msgbuf.h"
74#include "opt_kstack_pages.h"
75
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/kernel.h>
79#include <sys/lock.h>
80#include <sys/mman.h>
81#include <sys/msgbuf.h>
82#include <sys/mutex.h>
83#include <sys/proc.h>
84#include <sys/sx.h>
85#include <sys/user.h>
86#include <sys/vmmeter.h>
87#include <sys/sysctl.h>
88
89#include <vm/vm.h>
90#include <vm/vm_param.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_page.h>
93#include <vm/vm_map.h>
94#include <vm/vm_object.h>
95#include <vm/vm_extern.h>
96#include <vm/vm_pageout.h>
97#include <vm/vm_pager.h>
98#include <vm/vm_zone.h>
99
100#include <machine/cputypes.h>
101#include <machine/md_var.h>
102#include <machine/specialreg.h>
103#if defined(SMP) || defined(APIC_IO)
104#include <machine/smp.h>
105#include <machine/apic.h>
106#include <machine/segments.h>
107#include <machine/tss.h>
108#endif /* SMP || APIC_IO */
109
110#define PMAP_KEEP_PDIRS
111#ifndef PMAP_SHPGPERPROC
112#define PMAP_SHPGPERPROC 200
113#endif
114
115#if defined(DIAGNOSTIC)
116#define PMAP_DIAGNOSTIC
117#endif
118
119#define MINPV 2048
120
121#if !defined(PMAP_DIAGNOSTIC)
122#define PMAP_INLINE __inline
123#else
124#define PMAP_INLINE
125#endif
126
127/*
128 * Get PDEs and PTEs for user/kernel address space
129 */
130#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
131#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
132
133#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
134#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
135#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
136#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
137#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
138
139#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
140#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
141
142/*
143 * Given a map and a machine independent protection code,
144 * convert to a vax protection code.
145 */
146#define pte_prot(m, p)	(protection_codes[p])
147static int protection_codes[8];
148
149static struct pmap kernel_pmap_store;
150pmap_t kernel_pmap;
151LIST_HEAD(pmaplist, pmap);
152struct pmaplist allpmaps;
153
154vm_offset_t avail_start;	/* PA of first available physical page */
155vm_offset_t avail_end;		/* PA of last available physical page */
156vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
157vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
158static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
159static int pgeflag;		/* PG_G or-in */
160static int pseflag;		/* PG_PS or-in */
161
162static vm_object_t kptobj;
163
164static int nkpt;
165vm_offset_t kernel_vm_end;
166
167/*
168 * Data for the pv entry allocation mechanism
169 */
170static vm_zone_t pvzone;
171static struct vm_zone pvzone_store;
172static struct vm_object pvzone_obj;
173static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
174static int pmap_pagedaemon_waken = 0;
175static struct pv_entry *pvinit;
176
177/*
178 * All those kernel PT submaps that BSD is so fond of
179 */
180pt_entry_t *CMAP1 = 0;
181static pt_entry_t *CMAP2, *ptmmap;
182caddr_t CADDR1 = 0, ptvmmap = 0;
183static caddr_t CADDR2;
184static pt_entry_t *msgbufmap;
185struct msgbuf *msgbufp = 0;
186
187/*
188 * Crashdump maps.
189 */
190static pt_entry_t *pt_crashdumpmap;
191static caddr_t crashdumpmap;
192
193#ifdef SMP
194extern pt_entry_t *SMPpt;
195#endif
196static pt_entry_t *PMAP1 = 0;
197static pt_entry_t *PADDR1 = 0;
198
199static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
200static unsigned * get_ptbase __P((pmap_t pmap));
201static pv_entry_t get_pv_entry __P((void));
202static void	i386_protection_init __P((void));
203static __inline void	pmap_changebit __P((vm_page_t m, int bit, boolean_t setem));
204
205static void	pmap_remove_all __P((vm_page_t m));
206static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
207				      vm_page_t m, vm_page_t mpte));
208static int pmap_remove_pte __P((pmap_t pmap, unsigned *ptq, vm_offset_t sva));
209static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
210static int pmap_remove_entry __P((struct pmap *pmap, vm_page_t m,
211					vm_offset_t va));
212static boolean_t pmap_testbit __P((vm_page_t m, int bit));
213static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
214		vm_page_t mpte, vm_page_t m));
215
216static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
217
218static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
219static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
220static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
221static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
222static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
223static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
224
225static unsigned pdir4mb;
226
227/*
228 *	Routine:	pmap_pte
229 *	Function:
230 *		Extract the page table entry associated
231 *		with the given map/virtual_address pair.
232 */
233
234PMAP_INLINE unsigned *
235pmap_pte(pmap, va)
236	register pmap_t pmap;
237	vm_offset_t va;
238{
239	pd_entry_t *pdeaddr;
240
241	if (pmap) {
242		pdeaddr = pmap_pde(pmap, va);
243		if (*pdeaddr & PG_PS)
244			return pdeaddr;
245		if (*pdeaddr) {
246			return get_ptbase(pmap) + i386_btop(va);
247		}
248	}
249	return (0);
250}
251
252/*
253 * Move the kernel virtual free pointer to the next
254 * 4MB.  This is used to help improve performance
255 * by using a large (4MB) page for much of the kernel
256 * (.text, .data, .bss)
257 */
258static vm_offset_t
259pmap_kmem_choose(vm_offset_t addr)
260{
261	vm_offset_t newaddr = addr;
262#ifndef DISABLE_PSE
263	if (cpu_feature & CPUID_PSE) {
264		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
265	}
266#endif
267	return newaddr;
268}
269
270/*
271 *	Bootstrap the system enough to run with virtual memory.
272 *
273 *	On the i386 this is called after mapping has already been enabled
274 *	and just syncs the pmap module with what has already been done.
275 *	[We can't call it easily with mapping off since the kernel is not
276 *	mapped with PA == VA, hence we would have to relocate every address
277 *	from the linked base (virtual) address "KERNBASE" to the actual
278 *	(physical) address starting relative to 0]
279 */
280void
281pmap_bootstrap(firstaddr, loadaddr)
282	vm_offset_t firstaddr;
283	vm_offset_t loadaddr;
284{
285	vm_offset_t va;
286	pt_entry_t *pte;
287	int i;
288
289	avail_start = firstaddr;
290
291	/*
292	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
293	 * large. It should instead be correctly calculated in locore.s and
294	 * not based on 'first' (which is a physical address, not a virtual
295	 * address, for the start of unused physical memory). The kernel
296	 * page tables are NOT double mapped and thus should not be included
297	 * in this calculation.
298	 */
299	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
300	virtual_avail = pmap_kmem_choose(virtual_avail);
301
302	virtual_end = VM_MAX_KERNEL_ADDRESS;
303
304	/*
305	 * Initialize protection array.
306	 */
307	i386_protection_init();
308
309	/*
310	 * The kernel's pmap is statically allocated so we don't have to use
311	 * pmap_create, which is unlikely to work correctly at this part of
312	 * the boot sequence (XXX and which no longer exists).
313	 */
314	kernel_pmap = &kernel_pmap_store;
315
316	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
317	kernel_pmap->pm_count = 1;
318	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
319	TAILQ_INIT(&kernel_pmap->pm_pvlist);
320	LIST_INIT(&allpmaps);
321	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
322	nkpt = NKPT;
323
324	/*
325	 * Reserve some special page table entries/VA space for temporary
326	 * mapping of pages.
327	 */
328#define	SYSMAP(c, p, v, n)	\
329	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
330
331	va = virtual_avail;
332	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
333
334	/*
335	 * CMAP1/CMAP2 are used for zeroing and copying pages.
336	 */
337	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
338	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
339
340	/*
341	 * Crashdump maps.
342	 */
343	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
344
345	/*
346	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
347	 * XXX ptmmap is not used.
348	 */
349	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
350
351	/*
352	 * msgbufp is used to map the system message buffer.
353	 * XXX msgbufmap is not used.
354	 */
355	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
356	       atop(round_page(MSGBUF_SIZE)))
357
358	/*
359	 * ptemap is used for pmap_pte_quick
360	 */
361	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
362
363	virtual_avail = va;
364
365	*CMAP1 = *CMAP2 = 0;
366	for (i = 0; i < NKPT; i++)
367		PTD[i] = 0;
368
369	pgeflag = 0;
370#if !defined(SMP)			/* XXX - see also mp_machdep.c */
371	if (cpu_feature & CPUID_PGE) {
372		pgeflag = PG_G;
373	}
374#endif
375
376/*
377 * Initialize the 4MB page size flag
378 */
379	pseflag = 0;
380/*
381 * The 4MB page version of the initial
382 * kernel page mapping.
383 */
384	pdir4mb = 0;
385
386#if !defined(DISABLE_PSE)
387	if (cpu_feature & CPUID_PSE) {
388		unsigned ptditmp;
389		/*
390		 * Note that we have enabled PSE mode
391		 */
392		pseflag = PG_PS;
393		ptditmp = *(PTmap + i386_btop(KERNBASE));
394		ptditmp &= ~(NBPDR - 1);
395		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
396		pdir4mb = ptditmp;
397
398#if !defined(SMP)
399		/*
400		 * Enable the PSE mode.
401		 */
402		load_cr4(rcr4() | CR4_PSE);
403
404		/*
405		 * We can do the mapping here for the single processor
406		 * case.  We simply ignore the old page table page from
407		 * now on.
408		 */
409		/*
410		 * For SMP, we still need 4K pages to bootstrap APs,
411		 * PSE will be enabled as soon as all APs are up.
412		 */
413		PTD[KPTDI] = (pd_entry_t) ptditmp;
414		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
415		invltlb();
416#endif
417	}
418#endif
419
420#ifdef SMP
421	if (cpu_apic_address == 0)
422		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
423
424	/* local apic is mapped on last page */
425	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
426	    (cpu_apic_address & PG_FRAME));
427#endif
428
429	invltlb();
430}
431
432#ifdef SMP
433/*
434 * Set 4mb pdir for mp startup
435 */
436void
437pmap_set_opt(void)
438{
439	if (pseflag && (cpu_feature & CPUID_PSE)) {
440		load_cr4(rcr4() | CR4_PSE);
441		if (pdir4mb && PCPU_GET(cpuid) == 0) {	/* only on BSP */
442			kernel_pmap->pm_pdir[KPTDI] =
443			    PTD[KPTDI] = (pd_entry_t)pdir4mb;
444			cpu_invltlb();
445		}
446	}
447}
448#endif
449
450/*
451 *	Initialize the pmap module.
452 *	Called by vm_init, to initialize any structures that the pmap
453 *	system needs to map virtual memory.
454 *	pmap_init has been enhanced to support in a fairly consistant
455 *	way, discontiguous physical memory.
456 */
457void
458pmap_init(phys_start, phys_end)
459	vm_offset_t phys_start, phys_end;
460{
461	int i;
462	int initial_pvs;
463
464	/*
465	 * object for kernel page table pages
466	 */
467	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
468
469	/*
470	 * Allocate memory for random pmap data structures.  Includes the
471	 * pv_head_table.
472	 */
473
474	for(i = 0; i < vm_page_array_size; i++) {
475		vm_page_t m;
476
477		m = &vm_page_array[i];
478		TAILQ_INIT(&m->md.pv_list);
479		m->md.pv_list_count = 0;
480	}
481
482	/*
483	 * init the pv free list
484	 */
485	initial_pvs = vm_page_array_size;
486	if (initial_pvs < MINPV)
487		initial_pvs = MINPV;
488	pvzone = &pvzone_store;
489	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
490		initial_pvs * sizeof (struct pv_entry));
491	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
492	    vm_page_array_size);
493
494	/*
495	 * Now it is safe to enable pv_table recording.
496	 */
497	pmap_initialized = TRUE;
498}
499
500/*
501 * Initialize the address space (zone) for the pv_entries.  Set a
502 * high water mark so that the system can recover from excessive
503 * numbers of pv entries.
504 */
505void
506pmap_init2()
507{
508	int shpgperproc = PMAP_SHPGPERPROC;
509
510	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
511	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
512	pv_entry_high_water = 9 * (pv_entry_max / 10);
513	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
514}
515
516
517/***************************************************
518 * Low level helper routines.....
519 ***************************************************/
520
521#if defined(PMAP_DIAGNOSTIC)
522
523/*
524 * This code checks for non-writeable/modified pages.
525 * This should be an invalid condition.
526 */
527static int
528pmap_nw_modified(pt_entry_t ptea)
529{
530	int pte;
531
532	pte = (int) ptea;
533
534	if ((pte & (PG_M|PG_RW)) == PG_M)
535		return 1;
536	else
537		return 0;
538}
539#endif
540
541
542/*
543 * this routine defines the region(s) of memory that should
544 * not be tested for the modified bit.
545 */
546static PMAP_INLINE int
547pmap_track_modified(vm_offset_t va)
548{
549	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
550		return 1;
551	else
552		return 0;
553}
554
555static PMAP_INLINE void
556invltlb_1pg(vm_offset_t va)
557{
558#ifdef I386_CPU
559	invltlb();
560#else
561	invlpg(va);
562#endif
563}
564
565static __inline void
566pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
567{
568#if defined(SMP)
569	if (pmap->pm_active & (1 << PCPU_GET(cpuid)))
570		cpu_invlpg((void *)va);
571	if (pmap->pm_active & PCPU_GET(other_cpus))
572		smp_invltlb();
573#else
574	if (pmap->pm_active)
575		invltlb_1pg(va);
576#endif
577}
578
579static __inline void
580pmap_invalidate_all(pmap_t pmap)
581{
582#if defined(SMP)
583	if (pmap->pm_active & (1 << PCPU_GET(cpuid)))
584		cpu_invltlb();
585	if (pmap->pm_active & PCPU_GET(other_cpus))
586		smp_invltlb();
587#else
588	if (pmap->pm_active)
589		invltlb();
590#endif
591}
592
593/*
594 * Return an address which is the base of the Virtual mapping of
595 * all the PTEs for the given pmap. Note this doesn't say that
596 * all the PTEs will be present or that the pages there are valid.
597 * The PTEs are made available by the recursive mapping trick.
598 * It will map in the alternate PTE space if needed.
599 */
600static pt_entry_t *
601get_ptbase(pmap)
602	pmap_t pmap;
603{
604	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
605
606	/* are we current address space or kernel? */
607	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
608		return PTmap;
609	}
610	/* otherwise, we are alternate address space */
611	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
612		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
613#if defined(SMP)
614		/* The page directory is not shared between CPUs */
615		cpu_invltlb();
616#else
617		invltlb();
618#endif
619	}
620	return APTmap;
621}
622
623/*
624 * Super fast pmap_pte routine best used when scanning
625 * the pv lists.  This eliminates many coarse-grained
626 * invltlb calls.  Note that many of the pv list
627 * scans are across different pmaps.  It is very wasteful
628 * to do an entire invltlb for checking a single mapping.
629 */
630
631static pt_entry_t *
632pmap_pte_quick(pmap, va)
633	register pmap_t pmap;
634	vm_offset_t va;
635{
636	pd_entry_t pde, newpf;
637	if ((pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) != 0) {
638		pd_entry_t frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
639		unsigned index = i386_btop(va);
640		/* are we current address space or kernel? */
641		if ((pmap == kernel_pmap) ||
642			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
643			return PTmap + index;
644		}
645		newpf = pde & PG_FRAME;
646		if (((*PMAP1) & PG_FRAME) != newpf) {
647			*PMAP1 = newpf | PG_RW | PG_V;
648			invltlb_1pg((vm_offset_t) PADDR1);
649		}
650		return PADDR1 + (index & (NPTEPG - 1));
651	}
652	return (0);
653}
654
655/*
656 *	Routine:	pmap_extract
657 *	Function:
658 *		Extract the physical page address associated
659 *		with the given map/virtual_address pair.
660 */
661vm_offset_t
662pmap_extract(pmap, va)
663	register pmap_t pmap;
664	vm_offset_t va;
665{
666	vm_offset_t rtval;	/* XXX FIXME */
667	vm_offset_t pdirindex;
668	pdirindex = va >> PDRSHIFT;
669	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
670		pt_entry_t *pte;
671		if ((rtval & PG_PS) != 0) {
672			rtval &= ~(NBPDR - 1);
673			rtval |= va & (NBPDR - 1);
674			return rtval;
675		}
676		pte = get_ptbase(pmap) + i386_btop(va);
677		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
678		return rtval;
679	}
680	return 0;
681
682}
683
684/***************************************************
685 * Low level mapping routines.....
686 ***************************************************/
687
688/*
689 * add a wired page to the kva
690 * note that in order for the mapping to take effect -- you
691 * should do a invltlb after doing the pmap_kenter...
692 */
693PMAP_INLINE void
694pmap_kenter(vm_offset_t va, vm_offset_t pa)
695{
696	pt_entry_t *pte;
697	pt_entry_t npte, opte;
698
699	npte = pa | PG_RW | PG_V | pgeflag;
700	pte = vtopte(va);
701	opte = *pte;
702	*pte = npte;
703#ifdef SMP
704	invlpg(va);
705#else
706	invltlb_1pg(va);
707#endif
708}
709
710/*
711 * remove a page from the kernel pagetables
712 */
713PMAP_INLINE void
714pmap_kremove(vm_offset_t va)
715{
716	register pt_entry_t *pte;
717
718	pte = vtopte(va);
719	*pte = 0;
720#ifdef SMP
721	invlpg(va);
722#else
723	invltlb_1pg(va);
724#endif
725}
726
727/*
728 *	Used to map a range of physical addresses into kernel
729 *	virtual address space.
730 *
731 *	The value passed in '*virt' is a suggested virtual address for
732 *	the mapping. Architectures which can support a direct-mapped
733 *	physical to virtual region can return the appropriate address
734 *	within that region, leaving '*virt' unchanged. Other
735 *	architectures should map the pages starting at '*virt' and
736 *	update '*virt' with the first usable address after the mapped
737 *	region.
738 */
739vm_offset_t
740pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
741{
742	vm_offset_t sva = *virt;
743	vm_offset_t va = sva;
744	while (start < end) {
745		pmap_kenter(va, start);
746		va += PAGE_SIZE;
747		start += PAGE_SIZE;
748	}
749	*virt = va;
750	return (sva);
751}
752
753
754/*
755 * Add a list of wired pages to the kva
756 * this routine is only used for temporary
757 * kernel mappings that do not need to have
758 * page modification or references recorded.
759 * Note that old mappings are simply written
760 * over.  The page *must* be wired.
761 */
762void
763pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
764{
765	vm_offset_t end_va;
766
767	end_va = va + count * PAGE_SIZE;
768
769	while (va < end_va) {
770		pt_entry_t *pte;
771
772		pte = vtopte(va);
773		*pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
774#ifdef SMP
775		cpu_invlpg((void *)va);
776#else
777		invltlb_1pg(va);
778#endif
779		va += PAGE_SIZE;
780		m++;
781	}
782#ifdef SMP
783	smp_invltlb();
784#endif
785}
786
787/*
788 * this routine jerks page mappings from the
789 * kernel -- it is meant only for temporary mappings.
790 */
791void
792pmap_qremove(vm_offset_t va, int count)
793{
794	vm_offset_t end_va;
795
796	end_va = va + count*PAGE_SIZE;
797
798	while (va < end_va) {
799		pt_entry_t *pte;
800
801		pte = vtopte(va);
802		*pte = 0;
803#ifdef SMP
804		cpu_invlpg((void *)va);
805#else
806		invltlb_1pg(va);
807#endif
808		va += PAGE_SIZE;
809	}
810#ifdef SMP
811	smp_invltlb();
812#endif
813}
814
815static vm_page_t
816pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
817{
818	vm_page_t m;
819retry:
820	m = vm_page_lookup(object, pindex);
821	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
822		goto retry;
823	return m;
824}
825
826/*
827 * Create the Uarea stack for a new process.
828 * This routine directly affects the fork perf for a process.
829 */
830void
831pmap_new_proc(struct proc *p)
832{
833#ifdef I386_CPU
834	int updateneeded = 0;
835#endif
836	int i;
837	vm_object_t upobj;
838	vm_offset_t up;
839	vm_page_t m;
840	pt_entry_t *ptek, oldpte;
841
842	/*
843	 * allocate object for the upages
844	 */
845	upobj = p->p_upages_obj;
846	if (upobj == NULL) {
847		upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
848		p->p_upages_obj = upobj;
849	}
850
851	/* get a kernel virtual address for the U area for this thread */
852	up = (vm_offset_t)p->p_uarea;
853	if (up == 0) {
854		up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
855		if (up == 0)
856			panic("pmap_new_proc: upage allocation failed");
857		p->p_uarea = (struct user *)up;
858	}
859
860	ptek = vtopte(up);
861
862	for (i = 0; i < UAREA_PAGES; i++) {
863		/*
864		 * Get a kernel stack page
865		 */
866		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
867
868		/*
869		 * Wire the page
870		 */
871		m->wire_count++;
872		cnt.v_wire_count++;
873
874		oldpte = *(ptek + i);
875		/*
876		 * Enter the page into the kernel address space.
877		 */
878		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
879		if (oldpte) {
880#ifdef I386_CPU
881			updateneeded = 1;
882#else
883			invlpg(up + i * PAGE_SIZE);
884#endif
885		}
886
887		vm_page_wakeup(m);
888		vm_page_flag_clear(m, PG_ZERO);
889		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
890		m->valid = VM_PAGE_BITS_ALL;
891	}
892#ifdef I386_CPU
893	if (updateneeded)
894		invltlb();
895#endif
896}
897
898/*
899 * Dispose the U-Area for a process that has exited.
900 * This routine directly impacts the exit perf of a process.
901 */
902void
903pmap_dispose_proc(p)
904	struct proc *p;
905{
906	int i;
907	vm_object_t upobj;
908	vm_offset_t up;
909	vm_page_t m;
910	pt_entry_t *ptek, oldpte;
911
912	upobj = p->p_upages_obj;
913	up = (vm_offset_t)p->p_uarea;
914	ptek = vtopte(up);
915	for (i = 0; i < UAREA_PAGES; i++) {
916		m = vm_page_lookup(upobj, i);
917		if (m == NULL)
918			panic("pmap_dispose_proc: upage already missing?");
919		vm_page_busy(m);
920		oldpte = *(ptek + i);
921		*(ptek + i) = 0;
922#ifndef I386_CPU
923		invlpg(up + i * PAGE_SIZE);
924#endif
925		vm_page_unwire(m, 0);
926		vm_page_free(m);
927	}
928#ifdef I386_CPU
929	invltlb();
930#endif
931}
932
933/*
934 * Allow the U_AREA for a process to be prejudicially paged out.
935 */
936void
937pmap_swapout_proc(p)
938	struct proc *p;
939{
940	int i;
941	vm_object_t upobj;
942	vm_offset_t up;
943	vm_page_t m;
944
945	upobj = p->p_upages_obj;
946	up = (vm_offset_t)p->p_uarea;
947	for (i = 0; i < UAREA_PAGES; i++) {
948		m = vm_page_lookup(upobj, i);
949		if (m == NULL)
950			panic("pmap_swapout_proc: upage already missing?");
951		vm_page_dirty(m);
952		vm_page_unwire(m, 0);
953		pmap_kremove(up + i * PAGE_SIZE);
954	}
955}
956
957/*
958 * Bring the U-Area for a specified process back in.
959 */
960void
961pmap_swapin_proc(p)
962	struct proc *p;
963{
964	int i, rv;
965	vm_object_t upobj;
966	vm_offset_t up;
967	vm_page_t m;
968
969	upobj = p->p_upages_obj;
970	up = (vm_offset_t)p->p_uarea;
971	for (i = 0; i < UAREA_PAGES; i++) {
972		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
973		pmap_kenter(up + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
974		if (m->valid != VM_PAGE_BITS_ALL) {
975			rv = vm_pager_get_pages(upobj, &m, 1, 0);
976			if (rv != VM_PAGER_OK)
977				panic("pmap_swapin_proc: cannot get upage for proc: %d\n", p->p_pid);
978			m = vm_page_lookup(upobj, i);
979			m->valid = VM_PAGE_BITS_ALL;
980		}
981		vm_page_wire(m);
982		vm_page_wakeup(m);
983		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
984	}
985}
986
987/*
988 * Create the kernel stack (including pcb for i386) for a new thread.
989 * This routine directly affects the fork perf for a process and
990 * create performance for a thread.
991 */
992void
993pmap_new_thread(struct thread *td)
994{
995#ifdef I386_CPU
996	int updateneeded = 0;
997#endif
998	int i;
999	vm_object_t ksobj;
1000	vm_page_t m;
1001	vm_offset_t ks;
1002	pt_entry_t *ptek, oldpte;
1003
1004	/*
1005	 * allocate object for the kstack
1006	 */
1007	ksobj = td->td_kstack_obj;
1008	if (ksobj == NULL) {
1009		ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES);
1010		td->td_kstack_obj = ksobj;
1011	}
1012
1013#ifdef KSTACK_GUARD
1014	/* get a kernel virtual address for the kstack for this thread */
1015	ks = td->td_kstack;
1016	if (ks == 0) {
1017		ks = kmem_alloc_nofault(kernel_map,
1018		    (KSTACK_PAGES + 1) * PAGE_SIZE);
1019		if (ks == 0)
1020			panic("pmap_new_thread: kstack allocation failed");
1021		ks += PAGE_SIZE;
1022		td->td_kstack = ks;
1023	}
1024
1025	ptek = vtopte(ks - PAGE_SIZE);
1026	oldpte = *ptek;
1027	*ptek = 0;
1028	if (oldpte) {
1029#ifdef I386_CPU
1030		updateneeded = 1;
1031#else
1032		invlpg(ks - PAGE_SIZE);
1033#endif
1034	}
1035	ptek++;
1036#else
1037	/* get a kernel virtual address for the kstack for this thread */
1038	ks = td->td_kstack;
1039	if (ks == 0) {
1040		ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE);
1041		if (ks == 0)
1042			panic("pmap_new_thread: kstack allocation failed");
1043		td->td_kstack = ks;
1044	}
1045	ptek = vtopte(ks);
1046#endif
1047	for (i = 0; i < KSTACK_PAGES; i++) {
1048		/*
1049		 * Get a kernel stack page
1050		 */
1051		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1052
1053		/*
1054		 * Wire the page
1055		 */
1056		m->wire_count++;
1057		cnt.v_wire_count++;
1058
1059		oldpte = *(ptek + i);
1060		/*
1061		 * Enter the page into the kernel address space.
1062		 */
1063		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
1064		if (oldpte) {
1065#ifdef I386_CPU
1066			updateneeded = 1;
1067#else
1068			invlpg(ks + i * PAGE_SIZE);
1069#endif
1070		}
1071
1072		vm_page_wakeup(m);
1073		vm_page_flag_clear(m, PG_ZERO);
1074		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1075		m->valid = VM_PAGE_BITS_ALL;
1076	}
1077#ifdef I386_CPU
1078	if (updateneeded)
1079		invltlb();
1080#endif
1081}
1082
1083/*
1084 * Dispose the kernel stack for a thread that has exited.
1085 * This routine directly impacts the exit perf of a process and thread.
1086 */
1087void
1088pmap_dispose_thread(td)
1089	struct thread *td;
1090{
1091	int i;
1092	vm_object_t ksobj;
1093	vm_offset_t ks;
1094	vm_page_t m;
1095	pt_entry_t *ptek, oldpte;
1096
1097	ksobj = td->td_kstack_obj;
1098	ks = td->td_kstack;
1099	ptek = vtopte(ks);
1100	for (i = 0; i < KSTACK_PAGES; i++) {
1101		m = vm_page_lookup(ksobj, i);
1102		if (m == NULL)
1103			panic("pmap_dispose_thread: kstack already missing?");
1104		vm_page_busy(m);
1105		oldpte = *(ptek + i);
1106		*(ptek + i) = 0;
1107#ifndef I386_CPU
1108		invlpg(ks + i * PAGE_SIZE);
1109#endif
1110		vm_page_unwire(m, 0);
1111		vm_page_free(m);
1112	}
1113#ifdef I386_CPU
1114	invltlb();
1115#endif
1116}
1117
1118/*
1119 * Allow the Kernel stack for a thread to be prejudicially paged out.
1120 */
1121void
1122pmap_swapout_thread(td)
1123	struct thread *td;
1124{
1125	int i;
1126	vm_object_t ksobj;
1127	vm_offset_t ks;
1128	vm_page_t m;
1129
1130	ksobj = td->td_kstack_obj;
1131	ks = td->td_kstack;
1132	for (i = 0; i < KSTACK_PAGES; i++) {
1133		m = vm_page_lookup(ksobj, i);
1134		if (m == NULL)
1135			panic("pmap_swapout_thread: kstack already missing?");
1136		vm_page_dirty(m);
1137		vm_page_unwire(m, 0);
1138		pmap_kremove(ks + i * PAGE_SIZE);
1139	}
1140}
1141
1142/*
1143 * Bring the kernel stack for a specified thread back in.
1144 */
1145void
1146pmap_swapin_thread(td)
1147	struct thread *td;
1148{
1149	int i, rv;
1150	vm_object_t ksobj;
1151	vm_offset_t ks;
1152	vm_page_t m;
1153
1154	ksobj = td->td_kstack_obj;
1155	ks = td->td_kstack;
1156	for (i = 0; i < KSTACK_PAGES; i++) {
1157		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1158		pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
1159		if (m->valid != VM_PAGE_BITS_ALL) {
1160			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
1161			if (rv != VM_PAGER_OK)
1162				panic("pmap_swapin_thread: cannot get kstack for proc: %d\n", td->td_proc->p_pid);
1163			m = vm_page_lookup(ksobj, i);
1164			m->valid = VM_PAGE_BITS_ALL;
1165		}
1166		vm_page_wire(m);
1167		vm_page_wakeup(m);
1168		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1169	}
1170}
1171
1172/***************************************************
1173 * Page table page management routines.....
1174 ***************************************************/
1175
1176/*
1177 * This routine unholds page table pages, and if the hold count
1178 * drops to zero, then it decrements the wire count.
1179 */
1180static int
1181_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1182{
1183
1184	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
1185		;
1186
1187	if (m->hold_count == 0) {
1188		vm_offset_t pteva;
1189		/*
1190		 * unmap the page table page
1191		 */
1192		pmap->pm_pdir[m->pindex] = 0;
1193		--pmap->pm_stats.resident_count;
1194		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1195		    (PTDpde & PG_FRAME)) {
1196			/*
1197			 * Do a invltlb to make the invalidated mapping
1198			 * take effect immediately.
1199			 */
1200			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1201			pmap_invalidate_page(pmap, pteva);
1202		}
1203
1204		if (pmap->pm_ptphint == m)
1205			pmap->pm_ptphint = NULL;
1206
1207		/*
1208		 * If the page is finally unwired, simply free it.
1209		 */
1210		--m->wire_count;
1211		if (m->wire_count == 0) {
1212
1213			vm_page_flash(m);
1214			vm_page_busy(m);
1215			vm_page_free_zero(m);
1216			--cnt.v_wire_count;
1217		}
1218		return 1;
1219	}
1220	return 0;
1221}
1222
1223static PMAP_INLINE int
1224pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1225{
1226	vm_page_unhold(m);
1227	if (m->hold_count == 0)
1228		return _pmap_unwire_pte_hold(pmap, m);
1229	else
1230		return 0;
1231}
1232
1233/*
1234 * After removing a page table entry, this routine is used to
1235 * conditionally free the page, and manage the hold/wire counts.
1236 */
1237static int
1238pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1239{
1240	unsigned ptepindex;
1241	if (va >= VM_MAXUSER_ADDRESS)
1242		return 0;
1243
1244	if (mpte == NULL) {
1245		ptepindex = (va >> PDRSHIFT);
1246		if (pmap->pm_ptphint &&
1247			(pmap->pm_ptphint->pindex == ptepindex)) {
1248			mpte = pmap->pm_ptphint;
1249		} else {
1250			mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1251			pmap->pm_ptphint = mpte;
1252		}
1253	}
1254
1255	return pmap_unwire_pte_hold(pmap, mpte);
1256}
1257
1258void
1259pmap_pinit0(pmap)
1260	struct pmap *pmap;
1261{
1262	pmap->pm_pdir =
1263		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1264	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
1265	pmap->pm_count = 1;
1266	pmap->pm_ptphint = NULL;
1267	pmap->pm_active = 0;
1268	TAILQ_INIT(&pmap->pm_pvlist);
1269	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1270	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1271}
1272
1273/*
1274 * Initialize a preallocated and zeroed pmap structure,
1275 * such as one in a vmspace structure.
1276 */
1277void
1278pmap_pinit(pmap)
1279	register struct pmap *pmap;
1280{
1281	vm_page_t ptdpg;
1282
1283	/*
1284	 * No need to allocate page table space yet but we do need a valid
1285	 * page directory table.
1286	 */
1287	if (pmap->pm_pdir == NULL)
1288		pmap->pm_pdir =
1289			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1290
1291	/*
1292	 * allocate object for the ptes
1293	 */
1294	if (pmap->pm_pteobj == NULL)
1295		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
1296
1297	/*
1298	 * allocate the page directory page
1299	 */
1300	ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
1301			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1302
1303	ptdpg->wire_count = 1;
1304	++cnt.v_wire_count;
1305
1306
1307	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
1308	ptdpg->valid = VM_PAGE_BITS_ALL;
1309
1310	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1311	if ((ptdpg->flags & PG_ZERO) == 0)
1312		bzero(pmap->pm_pdir, PAGE_SIZE);
1313
1314	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1315	/* Wire in kernel global address entries. */
1316	/* XXX copies current process, does not fill in MPPTDI */
1317	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1318#ifdef SMP
1319	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1320#endif
1321
1322	/* install self-referential address mapping entry */
1323	pmap->pm_pdir[PTDPTDI] =
1324		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1325
1326	pmap->pm_count = 1;
1327	pmap->pm_active = 0;
1328	pmap->pm_ptphint = NULL;
1329	TAILQ_INIT(&pmap->pm_pvlist);
1330	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1331}
1332
1333/*
1334 * Wire in kernel global address entries.  To avoid a race condition
1335 * between pmap initialization and pmap_growkernel, this procedure
1336 * should be called after the vmspace is attached to the process
1337 * but before this pmap is activated.
1338 */
1339void
1340pmap_pinit2(pmap)
1341	struct pmap *pmap;
1342{
1343	/* XXX: Remove this stub when no longer called */
1344}
1345
1346static int
1347pmap_release_free_page(pmap_t pmap, vm_page_t p)
1348{
1349	pd_entry_t *pde = pmap->pm_pdir;
1350	/*
1351	 * This code optimizes the case of freeing non-busy
1352	 * page-table pages.  Those pages are zero now, and
1353	 * might as well be placed directly into the zero queue.
1354	 */
1355	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1356		return 0;
1357
1358	vm_page_busy(p);
1359
1360	/*
1361	 * Remove the page table page from the processes address space.
1362	 */
1363	pde[p->pindex] = 0;
1364	pmap->pm_stats.resident_count--;
1365
1366	if (p->hold_count)  {
1367		panic("pmap_release: freeing held page table page");
1368	}
1369	/*
1370	 * Page directory pages need to have the kernel
1371	 * stuff cleared, so they can go into the zero queue also.
1372	 */
1373	if (p->pindex == PTDPTDI) {
1374		bzero(pde + KPTDI, nkpt * PTESIZE);
1375#ifdef SMP
1376		pde[MPPTDI] = 0;
1377#endif
1378		pde[APTDPTDI] = 0;
1379		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1380	}
1381
1382	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1383		pmap->pm_ptphint = NULL;
1384
1385	p->wire_count--;
1386	cnt.v_wire_count--;
1387	vm_page_free_zero(p);
1388	return 1;
1389}
1390
1391/*
1392 * this routine is called if the page table page is not
1393 * mapped correctly.
1394 */
1395static vm_page_t
1396_pmap_allocpte(pmap, ptepindex)
1397	pmap_t	pmap;
1398	unsigned ptepindex;
1399{
1400	vm_offset_t pteva, ptepa;	/* XXXPA */
1401	vm_page_t m;
1402
1403	/*
1404	 * Find or fabricate a new pagetable page
1405	 */
1406	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1407			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1408
1409	KASSERT(m->queue == PQ_NONE,
1410		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1411
1412	if (m->wire_count == 0)
1413		cnt.v_wire_count++;
1414	m->wire_count++;
1415
1416	/*
1417	 * Increment the hold count for the page table page
1418	 * (denoting a new mapping.)
1419	 */
1420	m->hold_count++;
1421
1422	/*
1423	 * Map the pagetable page into the process address space, if
1424	 * it isn't already there.
1425	 */
1426
1427	pmap->pm_stats.resident_count++;
1428
1429	ptepa = VM_PAGE_TO_PHYS(m);
1430	pmap->pm_pdir[ptepindex] =
1431		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1432
1433	/*
1434	 * Set the page table hint
1435	 */
1436	pmap->pm_ptphint = m;
1437
1438	/*
1439	 * Try to use the new mapping, but if we cannot, then
1440	 * do it with the routine that maps the page explicitly.
1441	 */
1442	if ((m->flags & PG_ZERO) == 0) {
1443		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1444		    (PTDpde & PG_FRAME)) {
1445			pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex);
1446			bzero((caddr_t) pteva, PAGE_SIZE);
1447		} else {
1448			pmap_zero_page(ptepa);
1449		}
1450	}
1451
1452	m->valid = VM_PAGE_BITS_ALL;
1453	vm_page_flag_clear(m, PG_ZERO);
1454	vm_page_flag_set(m, PG_MAPPED);
1455	vm_page_wakeup(m);
1456
1457	return m;
1458}
1459
1460static vm_page_t
1461pmap_allocpte(pmap_t pmap, vm_offset_t va)
1462{
1463	unsigned ptepindex;
1464	pd_entry_t ptepa;
1465	vm_page_t m;
1466
1467	/*
1468	 * Calculate pagetable page index
1469	 */
1470	ptepindex = va >> PDRSHIFT;
1471
1472	/*
1473	 * Get the page directory entry
1474	 */
1475	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1476
1477	/*
1478	 * This supports switching from a 4MB page to a
1479	 * normal 4K page.
1480	 */
1481	if (ptepa & PG_PS) {
1482		pmap->pm_pdir[ptepindex] = 0;
1483		ptepa = 0;
1484		invltlb();
1485	}
1486
1487	/*
1488	 * If the page table page is mapped, we just increment the
1489	 * hold count, and activate it.
1490	 */
1491	if (ptepa) {
1492		/*
1493		 * In order to get the page table page, try the
1494		 * hint first.
1495		 */
1496		if (pmap->pm_ptphint &&
1497			(pmap->pm_ptphint->pindex == ptepindex)) {
1498			m = pmap->pm_ptphint;
1499		} else {
1500			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1501			pmap->pm_ptphint = m;
1502		}
1503		m->hold_count++;
1504		return m;
1505	}
1506	/*
1507	 * Here if the pte page isn't mapped, or if it has been deallocated.
1508	 */
1509	return _pmap_allocpte(pmap, ptepindex);
1510}
1511
1512
1513/***************************************************
1514* Pmap allocation/deallocation routines.
1515 ***************************************************/
1516
1517/*
1518 * Release any resources held by the given physical map.
1519 * Called when a pmap initialized by pmap_pinit is being released.
1520 * Should only be called if the map contains no valid mappings.
1521 */
1522void
1523pmap_release(pmap_t pmap)
1524{
1525	vm_page_t p,n,ptdpg;
1526	vm_object_t object = pmap->pm_pteobj;
1527	int curgeneration;
1528
1529#if defined(DIAGNOSTIC)
1530	if (object->ref_count != 1)
1531		panic("pmap_release: pteobj reference count != 1");
1532#endif
1533
1534	ptdpg = NULL;
1535	LIST_REMOVE(pmap, pm_list);
1536retry:
1537	curgeneration = object->generation;
1538	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1539		n = TAILQ_NEXT(p, listq);
1540		if (p->pindex == PTDPTDI) {
1541			ptdpg = p;
1542			continue;
1543		}
1544		while (1) {
1545			if (!pmap_release_free_page(pmap, p) &&
1546				(object->generation != curgeneration))
1547				goto retry;
1548		}
1549	}
1550
1551	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1552		goto retry;
1553}
1554
1555static int
1556kvm_size(SYSCTL_HANDLER_ARGS)
1557{
1558	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1559
1560        return sysctl_handle_long(oidp, &ksize, 0, req);
1561}
1562SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1563    0, 0, kvm_size, "IU", "Size of KVM");
1564
1565static int
1566kvm_free(SYSCTL_HANDLER_ARGS)
1567{
1568	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1569
1570        return sysctl_handle_long(oidp, &kfree, 0, req);
1571}
1572SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1573    0, 0, kvm_free, "IU", "Amount of KVM free");
1574
1575/*
1576 * grow the number of kernel page table entries, if needed
1577 */
1578void
1579pmap_growkernel(vm_offset_t addr)
1580{
1581	struct pmap *pmap;
1582	int s;
1583	vm_offset_t ptppaddr;
1584	vm_page_t nkpg;
1585	pd_entry_t newpdir;
1586
1587	s = splhigh();
1588	if (kernel_vm_end == 0) {
1589		kernel_vm_end = KERNBASE;
1590		nkpt = 0;
1591		while (pdir_pde(PTD, kernel_vm_end)) {
1592			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1593			nkpt++;
1594		}
1595	}
1596	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1597	while (kernel_vm_end < addr) {
1598		if (pdir_pde(PTD, kernel_vm_end)) {
1599			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1600			continue;
1601		}
1602
1603		/*
1604		 * This index is bogus, but out of the way
1605		 */
1606		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
1607		if (!nkpg)
1608			panic("pmap_growkernel: no memory to grow kernel");
1609
1610		nkpt++;
1611
1612		vm_page_wire(nkpg);
1613		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1614		pmap_zero_page(ptppaddr);
1615		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1616		pdir_pde(PTD, kernel_vm_end) = newpdir;
1617
1618		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1619			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1620		}
1621		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1622	}
1623	splx(s);
1624}
1625
1626/*
1627 *	Retire the given physical map from service.
1628 *	Should only be called if the map contains
1629 *	no valid mappings.
1630 */
1631void
1632pmap_destroy(pmap_t pmap)
1633{
1634	int count;
1635
1636	if (pmap == NULL)
1637		return;
1638
1639	count = --pmap->pm_count;
1640	if (count == 0) {
1641		pmap_release(pmap);
1642		panic("destroying a pmap is not yet implemented");
1643	}
1644}
1645
1646/*
1647 *	Add a reference to the specified pmap.
1648 */
1649void
1650pmap_reference(pmap_t pmap)
1651{
1652	if (pmap != NULL) {
1653		pmap->pm_count++;
1654	}
1655}
1656
1657/***************************************************
1658* page management routines.
1659 ***************************************************/
1660
1661/*
1662 * free the pv_entry back to the free list
1663 */
1664static PMAP_INLINE void
1665free_pv_entry(pv_entry_t pv)
1666{
1667	pv_entry_count--;
1668	zfree(pvzone, pv);
1669}
1670
1671/*
1672 * get a new pv_entry, allocating a block from the system
1673 * when needed.
1674 * the memory allocation is performed bypassing the malloc code
1675 * because of the possibility of allocations at interrupt time.
1676 */
1677static pv_entry_t
1678get_pv_entry(void)
1679{
1680	pv_entry_count++;
1681	if (pv_entry_high_water &&
1682		(pv_entry_count > pv_entry_high_water) &&
1683		(pmap_pagedaemon_waken == 0)) {
1684		pmap_pagedaemon_waken = 1;
1685		wakeup (&vm_pages_needed);
1686	}
1687	return zalloc(pvzone);
1688}
1689
1690/*
1691 * This routine is very drastic, but can save the system
1692 * in a pinch.
1693 */
1694void
1695pmap_collect()
1696{
1697	int i;
1698	vm_page_t m;
1699	static int warningdone = 0;
1700
1701	if (pmap_pagedaemon_waken == 0)
1702		return;
1703
1704	if (warningdone < 5) {
1705		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1706		warningdone++;
1707	}
1708
1709	for(i = 0; i < vm_page_array_size; i++) {
1710		m = &vm_page_array[i];
1711		if (m->wire_count || m->hold_count || m->busy ||
1712		    (m->flags & (PG_BUSY | PG_UNMANAGED)))
1713			continue;
1714		pmap_remove_all(m);
1715	}
1716	pmap_pagedaemon_waken = 0;
1717}
1718
1719
1720/*
1721 * If it is the first entry on the list, it is actually
1722 * in the header and we must copy the following entry up
1723 * to the header.  Otherwise we must search the list for
1724 * the entry.  In either case we free the now unused entry.
1725 */
1726
1727static int
1728pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1729{
1730	pv_entry_t pv;
1731	int rtval;
1732	int s;
1733
1734	s = splvm();
1735	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1736		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1737			if (pmap == pv->pv_pmap && va == pv->pv_va)
1738				break;
1739		}
1740	} else {
1741		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1742			if (va == pv->pv_va)
1743				break;
1744		}
1745	}
1746
1747	rtval = 0;
1748	if (pv) {
1749		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1750		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1751		m->md.pv_list_count--;
1752		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1753			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1754
1755		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1756		free_pv_entry(pv);
1757	}
1758
1759	splx(s);
1760	return rtval;
1761}
1762
1763/*
1764 * Create a pv entry for page at pa for
1765 * (pmap, va).
1766 */
1767static void
1768pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1769{
1770
1771	int s;
1772	pv_entry_t pv;
1773
1774	s = splvm();
1775	pv = get_pv_entry();
1776	pv->pv_va = va;
1777	pv->pv_pmap = pmap;
1778	pv->pv_ptem = mpte;
1779
1780	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1781	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1782	m->md.pv_list_count++;
1783
1784	splx(s);
1785}
1786
1787/*
1788 * pmap_remove_pte: do the things to unmap a page in a process
1789 */
1790static int
1791pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1792{
1793	pt_entry_t oldpte;
1794	vm_page_t m;
1795
1796	oldpte = atomic_readandclear_int(ptq);
1797	if (oldpte & PG_W)
1798		pmap->pm_stats.wired_count -= 1;
1799	/*
1800	 * Machines that don't support invlpg, also don't support
1801	 * PG_G.
1802	 */
1803	if (oldpte & PG_G)
1804		invlpg(va);
1805	pmap->pm_stats.resident_count -= 1;
1806	if (oldpte & PG_MANAGED) {
1807		m = PHYS_TO_VM_PAGE(oldpte);
1808		if (oldpte & PG_M) {
1809#if defined(PMAP_DIAGNOSTIC)
1810			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1811				printf(
1812	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1813				    va, oldpte);
1814			}
1815#endif
1816			if (pmap_track_modified(va))
1817				vm_page_dirty(m);
1818		}
1819		if (oldpte & PG_A)
1820			vm_page_flag_set(m, PG_REFERENCED);
1821		return pmap_remove_entry(pmap, m, va);
1822	} else {
1823		return pmap_unuse_pt(pmap, va, NULL);
1824	}
1825
1826	return 0;
1827}
1828
1829/*
1830 * Remove a single page from a process address space
1831 */
1832static void
1833pmap_remove_page(pmap_t pmap, vm_offset_t va)
1834{
1835	register pt_entry_t *ptq;
1836
1837	/*
1838	 * if there is no pte for this address, just skip it!!!
1839	 */
1840	if (*pmap_pde(pmap, va) == 0) {
1841		return;
1842	}
1843
1844	/*
1845	 * get a local va for mappings for this pmap.
1846	 */
1847	ptq = get_ptbase(pmap) + i386_btop(va);
1848	if (*ptq) {
1849		(void) pmap_remove_pte(pmap, ptq, va);
1850		pmap_invalidate_page(pmap, va);
1851	}
1852	return;
1853}
1854
1855/*
1856 *	Remove the given range of addresses from the specified map.
1857 *
1858 *	It is assumed that the start and end are properly
1859 *	rounded to the page size.
1860 */
1861void
1862pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1863{
1864	register pt_entry_t *ptbase;
1865	vm_offset_t pdnxt;
1866	pd_entry_t ptpaddr;
1867	vm_offset_t sindex, eindex;
1868	int anyvalid;
1869
1870	if (pmap == NULL)
1871		return;
1872
1873	if (pmap->pm_stats.resident_count == 0)
1874		return;
1875
1876	/*
1877	 * special handling of removing one page.  a very
1878	 * common operation and easy to short circuit some
1879	 * code.
1880	 */
1881	if ((sva + PAGE_SIZE == eva) &&
1882	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1883		pmap_remove_page(pmap, sva);
1884		return;
1885	}
1886
1887	anyvalid = 0;
1888
1889	/*
1890	 * Get a local virtual address for the mappings that are being
1891	 * worked with.
1892	 */
1893	ptbase = get_ptbase(pmap);
1894
1895	sindex = i386_btop(sva);
1896	eindex = i386_btop(eva);
1897
1898	for (; sindex < eindex; sindex = pdnxt) {
1899		unsigned pdirindex;
1900
1901		/*
1902		 * Calculate index for next page table.
1903		 */
1904		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1905		if (pmap->pm_stats.resident_count == 0)
1906			break;
1907
1908		pdirindex = sindex / NPDEPG;
1909		ptpaddr = pmap->pm_pdir[pdirindex];
1910		if ((ptpaddr & PG_PS) != 0) {
1911			pmap->pm_pdir[pdirindex] = 0;
1912			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1913			anyvalid++;
1914			continue;
1915		}
1916
1917		/*
1918		 * Weed out invalid mappings. Note: we assume that the page
1919		 * directory table is always allocated, and in kernel virtual.
1920		 */
1921		if (ptpaddr == 0)
1922			continue;
1923
1924		/*
1925		 * Limit our scan to either the end of the va represented
1926		 * by the current page table page, or to the end of the
1927		 * range being removed.
1928		 */
1929		if (pdnxt > eindex) {
1930			pdnxt = eindex;
1931		}
1932
1933		for (; sindex != pdnxt; sindex++) {
1934			vm_offset_t va;
1935			if (ptbase[sindex] == 0) {
1936				continue;
1937			}
1938			va = i386_ptob(sindex);
1939
1940			anyvalid++;
1941			if (pmap_remove_pte(pmap,
1942				ptbase + sindex, va))
1943				break;
1944		}
1945	}
1946
1947	if (anyvalid)
1948		pmap_invalidate_all(pmap);
1949}
1950
1951/*
1952 *	Routine:	pmap_remove_all
1953 *	Function:
1954 *		Removes this physical page from
1955 *		all physical maps in which it resides.
1956 *		Reflects back modify bits to the pager.
1957 *
1958 *	Notes:
1959 *		Original versions of this routine were very
1960 *		inefficient because they iteratively called
1961 *		pmap_remove (slow...)
1962 */
1963
1964static void
1965pmap_remove_all(vm_page_t m)
1966{
1967	register pv_entry_t pv;
1968	pt_entry_t *pte, tpte;
1969	int s;
1970
1971#if defined(PMAP_DIAGNOSTIC)
1972	/*
1973	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1974	 * pages!
1975	 */
1976	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1977		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
1978	}
1979#endif
1980
1981	s = splvm();
1982	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1983		pv->pv_pmap->pm_stats.resident_count--;
1984
1985		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1986
1987		tpte = atomic_readandclear_int(pte);
1988		if (tpte & PG_W)
1989			pv->pv_pmap->pm_stats.wired_count--;
1990
1991		if (tpte & PG_A)
1992			vm_page_flag_set(m, PG_REFERENCED);
1993
1994		/*
1995		 * Update the vm_page_t clean and reference bits.
1996		 */
1997		if (tpte & PG_M) {
1998#if defined(PMAP_DIAGNOSTIC)
1999			if (pmap_nw_modified((pt_entry_t) tpte)) {
2000				printf(
2001	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
2002				    pv->pv_va, tpte);
2003			}
2004#endif
2005			if (pmap_track_modified(pv->pv_va))
2006				vm_page_dirty(m);
2007		}
2008		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2009
2010		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2011		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2012		m->md.pv_list_count--;
2013		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2014		free_pv_entry(pv);
2015	}
2016
2017	vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2018
2019	splx(s);
2020}
2021
2022/*
2023 *	Set the physical protection on the
2024 *	specified range of this map as requested.
2025 */
2026void
2027pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2028{
2029	register pt_entry_t *ptbase;
2030	vm_offset_t pdnxt;
2031	pd_entry_t ptpaddr;
2032	vm_pindex_t sindex, eindex;
2033	int anychanged;
2034
2035	if (pmap == NULL)
2036		return;
2037
2038	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2039		pmap_remove(pmap, sva, eva);
2040		return;
2041	}
2042
2043	if (prot & VM_PROT_WRITE)
2044		return;
2045
2046	anychanged = 0;
2047
2048	ptbase = get_ptbase(pmap);
2049
2050	sindex = i386_btop(sva);
2051	eindex = i386_btop(eva);
2052
2053	for (; sindex < eindex; sindex = pdnxt) {
2054
2055		unsigned pdirindex;
2056
2057		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2058
2059		pdirindex = sindex / NPDEPG;
2060		ptpaddr = pmap->pm_pdir[pdirindex];
2061		if ((ptpaddr & PG_PS) != 0) {
2062			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2063			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2064			anychanged++;
2065			continue;
2066		}
2067
2068		/*
2069		 * Weed out invalid mappings. Note: we assume that the page
2070		 * directory table is always allocated, and in kernel virtual.
2071		 */
2072		if (ptpaddr == 0)
2073			continue;
2074
2075		if (pdnxt > eindex) {
2076			pdnxt = eindex;
2077		}
2078
2079		for (; sindex != pdnxt; sindex++) {
2080
2081			pt_entry_t pbits;
2082			vm_page_t m;
2083
2084			pbits = ptbase[sindex];
2085
2086			if (pbits & PG_MANAGED) {
2087				m = NULL;
2088				if (pbits & PG_A) {
2089					m = PHYS_TO_VM_PAGE(pbits);
2090					vm_page_flag_set(m, PG_REFERENCED);
2091					pbits &= ~PG_A;
2092				}
2093				if (pbits & PG_M) {
2094					if (pmap_track_modified(i386_ptob(sindex))) {
2095						if (m == NULL)
2096							m = PHYS_TO_VM_PAGE(pbits);
2097						vm_page_dirty(m);
2098						pbits &= ~PG_M;
2099					}
2100				}
2101			}
2102
2103			pbits &= ~PG_RW;
2104
2105			if (pbits != ptbase[sindex]) {
2106				ptbase[sindex] = pbits;
2107				anychanged = 1;
2108			}
2109		}
2110	}
2111	if (anychanged)
2112		pmap_invalidate_all(pmap);
2113}
2114
2115/*
2116 *	Insert the given physical page (p) at
2117 *	the specified virtual address (v) in the
2118 *	target physical map with the protection requested.
2119 *
2120 *	If specified, the page will be wired down, meaning
2121 *	that the related pte can not be reclaimed.
2122 *
2123 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2124 *	or lose information.  That is, this routine must actually
2125 *	insert this page into the given map NOW.
2126 */
2127void
2128pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2129	   boolean_t wired)
2130{
2131	vm_offset_t pa;
2132	register pt_entry_t *pte;
2133	vm_offset_t opa;
2134	pt_entry_t origpte, newpte;
2135	vm_page_t mpte;
2136
2137	if (pmap == NULL)
2138		return;
2139
2140	va &= PG_FRAME;
2141#ifdef PMAP_DIAGNOSTIC
2142	if (va > VM_MAX_KERNEL_ADDRESS)
2143		panic("pmap_enter: toobig");
2144	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2145		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2146#endif
2147
2148	mpte = NULL;
2149	/*
2150	 * In the case that a page table page is not
2151	 * resident, we are creating it here.
2152	 */
2153	if (va < VM_MAXUSER_ADDRESS) {
2154		mpte = pmap_allocpte(pmap, va);
2155	}
2156#if 0 && defined(PMAP_DIAGNOSTIC)
2157	else {
2158		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2159		if (((origpte = *pdeaddr) & PG_V) == 0) {
2160			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2161				pmap->pm_pdir[PTDPTDI], origpte, va);
2162		}
2163	}
2164#endif
2165
2166	pte = pmap_pte(pmap, va);
2167
2168	/*
2169	 * Page Directory table entry not valid, we need a new PT page
2170	 */
2171	if (pte == NULL) {
2172		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n",
2173			(void *)pmap->pm_pdir[PTDPTDI], va);
2174	}
2175
2176	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2177	origpte = *(vm_offset_t *)pte;
2178	opa = origpte & PG_FRAME;
2179
2180	if (origpte & PG_PS)
2181		panic("pmap_enter: attempted pmap_enter on 4MB page");
2182
2183	/*
2184	 * Mapping has not changed, must be protection or wiring change.
2185	 */
2186	if (origpte && (opa == pa)) {
2187		/*
2188		 * Wiring change, just update stats. We don't worry about
2189		 * wiring PT pages as they remain resident as long as there
2190		 * are valid mappings in them. Hence, if a user page is wired,
2191		 * the PT page will be also.
2192		 */
2193		if (wired && ((origpte & PG_W) == 0))
2194			pmap->pm_stats.wired_count++;
2195		else if (!wired && (origpte & PG_W))
2196			pmap->pm_stats.wired_count--;
2197
2198#if defined(PMAP_DIAGNOSTIC)
2199		if (pmap_nw_modified((pt_entry_t) origpte)) {
2200			printf(
2201	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2202			    va, origpte);
2203		}
2204#endif
2205
2206		/*
2207		 * Remove extra pte reference
2208		 */
2209		if (mpte)
2210			mpte->hold_count--;
2211
2212		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2213			if ((origpte & PG_RW) == 0) {
2214				*pte |= PG_RW;
2215#ifdef SMP
2216				cpu_invlpg((void *)va);
2217				if (pmap->pm_active & PCPU_GET(other_cpus))
2218					smp_invltlb();
2219#else
2220				invltlb_1pg(va);
2221#endif
2222			}
2223			return;
2224		}
2225
2226		/*
2227		 * We might be turning off write access to the page,
2228		 * so we go ahead and sense modify status.
2229		 */
2230		if (origpte & PG_MANAGED) {
2231			if ((origpte & PG_M) && pmap_track_modified(va)) {
2232				vm_page_t om;
2233				om = PHYS_TO_VM_PAGE(opa);
2234				vm_page_dirty(om);
2235			}
2236			pa |= PG_MANAGED;
2237		}
2238		goto validate;
2239	}
2240	/*
2241	 * Mapping has changed, invalidate old range and fall through to
2242	 * handle validating new mapping.
2243	 */
2244	if (opa) {
2245		int err;
2246		err = pmap_remove_pte(pmap, pte, va);
2247		if (err)
2248			panic("pmap_enter: pte vanished, va: 0x%x", va);
2249	}
2250
2251	/*
2252	 * Enter on the PV list if part of our managed memory. Note that we
2253	 * raise IPL while manipulating pv_table since pmap_enter can be
2254	 * called at interrupt time.
2255	 */
2256	if (pmap_initialized &&
2257	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2258		pmap_insert_entry(pmap, va, mpte, m);
2259		pa |= PG_MANAGED;
2260	}
2261
2262	/*
2263	 * Increment counters
2264	 */
2265	pmap->pm_stats.resident_count++;
2266	if (wired)
2267		pmap->pm_stats.wired_count++;
2268
2269validate:
2270	/*
2271	 * Now validate mapping with desired protection/wiring.
2272	 */
2273	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2274
2275	if (wired)
2276		newpte |= PG_W;
2277	if (va < VM_MAXUSER_ADDRESS)
2278		newpte |= PG_U;
2279	if (pmap == kernel_pmap)
2280		newpte |= pgeflag;
2281
2282	/*
2283	 * if the mapping or permission bits are different, we need
2284	 * to update the pte.
2285	 */
2286	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2287		*pte = newpte | PG_A;
2288		/*if (origpte)*/ {
2289#ifdef SMP
2290			cpu_invlpg((void *)va);
2291			if (pmap->pm_active & PCPU_GET(other_cpus))
2292				smp_invltlb();
2293#else
2294			invltlb_1pg(va);
2295#endif
2296		}
2297	}
2298}
2299
2300/*
2301 * this code makes some *MAJOR* assumptions:
2302 * 1. Current pmap & pmap exists.
2303 * 2. Not wired.
2304 * 3. Read access.
2305 * 4. No page table pages.
2306 * 5. Tlbflush is deferred to calling procedure.
2307 * 6. Page IS managed.
2308 * but is *MUCH* faster than pmap_enter...
2309 */
2310
2311static vm_page_t
2312pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2313{
2314	pt_entry_t *pte;
2315	vm_offset_t pa;
2316
2317	/*
2318	 * In the case that a page table page is not
2319	 * resident, we are creating it here.
2320	 */
2321	if (va < VM_MAXUSER_ADDRESS) {
2322		unsigned ptepindex;
2323		pd_entry_t ptepa;
2324
2325		/*
2326		 * Calculate pagetable page index
2327		 */
2328		ptepindex = va >> PDRSHIFT;
2329		if (mpte && (mpte->pindex == ptepindex)) {
2330			mpte->hold_count++;
2331		} else {
2332retry:
2333			/*
2334			 * Get the page directory entry
2335			 */
2336			ptepa = pmap->pm_pdir[ptepindex];
2337
2338			/*
2339			 * If the page table page is mapped, we just increment
2340			 * the hold count, and activate it.
2341			 */
2342			if (ptepa) {
2343				if (ptepa & PG_PS)
2344					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2345				if (pmap->pm_ptphint &&
2346					(pmap->pm_ptphint->pindex == ptepindex)) {
2347					mpte = pmap->pm_ptphint;
2348				} else {
2349					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
2350					pmap->pm_ptphint = mpte;
2351				}
2352				if (mpte == NULL)
2353					goto retry;
2354				mpte->hold_count++;
2355			} else {
2356				mpte = _pmap_allocpte(pmap, ptepindex);
2357			}
2358		}
2359	} else {
2360		mpte = NULL;
2361	}
2362
2363	/*
2364	 * This call to vtopte makes the assumption that we are
2365	 * entering the page into the current pmap.  In order to support
2366	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2367	 * But that isn't as quick as vtopte.
2368	 */
2369	pte = vtopte(va);
2370	if (*pte) {
2371		if (mpte)
2372			pmap_unwire_pte_hold(pmap, mpte);
2373		return 0;
2374	}
2375
2376	/*
2377	 * Enter on the PV list if part of our managed memory. Note that we
2378	 * raise IPL while manipulating pv_table since pmap_enter can be
2379	 * called at interrupt time.
2380	 */
2381	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2382		pmap_insert_entry(pmap, va, mpte, m);
2383
2384	/*
2385	 * Increment counters
2386	 */
2387	pmap->pm_stats.resident_count++;
2388
2389	pa = VM_PAGE_TO_PHYS(m);
2390
2391	/*
2392	 * Now validate mapping with RO protection
2393	 */
2394	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2395		*pte = pa | PG_V | PG_U;
2396	else
2397		*pte = pa | PG_V | PG_U | PG_MANAGED;
2398
2399	return mpte;
2400}
2401
2402/*
2403 * Make a temporary mapping for a physical address.  This is only intended
2404 * to be used for panic dumps.
2405 */
2406void *
2407pmap_kenter_temporary(vm_offset_t pa, int i)
2408{
2409	pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2410	return ((void *)crashdumpmap);
2411}
2412
2413#define MAX_INIT_PT (96)
2414/*
2415 * pmap_object_init_pt preloads the ptes for a given object
2416 * into the specified pmap.  This eliminates the blast of soft
2417 * faults on process startup and immediately after an mmap.
2418 */
2419void
2420pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2421		    vm_object_t object, vm_pindex_t pindex,
2422		    vm_size_t size, int limit)
2423{
2424	vm_offset_t tmpidx;
2425	int psize;
2426	vm_page_t p, mpte;
2427	int objpgs;
2428
2429	if (pmap == NULL || object == NULL)
2430		return;
2431
2432	/*
2433	 * This code maps large physical mmap regions into the
2434	 * processor address space.  Note that some shortcuts
2435	 * are taken, but the code works.
2436	 */
2437	if (pseflag && (object->type == OBJT_DEVICE) &&
2438	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2439		int i;
2440		vm_page_t m[1];
2441		unsigned int ptepindex;
2442		int npdes;
2443		pd_entry_t ptepa;
2444
2445		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2446			return;
2447
2448retry:
2449		p = vm_page_lookup(object, pindex);
2450		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
2451			goto retry;
2452
2453		if (p == NULL) {
2454			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2455			if (p == NULL)
2456				return;
2457			m[0] = p;
2458
2459			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2460				vm_page_free(p);
2461				return;
2462			}
2463
2464			p = vm_page_lookup(object, pindex);
2465			vm_page_wakeup(p);
2466		}
2467
2468		ptepa = VM_PAGE_TO_PHYS(p);
2469		if (ptepa & (NBPDR - 1)) {
2470			return;
2471		}
2472
2473		p->valid = VM_PAGE_BITS_ALL;
2474
2475		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2476		npdes = size >> PDRSHIFT;
2477		for(i = 0; i < npdes; i++) {
2478			pmap->pm_pdir[ptepindex] =
2479			    ptepa | PG_U | PG_RW | PG_V | PG_PS;
2480			ptepa += NBPDR;
2481			ptepindex += 1;
2482		}
2483		vm_page_flag_set(p, PG_MAPPED);
2484		invltlb();
2485		return;
2486	}
2487
2488	psize = i386_btop(size);
2489
2490	if ((object->type != OBJT_VNODE) ||
2491		((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2492			(object->resident_page_count > MAX_INIT_PT))) {
2493		return;
2494	}
2495
2496	if (psize + pindex > object->size) {
2497		if (object->size < pindex)
2498			return;
2499		psize = object->size - pindex;
2500	}
2501
2502	mpte = NULL;
2503	/*
2504	 * if we are processing a major portion of the object, then scan the
2505	 * entire thing.
2506	 */
2507	if (psize > (object->resident_page_count >> 2)) {
2508		objpgs = psize;
2509
2510		for (p = TAILQ_FIRST(&object->memq);
2511		    ((objpgs > 0) && (p != NULL));
2512		    p = TAILQ_NEXT(p, listq)) {
2513
2514			tmpidx = p->pindex;
2515			if (tmpidx < pindex) {
2516				continue;
2517			}
2518			tmpidx -= pindex;
2519			if (tmpidx >= psize) {
2520				continue;
2521			}
2522			/*
2523			 * don't allow an madvise to blow away our really
2524			 * free pages allocating pv entries.
2525			 */
2526			if ((limit & MAP_PREFAULT_MADVISE) &&
2527			    cnt.v_free_count < cnt.v_free_reserved) {
2528				break;
2529			}
2530			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2531				(p->busy == 0) &&
2532			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2533				if ((p->queue - p->pc) == PQ_CACHE)
2534					vm_page_deactivate(p);
2535				vm_page_busy(p);
2536				mpte = pmap_enter_quick(pmap,
2537					addr + i386_ptob(tmpidx), p, mpte);
2538				vm_page_flag_set(p, PG_MAPPED);
2539				vm_page_wakeup(p);
2540			}
2541			objpgs -= 1;
2542		}
2543	} else {
2544		/*
2545		 * else lookup the pages one-by-one.
2546		 */
2547		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2548			/*
2549			 * don't allow an madvise to blow away our really
2550			 * free pages allocating pv entries.
2551			 */
2552			if ((limit & MAP_PREFAULT_MADVISE) &&
2553			    cnt.v_free_count < cnt.v_free_reserved) {
2554				break;
2555			}
2556			p = vm_page_lookup(object, tmpidx + pindex);
2557			if (p &&
2558			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2559				(p->busy == 0) &&
2560			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2561				if ((p->queue - p->pc) == PQ_CACHE)
2562					vm_page_deactivate(p);
2563				vm_page_busy(p);
2564				mpte = pmap_enter_quick(pmap,
2565					addr + i386_ptob(tmpidx), p, mpte);
2566				vm_page_flag_set(p, PG_MAPPED);
2567				vm_page_wakeup(p);
2568			}
2569		}
2570	}
2571	return;
2572}
2573
2574/*
2575 * pmap_prefault provides a quick way of clustering
2576 * pagefaults into a processes address space.  It is a "cousin"
2577 * of pmap_object_init_pt, except it runs at page fault time instead
2578 * of mmap time.
2579 */
2580#define PFBAK 4
2581#define PFFOR 4
2582#define PAGEORDER_SIZE (PFBAK+PFFOR)
2583
2584static int pmap_prefault_pageorder[] = {
2585	-PAGE_SIZE, PAGE_SIZE,
2586	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2587	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2588	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2589};
2590
2591void
2592pmap_prefault(pmap, addra, entry)
2593	pmap_t pmap;
2594	vm_offset_t addra;
2595	vm_map_entry_t entry;
2596{
2597	int i;
2598	vm_offset_t starta;
2599	vm_offset_t addr;
2600	vm_pindex_t pindex;
2601	vm_page_t m, mpte;
2602	vm_object_t object;
2603
2604	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2605		return;
2606
2607	object = entry->object.vm_object;
2608
2609	starta = addra - PFBAK * PAGE_SIZE;
2610	if (starta < entry->start) {
2611		starta = entry->start;
2612	} else if (starta > addra) {
2613		starta = 0;
2614	}
2615
2616	mpte = NULL;
2617	for (i = 0; i < PAGEORDER_SIZE; i++) {
2618		vm_object_t lobject;
2619		pt_entry_t *pte;
2620
2621		addr = addra + pmap_prefault_pageorder[i];
2622		if (addr > addra + (PFFOR * PAGE_SIZE))
2623			addr = 0;
2624
2625		if (addr < starta || addr >= entry->end)
2626			continue;
2627
2628		if ((*pmap_pde(pmap, addr)) == NULL)
2629			continue;
2630
2631		pte = vtopte(addr);
2632		if (*pte)
2633			continue;
2634
2635		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2636		lobject = object;
2637		for (m = vm_page_lookup(lobject, pindex);
2638		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2639		    lobject = lobject->backing_object) {
2640			if (lobject->backing_object_offset & PAGE_MASK)
2641				break;
2642			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2643			m = vm_page_lookup(lobject->backing_object, pindex);
2644		}
2645
2646		/*
2647		 * give-up when a page is not in memory
2648		 */
2649		if (m == NULL)
2650			break;
2651
2652		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2653			(m->busy == 0) &&
2654		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2655
2656			if ((m->queue - m->pc) == PQ_CACHE) {
2657				vm_page_deactivate(m);
2658			}
2659			vm_page_busy(m);
2660			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2661			vm_page_flag_set(m, PG_MAPPED);
2662			vm_page_wakeup(m);
2663		}
2664	}
2665}
2666
2667/*
2668 *	Routine:	pmap_change_wiring
2669 *	Function:	Change the wiring attribute for a map/virtual-address
2670 *			pair.
2671 *	In/out conditions:
2672 *			The mapping must already exist in the pmap.
2673 */
2674void
2675pmap_change_wiring(pmap, va, wired)
2676	register pmap_t pmap;
2677	vm_offset_t va;
2678	boolean_t wired;
2679{
2680	register pt_entry_t *pte;
2681
2682	if (pmap == NULL)
2683		return;
2684
2685	pte = pmap_pte(pmap, va);
2686
2687	if (wired && !pmap_pte_w(pte))
2688		pmap->pm_stats.wired_count++;
2689	else if (!wired && pmap_pte_w(pte))
2690		pmap->pm_stats.wired_count--;
2691
2692	/*
2693	 * Wiring is not a hardware characteristic so there is no need to
2694	 * invalidate TLB.
2695	 */
2696	pmap_pte_set_w(pte, wired);
2697}
2698
2699
2700
2701/*
2702 *	Copy the range specified by src_addr/len
2703 *	from the source map to the range dst_addr/len
2704 *	in the destination map.
2705 *
2706 *	This routine is only advisory and need not do anything.
2707 */
2708
2709void
2710pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2711	  vm_offset_t src_addr)
2712{
2713	vm_offset_t addr;
2714	vm_offset_t end_addr = src_addr + len;
2715	vm_offset_t pdnxt;
2716	pd_entry_t src_frame, dst_frame;
2717	vm_page_t m;
2718	pd_entry_t saved_pde;
2719
2720	if (dst_addr != src_addr)
2721		return;
2722
2723	src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2724	if (src_frame != (PTDpde & PG_FRAME))
2725		return;
2726
2727	dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2728	if (dst_frame != (APTDpde & PG_FRAME)) {
2729		APTDpde = dst_frame | PG_RW | PG_V;
2730#if defined(SMP)
2731		/* The page directory is not shared between CPUs */
2732		cpu_invltlb();
2733#else
2734		invltlb();
2735#endif
2736	}
2737 	saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V);
2738	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
2739		pt_entry_t *src_pte, *dst_pte;
2740		vm_page_t dstmpte, srcmpte;
2741		pd_entry_t srcptepaddr;
2742		unsigned ptepindex;
2743
2744		if (addr >= UPT_MIN_ADDRESS)
2745			panic("pmap_copy: invalid to pmap_copy page tables\n");
2746
2747		/*
2748		 * Don't let optional prefaulting of pages make us go
2749		 * way below the low water mark of free pages or way
2750		 * above high water mark of used pv entries.
2751		 */
2752		if (cnt.v_free_count < cnt.v_free_reserved ||
2753		    pv_entry_count > pv_entry_high_water)
2754			break;
2755
2756		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2757		ptepindex = addr >> PDRSHIFT;
2758
2759		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2760		if (srcptepaddr == 0)
2761			continue;
2762
2763		if (srcptepaddr & PG_PS) {
2764			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2765				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2766				dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2767			}
2768			continue;
2769		}
2770
2771		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2772		if ((srcmpte == NULL) ||
2773		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2774			continue;
2775
2776		if (pdnxt > end_addr)
2777			pdnxt = end_addr;
2778
2779		src_pte = vtopte(addr);
2780		dst_pte = avtopte(addr);
2781		while (addr < pdnxt) {
2782			pt_entry_t ptetemp;
2783			ptetemp = *src_pte;
2784			/*
2785			 * we only virtual copy managed pages
2786			 */
2787			if ((ptetemp & PG_MANAGED) != 0) {
2788				/*
2789				 * We have to check after allocpte for the
2790				 * pte still being around...  allocpte can
2791				 * block.
2792				 */
2793				dstmpte = pmap_allocpte(dst_pmap, addr);
2794				if ((APTDpde & PG_FRAME) !=
2795				    (saved_pde & PG_FRAME)) {
2796					APTDpde = saved_pde;
2797printf ("IT HAPPENNED!");
2798#if defined(SMP)
2799					cpu_invltlb();
2800#else
2801					invltlb();
2802#endif
2803				}
2804				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2805					/*
2806					 * Clear the modified and
2807					 * accessed (referenced) bits
2808					 * during the copy.
2809					 */
2810					m = PHYS_TO_VM_PAGE(ptetemp);
2811					*dst_pte = ptetemp & ~(PG_M | PG_A);
2812					dst_pmap->pm_stats.resident_count++;
2813					pmap_insert_entry(dst_pmap, addr,
2814						dstmpte, m);
2815	 			} else {
2816					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2817				}
2818				if (dstmpte->hold_count >= srcmpte->hold_count)
2819					break;
2820			}
2821			addr += PAGE_SIZE;
2822			src_pte++;
2823			dst_pte++;
2824		}
2825	}
2826}
2827
2828/*
2829 *	Routine:	pmap_kernel
2830 *	Function:
2831 *		Returns the physical map handle for the kernel.
2832 */
2833pmap_t
2834pmap_kernel()
2835{
2836	return (kernel_pmap);
2837}
2838
2839/*
2840 *	pmap_zero_page zeros the specified hardware page by mapping
2841 *	the page into KVM and using bzero to clear its contents.
2842 */
2843void
2844pmap_zero_page(vm_offset_t phys)
2845{
2846
2847	if (*CMAP2)
2848		panic("pmap_zero_page: CMAP2 busy");
2849
2850	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2851	invltlb_1pg((vm_offset_t)CADDR2);
2852
2853#if defined(I686_CPU)
2854	if (cpu_class == CPUCLASS_686)
2855		i686_pagezero(CADDR2);
2856	else
2857#endif
2858		bzero(CADDR2, PAGE_SIZE);
2859	*CMAP2 = 0;
2860}
2861
2862/*
2863 *	pmap_zero_page_area zeros the specified hardware page by mapping
2864 *	the page into KVM and using bzero to clear its contents.
2865 *
2866 *	off and size may not cover an area beyond a single hardware page.
2867 */
2868void
2869pmap_zero_page_area(vm_offset_t phys, int off, int size)
2870{
2871
2872	if (*CMAP2)
2873		panic("pmap_zero_page: CMAP2 busy");
2874
2875	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2876	invltlb_1pg((vm_offset_t)CADDR2);
2877
2878#if defined(I686_CPU)
2879	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2880		i686_pagezero(CADDR2);
2881	else
2882#endif
2883		bzero((char *)CADDR2 + off, size);
2884	*CMAP2 = 0;
2885}
2886
2887/*
2888 *	pmap_copy_page copies the specified (machine independent)
2889 *	page by mapping the page into virtual memory and using
2890 *	bcopy to copy the page, one machine dependent page at a
2891 *	time.
2892 */
2893void
2894pmap_copy_page(vm_offset_t src, vm_offset_t dst)
2895{
2896
2897	if (*CMAP1)
2898		panic("pmap_copy_page: CMAP1 busy");
2899	if (*CMAP2)
2900		panic("pmap_copy_page: CMAP2 busy");
2901
2902	*CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2903	*CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2904#ifdef I386_CPU
2905	invltlb();
2906#else
2907	invlpg((u_int)CADDR1);
2908	invlpg((u_int)CADDR2);
2909#endif
2910
2911	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2912
2913	*CMAP1 = 0;
2914	*CMAP2 = 0;
2915}
2916
2917
2918/*
2919 *	Routine:	pmap_pageable
2920 *	Function:
2921 *		Make the specified pages (by pmap, offset)
2922 *		pageable (or not) as requested.
2923 *
2924 *		A page which is not pageable may not take
2925 *		a fault; therefore, its page table entry
2926 *		must remain valid for the duration.
2927 *
2928 *		This routine is merely advisory; pmap_enter
2929 *		will specify that these pages are to be wired
2930 *		down (or not) as appropriate.
2931 */
2932void
2933pmap_pageable(pmap, sva, eva, pageable)
2934	pmap_t pmap;
2935	vm_offset_t sva, eva;
2936	boolean_t pageable;
2937{
2938}
2939
2940/*
2941 * this routine returns true if a physical page resides
2942 * in the given pmap.
2943 */
2944boolean_t
2945pmap_page_exists(pmap, m)
2946	pmap_t pmap;
2947	vm_page_t m;
2948{
2949	register pv_entry_t pv;
2950	int s;
2951
2952	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2953		return FALSE;
2954
2955	s = splvm();
2956
2957	/*
2958	 * Not found, check current mappings returning immediately if found.
2959	 */
2960	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2961		if (pv->pv_pmap == pmap) {
2962			splx(s);
2963			return TRUE;
2964		}
2965	}
2966	splx(s);
2967	return (FALSE);
2968}
2969
2970#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2971/*
2972 * Remove all pages from specified address space
2973 * this aids process exit speeds.  Also, this code
2974 * is special cased for current process only, but
2975 * can have the more generic (and slightly slower)
2976 * mode enabled.  This is much faster than pmap_remove
2977 * in the case of running down an entire address space.
2978 */
2979void
2980pmap_remove_pages(pmap, sva, eva)
2981	pmap_t pmap;
2982	vm_offset_t sva, eva;
2983{
2984	pt_entry_t *pte, tpte;
2985	vm_page_t m;
2986	pv_entry_t pv, npv;
2987	int s;
2988
2989#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2990	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2991		printf("warning: pmap_remove_pages called with non-current pmap\n");
2992		return;
2993	}
2994#endif
2995
2996	s = splvm();
2997	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2998
2999		if (pv->pv_va >= eva || pv->pv_va < sva) {
3000			npv = TAILQ_NEXT(pv, pv_plist);
3001			continue;
3002		}
3003
3004#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
3005		pte = vtopte(pv->pv_va);
3006#else
3007		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3008#endif
3009		tpte = *pte;
3010
3011		if (tpte == 0) {
3012			printf("TPTE at %p  IS ZERO @ VA %08x\n",
3013							pte, pv->pv_va);
3014			panic("bad pte");
3015		}
3016
3017/*
3018 * We cannot remove wired pages from a process' mapping at this time
3019 */
3020		if (tpte & PG_W) {
3021			npv = TAILQ_NEXT(pv, pv_plist);
3022			continue;
3023		}
3024
3025		m = PHYS_TO_VM_PAGE(tpte);
3026		KASSERT(m->phys_addr == (tpte & PG_FRAME),
3027		    ("vm_page_t %p phys_addr mismatch %08x %08x",
3028		    m, m->phys_addr, tpte));
3029
3030		KASSERT(m < &vm_page_array[vm_page_array_size],
3031			("pmap_remove_pages: bad tpte %x", tpte));
3032
3033		pv->pv_pmap->pm_stats.resident_count--;
3034
3035		*pte = 0;
3036
3037		/*
3038		 * Update the vm_page_t clean and reference bits.
3039		 */
3040		if (tpte & PG_M) {
3041			vm_page_dirty(m);
3042		}
3043
3044		npv = TAILQ_NEXT(pv, pv_plist);
3045		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
3046
3047		m->md.pv_list_count--;
3048		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3049		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
3050			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
3051		}
3052
3053		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
3054		free_pv_entry(pv);
3055	}
3056	splx(s);
3057	pmap_invalidate_all(pmap);
3058}
3059
3060/*
3061 * pmap_testbit tests bits in pte's
3062 * note that the testbit/changebit routines are inline,
3063 * and a lot of things compile-time evaluate.
3064 */
3065static boolean_t
3066pmap_testbit(m, bit)
3067	vm_page_t m;
3068	int bit;
3069{
3070	pv_entry_t pv;
3071	pt_entry_t *pte;
3072	int s;
3073
3074	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3075		return FALSE;
3076
3077	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
3078		return FALSE;
3079
3080	s = splvm();
3081
3082	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3083		/*
3084		 * if the bit being tested is the modified bit, then
3085		 * mark clean_map and ptes as never
3086		 * modified.
3087		 */
3088		if (bit & (PG_A|PG_M)) {
3089			if (!pmap_track_modified(pv->pv_va))
3090				continue;
3091		}
3092
3093#if defined(PMAP_DIAGNOSTIC)
3094		if (!pv->pv_pmap) {
3095			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
3096			continue;
3097		}
3098#endif
3099		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3100		if (*pte & bit) {
3101			splx(s);
3102			return TRUE;
3103		}
3104	}
3105	splx(s);
3106	return (FALSE);
3107}
3108
3109/*
3110 * this routine is used to modify bits in ptes
3111 */
3112static __inline void
3113pmap_changebit(vm_page_t m, int bit, boolean_t setem)
3114{
3115	register pv_entry_t pv;
3116	register pt_entry_t *pte;
3117	int s;
3118
3119	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3120		return;
3121
3122	s = splvm();
3123
3124	/*
3125	 * Loop over all current mappings setting/clearing as appropos If
3126	 * setting RO do we need to clear the VAC?
3127	 */
3128	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3129		/*
3130		 * don't write protect pager mappings
3131		 */
3132		if (!setem && (bit == PG_RW)) {
3133			if (!pmap_track_modified(pv->pv_va))
3134				continue;
3135		}
3136
3137#if defined(PMAP_DIAGNOSTIC)
3138		if (!pv->pv_pmap) {
3139			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3140			continue;
3141		}
3142#endif
3143
3144		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3145
3146		if (setem) {
3147			*pte |= bit;
3148			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3149		} else {
3150			pt_entry_t pbits = *pte;
3151			if (pbits & bit) {
3152				if (bit == PG_RW) {
3153					if (pbits & PG_M) {
3154						vm_page_dirty(m);
3155					}
3156					*pte = pbits & ~(PG_M|PG_RW);
3157				} else {
3158					*pte = pbits & ~bit;
3159				}
3160				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3161			}
3162		}
3163	}
3164	splx(s);
3165}
3166
3167/*
3168 *      pmap_page_protect:
3169 *
3170 *      Lower the permission for all mappings to a given page.
3171 */
3172void
3173pmap_page_protect(vm_page_t m, vm_prot_t prot)
3174{
3175	if ((prot & VM_PROT_WRITE) == 0) {
3176		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3177			pmap_changebit(m, PG_RW, FALSE);
3178		} else {
3179			pmap_remove_all(m);
3180		}
3181	}
3182}
3183
3184vm_offset_t
3185pmap_phys_address(ppn)
3186	int ppn;
3187{
3188	return (i386_ptob(ppn));
3189}
3190
3191/*
3192 *	pmap_ts_referenced:
3193 *
3194 *	Return the count of reference bits for a page, clearing all of them.
3195 */
3196int
3197pmap_ts_referenced(vm_page_t m)
3198{
3199	register pv_entry_t pv, pvf, pvn;
3200	pt_entry_t *pte;
3201	int s;
3202	int rtval = 0;
3203
3204	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3205		return (rtval);
3206
3207	s = splvm();
3208
3209	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3210
3211		pvf = pv;
3212
3213		do {
3214			pvn = TAILQ_NEXT(pv, pv_list);
3215
3216			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3217
3218			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3219
3220			if (!pmap_track_modified(pv->pv_va))
3221				continue;
3222
3223			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3224
3225			if (pte && (*pte & PG_A)) {
3226				*pte &= ~PG_A;
3227
3228				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3229
3230				rtval++;
3231				if (rtval > 4) {
3232					break;
3233				}
3234			}
3235		} while ((pv = pvn) != NULL && pv != pvf);
3236	}
3237	splx(s);
3238
3239	return (rtval);
3240}
3241
3242/*
3243 *	pmap_is_modified:
3244 *
3245 *	Return whether or not the specified physical page was modified
3246 *	in any physical maps.
3247 */
3248boolean_t
3249pmap_is_modified(vm_page_t m)
3250{
3251	return pmap_testbit(m, PG_M);
3252}
3253
3254/*
3255 *	Clear the modify bits on the specified physical page.
3256 */
3257void
3258pmap_clear_modify(vm_page_t m)
3259{
3260	pmap_changebit(m, PG_M, FALSE);
3261}
3262
3263/*
3264 *	pmap_clear_reference:
3265 *
3266 *	Clear the reference bit on the specified physical page.
3267 */
3268void
3269pmap_clear_reference(vm_page_t m)
3270{
3271	pmap_changebit(m, PG_A, FALSE);
3272}
3273
3274/*
3275 * Miscellaneous support routines follow
3276 */
3277
3278static void
3279i386_protection_init()
3280{
3281	register int *kp, prot;
3282
3283	kp = protection_codes;
3284	for (prot = 0; prot < 8; prot++) {
3285		switch (prot) {
3286		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3287			/*
3288			 * Read access is also 0. There isn't any execute bit,
3289			 * so just make it readable.
3290			 */
3291		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3292		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3293		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3294			*kp++ = 0;
3295			break;
3296		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3297		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3298		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3299		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3300			*kp++ = PG_RW;
3301			break;
3302		}
3303	}
3304}
3305
3306/*
3307 * Map a set of physical memory pages into the kernel virtual
3308 * address space. Return a pointer to where it is mapped. This
3309 * routine is intended to be used for mapping device memory,
3310 * NOT real memory.
3311 */
3312void *
3313pmap_mapdev(pa, size)
3314	vm_offset_t pa;
3315	vm_size_t size;
3316{
3317	vm_offset_t va, tmpva, offset;
3318	pt_entry_t *pte;
3319
3320	offset = pa & PAGE_MASK;
3321	size = roundup(offset + size, PAGE_SIZE);
3322
3323	GIANT_REQUIRED;
3324
3325	va = kmem_alloc_pageable(kernel_map, size);
3326	if (!va)
3327		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3328
3329	pa = pa & PG_FRAME;
3330	for (tmpva = va; size > 0;) {
3331		pte = vtopte(tmpva);
3332		*pte = pa | PG_RW | PG_V | pgeflag;
3333		size -= PAGE_SIZE;
3334		tmpva += PAGE_SIZE;
3335		pa += PAGE_SIZE;
3336	}
3337	invltlb();
3338
3339	return ((void *)(va + offset));
3340}
3341
3342void
3343pmap_unmapdev(va, size)
3344	vm_offset_t va;
3345	vm_size_t size;
3346{
3347	vm_offset_t base, offset;
3348
3349	base = va & PG_FRAME;
3350	offset = va & PAGE_MASK;
3351	size = roundup(offset + size, PAGE_SIZE);
3352	kmem_free(kernel_map, base, size);
3353}
3354
3355/*
3356 * perform the pmap work for mincore
3357 */
3358int
3359pmap_mincore(pmap, addr)
3360	pmap_t pmap;
3361	vm_offset_t addr;
3362{
3363	pt_entry_t *ptep, pte;
3364	vm_page_t m;
3365	int val = 0;
3366
3367	ptep = pmap_pte(pmap, addr);
3368	if (ptep == 0) {
3369		return 0;
3370	}
3371
3372	if ((pte = *ptep) != 0) {
3373		vm_offset_t pa;
3374
3375		val = MINCORE_INCORE;
3376		if ((pte & PG_MANAGED) == 0)
3377			return val;
3378
3379		pa = pte & PG_FRAME;
3380
3381		m = PHYS_TO_VM_PAGE(pa);
3382
3383		/*
3384		 * Modified by us
3385		 */
3386		if (pte & PG_M)
3387			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3388		/*
3389		 * Modified by someone
3390		 */
3391		else if (m->dirty || pmap_is_modified(m))
3392			val |= MINCORE_MODIFIED_OTHER;
3393		/*
3394		 * Referenced by us
3395		 */
3396		if (pte & PG_A)
3397			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3398
3399		/*
3400		 * Referenced by someone
3401		 */
3402		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3403			val |= MINCORE_REFERENCED_OTHER;
3404			vm_page_flag_set(m, PG_REFERENCED);
3405		}
3406	}
3407	return val;
3408}
3409
3410void
3411pmap_activate(struct thread *td)
3412{
3413	struct proc *p = td->td_proc;
3414	pmap_t	pmap;
3415	u_int32_t  cr3;
3416
3417	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3418#if defined(SMP)
3419	pmap->pm_active |= 1 << PCPU_GET(cpuid);
3420#else
3421	pmap->pm_active |= 1;
3422#endif
3423#if defined(SWTCH_OPTIM_STATS)
3424	tlb_flush_count++;
3425#endif
3426	cr3 = vtophys(pmap->pm_pdir);
3427	/* XXXKSE this is wrong.
3428	 * pmap_activate is for the current thread on the current cpu
3429	 */
3430	if (p->p_flag & P_KSES) {
3431		/* Make sure all other cr3 entries are updated. */
3432		/* what if they are running?  XXXKSE (maybe abort them) */
3433		FOREACH_THREAD_IN_PROC(p, td) {
3434			td->td_pcb->pcb_cr3 = cr3;
3435		}
3436	} else {
3437		td->td_pcb->pcb_cr3 = cr3;
3438	}
3439	load_cr3(cr3);
3440}
3441
3442vm_offset_t
3443pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3444{
3445
3446	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3447		return addr;
3448	}
3449
3450	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3451	return addr;
3452}
3453
3454
3455#if defined(PMAP_DEBUG)
3456pmap_pid_dump(int pid)
3457{
3458	pmap_t pmap;
3459	struct proc *p;
3460	int npte = 0;
3461	int index;
3462
3463	sx_slock(&allproc_lock);
3464	LIST_FOREACH(p, &allproc, p_list) {
3465		if (p->p_pid != pid)
3466			continue;
3467
3468		if (p->p_vmspace) {
3469			int i,j;
3470			index = 0;
3471			pmap = vmspace_pmap(p->p_vmspace);
3472			for (i = 0; i < NPDEPG; i++) {
3473				pd_entry_t *pde;
3474				pt_entry_t *pte;
3475				vm_offset_t base = i << PDRSHIFT;
3476
3477				pde = &pmap->pm_pdir[i];
3478				if (pde && pmap_pde_v(pde)) {
3479					for (j = 0; j < NPTEPG; j++) {
3480						vm_offset_t va = base + (j << PAGE_SHIFT);
3481						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3482							if (index) {
3483								index = 0;
3484								printf("\n");
3485							}
3486							sx_sunlock(&allproc_lock);
3487							return npte;
3488						}
3489						pte = pmap_pte_quick(pmap, va);
3490						if (pte && pmap_pte_v(pte)) {
3491							pt_entry_t pa;
3492							vm_page_t m;
3493							pa = *pte;
3494							m = PHYS_TO_VM_PAGE(pa);
3495							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3496								va, pa, m->hold_count, m->wire_count, m->flags);
3497							npte++;
3498							index++;
3499							if (index >= 2) {
3500								index = 0;
3501								printf("\n");
3502							} else {
3503								printf(" ");
3504							}
3505						}
3506					}
3507				}
3508			}
3509		}
3510	}
3511	sx_sunlock(&allproc_lock);
3512	return npte;
3513}
3514#endif
3515
3516#if defined(DEBUG)
3517
3518static void	pads __P((pmap_t pm));
3519void		pmap_pvdump __P((vm_offset_t pa));
3520
3521/* print address space of pmap*/
3522static void
3523pads(pm)
3524	pmap_t pm;
3525{
3526	int i, j;
3527	vm_offset_t va;
3528	pt_entry_t *ptep;
3529
3530	if (pm == kernel_pmap)
3531		return;
3532	for (i = 0; i < NPDEPG; i++)
3533		if (pm->pm_pdir[i])
3534			for (j = 0; j < NPTEPG; j++) {
3535				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3536				if (pm == kernel_pmap && va < KERNBASE)
3537					continue;
3538				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3539					continue;
3540				ptep = pmap_pte_quick(pm, va);
3541				if (pmap_pte_v(ptep))
3542					printf("%x:%x ", va, *ptep);
3543			};
3544
3545}
3546
3547void
3548pmap_pvdump(pa)
3549	vm_offset_t pa;
3550{
3551	pv_entry_t pv;
3552	vm_page_t m;
3553
3554	printf("pa %x", pa);
3555	m = PHYS_TO_VM_PAGE(pa);
3556	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3557		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3558		pads(pv->pv_pmap);
3559	}
3560	printf(" ");
3561}
3562#endif
3563