pmap.c revision 86486
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 * $FreeBSD: head/sys/i386/i386/pmap.c 86486 2001-11-17 01:56:04Z peter $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72#include "opt_pmap.h"
73#include "opt_msgbuf.h"
74#include "opt_kstack_pages.h"
75
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/kernel.h>
79#include <sys/lock.h>
80#include <sys/mman.h>
81#include <sys/msgbuf.h>
82#include <sys/mutex.h>
83#include <sys/proc.h>
84#include <sys/sx.h>
85#include <sys/user.h>
86#include <sys/vmmeter.h>
87#include <sys/sysctl.h>
88
89#include <vm/vm.h>
90#include <vm/vm_param.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_page.h>
93#include <vm/vm_map.h>
94#include <vm/vm_object.h>
95#include <vm/vm_extern.h>
96#include <vm/vm_pageout.h>
97#include <vm/vm_pager.h>
98#include <vm/vm_zone.h>
99
100#include <machine/cputypes.h>
101#include <machine/md_var.h>
102#include <machine/specialreg.h>
103#if defined(SMP) || defined(APIC_IO)
104#include <machine/smp.h>
105#include <machine/apic.h>
106#include <machine/segments.h>
107#include <machine/tss.h>
108#include <machine/globaldata.h>
109#endif /* SMP || APIC_IO */
110
111#define PMAP_KEEP_PDIRS
112#ifndef PMAP_SHPGPERPROC
113#define PMAP_SHPGPERPROC 200
114#endif
115
116#if defined(DIAGNOSTIC)
117#define PMAP_DIAGNOSTIC
118#endif
119
120#define MINPV 2048
121
122#if !defined(PMAP_DIAGNOSTIC)
123#define PMAP_INLINE __inline
124#else
125#define PMAP_INLINE
126#endif
127
128/*
129 * Get PDEs and PTEs for user/kernel address space
130 */
131#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
132#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
133
134#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
135#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
136#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
137#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
138#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
139
140#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
141#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
142
143/*
144 * Given a map and a machine independent protection code,
145 * convert to a vax protection code.
146 */
147#define pte_prot(m, p)	(protection_codes[p])
148static int protection_codes[8];
149
150static struct pmap kernel_pmap_store;
151pmap_t kernel_pmap;
152LIST_HEAD(pmaplist, pmap);
153struct pmaplist allpmaps;
154
155vm_offset_t avail_start;	/* PA of first available physical page */
156vm_offset_t avail_end;		/* PA of last available physical page */
157vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
158vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
159static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
160static int pgeflag;		/* PG_G or-in */
161static int pseflag;		/* PG_PS or-in */
162
163static vm_object_t kptobj;
164
165static int nkpt;
166vm_offset_t kernel_vm_end;
167
168/*
169 * Data for the pv entry allocation mechanism
170 */
171static vm_zone_t pvzone;
172static struct vm_zone pvzone_store;
173static struct vm_object pvzone_obj;
174static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
175static int pmap_pagedaemon_waken = 0;
176static struct pv_entry *pvinit;
177
178/*
179 * All those kernel PT submaps that BSD is so fond of
180 */
181pt_entry_t *CMAP1 = 0;
182static pt_entry_t *CMAP2, *ptmmap;
183caddr_t CADDR1 = 0, ptvmmap = 0;
184static caddr_t CADDR2;
185static pt_entry_t *msgbufmap;
186struct msgbuf *msgbufp = 0;
187
188/*
189 * Crashdump maps.
190 */
191static pt_entry_t *pt_crashdumpmap;
192static caddr_t crashdumpmap;
193
194#ifdef SMP
195extern pt_entry_t *SMPpt;
196#endif
197static pt_entry_t *PMAP1 = 0;
198static pt_entry_t *PADDR1 = 0;
199
200static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
201static unsigned * get_ptbase __P((pmap_t pmap));
202static pv_entry_t get_pv_entry __P((void));
203static void	i386_protection_init __P((void));
204static __inline void	pmap_changebit __P((vm_page_t m, int bit, boolean_t setem));
205
206static void	pmap_remove_all __P((vm_page_t m));
207static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
208				      vm_page_t m, vm_page_t mpte));
209static int pmap_remove_pte __P((pmap_t pmap, unsigned *ptq, vm_offset_t sva));
210static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
211static int pmap_remove_entry __P((struct pmap *pmap, vm_page_t m,
212					vm_offset_t va));
213static boolean_t pmap_testbit __P((vm_page_t m, int bit));
214static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
215		vm_page_t mpte, vm_page_t m));
216
217static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
218
219static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
220static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
221static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
222static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
223static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
224static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
225
226static unsigned pdir4mb;
227
228/*
229 *	Routine:	pmap_pte
230 *	Function:
231 *		Extract the page table entry associated
232 *		with the given map/virtual_address pair.
233 */
234
235PMAP_INLINE unsigned *
236pmap_pte(pmap, va)
237	register pmap_t pmap;
238	vm_offset_t va;
239{
240	pd_entry_t *pdeaddr;
241
242	if (pmap) {
243		pdeaddr = pmap_pde(pmap, va);
244		if (*pdeaddr & PG_PS)
245			return pdeaddr;
246		if (*pdeaddr) {
247			return get_ptbase(pmap) + i386_btop(va);
248		}
249	}
250	return (0);
251}
252
253/*
254 * Move the kernel virtual free pointer to the next
255 * 4MB.  This is used to help improve performance
256 * by using a large (4MB) page for much of the kernel
257 * (.text, .data, .bss)
258 */
259static vm_offset_t
260pmap_kmem_choose(vm_offset_t addr)
261{
262	vm_offset_t newaddr = addr;
263#ifndef DISABLE_PSE
264	if (cpu_feature & CPUID_PSE) {
265		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
266	}
267#endif
268	return newaddr;
269}
270
271/*
272 *	Bootstrap the system enough to run with virtual memory.
273 *
274 *	On the i386 this is called after mapping has already been enabled
275 *	and just syncs the pmap module with what has already been done.
276 *	[We can't call it easily with mapping off since the kernel is not
277 *	mapped with PA == VA, hence we would have to relocate every address
278 *	from the linked base (virtual) address "KERNBASE" to the actual
279 *	(physical) address starting relative to 0]
280 */
281void
282pmap_bootstrap(firstaddr, loadaddr)
283	vm_offset_t firstaddr;
284	vm_offset_t loadaddr;
285{
286	vm_offset_t va;
287	pt_entry_t *pte;
288	int i;
289
290	avail_start = firstaddr;
291
292	/*
293	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
294	 * large. It should instead be correctly calculated in locore.s and
295	 * not based on 'first' (which is a physical address, not a virtual
296	 * address, for the start of unused physical memory). The kernel
297	 * page tables are NOT double mapped and thus should not be included
298	 * in this calculation.
299	 */
300	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
301	virtual_avail = pmap_kmem_choose(virtual_avail);
302
303	virtual_end = VM_MAX_KERNEL_ADDRESS;
304
305	/*
306	 * Initialize protection array.
307	 */
308	i386_protection_init();
309
310	/*
311	 * The kernel's pmap is statically allocated so we don't have to use
312	 * pmap_create, which is unlikely to work correctly at this part of
313	 * the boot sequence (XXX and which no longer exists).
314	 */
315	kernel_pmap = &kernel_pmap_store;
316
317	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
318	kernel_pmap->pm_count = 1;
319	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
320	TAILQ_INIT(&kernel_pmap->pm_pvlist);
321	LIST_INIT(&allpmaps);
322	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
323	nkpt = NKPT;
324
325	/*
326	 * Reserve some special page table entries/VA space for temporary
327	 * mapping of pages.
328	 */
329#define	SYSMAP(c, p, v, n)	\
330	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
331
332	va = virtual_avail;
333	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
334
335	/*
336	 * CMAP1/CMAP2 are used for zeroing and copying pages.
337	 */
338	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
339	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
340
341	/*
342	 * Crashdump maps.
343	 */
344	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
345
346	/*
347	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
348	 * XXX ptmmap is not used.
349	 */
350	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
351
352	/*
353	 * msgbufp is used to map the system message buffer.
354	 * XXX msgbufmap is not used.
355	 */
356	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
357	       atop(round_page(MSGBUF_SIZE)))
358
359	/*
360	 * ptemap is used for pmap_pte_quick
361	 */
362	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
363
364	virtual_avail = va;
365
366	*CMAP1 = *CMAP2 = 0;
367	for (i = 0; i < NKPT; i++)
368		PTD[i] = 0;
369
370	pgeflag = 0;
371#if !defined(SMP)			/* XXX - see also mp_machdep.c */
372	if (cpu_feature & CPUID_PGE) {
373		pgeflag = PG_G;
374	}
375#endif
376
377/*
378 * Initialize the 4MB page size flag
379 */
380	pseflag = 0;
381/*
382 * The 4MB page version of the initial
383 * kernel page mapping.
384 */
385	pdir4mb = 0;
386
387#if !defined(DISABLE_PSE)
388	if (cpu_feature & CPUID_PSE) {
389		unsigned ptditmp;
390		/*
391		 * Note that we have enabled PSE mode
392		 */
393		pseflag = PG_PS;
394		ptditmp = *(PTmap + i386_btop(KERNBASE));
395		ptditmp &= ~(NBPDR - 1);
396		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
397		pdir4mb = ptditmp;
398
399#if !defined(SMP)
400		/*
401		 * Enable the PSE mode.
402		 */
403		load_cr4(rcr4() | CR4_PSE);
404
405		/*
406		 * We can do the mapping here for the single processor
407		 * case.  We simply ignore the old page table page from
408		 * now on.
409		 */
410		/*
411		 * For SMP, we still need 4K pages to bootstrap APs,
412		 * PSE will be enabled as soon as all APs are up.
413		 */
414		PTD[KPTDI] = (pd_entry_t) ptditmp;
415		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
416		invltlb();
417#endif
418	}
419#endif
420
421#ifdef SMP
422	if (cpu_apic_address == 0)
423		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
424
425	/* local apic is mapped on last page */
426	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
427	    (cpu_apic_address & PG_FRAME));
428#endif
429
430	invltlb();
431}
432
433#ifdef SMP
434/*
435 * Set 4mb pdir for mp startup
436 */
437void
438pmap_set_opt(void)
439{
440	if (pseflag && (cpu_feature & CPUID_PSE)) {
441		load_cr4(rcr4() | CR4_PSE);
442		if (pdir4mb && PCPU_GET(cpuid) == 0) {	/* only on BSP */
443			kernel_pmap->pm_pdir[KPTDI] =
444			    PTD[KPTDI] = (pd_entry_t)pdir4mb;
445			cpu_invltlb();
446		}
447	}
448}
449#endif
450
451/*
452 *	Initialize the pmap module.
453 *	Called by vm_init, to initialize any structures that the pmap
454 *	system needs to map virtual memory.
455 *	pmap_init has been enhanced to support in a fairly consistant
456 *	way, discontiguous physical memory.
457 */
458void
459pmap_init(phys_start, phys_end)
460	vm_offset_t phys_start, phys_end;
461{
462	int i;
463	int initial_pvs;
464
465	/*
466	 * object for kernel page table pages
467	 */
468	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
469
470	/*
471	 * Allocate memory for random pmap data structures.  Includes the
472	 * pv_head_table.
473	 */
474
475	for(i = 0; i < vm_page_array_size; i++) {
476		vm_page_t m;
477
478		m = &vm_page_array[i];
479		TAILQ_INIT(&m->md.pv_list);
480		m->md.pv_list_count = 0;
481	}
482
483	/*
484	 * init the pv free list
485	 */
486	initial_pvs = vm_page_array_size;
487	if (initial_pvs < MINPV)
488		initial_pvs = MINPV;
489	pvzone = &pvzone_store;
490	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
491		initial_pvs * sizeof (struct pv_entry));
492	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
493	    vm_page_array_size);
494
495	/*
496	 * Now it is safe to enable pv_table recording.
497	 */
498	pmap_initialized = TRUE;
499}
500
501/*
502 * Initialize the address space (zone) for the pv_entries.  Set a
503 * high water mark so that the system can recover from excessive
504 * numbers of pv entries.
505 */
506void
507pmap_init2()
508{
509	int shpgperproc = PMAP_SHPGPERPROC;
510
511	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
512	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
513	pv_entry_high_water = 9 * (pv_entry_max / 10);
514	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
515}
516
517
518/***************************************************
519 * Low level helper routines.....
520 ***************************************************/
521
522#if defined(PMAP_DIAGNOSTIC)
523
524/*
525 * This code checks for non-writeable/modified pages.
526 * This should be an invalid condition.
527 */
528static int
529pmap_nw_modified(pt_entry_t ptea)
530{
531	int pte;
532
533	pte = (int) ptea;
534
535	if ((pte & (PG_M|PG_RW)) == PG_M)
536		return 1;
537	else
538		return 0;
539}
540#endif
541
542
543/*
544 * this routine defines the region(s) of memory that should
545 * not be tested for the modified bit.
546 */
547static PMAP_INLINE int
548pmap_track_modified(vm_offset_t va)
549{
550	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
551		return 1;
552	else
553		return 0;
554}
555
556static PMAP_INLINE void
557invltlb_1pg(vm_offset_t va)
558{
559#ifdef I386_CPU
560	invltlb();
561#else
562	invlpg(va);
563#endif
564}
565
566static __inline void
567pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
568{
569#if defined(SMP)
570	if (pmap->pm_active & (1 << PCPU_GET(cpuid)))
571		cpu_invlpg((void *)va);
572	if (pmap->pm_active & PCPU_GET(other_cpus))
573		smp_invltlb();
574#else
575	if (pmap->pm_active)
576		invltlb_1pg(va);
577#endif
578}
579
580static __inline void
581pmap_invalidate_all(pmap_t pmap)
582{
583#if defined(SMP)
584	if (pmap->pm_active & (1 << PCPU_GET(cpuid)))
585		cpu_invltlb();
586	if (pmap->pm_active & PCPU_GET(other_cpus))
587		smp_invltlb();
588#else
589	if (pmap->pm_active)
590		invltlb();
591#endif
592}
593
594/*
595 * Return an address which is the base of the Virtual mapping of
596 * all the PTEs for the given pmap. Note this doesn't say that
597 * all the PTEs will be present or that the pages there are valid.
598 * The PTEs are made available by the recursive mapping trick.
599 * It will map in the alternate PTE space if needed.
600 */
601static pt_entry_t *
602get_ptbase(pmap)
603	pmap_t pmap;
604{
605	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
606
607	/* are we current address space or kernel? */
608	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
609		return PTmap;
610	}
611	/* otherwise, we are alternate address space */
612	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
613		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
614#if defined(SMP)
615		/* The page directory is not shared between CPUs */
616		cpu_invltlb();
617#else
618		invltlb();
619#endif
620	}
621	return APTmap;
622}
623
624/*
625 * Super fast pmap_pte routine best used when scanning
626 * the pv lists.  This eliminates many coarse-grained
627 * invltlb calls.  Note that many of the pv list
628 * scans are across different pmaps.  It is very wasteful
629 * to do an entire invltlb for checking a single mapping.
630 */
631
632static pt_entry_t *
633pmap_pte_quick(pmap, va)
634	register pmap_t pmap;
635	vm_offset_t va;
636{
637	pd_entry_t pde, newpf;
638	if ((pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) != 0) {
639		pd_entry_t frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
640		unsigned index = i386_btop(va);
641		/* are we current address space or kernel? */
642		if ((pmap == kernel_pmap) ||
643			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
644			return PTmap + index;
645		}
646		newpf = pde & PG_FRAME;
647		if ( ((*PMAP1) & PG_FRAME) != newpf) {
648			*PMAP1 = newpf | PG_RW | PG_V;
649			invltlb_1pg((vm_offset_t) PADDR1);
650		}
651		return PADDR1 + (index & (NPTEPG - 1));
652	}
653	return (0);
654}
655
656/*
657 *	Routine:	pmap_extract
658 *	Function:
659 *		Extract the physical page address associated
660 *		with the given map/virtual_address pair.
661 */
662vm_offset_t
663pmap_extract(pmap, va)
664	register pmap_t pmap;
665	vm_offset_t va;
666{
667	vm_offset_t rtval;	/* XXX FIXME */
668	vm_offset_t pdirindex;
669	pdirindex = va >> PDRSHIFT;
670	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
671		pt_entry_t *pte;
672		if ((rtval & PG_PS) != 0) {
673			rtval &= ~(NBPDR - 1);
674			rtval |= va & (NBPDR - 1);
675			return rtval;
676		}
677		pte = get_ptbase(pmap) + i386_btop(va);
678		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
679		return rtval;
680	}
681	return 0;
682
683}
684
685/***************************************************
686 * Low level mapping routines.....
687 ***************************************************/
688
689/*
690 * add a wired page to the kva
691 * note that in order for the mapping to take effect -- you
692 * should do a invltlb after doing the pmap_kenter...
693 */
694PMAP_INLINE void
695pmap_kenter(vm_offset_t va, vm_offset_t pa)
696{
697	pt_entry_t *pte;
698	pt_entry_t npte, opte;
699
700	npte = pa | PG_RW | PG_V | pgeflag;
701	pte = vtopte(va);
702	opte = *pte;
703	*pte = npte;
704	/*if (opte)*/
705		invltlb_1pg(va);	/* XXX what about SMP? */
706}
707
708/*
709 * remove a page from the kernel pagetables
710 */
711PMAP_INLINE void
712pmap_kremove(vm_offset_t va)
713{
714	register pt_entry_t *pte;
715
716	pte = vtopte(va);
717	*pte = 0;
718	invltlb_1pg(va);	/* XXX what about SMP? */
719}
720
721/*
722 *	Used to map a range of physical addresses into kernel
723 *	virtual address space.
724 *
725 *	The value passed in '*virt' is a suggested virtual address for
726 *	the mapping. Architectures which can support a direct-mapped
727 *	physical to virtual region can return the appropriate address
728 *	within that region, leaving '*virt' unchanged. Other
729 *	architectures should map the pages starting at '*virt' and
730 *	update '*virt' with the first usable address after the mapped
731 *	region.
732 */
733vm_offset_t
734pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
735{
736	vm_offset_t sva = *virt;
737	vm_offset_t va = sva;
738	while (start < end) {
739		pmap_kenter(va, start);
740		va += PAGE_SIZE;
741		start += PAGE_SIZE;
742	}
743	*virt = va;
744	return (sva);
745}
746
747
748/*
749 * Add a list of wired pages to the kva
750 * this routine is only used for temporary
751 * kernel mappings that do not need to have
752 * page modification or references recorded.
753 * Note that old mappings are simply written
754 * over.  The page *must* be wired.
755 */
756void
757pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
758{
759	vm_offset_t end_va;
760
761	end_va = va + count * PAGE_SIZE;
762
763	while (va < end_va) {
764		pt_entry_t *pte;
765
766		pte = vtopte(va);
767		*pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
768#ifdef SMP
769		cpu_invlpg((void *)va);
770#else
771		invltlb_1pg(va);
772#endif
773		va += PAGE_SIZE;
774		m++;
775	}
776#ifdef SMP
777	smp_invltlb();
778#endif
779}
780
781/*
782 * this routine jerks page mappings from the
783 * kernel -- it is meant only for temporary mappings.
784 */
785void
786pmap_qremove(vm_offset_t va, int count)
787{
788	vm_offset_t end_va;
789
790	end_va = va + count*PAGE_SIZE;
791
792	while (va < end_va) {
793		pt_entry_t *pte;
794
795		pte = vtopte(va);
796		*pte = 0;
797#ifdef SMP
798		cpu_invlpg((void *)va);
799#else
800		invltlb_1pg(va);
801#endif
802		va += PAGE_SIZE;
803	}
804#ifdef SMP
805	smp_invltlb();
806#endif
807}
808
809static vm_page_t
810pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
811{
812	vm_page_t m;
813retry:
814	m = vm_page_lookup(object, pindex);
815	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
816		goto retry;
817	return m;
818}
819
820/*
821 * Create the Uarea stack for a new process.
822 * This routine directly affects the fork perf for a process.
823 */
824void
825pmap_new_proc(struct proc *p)
826{
827#ifdef I386_CPU
828	int updateneeded = 0;
829#endif
830	int i;
831	vm_object_t upobj;
832	vm_offset_t up;
833	vm_page_t m;
834	pt_entry_t *ptek, oldpte;
835
836	/*
837	 * allocate object for the upages
838	 */
839	upobj = p->p_upages_obj;
840	if (upobj == NULL) {
841		upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
842		p->p_upages_obj = upobj;
843	}
844
845	/* get a kernel virtual address for the U area for this thread */
846	up = (vm_offset_t)p->p_uarea;
847	if (up == 0) {
848		up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
849		if (up == 0)
850			panic("pmap_new_proc: upage allocation failed");
851		p->p_uarea = (struct user *)up;
852	}
853
854	ptek = vtopte(up);
855
856	for (i = 0; i < UAREA_PAGES; i++) {
857		/*
858		 * Get a kernel stack page
859		 */
860		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
861
862		/*
863		 * Wire the page
864		 */
865		m->wire_count++;
866		cnt.v_wire_count++;
867
868		oldpte = *(ptek + i);
869		/*
870		 * Enter the page into the kernel address space.
871		 */
872		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
873		if (oldpte) {
874#ifdef I386_CPU
875			updateneeded = 1;
876#else
877			invlpg(up + i * PAGE_SIZE);
878#endif
879		}
880
881		vm_page_wakeup(m);
882		vm_page_flag_clear(m, PG_ZERO);
883		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
884		m->valid = VM_PAGE_BITS_ALL;
885	}
886#ifdef I386_CPU
887	if (updateneeded)
888		invltlb();
889#endif
890}
891
892/*
893 * Dispose the U-Area for a process that has exited.
894 * This routine directly impacts the exit perf of a process.
895 */
896void
897pmap_dispose_proc(p)
898	struct proc *p;
899{
900	int i;
901	vm_object_t upobj;
902	vm_offset_t up;
903	vm_page_t m;
904	pt_entry_t *ptek, oldpte;
905
906	upobj = p->p_upages_obj;
907	up = (vm_offset_t)p->p_uarea;
908	ptek = vtopte(up);
909	for (i = 0; i < UAREA_PAGES; i++) {
910		m = vm_page_lookup(upobj, i);
911		if (m == NULL)
912			panic("pmap_dispose_proc: upage already missing?");
913		vm_page_busy(m);
914		oldpte = *(ptek + i);
915		*(ptek + i) = 0;
916#ifndef I386_CPU
917		invlpg(up + i * PAGE_SIZE);
918#endif
919		vm_page_unwire(m, 0);
920		vm_page_free(m);
921	}
922#ifdef I386_CPU
923	invltlb();
924#endif
925}
926
927/*
928 * Allow the U_AREA for a process to be prejudicially paged out.
929 */
930void
931pmap_swapout_proc(p)
932	struct proc *p;
933{
934	int i;
935	vm_object_t upobj;
936	vm_offset_t up;
937	vm_page_t m;
938
939	upobj = p->p_upages_obj;
940	up = (vm_offset_t)p->p_uarea;
941	for (i = 0; i < UAREA_PAGES; i++) {
942		m = vm_page_lookup(upobj, i);
943		if (m == NULL)
944			panic("pmap_swapout_proc: upage already missing?");
945		vm_page_dirty(m);
946		vm_page_unwire(m, 0);
947		pmap_kremove(up + i * PAGE_SIZE);
948	}
949}
950
951/*
952 * Bring the U-Area for a specified process back in.
953 */
954void
955pmap_swapin_proc(p)
956	struct proc *p;
957{
958	int i, rv;
959	vm_object_t upobj;
960	vm_offset_t up;
961	vm_page_t m;
962
963	upobj = p->p_upages_obj;
964	up = (vm_offset_t)p->p_uarea;
965	for (i = 0; i < UAREA_PAGES; i++) {
966		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
967		pmap_kenter(up + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
968		if (m->valid != VM_PAGE_BITS_ALL) {
969			rv = vm_pager_get_pages(upobj, &m, 1, 0);
970			if (rv != VM_PAGER_OK)
971				panic("pmap_swapin_proc: cannot get upage for proc: %d\n", p->p_pid);
972			m = vm_page_lookup(upobj, i);
973			m->valid = VM_PAGE_BITS_ALL;
974		}
975		vm_page_wire(m);
976		vm_page_wakeup(m);
977		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
978	}
979}
980
981/*
982 * Create the kernel stack (including pcb for i386) for a new thread.
983 * This routine directly affects the fork perf for a process and
984 * create performance for a thread.
985 */
986void
987pmap_new_thread(struct thread *td)
988{
989#ifdef I386_CPU
990	int updateneeded = 0;
991#endif
992	int i;
993	vm_object_t ksobj;
994	vm_page_t m;
995	vm_offset_t ks;
996	pt_entry_t *ptek, oldpte;
997
998	/*
999	 * allocate object for the kstack
1000	 */
1001	ksobj = td->td_kstack_obj;
1002	if (ksobj == NULL) {
1003		ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES);
1004		td->td_kstack_obj = ksobj;
1005	}
1006
1007#ifdef KSTACK_GUARD
1008	/* get a kernel virtual address for the kstack for this thread */
1009	ks = td->td_kstack;
1010	if (ks == 0) {
1011		ks = kmem_alloc_nofault(kernel_map,
1012		    (KSTACK_PAGES + 1) * PAGE_SIZE);
1013		if (ks == 0)
1014			panic("pmap_new_thread: kstack allocation failed");
1015		ks += PAGE_SIZE;
1016		td->td_kstack = ks;
1017	}
1018
1019	ptek = vtopte(ks - PAGE_SIZE);
1020	oldpte = *ptek;
1021	*ptek = 0;
1022	if (oldpte) {
1023#ifdef I386_CPU
1024		updateneeded = 1;
1025#else
1026		invlpg(ks - PAGE_SIZE);
1027#endif
1028	}
1029	ptek++;
1030#else
1031	/* get a kernel virtual address for the kstack for this thread */
1032	ks = td->td_kstack;
1033	if (ks == 0) {
1034		ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE);
1035		if (ks == 0)
1036			panic("pmap_new_thread: kstack allocation failed");
1037		td->td_kstack = ks;
1038	}
1039	ptek = vtopte(ks);
1040#endif
1041	for (i = 0; i < KSTACK_PAGES; i++) {
1042		/*
1043		 * Get a kernel stack page
1044		 */
1045		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1046
1047		/*
1048		 * Wire the page
1049		 */
1050		m->wire_count++;
1051		cnt.v_wire_count++;
1052
1053		oldpte = *(ptek + i);
1054		/*
1055		 * Enter the page into the kernel address space.
1056		 */
1057		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
1058		if (oldpte) {
1059#ifdef I386_CPU
1060			updateneeded = 1;
1061#else
1062			invlpg(ks + i * PAGE_SIZE);
1063#endif
1064		}
1065
1066		vm_page_wakeup(m);
1067		vm_page_flag_clear(m, PG_ZERO);
1068		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1069		m->valid = VM_PAGE_BITS_ALL;
1070	}
1071#ifdef I386_CPU
1072	if (updateneeded)
1073		invltlb();
1074#endif
1075}
1076
1077/*
1078 * Dispose the kernel stack for a thread that has exited.
1079 * This routine directly impacts the exit perf of a process and thread.
1080 */
1081void
1082pmap_dispose_thread(td)
1083	struct thread *td;
1084{
1085	int i;
1086	vm_object_t ksobj;
1087	vm_offset_t ks;
1088	vm_page_t m;
1089	pt_entry_t *ptek, oldpte;
1090
1091	ksobj = td->td_kstack_obj;
1092	ks = td->td_kstack;
1093	ptek = vtopte(ks);
1094	for (i = 0; i < KSTACK_PAGES; i++) {
1095		m = vm_page_lookup(ksobj, i);
1096		if (m == NULL)
1097			panic("pmap_dispose_thread: kstack already missing?");
1098		vm_page_busy(m);
1099		oldpte = *(ptek + i);
1100		*(ptek + i) = 0;
1101#ifndef I386_CPU
1102		invlpg(ks + i * PAGE_SIZE);
1103#endif
1104		vm_page_unwire(m, 0);
1105		vm_page_free(m);
1106	}
1107#ifdef I386_CPU
1108	invltlb();
1109#endif
1110}
1111
1112/*
1113 * Allow the Kernel stack for a thread to be prejudicially paged out.
1114 */
1115void
1116pmap_swapout_thread(td)
1117	struct thread *td;
1118{
1119	int i;
1120	vm_object_t ksobj;
1121	vm_offset_t ks;
1122	vm_page_t m;
1123
1124	ksobj = td->td_kstack_obj;
1125	ks = td->td_kstack;
1126	for (i = 0; i < KSTACK_PAGES; i++) {
1127		m = vm_page_lookup(ksobj, i);
1128		if (m == NULL)
1129			panic("pmap_swapout_thread: kstack already missing?");
1130		vm_page_dirty(m);
1131		vm_page_unwire(m, 0);
1132		pmap_kremove(ks + i * PAGE_SIZE);
1133	}
1134}
1135
1136/*
1137 * Bring the kernel stack for a specified thread back in.
1138 */
1139void
1140pmap_swapin_thread(td)
1141	struct thread *td;
1142{
1143	int i, rv;
1144	vm_object_t ksobj;
1145	vm_offset_t ks;
1146	vm_page_t m;
1147
1148	ksobj = td->td_kstack_obj;
1149	ks = td->td_kstack;
1150	for (i = 0; i < KSTACK_PAGES; i++) {
1151		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1152		pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
1153		if (m->valid != VM_PAGE_BITS_ALL) {
1154			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
1155			if (rv != VM_PAGER_OK)
1156				panic("pmap_swapin_thread: cannot get kstack for proc: %d\n", td->td_proc->p_pid);
1157			m = vm_page_lookup(ksobj, i);
1158			m->valid = VM_PAGE_BITS_ALL;
1159		}
1160		vm_page_wire(m);
1161		vm_page_wakeup(m);
1162		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1163	}
1164}
1165
1166/***************************************************
1167 * Page table page management routines.....
1168 ***************************************************/
1169
1170/*
1171 * This routine unholds page table pages, and if the hold count
1172 * drops to zero, then it decrements the wire count.
1173 */
1174static int
1175_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1176{
1177
1178	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
1179		;
1180
1181	if (m->hold_count == 0) {
1182		vm_offset_t pteva;
1183		/*
1184		 * unmap the page table page
1185		 */
1186		pmap->pm_pdir[m->pindex] = 0;
1187		--pmap->pm_stats.resident_count;
1188		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1189		    (PTDpde & PG_FRAME)) {
1190			/*
1191			 * Do a invltlb to make the invalidated mapping
1192			 * take effect immediately.
1193			 */
1194			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1195			pmap_invalidate_page(pmap, pteva);
1196		}
1197
1198		if (pmap->pm_ptphint == m)
1199			pmap->pm_ptphint = NULL;
1200
1201		/*
1202		 * If the page is finally unwired, simply free it.
1203		 */
1204		--m->wire_count;
1205		if (m->wire_count == 0) {
1206
1207			vm_page_flash(m);
1208			vm_page_busy(m);
1209			vm_page_free_zero(m);
1210			--cnt.v_wire_count;
1211		}
1212		return 1;
1213	}
1214	return 0;
1215}
1216
1217static PMAP_INLINE int
1218pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1219{
1220	vm_page_unhold(m);
1221	if (m->hold_count == 0)
1222		return _pmap_unwire_pte_hold(pmap, m);
1223	else
1224		return 0;
1225}
1226
1227/*
1228 * After removing a page table entry, this routine is used to
1229 * conditionally free the page, and manage the hold/wire counts.
1230 */
1231static int
1232pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1233{
1234	unsigned ptepindex;
1235	if (va >= VM_MAXUSER_ADDRESS)
1236		return 0;
1237
1238	if (mpte == NULL) {
1239		ptepindex = (va >> PDRSHIFT);
1240		if (pmap->pm_ptphint &&
1241			(pmap->pm_ptphint->pindex == ptepindex)) {
1242			mpte = pmap->pm_ptphint;
1243		} else {
1244			mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1245			pmap->pm_ptphint = mpte;
1246		}
1247	}
1248
1249	return pmap_unwire_pte_hold(pmap, mpte);
1250}
1251
1252void
1253pmap_pinit0(pmap)
1254	struct pmap *pmap;
1255{
1256	pmap->pm_pdir =
1257		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1258	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
1259	pmap->pm_count = 1;
1260	pmap->pm_ptphint = NULL;
1261	pmap->pm_active = 0;
1262	TAILQ_INIT(&pmap->pm_pvlist);
1263	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1264	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1265}
1266
1267/*
1268 * Initialize a preallocated and zeroed pmap structure,
1269 * such as one in a vmspace structure.
1270 */
1271void
1272pmap_pinit(pmap)
1273	register struct pmap *pmap;
1274{
1275	vm_page_t ptdpg;
1276
1277	/*
1278	 * No need to allocate page table space yet but we do need a valid
1279	 * page directory table.
1280	 */
1281	if (pmap->pm_pdir == NULL)
1282		pmap->pm_pdir =
1283			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1284
1285	/*
1286	 * allocate object for the ptes
1287	 */
1288	if (pmap->pm_pteobj == NULL)
1289		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
1290
1291	/*
1292	 * allocate the page directory page
1293	 */
1294	ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI,
1295			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1296
1297	ptdpg->wire_count = 1;
1298	++cnt.v_wire_count;
1299
1300
1301	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
1302	ptdpg->valid = VM_PAGE_BITS_ALL;
1303
1304	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1305	if ((ptdpg->flags & PG_ZERO) == 0)
1306		bzero(pmap->pm_pdir, PAGE_SIZE);
1307
1308	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1309	/* Wire in kernel global address entries. */
1310	/* XXX copies current process, does not fill in MPPTDI */
1311	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1312#ifdef SMP
1313	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1314#endif
1315
1316	/* install self-referential address mapping entry */
1317	pmap->pm_pdir[PTDPTDI] =
1318		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1319
1320	pmap->pm_count = 1;
1321	pmap->pm_active = 0;
1322	pmap->pm_ptphint = NULL;
1323	TAILQ_INIT(&pmap->pm_pvlist);
1324	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1325}
1326
1327/*
1328 * Wire in kernel global address entries.  To avoid a race condition
1329 * between pmap initialization and pmap_growkernel, this procedure
1330 * should be called after the vmspace is attached to the process
1331 * but before this pmap is activated.
1332 */
1333void
1334pmap_pinit2(pmap)
1335	struct pmap *pmap;
1336{
1337	/* XXX: Remove this stub when no longer called */
1338}
1339
1340static int
1341pmap_release_free_page(pmap_t pmap, vm_page_t p)
1342{
1343	pd_entry_t *pde = pmap->pm_pdir;
1344	/*
1345	 * This code optimizes the case of freeing non-busy
1346	 * page-table pages.  Those pages are zero now, and
1347	 * might as well be placed directly into the zero queue.
1348	 */
1349	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1350		return 0;
1351
1352	vm_page_busy(p);
1353
1354	/*
1355	 * Remove the page table page from the processes address space.
1356	 */
1357	pde[p->pindex] = 0;
1358	pmap->pm_stats.resident_count--;
1359
1360	if (p->hold_count)  {
1361		panic("pmap_release: freeing held page table page");
1362	}
1363	/*
1364	 * Page directory pages need to have the kernel
1365	 * stuff cleared, so they can go into the zero queue also.
1366	 */
1367	if (p->pindex == PTDPTDI) {
1368		bzero(pde + KPTDI, nkpt * PTESIZE);
1369#ifdef SMP
1370		pde[MPPTDI] = 0;
1371#endif
1372		pde[APTDPTDI] = 0;
1373		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1374	}
1375
1376	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1377		pmap->pm_ptphint = NULL;
1378
1379	p->wire_count--;
1380	cnt.v_wire_count--;
1381	vm_page_free_zero(p);
1382	return 1;
1383}
1384
1385/*
1386 * this routine is called if the page table page is not
1387 * mapped correctly.
1388 */
1389static vm_page_t
1390_pmap_allocpte(pmap, ptepindex)
1391	pmap_t	pmap;
1392	unsigned ptepindex;
1393{
1394	vm_offset_t pteva, ptepa;	/* XXXPA */
1395	vm_page_t m;
1396
1397	/*
1398	 * Find or fabricate a new pagetable page
1399	 */
1400	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1401			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1402
1403	KASSERT(m->queue == PQ_NONE,
1404		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1405
1406	if (m->wire_count == 0)
1407		cnt.v_wire_count++;
1408	m->wire_count++;
1409
1410	/*
1411	 * Increment the hold count for the page table page
1412	 * (denoting a new mapping.)
1413	 */
1414	m->hold_count++;
1415
1416	/*
1417	 * Map the pagetable page into the process address space, if
1418	 * it isn't already there.
1419	 */
1420
1421	pmap->pm_stats.resident_count++;
1422
1423	ptepa = VM_PAGE_TO_PHYS(m);
1424	pmap->pm_pdir[ptepindex] =
1425		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1426
1427	/*
1428	 * Set the page table hint
1429	 */
1430	pmap->pm_ptphint = m;
1431
1432	/*
1433	 * Try to use the new mapping, but if we cannot, then
1434	 * do it with the routine that maps the page explicitly.
1435	 */
1436	if ((m->flags & PG_ZERO) == 0) {
1437		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1438		    (PTDpde & PG_FRAME)) {
1439			pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex);
1440			bzero((caddr_t) pteva, PAGE_SIZE);
1441		} else {
1442			pmap_zero_page(ptepa);
1443		}
1444	}
1445
1446	m->valid = VM_PAGE_BITS_ALL;
1447	vm_page_flag_clear(m, PG_ZERO);
1448	vm_page_flag_set(m, PG_MAPPED);
1449	vm_page_wakeup(m);
1450
1451	return m;
1452}
1453
1454static vm_page_t
1455pmap_allocpte(pmap_t pmap, vm_offset_t va)
1456{
1457	unsigned ptepindex;
1458	pd_entry_t ptepa;
1459	vm_page_t m;
1460
1461	/*
1462	 * Calculate pagetable page index
1463	 */
1464	ptepindex = va >> PDRSHIFT;
1465
1466	/*
1467	 * Get the page directory entry
1468	 */
1469	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1470
1471	/*
1472	 * This supports switching from a 4MB page to a
1473	 * normal 4K page.
1474	 */
1475	if (ptepa & PG_PS) {
1476		pmap->pm_pdir[ptepindex] = 0;
1477		ptepa = 0;
1478		invltlb();
1479	}
1480
1481	/*
1482	 * If the page table page is mapped, we just increment the
1483	 * hold count, and activate it.
1484	 */
1485	if (ptepa) {
1486		/*
1487		 * In order to get the page table page, try the
1488		 * hint first.
1489		 */
1490		if (pmap->pm_ptphint &&
1491			(pmap->pm_ptphint->pindex == ptepindex)) {
1492			m = pmap->pm_ptphint;
1493		} else {
1494			m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1495			pmap->pm_ptphint = m;
1496		}
1497		m->hold_count++;
1498		return m;
1499	}
1500	/*
1501	 * Here if the pte page isn't mapped, or if it has been deallocated.
1502	 */
1503	return _pmap_allocpte(pmap, ptepindex);
1504}
1505
1506
1507/***************************************************
1508* Pmap allocation/deallocation routines.
1509 ***************************************************/
1510
1511/*
1512 * Release any resources held by the given physical map.
1513 * Called when a pmap initialized by pmap_pinit is being released.
1514 * Should only be called if the map contains no valid mappings.
1515 */
1516void
1517pmap_release(pmap_t pmap)
1518{
1519	vm_page_t p,n,ptdpg;
1520	vm_object_t object = pmap->pm_pteobj;
1521	int curgeneration;
1522
1523#if defined(DIAGNOSTIC)
1524	if (object->ref_count != 1)
1525		panic("pmap_release: pteobj reference count != 1");
1526#endif
1527
1528	ptdpg = NULL;
1529	LIST_REMOVE(pmap, pm_list);
1530retry:
1531	curgeneration = object->generation;
1532	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1533		n = TAILQ_NEXT(p, listq);
1534		if (p->pindex == PTDPTDI) {
1535			ptdpg = p;
1536			continue;
1537		}
1538		while (1) {
1539			if (!pmap_release_free_page(pmap, p) &&
1540				(object->generation != curgeneration))
1541				goto retry;
1542		}
1543	}
1544
1545	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1546		goto retry;
1547}
1548
1549static int
1550kvm_size(SYSCTL_HANDLER_ARGS)
1551{
1552	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1553
1554        return sysctl_handle_long(oidp, &ksize, 0, req);
1555}
1556SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1557    0, 0, kvm_size, "IU", "Size of KVM");
1558
1559static int
1560kvm_free(SYSCTL_HANDLER_ARGS)
1561{
1562	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1563
1564        return sysctl_handle_long(oidp, &kfree, 0, req);
1565}
1566SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1567    0, 0, kvm_free, "IU", "Amount of KVM free");
1568
1569/*
1570 * grow the number of kernel page table entries, if needed
1571 */
1572void
1573pmap_growkernel(vm_offset_t addr)
1574{
1575	struct pmap *pmap;
1576	int s;
1577	vm_offset_t ptppaddr;
1578	vm_page_t nkpg;
1579	pd_entry_t newpdir;
1580
1581	s = splhigh();
1582	if (kernel_vm_end == 0) {
1583		kernel_vm_end = KERNBASE;
1584		nkpt = 0;
1585		while (pdir_pde(PTD, kernel_vm_end)) {
1586			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1587			nkpt++;
1588		}
1589	}
1590	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1591	while (kernel_vm_end < addr) {
1592		if (pdir_pde(PTD, kernel_vm_end)) {
1593			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1594			continue;
1595		}
1596
1597		/*
1598		 * This index is bogus, but out of the way
1599		 */
1600		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
1601		if (!nkpg)
1602			panic("pmap_growkernel: no memory to grow kernel");
1603
1604		nkpt++;
1605
1606		vm_page_wire(nkpg);
1607		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1608		pmap_zero_page(ptppaddr);
1609		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1610		pdir_pde(PTD, kernel_vm_end) = newpdir;
1611
1612		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1613			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1614		}
1615		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1616	}
1617	splx(s);
1618}
1619
1620/*
1621 *	Retire the given physical map from service.
1622 *	Should only be called if the map contains
1623 *	no valid mappings.
1624 */
1625void
1626pmap_destroy(pmap_t pmap)
1627{
1628	int count;
1629
1630	if (pmap == NULL)
1631		return;
1632
1633	count = --pmap->pm_count;
1634	if (count == 0) {
1635		pmap_release(pmap);
1636		panic("destroying a pmap is not yet implemented");
1637	}
1638}
1639
1640/*
1641 *	Add a reference to the specified pmap.
1642 */
1643void
1644pmap_reference(pmap_t pmap)
1645{
1646	if (pmap != NULL) {
1647		pmap->pm_count++;
1648	}
1649}
1650
1651/***************************************************
1652* page management routines.
1653 ***************************************************/
1654
1655/*
1656 * free the pv_entry back to the free list
1657 */
1658static PMAP_INLINE void
1659free_pv_entry(pv_entry_t pv)
1660{
1661	pv_entry_count--;
1662	zfree(pvzone, pv);
1663}
1664
1665/*
1666 * get a new pv_entry, allocating a block from the system
1667 * when needed.
1668 * the memory allocation is performed bypassing the malloc code
1669 * because of the possibility of allocations at interrupt time.
1670 */
1671static pv_entry_t
1672get_pv_entry(void)
1673{
1674	pv_entry_count++;
1675	if (pv_entry_high_water &&
1676		(pv_entry_count > pv_entry_high_water) &&
1677		(pmap_pagedaemon_waken == 0)) {
1678		pmap_pagedaemon_waken = 1;
1679		wakeup (&vm_pages_needed);
1680	}
1681	return zalloc(pvzone);
1682}
1683
1684/*
1685 * This routine is very drastic, but can save the system
1686 * in a pinch.
1687 */
1688void
1689pmap_collect()
1690{
1691	int i;
1692	vm_page_t m;
1693	static int warningdone = 0;
1694
1695	if (pmap_pagedaemon_waken == 0)
1696		return;
1697
1698	if (warningdone < 5) {
1699		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1700		warningdone++;
1701	}
1702
1703	for(i = 0; i < vm_page_array_size; i++) {
1704		m = &vm_page_array[i];
1705		if (m->wire_count || m->hold_count || m->busy ||
1706		    (m->flags & (PG_BUSY | PG_UNMANAGED)))
1707			continue;
1708		pmap_remove_all(m);
1709	}
1710	pmap_pagedaemon_waken = 0;
1711}
1712
1713
1714/*
1715 * If it is the first entry on the list, it is actually
1716 * in the header and we must copy the following entry up
1717 * to the header.  Otherwise we must search the list for
1718 * the entry.  In either case we free the now unused entry.
1719 */
1720
1721static int
1722pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1723{
1724	pv_entry_t pv;
1725	int rtval;
1726	int s;
1727
1728	s = splvm();
1729	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1730		for (pv = TAILQ_FIRST(&m->md.pv_list);
1731			pv;
1732			pv = TAILQ_NEXT(pv, pv_list)) {
1733			if (pmap == pv->pv_pmap && va == pv->pv_va)
1734				break;
1735		}
1736	} else {
1737		for (pv = TAILQ_FIRST(&pmap->pm_pvlist);
1738			pv;
1739			pv = TAILQ_NEXT(pv, pv_plist)) {
1740			if (va == pv->pv_va)
1741				break;
1742		}
1743	}
1744
1745	rtval = 0;
1746	if (pv) {
1747		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1748		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1749		m->md.pv_list_count--;
1750		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1751			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1752
1753		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1754		free_pv_entry(pv);
1755	}
1756
1757	splx(s);
1758	return rtval;
1759}
1760
1761/*
1762 * Create a pv entry for page at pa for
1763 * (pmap, va).
1764 */
1765static void
1766pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1767{
1768
1769	int s;
1770	pv_entry_t pv;
1771
1772	s = splvm();
1773	pv = get_pv_entry();
1774	pv->pv_va = va;
1775	pv->pv_pmap = pmap;
1776	pv->pv_ptem = mpte;
1777
1778	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1779	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1780	m->md.pv_list_count++;
1781
1782	splx(s);
1783}
1784
1785/*
1786 * pmap_remove_pte: do the things to unmap a page in a process
1787 */
1788static int
1789pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1790{
1791	pt_entry_t oldpte;
1792	vm_page_t m;
1793
1794	oldpte = atomic_readandclear_int(ptq);
1795	if (oldpte & PG_W)
1796		pmap->pm_stats.wired_count -= 1;
1797	/*
1798	 * Machines that don't support invlpg, also don't support
1799	 * PG_G.
1800	 */
1801	if (oldpte & PG_G)
1802		invlpg(va);
1803	pmap->pm_stats.resident_count -= 1;
1804	if (oldpte & PG_MANAGED) {
1805		m = PHYS_TO_VM_PAGE(oldpte);
1806		if (oldpte & PG_M) {
1807#if defined(PMAP_DIAGNOSTIC)
1808			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1809				printf(
1810	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1811				    va, oldpte);
1812			}
1813#endif
1814			if (pmap_track_modified(va))
1815				vm_page_dirty(m);
1816		}
1817		if (oldpte & PG_A)
1818			vm_page_flag_set(m, PG_REFERENCED);
1819		return pmap_remove_entry(pmap, m, va);
1820	} else {
1821		return pmap_unuse_pt(pmap, va, NULL);
1822	}
1823
1824	return 0;
1825}
1826
1827/*
1828 * Remove a single page from a process address space
1829 */
1830static void
1831pmap_remove_page(pmap_t pmap, vm_offset_t va)
1832{
1833	register pt_entry_t *ptq;
1834
1835	/*
1836	 * if there is no pte for this address, just skip it!!!
1837	 */
1838	if (*pmap_pde(pmap, va) == 0) {
1839		return;
1840	}
1841
1842	/*
1843	 * get a local va for mappings for this pmap.
1844	 */
1845	ptq = get_ptbase(pmap) + i386_btop(va);
1846	if (*ptq) {
1847		(void) pmap_remove_pte(pmap, ptq, va);
1848		pmap_invalidate_page(pmap, va);
1849	}
1850	return;
1851}
1852
1853/*
1854 *	Remove the given range of addresses from the specified map.
1855 *
1856 *	It is assumed that the start and end are properly
1857 *	rounded to the page size.
1858 */
1859void
1860pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1861{
1862	register pt_entry_t *ptbase;
1863	vm_offset_t pdnxt;
1864	pd_entry_t ptpaddr;
1865	vm_offset_t sindex, eindex;
1866	int anyvalid;
1867
1868	if (pmap == NULL)
1869		return;
1870
1871	if (pmap->pm_stats.resident_count == 0)
1872		return;
1873
1874	/*
1875	 * special handling of removing one page.  a very
1876	 * common operation and easy to short circuit some
1877	 * code.
1878	 */
1879	if ((sva + PAGE_SIZE == eva) &&
1880	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1881		pmap_remove_page(pmap, sva);
1882		return;
1883	}
1884
1885	anyvalid = 0;
1886
1887	/*
1888	 * Get a local virtual address for the mappings that are being
1889	 * worked with.
1890	 */
1891	ptbase = get_ptbase(pmap);
1892
1893	sindex = i386_btop(sva);
1894	eindex = i386_btop(eva);
1895
1896	for (; sindex < eindex; sindex = pdnxt) {
1897		unsigned pdirindex;
1898
1899		/*
1900		 * Calculate index for next page table.
1901		 */
1902		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1903		if (pmap->pm_stats.resident_count == 0)
1904			break;
1905
1906		pdirindex = sindex / NPDEPG;
1907		ptpaddr = pmap->pm_pdir[pdirindex];
1908		if ((ptpaddr & PG_PS) != 0) {
1909			pmap->pm_pdir[pdirindex] = 0;
1910			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1911			anyvalid++;
1912			continue;
1913		}
1914
1915		/*
1916		 * Weed out invalid mappings. Note: we assume that the page
1917		 * directory table is always allocated, and in kernel virtual.
1918		 */
1919		if (ptpaddr == 0)
1920			continue;
1921
1922		/*
1923		 * Limit our scan to either the end of the va represented
1924		 * by the current page table page, or to the end of the
1925		 * range being removed.
1926		 */
1927		if (pdnxt > eindex) {
1928			pdnxt = eindex;
1929		}
1930
1931		for ( ;sindex != pdnxt; sindex++) {
1932			vm_offset_t va;
1933			if (ptbase[sindex] == 0) {
1934				continue;
1935			}
1936			va = i386_ptob(sindex);
1937
1938			anyvalid++;
1939			if (pmap_remove_pte(pmap,
1940				ptbase + sindex, va))
1941				break;
1942		}
1943	}
1944
1945	if (anyvalid)
1946		pmap_invalidate_all(pmap);
1947}
1948
1949/*
1950 *	Routine:	pmap_remove_all
1951 *	Function:
1952 *		Removes this physical page from
1953 *		all physical maps in which it resides.
1954 *		Reflects back modify bits to the pager.
1955 *
1956 *	Notes:
1957 *		Original versions of this routine were very
1958 *		inefficient because they iteratively called
1959 *		pmap_remove (slow...)
1960 */
1961
1962static void
1963pmap_remove_all(vm_page_t m)
1964{
1965	register pv_entry_t pv;
1966	pt_entry_t *pte, tpte;
1967	int s;
1968
1969#if defined(PMAP_DIAGNOSTIC)
1970	/*
1971	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1972	 * pages!
1973	 */
1974	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1975		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
1976	}
1977#endif
1978
1979	s = splvm();
1980	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1981		pv->pv_pmap->pm_stats.resident_count--;
1982
1983		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1984
1985		tpte = atomic_readandclear_int(pte);
1986		if (tpte & PG_W)
1987			pv->pv_pmap->pm_stats.wired_count--;
1988
1989		if (tpte & PG_A)
1990			vm_page_flag_set(m, PG_REFERENCED);
1991
1992		/*
1993		 * Update the vm_page_t clean and reference bits.
1994		 */
1995		if (tpte & PG_M) {
1996#if defined(PMAP_DIAGNOSTIC)
1997			if (pmap_nw_modified((pt_entry_t) tpte)) {
1998				printf(
1999	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
2000				    pv->pv_va, tpte);
2001			}
2002#endif
2003			if (pmap_track_modified(pv->pv_va))
2004				vm_page_dirty(m);
2005		}
2006		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2007
2008		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2009		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2010		m->md.pv_list_count--;
2011		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2012		free_pv_entry(pv);
2013	}
2014
2015	vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2016
2017	splx(s);
2018}
2019
2020/*
2021 *	Set the physical protection on the
2022 *	specified range of this map as requested.
2023 */
2024void
2025pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2026{
2027	register pt_entry_t *ptbase;
2028	vm_offset_t pdnxt;
2029	pd_entry_t ptpaddr;
2030	vm_pindex_t sindex, eindex;
2031	int anychanged;
2032
2033	if (pmap == NULL)
2034		return;
2035
2036	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2037		pmap_remove(pmap, sva, eva);
2038		return;
2039	}
2040
2041	if (prot & VM_PROT_WRITE)
2042		return;
2043
2044	anychanged = 0;
2045
2046	ptbase = get_ptbase(pmap);
2047
2048	sindex = i386_btop(sva);
2049	eindex = i386_btop(eva);
2050
2051	for (; sindex < eindex; sindex = pdnxt) {
2052
2053		unsigned pdirindex;
2054
2055		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2056
2057		pdirindex = sindex / NPDEPG;
2058		ptpaddr = pmap->pm_pdir[pdirindex];
2059		if ((ptpaddr & PG_PS) != 0) {
2060			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2061			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2062			anychanged++;
2063			continue;
2064		}
2065
2066		/*
2067		 * Weed out invalid mappings. Note: we assume that the page
2068		 * directory table is always allocated, and in kernel virtual.
2069		 */
2070		if (ptpaddr == 0)
2071			continue;
2072
2073		if (pdnxt > eindex) {
2074			pdnxt = eindex;
2075		}
2076
2077		for (; sindex != pdnxt; sindex++) {
2078
2079			pt_entry_t pbits;
2080			vm_page_t m;
2081
2082			pbits = ptbase[sindex];
2083
2084			if (pbits & PG_MANAGED) {
2085				m = NULL;
2086				if (pbits & PG_A) {
2087					m = PHYS_TO_VM_PAGE(pbits);
2088					vm_page_flag_set(m, PG_REFERENCED);
2089					pbits &= ~PG_A;
2090				}
2091				if (pbits & PG_M) {
2092					if (pmap_track_modified(i386_ptob(sindex))) {
2093						if (m == NULL)
2094							m = PHYS_TO_VM_PAGE(pbits);
2095						vm_page_dirty(m);
2096						pbits &= ~PG_M;
2097					}
2098				}
2099			}
2100
2101			pbits &= ~PG_RW;
2102
2103			if (pbits != ptbase[sindex]) {
2104				ptbase[sindex] = pbits;
2105				anychanged = 1;
2106			}
2107		}
2108	}
2109	if (anychanged)
2110		pmap_invalidate_all(pmap);
2111}
2112
2113/*
2114 *	Insert the given physical page (p) at
2115 *	the specified virtual address (v) in the
2116 *	target physical map with the protection requested.
2117 *
2118 *	If specified, the page will be wired down, meaning
2119 *	that the related pte can not be reclaimed.
2120 *
2121 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2122 *	or lose information.  That is, this routine must actually
2123 *	insert this page into the given map NOW.
2124 */
2125void
2126pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2127	   boolean_t wired)
2128{
2129	vm_offset_t pa;
2130	register pt_entry_t *pte;
2131	vm_offset_t opa;
2132	pt_entry_t origpte, newpte;
2133	vm_page_t mpte;
2134
2135	if (pmap == NULL)
2136		return;
2137
2138	va &= PG_FRAME;
2139#ifdef PMAP_DIAGNOSTIC
2140	if (va > VM_MAX_KERNEL_ADDRESS)
2141		panic("pmap_enter: toobig");
2142	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2143		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2144#endif
2145
2146	mpte = NULL;
2147	/*
2148	 * In the case that a page table page is not
2149	 * resident, we are creating it here.
2150	 */
2151	if (va < VM_MAXUSER_ADDRESS) {
2152		mpte = pmap_allocpte(pmap, va);
2153	}
2154#if 0 && defined(PMAP_DIAGNOSTIC)
2155	else {
2156		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2157		if (((origpte = *pdeaddr) & PG_V) == 0) {
2158			panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n",
2159				pmap->pm_pdir[PTDPTDI], origpte, va);
2160		}
2161		if (smp_active) {
2162			pdeaddr = (vm_offset_t *) IdlePTDS[PCPU_GET(cpuid)];
2163			if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) {
2164				if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr))
2165					printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr);
2166				printf("cpuid: %d, pdeaddr: 0x%x\n", PCPU_GET(cpuid), pdeaddr);
2167				panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n",
2168					pmap->pm_pdir[PTDPTDI], newpte, origpte, va);
2169			}
2170		}
2171	}
2172#endif
2173
2174	pte = pmap_pte(pmap, va);
2175
2176	/*
2177	 * Page Directory table entry not valid, we need a new PT page
2178	 */
2179	if (pte == NULL) {
2180		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n",
2181			(void *)pmap->pm_pdir[PTDPTDI], va);
2182	}
2183
2184	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2185	origpte = *(vm_offset_t *)pte;
2186	opa = origpte & PG_FRAME;
2187
2188	if (origpte & PG_PS)
2189		panic("pmap_enter: attempted pmap_enter on 4MB page");
2190
2191	/*
2192	 * Mapping has not changed, must be protection or wiring change.
2193	 */
2194	if (origpte && (opa == pa)) {
2195		/*
2196		 * Wiring change, just update stats. We don't worry about
2197		 * wiring PT pages as they remain resident as long as there
2198		 * are valid mappings in them. Hence, if a user page is wired,
2199		 * the PT page will be also.
2200		 */
2201		if (wired && ((origpte & PG_W) == 0))
2202			pmap->pm_stats.wired_count++;
2203		else if (!wired && (origpte & PG_W))
2204			pmap->pm_stats.wired_count--;
2205
2206#if defined(PMAP_DIAGNOSTIC)
2207		if (pmap_nw_modified((pt_entry_t) origpte)) {
2208			printf(
2209	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2210			    va, origpte);
2211		}
2212#endif
2213
2214		/*
2215		 * Remove extra pte reference
2216		 */
2217		if (mpte)
2218			mpte->hold_count--;
2219
2220		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2221			if ((origpte & PG_RW) == 0) {
2222				*pte |= PG_RW;
2223#ifdef SMP
2224				cpu_invlpg((void *)va);
2225				if (pmap->pm_active & PCPU_GET(other_cpus))
2226					smp_invltlb();
2227#else
2228				invltlb_1pg(va);
2229#endif
2230			}
2231			return;
2232		}
2233
2234		/*
2235		 * We might be turning off write access to the page,
2236		 * so we go ahead and sense modify status.
2237		 */
2238		if (origpte & PG_MANAGED) {
2239			if ((origpte & PG_M) && pmap_track_modified(va)) {
2240				vm_page_t om;
2241				om = PHYS_TO_VM_PAGE(opa);
2242				vm_page_dirty(om);
2243			}
2244			pa |= PG_MANAGED;
2245		}
2246		goto validate;
2247	}
2248	/*
2249	 * Mapping has changed, invalidate old range and fall through to
2250	 * handle validating new mapping.
2251	 */
2252	if (opa) {
2253		int err;
2254		err = pmap_remove_pte(pmap, pte, va);
2255		if (err)
2256			panic("pmap_enter: pte vanished, va: 0x%x", va);
2257	}
2258
2259	/*
2260	 * Enter on the PV list if part of our managed memory. Note that we
2261	 * raise IPL while manipulating pv_table since pmap_enter can be
2262	 * called at interrupt time.
2263	 */
2264	if (pmap_initialized &&
2265	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2266		pmap_insert_entry(pmap, va, mpte, m);
2267		pa |= PG_MANAGED;
2268	}
2269
2270	/*
2271	 * Increment counters
2272	 */
2273	pmap->pm_stats.resident_count++;
2274	if (wired)
2275		pmap->pm_stats.wired_count++;
2276
2277validate:
2278	/*
2279	 * Now validate mapping with desired protection/wiring.
2280	 */
2281	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2282
2283	if (wired)
2284		newpte |= PG_W;
2285	if (va < VM_MAXUSER_ADDRESS)
2286		newpte |= PG_U;
2287	if (pmap == kernel_pmap)
2288		newpte |= pgeflag;
2289
2290	/*
2291	 * if the mapping or permission bits are different, we need
2292	 * to update the pte.
2293	 */
2294	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2295		*pte = newpte | PG_A;
2296		/*if (origpte)*/ {
2297#ifdef SMP
2298			cpu_invlpg((void *)va);
2299			if (pmap->pm_active & PCPU_GET(other_cpus))
2300				smp_invltlb();
2301#else
2302			invltlb_1pg(va);
2303#endif
2304		}
2305	}
2306}
2307
2308/*
2309 * this code makes some *MAJOR* assumptions:
2310 * 1. Current pmap & pmap exists.
2311 * 2. Not wired.
2312 * 3. Read access.
2313 * 4. No page table pages.
2314 * 5. Tlbflush is deferred to calling procedure.
2315 * 6. Page IS managed.
2316 * but is *MUCH* faster than pmap_enter...
2317 */
2318
2319static vm_page_t
2320pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2321{
2322	pt_entry_t *pte;
2323	vm_offset_t pa;
2324
2325	/*
2326	 * In the case that a page table page is not
2327	 * resident, we are creating it here.
2328	 */
2329	if (va < VM_MAXUSER_ADDRESS) {
2330		unsigned ptepindex;
2331		pd_entry_t ptepa;
2332
2333		/*
2334		 * Calculate pagetable page index
2335		 */
2336		ptepindex = va >> PDRSHIFT;
2337		if (mpte && (mpte->pindex == ptepindex)) {
2338			mpte->hold_count++;
2339		} else {
2340retry:
2341			/*
2342			 * Get the page directory entry
2343			 */
2344			ptepa = pmap->pm_pdir[ptepindex];
2345
2346			/*
2347			 * If the page table page is mapped, we just increment
2348			 * the hold count, and activate it.
2349			 */
2350			if (ptepa) {
2351				if (ptepa & PG_PS)
2352					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2353				if (pmap->pm_ptphint &&
2354					(pmap->pm_ptphint->pindex == ptepindex)) {
2355					mpte = pmap->pm_ptphint;
2356				} else {
2357					mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
2358					pmap->pm_ptphint = mpte;
2359				}
2360				if (mpte == NULL)
2361					goto retry;
2362				mpte->hold_count++;
2363			} else {
2364				mpte = _pmap_allocpte(pmap, ptepindex);
2365			}
2366		}
2367	} else {
2368		mpte = NULL;
2369	}
2370
2371	/*
2372	 * This call to vtopte makes the assumption that we are
2373	 * entering the page into the current pmap.  In order to support
2374	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2375	 * But that isn't as quick as vtopte.
2376	 */
2377	pte = vtopte(va);
2378	if (*pte) {
2379		if (mpte)
2380			pmap_unwire_pte_hold(pmap, mpte);
2381		return 0;
2382	}
2383
2384	/*
2385	 * Enter on the PV list if part of our managed memory. Note that we
2386	 * raise IPL while manipulating pv_table since pmap_enter can be
2387	 * called at interrupt time.
2388	 */
2389	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2390		pmap_insert_entry(pmap, va, mpte, m);
2391
2392	/*
2393	 * Increment counters
2394	 */
2395	pmap->pm_stats.resident_count++;
2396
2397	pa = VM_PAGE_TO_PHYS(m);
2398
2399	/*
2400	 * Now validate mapping with RO protection
2401	 */
2402	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2403		*pte = pa | PG_V | PG_U;
2404	else
2405		*pte = pa | PG_V | PG_U | PG_MANAGED;
2406
2407	return mpte;
2408}
2409
2410/*
2411 * Make a temporary mapping for a physical address.  This is only intended
2412 * to be used for panic dumps.
2413 */
2414void *
2415pmap_kenter_temporary(vm_offset_t pa, int i)
2416{
2417	pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2418	return ((void *)crashdumpmap);
2419}
2420
2421#define MAX_INIT_PT (96)
2422/*
2423 * pmap_object_init_pt preloads the ptes for a given object
2424 * into the specified pmap.  This eliminates the blast of soft
2425 * faults on process startup and immediately after an mmap.
2426 */
2427void
2428pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2429		    vm_object_t object, vm_pindex_t pindex,
2430		    vm_size_t size, int limit)
2431{
2432	vm_offset_t tmpidx;
2433	int psize;
2434	vm_page_t p, mpte;
2435	int objpgs;
2436
2437	if (pmap == NULL || object == NULL)
2438		return;
2439
2440	/*
2441	 * This code maps large physical mmap regions into the
2442	 * processor address space.  Note that some shortcuts
2443	 * are taken, but the code works.
2444	 */
2445	if (pseflag && (object->type == OBJT_DEVICE) &&
2446	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2447		int i;
2448		vm_page_t m[1];
2449		unsigned int ptepindex;
2450		int npdes;
2451		pd_entry_t ptepa;
2452
2453		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2454			return;
2455
2456retry:
2457		p = vm_page_lookup(object, pindex);
2458		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
2459			goto retry;
2460
2461		if (p == NULL) {
2462			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2463			if (p == NULL)
2464				return;
2465			m[0] = p;
2466
2467			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2468				vm_page_free(p);
2469				return;
2470			}
2471
2472			p = vm_page_lookup(object, pindex);
2473			vm_page_wakeup(p);
2474		}
2475
2476		ptepa = VM_PAGE_TO_PHYS(p);
2477		if (ptepa & (NBPDR - 1)) {
2478			return;
2479		}
2480
2481		p->valid = VM_PAGE_BITS_ALL;
2482
2483		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2484		npdes = size >> PDRSHIFT;
2485		for(i = 0; i < npdes; i++) {
2486			pmap->pm_pdir[ptepindex] =
2487			    ptepa | PG_U | PG_RW | PG_V | PG_PS;
2488			ptepa += NBPDR;
2489			ptepindex += 1;
2490		}
2491		vm_page_flag_set(p, PG_MAPPED);
2492		invltlb();
2493		return;
2494	}
2495
2496	psize = i386_btop(size);
2497
2498	if ((object->type != OBJT_VNODE) ||
2499		((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2500			(object->resident_page_count > MAX_INIT_PT))) {
2501		return;
2502	}
2503
2504	if (psize + pindex > object->size) {
2505		if (object->size < pindex)
2506			return;
2507		psize = object->size - pindex;
2508	}
2509
2510	mpte = NULL;
2511	/*
2512	 * if we are processing a major portion of the object, then scan the
2513	 * entire thing.
2514	 */
2515	if (psize > (object->resident_page_count >> 2)) {
2516		objpgs = psize;
2517
2518		for (p = TAILQ_FIRST(&object->memq);
2519		    ((objpgs > 0) && (p != NULL));
2520		    p = TAILQ_NEXT(p, listq)) {
2521
2522			tmpidx = p->pindex;
2523			if (tmpidx < pindex) {
2524				continue;
2525			}
2526			tmpidx -= pindex;
2527			if (tmpidx >= psize) {
2528				continue;
2529			}
2530			/*
2531			 * don't allow an madvise to blow away our really
2532			 * free pages allocating pv entries.
2533			 */
2534			if ((limit & MAP_PREFAULT_MADVISE) &&
2535			    cnt.v_free_count < cnt.v_free_reserved) {
2536				break;
2537			}
2538			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2539				(p->busy == 0) &&
2540			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2541				if ((p->queue - p->pc) == PQ_CACHE)
2542					vm_page_deactivate(p);
2543				vm_page_busy(p);
2544				mpte = pmap_enter_quick(pmap,
2545					addr + i386_ptob(tmpidx), p, mpte);
2546				vm_page_flag_set(p, PG_MAPPED);
2547				vm_page_wakeup(p);
2548			}
2549			objpgs -= 1;
2550		}
2551	} else {
2552		/*
2553		 * else lookup the pages one-by-one.
2554		 */
2555		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2556			/*
2557			 * don't allow an madvise to blow away our really
2558			 * free pages allocating pv entries.
2559			 */
2560			if ((limit & MAP_PREFAULT_MADVISE) &&
2561			    cnt.v_free_count < cnt.v_free_reserved) {
2562				break;
2563			}
2564			p = vm_page_lookup(object, tmpidx + pindex);
2565			if (p &&
2566			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2567				(p->busy == 0) &&
2568			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2569				if ((p->queue - p->pc) == PQ_CACHE)
2570					vm_page_deactivate(p);
2571				vm_page_busy(p);
2572				mpte = pmap_enter_quick(pmap,
2573					addr + i386_ptob(tmpidx), p, mpte);
2574				vm_page_flag_set(p, PG_MAPPED);
2575				vm_page_wakeup(p);
2576			}
2577		}
2578	}
2579	return;
2580}
2581
2582/*
2583 * pmap_prefault provides a quick way of clustering
2584 * pagefaults into a processes address space.  It is a "cousin"
2585 * of pmap_object_init_pt, except it runs at page fault time instead
2586 * of mmap time.
2587 */
2588#define PFBAK 4
2589#define PFFOR 4
2590#define PAGEORDER_SIZE (PFBAK+PFFOR)
2591
2592static int pmap_prefault_pageorder[] = {
2593	-PAGE_SIZE, PAGE_SIZE,
2594	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2595	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2596	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2597};
2598
2599void
2600pmap_prefault(pmap, addra, entry)
2601	pmap_t pmap;
2602	vm_offset_t addra;
2603	vm_map_entry_t entry;
2604{
2605	int i;
2606	vm_offset_t starta;
2607	vm_offset_t addr;
2608	vm_pindex_t pindex;
2609	vm_page_t m, mpte;
2610	vm_object_t object;
2611
2612	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2613		return;
2614
2615	object = entry->object.vm_object;
2616
2617	starta = addra - PFBAK * PAGE_SIZE;
2618	if (starta < entry->start) {
2619		starta = entry->start;
2620	} else if (starta > addra) {
2621		starta = 0;
2622	}
2623
2624	mpte = NULL;
2625	for (i = 0; i < PAGEORDER_SIZE; i++) {
2626		vm_object_t lobject;
2627		pt_entry_t *pte;
2628
2629		addr = addra + pmap_prefault_pageorder[i];
2630		if (addr > addra + (PFFOR * PAGE_SIZE))
2631			addr = 0;
2632
2633		if (addr < starta || addr >= entry->end)
2634			continue;
2635
2636		if ((*pmap_pde(pmap, addr)) == NULL)
2637			continue;
2638
2639		pte = vtopte(addr);
2640		if (*pte)
2641			continue;
2642
2643		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2644		lobject = object;
2645		for (m = vm_page_lookup(lobject, pindex);
2646		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2647		    lobject = lobject->backing_object) {
2648			if (lobject->backing_object_offset & PAGE_MASK)
2649				break;
2650			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2651			m = vm_page_lookup(lobject->backing_object, pindex);
2652		}
2653
2654		/*
2655		 * give-up when a page is not in memory
2656		 */
2657		if (m == NULL)
2658			break;
2659
2660		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2661			(m->busy == 0) &&
2662		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2663
2664			if ((m->queue - m->pc) == PQ_CACHE) {
2665				vm_page_deactivate(m);
2666			}
2667			vm_page_busy(m);
2668			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2669			vm_page_flag_set(m, PG_MAPPED);
2670			vm_page_wakeup(m);
2671		}
2672	}
2673}
2674
2675/*
2676 *	Routine:	pmap_change_wiring
2677 *	Function:	Change the wiring attribute for a map/virtual-address
2678 *			pair.
2679 *	In/out conditions:
2680 *			The mapping must already exist in the pmap.
2681 */
2682void
2683pmap_change_wiring(pmap, va, wired)
2684	register pmap_t pmap;
2685	vm_offset_t va;
2686	boolean_t wired;
2687{
2688	register pt_entry_t *pte;
2689
2690	if (pmap == NULL)
2691		return;
2692
2693	pte = pmap_pte(pmap, va);
2694
2695	if (wired && !pmap_pte_w(pte))
2696		pmap->pm_stats.wired_count++;
2697	else if (!wired && pmap_pte_w(pte))
2698		pmap->pm_stats.wired_count--;
2699
2700	/*
2701	 * Wiring is not a hardware characteristic so there is no need to
2702	 * invalidate TLB.
2703	 */
2704	pmap_pte_set_w(pte, wired);
2705}
2706
2707
2708
2709/*
2710 *	Copy the range specified by src_addr/len
2711 *	from the source map to the range dst_addr/len
2712 *	in the destination map.
2713 *
2714 *	This routine is only advisory and need not do anything.
2715 */
2716
2717void
2718pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2719	  vm_offset_t src_addr)
2720{
2721	vm_offset_t addr;
2722	vm_offset_t end_addr = src_addr + len;
2723	vm_offset_t pdnxt;
2724	pd_entry_t src_frame, dst_frame;
2725	vm_page_t m;
2726	pd_entry_t saved_pde;
2727
2728	if (dst_addr != src_addr)
2729		return;
2730
2731	src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2732	if (src_frame != (PTDpde & PG_FRAME))
2733		return;
2734
2735	dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2736	if (dst_frame != (APTDpde & PG_FRAME)) {
2737		APTDpde = dst_frame | PG_RW | PG_V;
2738#if defined(SMP)
2739		/* The page directory is not shared between CPUs */
2740		cpu_invltlb();
2741#else
2742		invltlb();
2743#endif
2744	}
2745 	saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V);
2746	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
2747		pt_entry_t *src_pte, *dst_pte;
2748		vm_page_t dstmpte, srcmpte;
2749		pd_entry_t srcptepaddr;
2750		unsigned ptepindex;
2751
2752		if (addr >= UPT_MIN_ADDRESS)
2753			panic("pmap_copy: invalid to pmap_copy page tables\n");
2754
2755		/*
2756		 * Don't let optional prefaulting of pages make us go
2757		 * way below the low water mark of free pages or way
2758		 * above high water mark of used pv entries.
2759		 */
2760		if (cnt.v_free_count < cnt.v_free_reserved ||
2761		    pv_entry_count > pv_entry_high_water)
2762			break;
2763
2764		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2765		ptepindex = addr >> PDRSHIFT;
2766
2767		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2768		if (srcptepaddr == 0)
2769			continue;
2770
2771		if (srcptepaddr & PG_PS) {
2772			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2773				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2774				dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2775			}
2776			continue;
2777		}
2778
2779		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2780		if ((srcmpte == NULL) ||
2781		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2782			continue;
2783
2784		if (pdnxt > end_addr)
2785			pdnxt = end_addr;
2786
2787		src_pte = vtopte(addr);
2788		dst_pte = avtopte(addr);
2789		while (addr < pdnxt) {
2790			pt_entry_t ptetemp;
2791			ptetemp = *src_pte;
2792			/*
2793			 * we only virtual copy managed pages
2794			 */
2795			if ((ptetemp & PG_MANAGED) != 0) {
2796				/*
2797				 * We have to check after allocpte for the
2798				 * pte still being around...  allocpte can
2799				 * block.
2800				 */
2801				dstmpte = pmap_allocpte(dst_pmap, addr);
2802				if ((APTDpde & PG_FRAME) !=
2803				    (saved_pde & PG_FRAME)) {
2804					APTDpde = saved_pde;
2805printf ("IT HAPPENNED!");
2806#if defined(SMP)
2807					cpu_invltlb();
2808#else
2809					invltlb();
2810#endif
2811				}
2812				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2813					/*
2814					 * Clear the modified and
2815					 * accessed (referenced) bits
2816					 * during the copy.
2817					 */
2818					m = PHYS_TO_VM_PAGE(ptetemp);
2819					*dst_pte = ptetemp & ~(PG_M | PG_A);
2820					dst_pmap->pm_stats.resident_count++;
2821					pmap_insert_entry(dst_pmap, addr,
2822						dstmpte, m);
2823	 			} else {
2824					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2825				}
2826				if (dstmpte->hold_count >= srcmpte->hold_count)
2827					break;
2828			}
2829			addr += PAGE_SIZE;
2830			src_pte++;
2831			dst_pte++;
2832		}
2833	}
2834}
2835
2836/*
2837 *	Routine:	pmap_kernel
2838 *	Function:
2839 *		Returns the physical map handle for the kernel.
2840 */
2841pmap_t
2842pmap_kernel()
2843{
2844	return (kernel_pmap);
2845}
2846
2847/*
2848 *	pmap_zero_page zeros the specified hardware page by mapping
2849 *	the page into KVM and using bzero to clear its contents.
2850 */
2851void
2852pmap_zero_page(vm_offset_t phys)
2853{
2854
2855	if (*CMAP2)
2856		panic("pmap_zero_page: CMAP2 busy");
2857
2858	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2859	invltlb_1pg((vm_offset_t)CADDR2);
2860
2861#if defined(I686_CPU)
2862	if (cpu_class == CPUCLASS_686)
2863		i686_pagezero(CADDR2);
2864	else
2865#endif
2866		bzero(CADDR2, PAGE_SIZE);
2867	*CMAP2 = 0;
2868}
2869
2870/*
2871 *	pmap_zero_page_area zeros the specified hardware page by mapping
2872 *	the page into KVM and using bzero to clear its contents.
2873 *
2874 *	off and size may not cover an area beyond a single hardware page.
2875 */
2876void
2877pmap_zero_page_area(vm_offset_t phys, int off, int size)
2878{
2879
2880	if (*CMAP2)
2881		panic("pmap_zero_page: CMAP2 busy");
2882
2883	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2884	invltlb_1pg((vm_offset_t)CADDR2);
2885
2886#if defined(I686_CPU)
2887	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2888		i686_pagezero(CADDR2);
2889	else
2890#endif
2891		bzero((char *)CADDR2 + off, size);
2892	*CMAP2 = 0;
2893}
2894
2895/*
2896 *	pmap_copy_page copies the specified (machine independent)
2897 *	page by mapping the page into virtual memory and using
2898 *	bcopy to copy the page, one machine dependent page at a
2899 *	time.
2900 */
2901void
2902pmap_copy_page(vm_offset_t src, vm_offset_t dst)
2903{
2904
2905	if (*CMAP1)
2906		panic("pmap_copy_page: CMAP1 busy");
2907	if (*CMAP2)
2908		panic("pmap_copy_page: CMAP2 busy");
2909
2910	*CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2911	*CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2912#ifdef I386_CPU
2913	invltlb();
2914#else
2915	invlpg((u_int)CADDR1);
2916	invlpg((u_int)CADDR2);
2917#endif
2918
2919	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2920
2921	*CMAP1 = 0;
2922	*CMAP2 = 0;
2923}
2924
2925
2926/*
2927 *	Routine:	pmap_pageable
2928 *	Function:
2929 *		Make the specified pages (by pmap, offset)
2930 *		pageable (or not) as requested.
2931 *
2932 *		A page which is not pageable may not take
2933 *		a fault; therefore, its page table entry
2934 *		must remain valid for the duration.
2935 *
2936 *		This routine is merely advisory; pmap_enter
2937 *		will specify that these pages are to be wired
2938 *		down (or not) as appropriate.
2939 */
2940void
2941pmap_pageable(pmap, sva, eva, pageable)
2942	pmap_t pmap;
2943	vm_offset_t sva, eva;
2944	boolean_t pageable;
2945{
2946}
2947
2948/*
2949 * this routine returns true if a physical page resides
2950 * in the given pmap.
2951 */
2952boolean_t
2953pmap_page_exists(pmap, m)
2954	pmap_t pmap;
2955	vm_page_t m;
2956{
2957	register pv_entry_t pv;
2958	int s;
2959
2960	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2961		return FALSE;
2962
2963	s = splvm();
2964
2965	/*
2966	 * Not found, check current mappings returning immediately if found.
2967	 */
2968	for (pv = TAILQ_FIRST(&m->md.pv_list);
2969		pv;
2970		pv = TAILQ_NEXT(pv, pv_list)) {
2971		if (pv->pv_pmap == pmap) {
2972			splx(s);
2973			return TRUE;
2974		}
2975	}
2976	splx(s);
2977	return (FALSE);
2978}
2979
2980#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2981/*
2982 * Remove all pages from specified address space
2983 * this aids process exit speeds.  Also, this code
2984 * is special cased for current process only, but
2985 * can have the more generic (and slightly slower)
2986 * mode enabled.  This is much faster than pmap_remove
2987 * in the case of running down an entire address space.
2988 */
2989void
2990pmap_remove_pages(pmap, sva, eva)
2991	pmap_t pmap;
2992	vm_offset_t sva, eva;
2993{
2994	pt_entry_t *pte, tpte;
2995	vm_page_t m;
2996	pv_entry_t pv, npv;
2997	int s;
2998
2999#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
3000	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
3001		printf("warning: pmap_remove_pages called with non-current pmap\n");
3002		return;
3003	}
3004#endif
3005
3006	s = splvm();
3007	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
3008
3009		if (pv->pv_va >= eva || pv->pv_va < sva) {
3010			npv = TAILQ_NEXT(pv, pv_plist);
3011			continue;
3012		}
3013
3014#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
3015		pte = vtopte(pv->pv_va);
3016#else
3017		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3018#endif
3019		tpte = *pte;
3020
3021		if (tpte == 0) {
3022			printf("TPTE at %p  IS ZERO @ VA %08x\n",
3023							pte, pv->pv_va);
3024			panic("bad pte");
3025		}
3026
3027/*
3028 * We cannot remove wired pages from a process' mapping at this time
3029 */
3030		if (tpte & PG_W) {
3031			npv = TAILQ_NEXT(pv, pv_plist);
3032			continue;
3033		}
3034
3035		m = PHYS_TO_VM_PAGE(tpte);
3036		KASSERT(m->phys_addr == (tpte & PG_FRAME),
3037		    ("vm_page_t %p phys_addr mismatch %08x %08x",
3038		    m, m->phys_addr, tpte));
3039
3040		KASSERT(m < &vm_page_array[vm_page_array_size],
3041			("pmap_remove_pages: bad tpte %x", tpte));
3042
3043		pv->pv_pmap->pm_stats.resident_count--;
3044
3045		*pte = 0;
3046
3047		/*
3048		 * Update the vm_page_t clean and reference bits.
3049		 */
3050		if (tpte & PG_M) {
3051			vm_page_dirty(m);
3052		}
3053
3054		npv = TAILQ_NEXT(pv, pv_plist);
3055		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
3056
3057		m->md.pv_list_count--;
3058		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3059		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
3060			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
3061		}
3062
3063		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
3064		free_pv_entry(pv);
3065	}
3066	splx(s);
3067	pmap_invalidate_all(pmap);
3068}
3069
3070/*
3071 * pmap_testbit tests bits in pte's
3072 * note that the testbit/changebit routines are inline,
3073 * and a lot of things compile-time evaluate.
3074 */
3075static boolean_t
3076pmap_testbit(m, bit)
3077	vm_page_t m;
3078	int bit;
3079{
3080	pv_entry_t pv;
3081	pt_entry_t *pte;
3082	int s;
3083
3084	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3085		return FALSE;
3086
3087	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
3088		return FALSE;
3089
3090	s = splvm();
3091
3092	for (pv = TAILQ_FIRST(&m->md.pv_list);
3093		pv;
3094		pv = TAILQ_NEXT(pv, pv_list)) {
3095
3096		/*
3097		 * if the bit being tested is the modified bit, then
3098		 * mark clean_map and ptes as never
3099		 * modified.
3100		 */
3101		if (bit & (PG_A|PG_M)) {
3102			if (!pmap_track_modified(pv->pv_va))
3103				continue;
3104		}
3105
3106#if defined(PMAP_DIAGNOSTIC)
3107		if (!pv->pv_pmap) {
3108			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
3109			continue;
3110		}
3111#endif
3112		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3113		if (*pte & bit) {
3114			splx(s);
3115			return TRUE;
3116		}
3117	}
3118	splx(s);
3119	return (FALSE);
3120}
3121
3122/*
3123 * this routine is used to modify bits in ptes
3124 */
3125static __inline void
3126pmap_changebit(vm_page_t m, int bit, boolean_t setem)
3127{
3128	register pv_entry_t pv;
3129	register pt_entry_t *pte;
3130	int s;
3131
3132	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3133		return;
3134
3135	s = splvm();
3136
3137	/*
3138	 * Loop over all current mappings setting/clearing as appropos If
3139	 * setting RO do we need to clear the VAC?
3140	 */
3141	for (pv = TAILQ_FIRST(&m->md.pv_list);
3142		pv;
3143		pv = TAILQ_NEXT(pv, pv_list)) {
3144
3145		/*
3146		 * don't write protect pager mappings
3147		 */
3148		if (!setem && (bit == PG_RW)) {
3149			if (!pmap_track_modified(pv->pv_va))
3150				continue;
3151		}
3152
3153#if defined(PMAP_DIAGNOSTIC)
3154		if (!pv->pv_pmap) {
3155			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3156			continue;
3157		}
3158#endif
3159
3160		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3161
3162		if (setem) {
3163			*pte |= bit;
3164			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3165		} else {
3166			pt_entry_t pbits = *pte;
3167			if (pbits & bit) {
3168				if (bit == PG_RW) {
3169					if (pbits & PG_M) {
3170						vm_page_dirty(m);
3171					}
3172					*pte = pbits & ~(PG_M|PG_RW);
3173				} else {
3174					*pte = pbits & ~bit;
3175				}
3176				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3177			}
3178		}
3179	}
3180	splx(s);
3181}
3182
3183/*
3184 *      pmap_page_protect:
3185 *
3186 *      Lower the permission for all mappings to a given page.
3187 */
3188void
3189pmap_page_protect(vm_page_t m, vm_prot_t prot)
3190{
3191	if ((prot & VM_PROT_WRITE) == 0) {
3192		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3193			pmap_changebit(m, PG_RW, FALSE);
3194		} else {
3195			pmap_remove_all(m);
3196		}
3197	}
3198}
3199
3200vm_offset_t
3201pmap_phys_address(ppn)
3202	int ppn;
3203{
3204	return (i386_ptob(ppn));
3205}
3206
3207/*
3208 *	pmap_ts_referenced:
3209 *
3210 *	Return the count of reference bits for a page, clearing all of them.
3211 */
3212int
3213pmap_ts_referenced(vm_page_t m)
3214{
3215	register pv_entry_t pv, pvf, pvn;
3216	pt_entry_t *pte;
3217	int s;
3218	int rtval = 0;
3219
3220	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3221		return (rtval);
3222
3223	s = splvm();
3224
3225	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3226
3227		pvf = pv;
3228
3229		do {
3230			pvn = TAILQ_NEXT(pv, pv_list);
3231
3232			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3233
3234			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3235
3236			if (!pmap_track_modified(pv->pv_va))
3237				continue;
3238
3239			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3240
3241			if (pte && (*pte & PG_A)) {
3242				*pte &= ~PG_A;
3243
3244				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3245
3246				rtval++;
3247				if (rtval > 4) {
3248					break;
3249				}
3250			}
3251		} while ((pv = pvn) != NULL && pv != pvf);
3252	}
3253	splx(s);
3254
3255	return (rtval);
3256}
3257
3258/*
3259 *	pmap_is_modified:
3260 *
3261 *	Return whether or not the specified physical page was modified
3262 *	in any physical maps.
3263 */
3264boolean_t
3265pmap_is_modified(vm_page_t m)
3266{
3267	return pmap_testbit(m, PG_M);
3268}
3269
3270/*
3271 *	Clear the modify bits on the specified physical page.
3272 */
3273void
3274pmap_clear_modify(vm_page_t m)
3275{
3276	pmap_changebit(m, PG_M, FALSE);
3277}
3278
3279/*
3280 *	pmap_clear_reference:
3281 *
3282 *	Clear the reference bit on the specified physical page.
3283 */
3284void
3285pmap_clear_reference(vm_page_t m)
3286{
3287	pmap_changebit(m, PG_A, FALSE);
3288}
3289
3290/*
3291 * Miscellaneous support routines follow
3292 */
3293
3294static void
3295i386_protection_init()
3296{
3297	register int *kp, prot;
3298
3299	kp = protection_codes;
3300	for (prot = 0; prot < 8; prot++) {
3301		switch (prot) {
3302		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3303			/*
3304			 * Read access is also 0. There isn't any execute bit,
3305			 * so just make it readable.
3306			 */
3307		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3308		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3309		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3310			*kp++ = 0;
3311			break;
3312		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3313		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3314		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3315		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3316			*kp++ = PG_RW;
3317			break;
3318		}
3319	}
3320}
3321
3322/*
3323 * Map a set of physical memory pages into the kernel virtual
3324 * address space. Return a pointer to where it is mapped. This
3325 * routine is intended to be used for mapping device memory,
3326 * NOT real memory.
3327 */
3328void *
3329pmap_mapdev(pa, size)
3330	vm_offset_t pa;
3331	vm_size_t size;
3332{
3333	vm_offset_t va, tmpva, offset;
3334	pt_entry_t *pte;
3335
3336	offset = pa & PAGE_MASK;
3337	size = roundup(offset + size, PAGE_SIZE);
3338
3339	GIANT_REQUIRED;
3340
3341	va = kmem_alloc_pageable(kernel_map, size);
3342	if (!va)
3343		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3344
3345	pa = pa & PG_FRAME;
3346	for (tmpva = va; size > 0;) {
3347		pte = vtopte(tmpva);
3348		*pte = pa | PG_RW | PG_V | pgeflag;
3349		size -= PAGE_SIZE;
3350		tmpva += PAGE_SIZE;
3351		pa += PAGE_SIZE;
3352	}
3353	invltlb();
3354
3355	return ((void *)(va + offset));
3356}
3357
3358void
3359pmap_unmapdev(va, size)
3360	vm_offset_t va;
3361	vm_size_t size;
3362{
3363	vm_offset_t base, offset;
3364
3365	base = va & PG_FRAME;
3366	offset = va & PAGE_MASK;
3367	size = roundup(offset + size, PAGE_SIZE);
3368	kmem_free(kernel_map, base, size);
3369}
3370
3371/*
3372 * perform the pmap work for mincore
3373 */
3374int
3375pmap_mincore(pmap, addr)
3376	pmap_t pmap;
3377	vm_offset_t addr;
3378{
3379
3380	pt_entry_t *ptep, pte;
3381	vm_page_t m;
3382	int val = 0;
3383
3384	ptep = pmap_pte(pmap, addr);
3385	if (ptep == 0) {
3386		return 0;
3387	}
3388
3389	if ((pte = *ptep) != 0) {
3390		vm_offset_t pa;
3391
3392		val = MINCORE_INCORE;
3393		if ((pte & PG_MANAGED) == 0)
3394			return val;
3395
3396		pa = pte & PG_FRAME;
3397
3398		m = PHYS_TO_VM_PAGE(pa);
3399
3400		/*
3401		 * Modified by us
3402		 */
3403		if (pte & PG_M)
3404			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3405		/*
3406		 * Modified by someone
3407		 */
3408		else if (m->dirty || pmap_is_modified(m))
3409			val |= MINCORE_MODIFIED_OTHER;
3410		/*
3411		 * Referenced by us
3412		 */
3413		if (pte & PG_A)
3414			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3415
3416		/*
3417		 * Referenced by someone
3418		 */
3419		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3420			val |= MINCORE_REFERENCED_OTHER;
3421			vm_page_flag_set(m, PG_REFERENCED);
3422		}
3423	}
3424	return val;
3425}
3426
3427void
3428pmap_activate(struct thread *td)
3429{
3430	struct proc *p = td->td_proc;
3431	pmap_t	pmap;
3432	u_int32_t  cr3;
3433
3434	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3435#if defined(SMP)
3436	pmap->pm_active |= 1 << PCPU_GET(cpuid);
3437#else
3438	pmap->pm_active |= 1;
3439#endif
3440#if defined(SWTCH_OPTIM_STATS)
3441	tlb_flush_count++;
3442#endif
3443	cr3 = vtophys(pmap->pm_pdir);
3444	/* XXXKSE this is wrong.
3445	 * pmap_activate is for the current thread on the current cpu
3446	 */
3447	if (p->p_flag & P_KSES) {
3448		/* Make sure all other cr3 entries are updated. */
3449		/* what if they are running?  XXXKSE (maybe abort them) */
3450		FOREACH_THREAD_IN_PROC(p, td) {
3451			td->td_pcb->pcb_cr3 = cr3;
3452		}
3453	} else {
3454		td->td_pcb->pcb_cr3 = cr3;
3455	}
3456	load_cr3(cr3);
3457}
3458
3459vm_offset_t
3460pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3461{
3462
3463	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3464		return addr;
3465	}
3466
3467	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3468	return addr;
3469}
3470
3471
3472#if defined(PMAP_DEBUG)
3473pmap_pid_dump(int pid)
3474{
3475	pmap_t pmap;
3476	struct proc *p;
3477	int npte = 0;
3478	int index;
3479
3480	sx_slock(&allproc_lock);
3481	LIST_FOREACH(p, &allproc, p_list) {
3482		if (p->p_pid != pid)
3483			continue;
3484
3485		if (p->p_vmspace) {
3486			int i,j;
3487			index = 0;
3488			pmap = vmspace_pmap(p->p_vmspace);
3489			for (i = 0; i < NPDEPG; i++) {
3490				pd_entry_t *pde;
3491				pt_entry_t *pte;
3492				vm_offset_t base = i << PDRSHIFT;
3493
3494				pde = &pmap->pm_pdir[i];
3495				if (pde && pmap_pde_v(pde)) {
3496					for (j = 0; j < NPTEPG; j++) {
3497						vm_offset_t va = base + (j << PAGE_SHIFT);
3498						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3499							if (index) {
3500								index = 0;
3501								printf("\n");
3502							}
3503							sx_sunlock(&allproc_lock);
3504							return npte;
3505						}
3506						pte = pmap_pte_quick(pmap, va);
3507						if (pte && pmap_pte_v(pte)) {
3508							pt_entry_t pa;
3509							vm_page_t m;
3510							pa = *pte;
3511							m = PHYS_TO_VM_PAGE(pa);
3512							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3513								va, pa, m->hold_count, m->wire_count, m->flags);
3514							npte++;
3515							index++;
3516							if (index >= 2) {
3517								index = 0;
3518								printf("\n");
3519							} else {
3520								printf(" ");
3521							}
3522						}
3523					}
3524				}
3525			}
3526		}
3527	}
3528	sx_sunlock(&allproc_lock);
3529	return npte;
3530}
3531#endif
3532
3533#if defined(DEBUG)
3534
3535static void	pads __P((pmap_t pm));
3536void		pmap_pvdump __P((vm_offset_t pa));
3537
3538/* print address space of pmap*/
3539static void
3540pads(pm)
3541	pmap_t pm;
3542{
3543	int i, j;
3544	vm_offset_t va;
3545	pt_entry_t *ptep;
3546
3547	if (pm == kernel_pmap)
3548		return;
3549	for (i = 0; i < NPDEPG; i++)
3550		if (pm->pm_pdir[i])
3551			for (j = 0; j < NPTEPG; j++) {
3552				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3553				if (pm == kernel_pmap && va < KERNBASE)
3554					continue;
3555				if (pm != kernel_pmap && va > VM_MAXUSER_ADDRESS)
3556					continue;
3557				ptep = pmap_pte_quick(pm, va);
3558				if (pmap_pte_v(ptep))
3559					printf("%x:%x ", va, *ptep);
3560			};
3561
3562}
3563
3564void
3565pmap_pvdump(pa)
3566	vm_offset_t pa;
3567{
3568	pv_entry_t pv;
3569	vm_page_t m;
3570
3571	printf("pa %x", pa);
3572	m = PHYS_TO_VM_PAGE(pa);
3573	for (pv = TAILQ_FIRST(&m->md.pv_list);
3574		pv;
3575		pv = TAILQ_NEXT(pv, pv_list)) {
3576		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3577		pads(pv->pv_pmap);
3578	}
3579	printf(" ");
3580}
3581#endif
3582