pmap.c revision 92770
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 * $FreeBSD: head/sys/i386/i386/pmap.c 92770 2002-03-20 08:56:31Z alfred $
43 */
44
45/*
46 *	Manages physical address maps.
47 *
48 *	In addition to hardware address maps, this
49 *	module is called upon to provide software-use-only
50 *	maps which may or may not be stored in the same
51 *	form as hardware maps.  These pseudo-maps are
52 *	used to store intermediate results from copy
53 *	operations to and from address spaces.
54 *
55 *	Since the information managed by this module is
56 *	also stored by the logical address mapping module,
57 *	this module may throw away valid virtual-to-physical
58 *	mappings at almost any time.  However, invalidations
59 *	of virtual-to-physical mappings must be done as
60 *	requested.
61 *
62 *	In order to cope with hardware architectures which
63 *	make virtual-to-physical map invalidates expensive,
64 *	this module may delay invalidate or reduced protection
65 *	operations until such time as they are actually
66 *	necessary.  This module is given full information as
67 *	to which processors are currently using which maps,
68 *	and to when physical maps must be made correct.
69 */
70
71#include "opt_disable_pse.h"
72#include "opt_pmap.h"
73#include "opt_msgbuf.h"
74#include "opt_kstack_pages.h"
75
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/kernel.h>
79#include <sys/lock.h>
80#include <sys/mman.h>
81#include <sys/msgbuf.h>
82#include <sys/mutex.h>
83#include <sys/proc.h>
84#include <sys/sx.h>
85#include <sys/user.h>
86#include <sys/vmmeter.h>
87#include <sys/sysctl.h>
88
89#include <vm/vm.h>
90#include <vm/vm_param.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_page.h>
93#include <vm/vm_map.h>
94#include <vm/vm_object.h>
95#include <vm/vm_extern.h>
96#include <vm/vm_pageout.h>
97#include <vm/vm_pager.h>
98#include <vm/vm_zone.h>
99
100#include <machine/cputypes.h>
101#include <machine/md_var.h>
102#include <machine/specialreg.h>
103#if defined(SMP) || defined(APIC_IO)
104#include <machine/smp.h>
105#include <machine/apic.h>
106#include <machine/segments.h>
107#include <machine/tss.h>
108#endif /* SMP || APIC_IO */
109
110#define PMAP_KEEP_PDIRS
111#ifndef PMAP_SHPGPERPROC
112#define PMAP_SHPGPERPROC 200
113#endif
114
115#if defined(DIAGNOSTIC)
116#define PMAP_DIAGNOSTIC
117#endif
118
119#define MINPV 2048
120
121#if !defined(PMAP_DIAGNOSTIC)
122#define PMAP_INLINE __inline
123#else
124#define PMAP_INLINE
125#endif
126
127/*
128 * Get PDEs and PTEs for user/kernel address space
129 */
130#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
131#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
132
133#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
134#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
135#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
136#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
137#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
138
139#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
140#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
141
142/*
143 * Given a map and a machine independent protection code,
144 * convert to a vax protection code.
145 */
146#define pte_prot(m, p)	(protection_codes[p])
147static int protection_codes[8];
148
149static struct pmap kernel_pmap_store;
150pmap_t kernel_pmap;
151LIST_HEAD(pmaplist, pmap);
152struct pmaplist allpmaps;
153
154vm_offset_t avail_start;	/* PA of first available physical page */
155vm_offset_t avail_end;		/* PA of last available physical page */
156vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
157vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
158static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
159static int pgeflag;		/* PG_G or-in */
160static int pseflag;		/* PG_PS or-in */
161
162static vm_object_t kptobj;
163
164static int nkpt;
165vm_offset_t kernel_vm_end;
166
167/*
168 * Data for the pv entry allocation mechanism
169 */
170static vm_zone_t pvzone;
171static struct vm_object pvzone_obj;
172static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
173static int pmap_pagedaemon_waken = 0;
174
175/*
176 * All those kernel PT submaps that BSD is so fond of
177 */
178pt_entry_t *CMAP1 = 0;
179static pt_entry_t *CMAP2, *ptmmap;
180caddr_t CADDR1 = 0, ptvmmap = 0;
181static caddr_t CADDR2;
182static pt_entry_t *msgbufmap;
183struct msgbuf *msgbufp = 0;
184
185/*
186 * Crashdump maps.
187 */
188static pt_entry_t *pt_crashdumpmap;
189static caddr_t crashdumpmap;
190
191#ifdef SMP
192extern pt_entry_t *SMPpt;
193#endif
194static pt_entry_t *PMAP1 = 0;
195static pt_entry_t *PADDR1 = 0;
196
197static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
198static pt_entry_t *get_ptbase(pmap_t pmap);
199static pv_entry_t get_pv_entry(void);
200static void	i386_protection_init(void);
201static __inline void	pmap_changebit(vm_page_t m, int bit, boolean_t setem);
202
203static void	pmap_remove_all(vm_page_t m);
204static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va,
205				      vm_page_t m, vm_page_t mpte);
206static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
207static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
208static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
209					vm_offset_t va);
210static boolean_t pmap_testbit(vm_page_t m, int bit);
211static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
212		vm_page_t mpte, vm_page_t m);
213
214static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
215
216static int pmap_release_free_page(pmap_t pmap, vm_page_t p);
217static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex);
218static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
219static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex);
220static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
221static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
222static void *pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
223
224static pd_entry_t pdir4mb;
225
226/*
227 *	Routine:	pmap_pte
228 *	Function:
229 *		Extract the page table entry associated
230 *		with the given map/virtual_address pair.
231 */
232
233PMAP_INLINE pt_entry_t *
234pmap_pte(pmap, va)
235	register pmap_t pmap;
236	vm_offset_t va;
237{
238	pd_entry_t *pdeaddr;
239
240	if (pmap) {
241		pdeaddr = pmap_pde(pmap, va);
242		if (*pdeaddr & PG_PS)
243			return pdeaddr;
244		if (*pdeaddr) {
245			return get_ptbase(pmap) + i386_btop(va);
246		}
247	}
248	return (0);
249}
250
251/*
252 * Move the kernel virtual free pointer to the next
253 * 4MB.  This is used to help improve performance
254 * by using a large (4MB) page for much of the kernel
255 * (.text, .data, .bss)
256 */
257static vm_offset_t
258pmap_kmem_choose(vm_offset_t addr)
259{
260	vm_offset_t newaddr = addr;
261#ifndef DISABLE_PSE
262	if (cpu_feature & CPUID_PSE) {
263		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
264	}
265#endif
266	return newaddr;
267}
268
269/*
270 *	Bootstrap the system enough to run with virtual memory.
271 *
272 *	On the i386 this is called after mapping has already been enabled
273 *	and just syncs the pmap module with what has already been done.
274 *	[We can't call it easily with mapping off since the kernel is not
275 *	mapped with PA == VA, hence we would have to relocate every address
276 *	from the linked base (virtual) address "KERNBASE" to the actual
277 *	(physical) address starting relative to 0]
278 */
279void
280pmap_bootstrap(firstaddr, loadaddr)
281	vm_offset_t firstaddr;
282	vm_offset_t loadaddr;
283{
284	vm_offset_t va;
285	pt_entry_t *pte;
286	int i;
287
288	avail_start = firstaddr;
289
290	/*
291	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
292	 * large. It should instead be correctly calculated in locore.s and
293	 * not based on 'first' (which is a physical address, not a virtual
294	 * address, for the start of unused physical memory). The kernel
295	 * page tables are NOT double mapped and thus should not be included
296	 * in this calculation.
297	 */
298	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
299	virtual_avail = pmap_kmem_choose(virtual_avail);
300
301	virtual_end = VM_MAX_KERNEL_ADDRESS;
302
303	/*
304	 * Initialize protection array.
305	 */
306	i386_protection_init();
307
308	/*
309	 * The kernel's pmap is statically allocated so we don't have to use
310	 * pmap_create, which is unlikely to work correctly at this part of
311	 * the boot sequence (XXX and which no longer exists).
312	 */
313	kernel_pmap = &kernel_pmap_store;
314
315	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
316	kernel_pmap->pm_count = 1;
317	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
318	TAILQ_INIT(&kernel_pmap->pm_pvlist);
319	LIST_INIT(&allpmaps);
320	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
321	nkpt = NKPT;
322
323	/*
324	 * Reserve some special page table entries/VA space for temporary
325	 * mapping of pages.
326	 */
327#define	SYSMAP(c, p, v, n)	\
328	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
329
330	va = virtual_avail;
331	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
332
333	/*
334	 * CMAP1/CMAP2 are used for zeroing and copying pages.
335	 */
336	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
337	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
338
339	/*
340	 * Crashdump maps.
341	 */
342	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
343
344	/*
345	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
346	 * XXX ptmmap is not used.
347	 */
348	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
349
350	/*
351	 * msgbufp is used to map the system message buffer.
352	 * XXX msgbufmap is not used.
353	 */
354	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
355	       atop(round_page(MSGBUF_SIZE)))
356
357	/*
358	 * ptemap is used for pmap_pte_quick
359	 */
360	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
361
362	virtual_avail = va;
363
364	*CMAP1 = *CMAP2 = 0;
365	for (i = 0; i < NKPT; i++)
366		PTD[i] = 0;
367
368	pgeflag = 0;
369#if !defined(SMP)			/* XXX - see also mp_machdep.c */
370	if (cpu_feature & CPUID_PGE) {
371		pgeflag = PG_G;
372	}
373#endif
374
375/*
376 * Initialize the 4MB page size flag
377 */
378	pseflag = 0;
379/*
380 * The 4MB page version of the initial
381 * kernel page mapping.
382 */
383	pdir4mb = 0;
384
385#if !defined(DISABLE_PSE)
386	if (cpu_feature & CPUID_PSE) {
387		pd_entry_t ptditmp;
388		/*
389		 * Note that we have enabled PSE mode
390		 */
391		pseflag = PG_PS;
392		ptditmp = *(PTmap + i386_btop(KERNBASE));
393		ptditmp &= ~(NBPDR - 1);
394		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
395		pdir4mb = ptditmp;
396
397#if !defined(SMP)
398		/*
399		 * Enable the PSE mode.
400		 */
401		load_cr4(rcr4() | CR4_PSE);
402
403		/*
404		 * We can do the mapping here for the single processor
405		 * case.  We simply ignore the old page table page from
406		 * now on.
407		 */
408		/*
409		 * For SMP, we still need 4K pages to bootstrap APs,
410		 * PSE will be enabled as soon as all APs are up.
411		 */
412		PTD[KPTDI] = (pd_entry_t) ptditmp;
413		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
414		invltlb();
415#endif
416	}
417#endif
418
419#ifdef SMP
420	if (cpu_apic_address == 0)
421		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
422
423	/* local apic is mapped on last page */
424	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
425	    (cpu_apic_address & PG_FRAME));
426#endif
427
428	invltlb();
429}
430
431#ifdef SMP
432/*
433 * Set 4mb pdir for mp startup
434 */
435void
436pmap_set_opt(void)
437{
438	if (pseflag && (cpu_feature & CPUID_PSE)) {
439		load_cr4(rcr4() | CR4_PSE);
440		if (pdir4mb && PCPU_GET(cpuid) == 0) {	/* only on BSP */
441			kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
442			cpu_invltlb();
443		}
444	}
445}
446#endif
447
448void *
449pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
450{
451	*flags = UMA_SLAB_PRIV;
452	return (void *)kmem_alloc(kernel_map, bytes);
453}
454
455/*
456 *	Initialize the pmap module.
457 *	Called by vm_init, to initialize any structures that the pmap
458 *	system needs to map virtual memory.
459 *	pmap_init has been enhanced to support in a fairly consistant
460 *	way, discontiguous physical memory.
461 */
462void
463pmap_init(phys_start, phys_end)
464	vm_offset_t phys_start, phys_end;
465{
466	int i;
467	int initial_pvs;
468
469	/*
470	 * object for kernel page table pages
471	 */
472	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
473
474	/*
475	 * Allocate memory for random pmap data structures.  Includes the
476	 * pv_head_table.
477	 */
478
479	for(i = 0; i < vm_page_array_size; i++) {
480		vm_page_t m;
481
482		m = &vm_page_array[i];
483		TAILQ_INIT(&m->md.pv_list);
484		m->md.pv_list_count = 0;
485	}
486
487	/*
488	 * init the pv free list
489	 */
490	initial_pvs = vm_page_array_size;
491	if (initial_pvs < MINPV)
492		initial_pvs = MINPV;
493#if 0
494	pvzone = &pvzone_store;
495	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
496		initial_pvs * sizeof (struct pv_entry));
497	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
498	    vm_page_array_size);
499#endif
500	pvzone = zinit("PV ENTRY", sizeof (struct pv_entry), 0, 0, 0);
501	uma_zone_set_allocf(pvzone, pmap_allocf);
502	uma_prealloc(pvzone, initial_pvs);
503
504	/*
505	 * Now it is safe to enable pv_table recording.
506	 */
507	pmap_initialized = TRUE;
508}
509
510/*
511 * Initialize the address space (zone) for the pv_entries.  Set a
512 * high water mark so that the system can recover from excessive
513 * numbers of pv entries.
514 */
515void
516pmap_init2()
517{
518	int shpgperproc = PMAP_SHPGPERPROC;
519
520	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
521	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
522	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
523	pv_entry_high_water = 9 * (pv_entry_max / 10);
524#if 0
525	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
526#endif
527	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
528}
529
530
531/***************************************************
532 * Low level helper routines.....
533 ***************************************************/
534
535#if defined(PMAP_DIAGNOSTIC)
536
537/*
538 * This code checks for non-writeable/modified pages.
539 * This should be an invalid condition.
540 */
541static int
542pmap_nw_modified(pt_entry_t ptea)
543{
544	int pte;
545
546	pte = (int) ptea;
547
548	if ((pte & (PG_M|PG_RW)) == PG_M)
549		return 1;
550	else
551		return 0;
552}
553#endif
554
555
556/*
557 * this routine defines the region(s) of memory that should
558 * not be tested for the modified bit.
559 */
560static PMAP_INLINE int
561pmap_track_modified(vm_offset_t va)
562{
563	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
564		return 1;
565	else
566		return 0;
567}
568
569static PMAP_INLINE void
570invltlb_1pg(vm_offset_t va)
571{
572#ifdef I386_CPU
573	invltlb();
574#else
575	invlpg(va);
576#endif
577}
578
579static __inline void
580pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
581{
582#if defined(SMP)
583	if (pmap->pm_active & PCPU_GET(cpumask))
584		cpu_invlpg((void *)va);
585	if (pmap->pm_active & PCPU_GET(other_cpus))
586		smp_invltlb();
587#else
588	if (pmap->pm_active)
589		invltlb_1pg(va);
590#endif
591}
592
593static __inline void
594pmap_invalidate_all(pmap_t pmap)
595{
596#if defined(SMP)
597	if (pmap->pm_active & PCPU_GET(cpumask))
598		cpu_invltlb();
599	if (pmap->pm_active & PCPU_GET(other_cpus))
600		smp_invltlb();
601#else
602	if (pmap->pm_active)
603		invltlb();
604#endif
605}
606
607/*
608 * Return an address which is the base of the Virtual mapping of
609 * all the PTEs for the given pmap. Note this doesn't say that
610 * all the PTEs will be present or that the pages there are valid.
611 * The PTEs are made available by the recursive mapping trick.
612 * It will map in the alternate PTE space if needed.
613 */
614static pt_entry_t *
615get_ptbase(pmap)
616	pmap_t pmap;
617{
618	pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
619
620	/* are we current address space or kernel? */
621	if (pmap == kernel_pmap || frame == (PTDpde & PG_FRAME))
622		return PTmap;
623	/* otherwise, we are alternate address space */
624	if (frame != (APTDpde & PG_FRAME)) {
625		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
626#if defined(SMP)
627		/* The page directory is not shared between CPUs */
628		cpu_invltlb();
629#else
630		invltlb();
631#endif
632	}
633	return APTmap;
634}
635
636/*
637 * Super fast pmap_pte routine best used when scanning
638 * the pv lists.  This eliminates many coarse-grained
639 * invltlb calls.  Note that many of the pv list
640 * scans are across different pmaps.  It is very wasteful
641 * to do an entire invltlb for checking a single mapping.
642 */
643
644static pt_entry_t *
645pmap_pte_quick(pmap, va)
646	register pmap_t pmap;
647	vm_offset_t va;
648{
649	pd_entry_t pde, newpf;
650	pde = pmap->pm_pdir[va >> PDRSHIFT];
651	if (pde != 0) {
652		pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
653		unsigned index = i386_btop(va);
654		/* are we current address space or kernel? */
655		if (pmap == kernel_pmap || frame == (PTDpde & PG_FRAME))
656			return PTmap + index;
657		newpf = pde & PG_FRAME;
658		if (((*PMAP1) & PG_FRAME) != newpf) {
659			*PMAP1 = newpf | PG_RW | PG_V;
660			invltlb_1pg((vm_offset_t) PADDR1);
661		}
662		return PADDR1 + (index & (NPTEPG - 1));
663	}
664	return (0);
665}
666
667/*
668 *	Routine:	pmap_extract
669 *	Function:
670 *		Extract the physical page address associated
671 *		with the given map/virtual_address pair.
672 */
673vm_offset_t
674pmap_extract(pmap, va)
675	register pmap_t pmap;
676	vm_offset_t va;
677{
678	vm_offset_t rtval;	/* XXX FIXME */
679	vm_offset_t pdirindex;
680
681	if (pmap == 0)
682		return 0;
683	pdirindex = va >> PDRSHIFT;
684	rtval = pmap->pm_pdir[pdirindex];
685	if (rtval != 0) {
686		pt_entry_t *pte;
687		if ((rtval & PG_PS) != 0) {
688			rtval &= ~(NBPDR - 1);
689			rtval |= va & (NBPDR - 1);
690			return rtval;
691		}
692		pte = get_ptbase(pmap) + i386_btop(va);
693		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
694		return rtval;
695	}
696	return 0;
697
698}
699
700/***************************************************
701 * Low level mapping routines.....
702 ***************************************************/
703
704/*
705 * add a wired page to the kva
706 * note that in order for the mapping to take effect -- you
707 * should do a invltlb after doing the pmap_kenter...
708 */
709PMAP_INLINE void
710pmap_kenter(vm_offset_t va, vm_offset_t pa)
711{
712	pt_entry_t *pte;
713	pt_entry_t npte, opte;
714
715	npte = pa | PG_RW | PG_V | pgeflag;
716	pte = vtopte(va);
717	opte = *pte;
718	*pte = npte;
719	invltlb_1pg(va);
720}
721
722/*
723 * remove a page from the kernel pagetables
724 */
725PMAP_INLINE void
726pmap_kremove(vm_offset_t va)
727{
728	register pt_entry_t *pte;
729
730	pte = vtopte(va);
731	*pte = 0;
732	invltlb_1pg(va);
733}
734
735/*
736 *	Used to map a range of physical addresses into kernel
737 *	virtual address space.
738 *
739 *	The value passed in '*virt' is a suggested virtual address for
740 *	the mapping. Architectures which can support a direct-mapped
741 *	physical to virtual region can return the appropriate address
742 *	within that region, leaving '*virt' unchanged. Other
743 *	architectures should map the pages starting at '*virt' and
744 *	update '*virt' with the first usable address after the mapped
745 *	region.
746 */
747vm_offset_t
748pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
749{
750	vm_offset_t sva = *virt;
751	vm_offset_t va = sva;
752	while (start < end) {
753		pmap_kenter(va, start);
754		va += PAGE_SIZE;
755		start += PAGE_SIZE;
756	}
757	*virt = va;
758	return (sva);
759}
760
761
762/*
763 * Add a list of wired pages to the kva
764 * this routine is only used for temporary
765 * kernel mappings that do not need to have
766 * page modification or references recorded.
767 * Note that old mappings are simply written
768 * over.  The page *must* be wired.
769 */
770void
771pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
772{
773	vm_offset_t end_va;
774
775	end_va = va + count * PAGE_SIZE;
776
777	while (va < end_va) {
778		pt_entry_t *pte;
779
780		pte = vtopte(va);
781		*pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
782#ifdef SMP
783		cpu_invlpg((void *)va);
784#else
785		invltlb_1pg(va);
786#endif
787		va += PAGE_SIZE;
788		m++;
789	}
790#ifdef SMP
791	smp_invltlb();
792#endif
793}
794
795/*
796 * this routine jerks page mappings from the
797 * kernel -- it is meant only for temporary mappings.
798 */
799void
800pmap_qremove(vm_offset_t va, int count)
801{
802	vm_offset_t end_va;
803
804	end_va = va + count*PAGE_SIZE;
805
806	while (va < end_va) {
807		pt_entry_t *pte;
808
809		pte = vtopte(va);
810		*pte = 0;
811#ifdef SMP
812		cpu_invlpg((void *)va);
813#else
814		invltlb_1pg(va);
815#endif
816		va += PAGE_SIZE;
817	}
818#ifdef SMP
819	smp_invltlb();
820#endif
821}
822
823static vm_page_t
824pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
825{
826	vm_page_t m;
827retry:
828	m = vm_page_lookup(object, pindex);
829	if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
830		goto retry;
831	return m;
832}
833
834/*
835 * Create the Uarea stack for a new process.
836 * This routine directly affects the fork perf for a process.
837 */
838void
839pmap_new_proc(struct proc *p)
840{
841#ifdef I386_CPU
842	int updateneeded = 0;
843#endif
844	int i;
845	vm_object_t upobj;
846	vm_offset_t up;
847	vm_page_t m;
848	pt_entry_t *ptek, oldpte;
849
850	/*
851	 * allocate object for the upages
852	 */
853	upobj = p->p_upages_obj;
854	if (upobj == NULL) {
855		upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
856		p->p_upages_obj = upobj;
857	}
858
859	/* get a kernel virtual address for the U area for this thread */
860	up = (vm_offset_t)p->p_uarea;
861	if (up == 0) {
862		up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
863		if (up == 0)
864			panic("pmap_new_proc: upage allocation failed");
865		p->p_uarea = (struct user *)up;
866	}
867
868	ptek = vtopte(up);
869
870	for (i = 0; i < UAREA_PAGES; i++) {
871		/*
872		 * Get a kernel stack page
873		 */
874		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
875
876		/*
877		 * Wire the page
878		 */
879		m->wire_count++;
880		cnt.v_wire_count++;
881
882		oldpte = *(ptek + i);
883		/*
884		 * Enter the page into the kernel address space.
885		 */
886		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
887		if (oldpte) {
888#ifdef I386_CPU
889			updateneeded = 1;
890#else
891			invlpg(up + i * PAGE_SIZE);
892#endif
893		}
894
895		vm_page_wakeup(m);
896		vm_page_flag_clear(m, PG_ZERO);
897		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
898		m->valid = VM_PAGE_BITS_ALL;
899	}
900#ifdef I386_CPU
901	if (updateneeded)
902		invltlb();
903#endif
904}
905
906/*
907 * Dispose the U-Area for a process that has exited.
908 * This routine directly impacts the exit perf of a process.
909 */
910void
911pmap_dispose_proc(p)
912	struct proc *p;
913{
914	int i;
915	vm_object_t upobj;
916	vm_offset_t up;
917	vm_page_t m;
918	pt_entry_t *ptek, oldpte;
919
920	upobj = p->p_upages_obj;
921	up = (vm_offset_t)p->p_uarea;
922	ptek = vtopte(up);
923	for (i = 0; i < UAREA_PAGES; i++) {
924		m = vm_page_lookup(upobj, i);
925		if (m == NULL)
926			panic("pmap_dispose_proc: upage already missing?");
927		vm_page_busy(m);
928		oldpte = *(ptek + i);
929		*(ptek + i) = 0;
930#ifndef I386_CPU
931		invlpg(up + i * PAGE_SIZE);
932#endif
933		vm_page_unwire(m, 0);
934		vm_page_free(m);
935	}
936#ifdef I386_CPU
937	invltlb();
938#endif
939
940	/*
941	 * If the process got swapped out some of its UPAGES might have gotten
942	 * swapped.  Just get rid of the object to clean up the swap use
943	 * proactively.  NOTE! might block waiting for paging I/O to complete.
944	 */
945	if (upobj->type == OBJT_SWAP) {
946		p->p_upages_obj = NULL;
947		vm_object_deallocate(upobj);
948	}
949}
950
951/*
952 * Allow the U_AREA for a process to be prejudicially paged out.
953 */
954void
955pmap_swapout_proc(p)
956	struct proc *p;
957{
958	int i;
959	vm_object_t upobj;
960	vm_offset_t up;
961	vm_page_t m;
962
963	upobj = p->p_upages_obj;
964	up = (vm_offset_t)p->p_uarea;
965	for (i = 0; i < UAREA_PAGES; i++) {
966		m = vm_page_lookup(upobj, i);
967		if (m == NULL)
968			panic("pmap_swapout_proc: upage already missing?");
969		vm_page_dirty(m);
970		vm_page_unwire(m, 0);
971		pmap_kremove(up + i * PAGE_SIZE);
972	}
973}
974
975/*
976 * Bring the U-Area for a specified process back in.
977 */
978void
979pmap_swapin_proc(p)
980	struct proc *p;
981{
982	int i, rv;
983	vm_object_t upobj;
984	vm_offset_t up;
985	vm_page_t m;
986
987	upobj = p->p_upages_obj;
988	up = (vm_offset_t)p->p_uarea;
989	for (i = 0; i < UAREA_PAGES; i++) {
990		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
991		pmap_kenter(up + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
992		if (m->valid != VM_PAGE_BITS_ALL) {
993			rv = vm_pager_get_pages(upobj, &m, 1, 0);
994			if (rv != VM_PAGER_OK)
995				panic("pmap_swapin_proc: cannot get upage for proc: %d\n", p->p_pid);
996			m = vm_page_lookup(upobj, i);
997			m->valid = VM_PAGE_BITS_ALL;
998		}
999		vm_page_wire(m);
1000		vm_page_wakeup(m);
1001		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1002	}
1003}
1004
1005/*
1006 * Create the kernel stack (including pcb for i386) for a new thread.
1007 * This routine directly affects the fork perf for a process and
1008 * create performance for a thread.
1009 */
1010void
1011pmap_new_thread(struct thread *td)
1012{
1013#ifdef I386_CPU
1014	int updateneeded = 0;
1015#endif
1016	int i;
1017	vm_object_t ksobj;
1018	vm_page_t m;
1019	vm_offset_t ks;
1020	pt_entry_t *ptek, oldpte;
1021
1022	/*
1023	 * allocate object for the kstack
1024	 */
1025	ksobj = td->td_kstack_obj;
1026	if (ksobj == NULL) {
1027		ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES);
1028		td->td_kstack_obj = ksobj;
1029	}
1030
1031#ifdef KSTACK_GUARD
1032	/* get a kernel virtual address for the kstack for this thread */
1033	ks = td->td_kstack;
1034	if (ks == 0) {
1035		ks = kmem_alloc_nofault(kernel_map,
1036		    (KSTACK_PAGES + 1) * PAGE_SIZE);
1037		if (ks == 0)
1038			panic("pmap_new_thread: kstack allocation failed");
1039		ks += PAGE_SIZE;
1040		td->td_kstack = ks;
1041	}
1042
1043	ptek = vtopte(ks - PAGE_SIZE);
1044	oldpte = *ptek;
1045	*ptek = 0;
1046	if (oldpte) {
1047#ifdef I386_CPU
1048		updateneeded = 1;
1049#else
1050		invlpg(ks - PAGE_SIZE);
1051#endif
1052	}
1053	ptek++;
1054#else
1055	/* get a kernel virtual address for the kstack for this thread */
1056	ks = td->td_kstack;
1057	if (ks == 0) {
1058		ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE);
1059		if (ks == 0)
1060			panic("pmap_new_thread: kstack allocation failed");
1061		td->td_kstack = ks;
1062	}
1063	ptek = vtopte(ks);
1064#endif
1065	for (i = 0; i < KSTACK_PAGES; i++) {
1066		/*
1067		 * Get a kernel stack page
1068		 */
1069		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1070
1071		/*
1072		 * Wire the page
1073		 */
1074		m->wire_count++;
1075		cnt.v_wire_count++;
1076
1077		oldpte = *(ptek + i);
1078		/*
1079		 * Enter the page into the kernel address space.
1080		 */
1081		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
1082		if (oldpte) {
1083#ifdef I386_CPU
1084			updateneeded = 1;
1085#else
1086			invlpg(ks + i * PAGE_SIZE);
1087#endif
1088		}
1089
1090		vm_page_wakeup(m);
1091		vm_page_flag_clear(m, PG_ZERO);
1092		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1093		m->valid = VM_PAGE_BITS_ALL;
1094	}
1095#ifdef I386_CPU
1096	if (updateneeded)
1097		invltlb();
1098#endif
1099}
1100
1101/*
1102 * Dispose the kernel stack for a thread that has exited.
1103 * This routine directly impacts the exit perf of a process and thread.
1104 */
1105void
1106pmap_dispose_thread(td)
1107	struct thread *td;
1108{
1109	int i;
1110	vm_object_t ksobj;
1111	vm_offset_t ks;
1112	vm_page_t m;
1113	pt_entry_t *ptek, oldpte;
1114
1115	ksobj = td->td_kstack_obj;
1116	ks = td->td_kstack;
1117	ptek = vtopte(ks);
1118	for (i = 0; i < KSTACK_PAGES; i++) {
1119		m = vm_page_lookup(ksobj, i);
1120		if (m == NULL)
1121			panic("pmap_dispose_thread: kstack already missing?");
1122		vm_page_busy(m);
1123		oldpte = *(ptek + i);
1124		*(ptek + i) = 0;
1125#ifndef I386_CPU
1126		invlpg(ks + i * PAGE_SIZE);
1127#endif
1128		vm_page_unwire(m, 0);
1129		vm_page_free(m);
1130	}
1131#ifdef I386_CPU
1132	invltlb();
1133#endif
1134
1135	/*
1136	 * If the thread got swapped out some of its KSTACK might have gotten
1137	 * swapped.  Just get rid of the object to clean up the swap use
1138	 * proactively.  NOTE! might block waiting for paging I/O to complete.
1139	 */
1140	if (ksobj->type == OBJT_SWAP) {
1141		td->td_kstack_obj = NULL;
1142		vm_object_deallocate(ksobj);
1143	}
1144}
1145
1146/*
1147 * Allow the Kernel stack for a thread to be prejudicially paged out.
1148 */
1149void
1150pmap_swapout_thread(td)
1151	struct thread *td;
1152{
1153	int i;
1154	vm_object_t ksobj;
1155	vm_offset_t ks;
1156	vm_page_t m;
1157
1158	ksobj = td->td_kstack_obj;
1159	ks = td->td_kstack;
1160	for (i = 0; i < KSTACK_PAGES; i++) {
1161		m = vm_page_lookup(ksobj, i);
1162		if (m == NULL)
1163			panic("pmap_swapout_thread: kstack already missing?");
1164		vm_page_dirty(m);
1165		vm_page_unwire(m, 0);
1166		pmap_kremove(ks + i * PAGE_SIZE);
1167	}
1168}
1169
1170/*
1171 * Bring the kernel stack for a specified thread back in.
1172 */
1173void
1174pmap_swapin_thread(td)
1175	struct thread *td;
1176{
1177	int i, rv;
1178	vm_object_t ksobj;
1179	vm_offset_t ks;
1180	vm_page_t m;
1181
1182	ksobj = td->td_kstack_obj;
1183	ks = td->td_kstack;
1184	for (i = 0; i < KSTACK_PAGES; i++) {
1185		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1186		pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m));
1187		if (m->valid != VM_PAGE_BITS_ALL) {
1188			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
1189			if (rv != VM_PAGER_OK)
1190				panic("pmap_swapin_thread: cannot get kstack for proc: %d\n", td->td_proc->p_pid);
1191			m = vm_page_lookup(ksobj, i);
1192			m->valid = VM_PAGE_BITS_ALL;
1193		}
1194		vm_page_wire(m);
1195		vm_page_wakeup(m);
1196		vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1197	}
1198}
1199
1200/***************************************************
1201 * Page table page management routines.....
1202 ***************************************************/
1203
1204/*
1205 * This routine unholds page table pages, and if the hold count
1206 * drops to zero, then it decrements the wire count.
1207 */
1208static int
1209_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1210{
1211
1212	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
1213		;
1214
1215	if (m->hold_count == 0) {
1216		vm_offset_t pteva;
1217		/*
1218		 * unmap the page table page
1219		 */
1220		pmap->pm_pdir[m->pindex] = 0;
1221		--pmap->pm_stats.resident_count;
1222		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1223		    (PTDpde & PG_FRAME)) {
1224			/*
1225			 * Do a invltlb to make the invalidated mapping
1226			 * take effect immediately.
1227			 */
1228			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1229			pmap_invalidate_page(pmap, pteva);
1230		}
1231
1232		if (pmap->pm_ptphint == m)
1233			pmap->pm_ptphint = NULL;
1234
1235		/*
1236		 * If the page is finally unwired, simply free it.
1237		 */
1238		--m->wire_count;
1239		if (m->wire_count == 0) {
1240
1241			vm_page_flash(m);
1242			vm_page_busy(m);
1243			vm_page_free_zero(m);
1244			--cnt.v_wire_count;
1245		}
1246		return 1;
1247	}
1248	return 0;
1249}
1250
1251static PMAP_INLINE int
1252pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1253{
1254	vm_page_unhold(m);
1255	if (m->hold_count == 0)
1256		return _pmap_unwire_pte_hold(pmap, m);
1257	else
1258		return 0;
1259}
1260
1261/*
1262 * After removing a page table entry, this routine is used to
1263 * conditionally free the page, and manage the hold/wire counts.
1264 */
1265static int
1266pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1267{
1268	unsigned ptepindex;
1269	if (va >= VM_MAXUSER_ADDRESS)
1270		return 0;
1271
1272	if (mpte == NULL) {
1273		ptepindex = (va >> PDRSHIFT);
1274		if (pmap->pm_ptphint &&
1275			(pmap->pm_ptphint->pindex == ptepindex)) {
1276			mpte = pmap->pm_ptphint;
1277		} else {
1278			mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1279			pmap->pm_ptphint = mpte;
1280		}
1281	}
1282
1283	return pmap_unwire_pte_hold(pmap, mpte);
1284}
1285
1286void
1287pmap_pinit0(pmap)
1288	struct pmap *pmap;
1289{
1290	pmap->pm_pdir =
1291		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1292	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
1293	pmap->pm_count = 1;
1294	pmap->pm_ptphint = NULL;
1295	pmap->pm_active = 0;
1296	TAILQ_INIT(&pmap->pm_pvlist);
1297	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1298	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1299}
1300
1301/*
1302 * Initialize a preallocated and zeroed pmap structure,
1303 * such as one in a vmspace structure.
1304 */
1305void
1306pmap_pinit(pmap)
1307	register struct pmap *pmap;
1308{
1309	vm_page_t ptdpg;
1310
1311	/*
1312	 * No need to allocate page table space yet but we do need a valid
1313	 * page directory table.
1314	 */
1315	if (pmap->pm_pdir == NULL)
1316		pmap->pm_pdir =
1317			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
1318
1319	/*
1320	 * allocate object for the ptes
1321	 */
1322	if (pmap->pm_pteobj == NULL)
1323		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
1324
1325	/*
1326	 * allocate the page directory page
1327	 */
1328	ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
1329			VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1330
1331	ptdpg->wire_count = 1;
1332	++cnt.v_wire_count;
1333
1334
1335	vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/
1336	ptdpg->valid = VM_PAGE_BITS_ALL;
1337
1338	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1339	if ((ptdpg->flags & PG_ZERO) == 0)
1340		bzero(pmap->pm_pdir, PAGE_SIZE);
1341
1342	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1343	/* Wire in kernel global address entries. */
1344	/* XXX copies current process, does not fill in MPPTDI */
1345	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1346#ifdef SMP
1347	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1348#endif
1349
1350	/* install self-referential address mapping entry */
1351	pmap->pm_pdir[PTDPTDI] =
1352		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1353
1354	pmap->pm_count = 1;
1355	pmap->pm_active = 0;
1356	pmap->pm_ptphint = NULL;
1357	TAILQ_INIT(&pmap->pm_pvlist);
1358	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1359}
1360
1361/*
1362 * Wire in kernel global address entries.  To avoid a race condition
1363 * between pmap initialization and pmap_growkernel, this procedure
1364 * should be called after the vmspace is attached to the process
1365 * but before this pmap is activated.
1366 */
1367void
1368pmap_pinit2(pmap)
1369	struct pmap *pmap;
1370{
1371	/* XXX: Remove this stub when no longer called */
1372}
1373
1374static int
1375pmap_release_free_page(pmap_t pmap, vm_page_t p)
1376{
1377	pd_entry_t *pde = pmap->pm_pdir;
1378	/*
1379	 * This code optimizes the case of freeing non-busy
1380	 * page-table pages.  Those pages are zero now, and
1381	 * might as well be placed directly into the zero queue.
1382	 */
1383	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1384		return 0;
1385
1386	vm_page_busy(p);
1387
1388	/*
1389	 * Remove the page table page from the processes address space.
1390	 */
1391	pde[p->pindex] = 0;
1392	pmap->pm_stats.resident_count--;
1393
1394	if (p->hold_count)  {
1395		panic("pmap_release: freeing held page table page");
1396	}
1397	/*
1398	 * Page directory pages need to have the kernel
1399	 * stuff cleared, so they can go into the zero queue also.
1400	 */
1401	if (p->pindex == PTDPTDI) {
1402		bzero(pde + KPTDI, nkpt * PTESIZE);
1403#ifdef SMP
1404		pde[MPPTDI] = 0;
1405#endif
1406		pde[APTDPTDI] = 0;
1407		pmap_kremove((vm_offset_t) pmap->pm_pdir);
1408	}
1409
1410	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1411		pmap->pm_ptphint = NULL;
1412
1413	p->wire_count--;
1414	cnt.v_wire_count--;
1415	vm_page_free_zero(p);
1416	return 1;
1417}
1418
1419/*
1420 * this routine is called if the page table page is not
1421 * mapped correctly.
1422 */
1423static vm_page_t
1424_pmap_allocpte(pmap, ptepindex)
1425	pmap_t	pmap;
1426	unsigned ptepindex;
1427{
1428	vm_offset_t pteva, ptepa;	/* XXXPA */
1429	vm_page_t m;
1430
1431	/*
1432	 * Find or fabricate a new pagetable page
1433	 */
1434	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1435			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1436
1437	KASSERT(m->queue == PQ_NONE,
1438		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1439
1440	if (m->wire_count == 0)
1441		cnt.v_wire_count++;
1442	m->wire_count++;
1443
1444	/*
1445	 * Increment the hold count for the page table page
1446	 * (denoting a new mapping.)
1447	 */
1448	m->hold_count++;
1449
1450	/*
1451	 * Map the pagetable page into the process address space, if
1452	 * it isn't already there.
1453	 */
1454
1455	pmap->pm_stats.resident_count++;
1456
1457	ptepa = VM_PAGE_TO_PHYS(m);
1458	pmap->pm_pdir[ptepindex] =
1459		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1460
1461	/*
1462	 * Set the page table hint
1463	 */
1464	pmap->pm_ptphint = m;
1465
1466	/*
1467	 * Try to use the new mapping, but if we cannot, then
1468	 * do it with the routine that maps the page explicitly.
1469	 */
1470	if ((m->flags & PG_ZERO) == 0) {
1471		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1472		    (PTDpde & PG_FRAME)) {
1473			pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex);
1474			bzero((caddr_t) pteva, PAGE_SIZE);
1475		} else {
1476			pmap_zero_page(ptepa);
1477		}
1478	}
1479
1480	m->valid = VM_PAGE_BITS_ALL;
1481	vm_page_flag_clear(m, PG_ZERO);
1482	vm_page_flag_set(m, PG_MAPPED);
1483	vm_page_wakeup(m);
1484
1485	return m;
1486}
1487
1488static vm_page_t
1489pmap_allocpte(pmap_t pmap, vm_offset_t va)
1490{
1491	unsigned ptepindex;
1492	pd_entry_t ptepa;
1493	vm_page_t m;
1494
1495	/*
1496	 * Calculate pagetable page index
1497	 */
1498	ptepindex = va >> PDRSHIFT;
1499
1500	/*
1501	 * Get the page directory entry
1502	 */
1503	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1504
1505	/*
1506	 * This supports switching from a 4MB page to a
1507	 * normal 4K page.
1508	 */
1509	if (ptepa & PG_PS) {
1510		pmap->pm_pdir[ptepindex] = 0;
1511		ptepa = 0;
1512		invltlb();
1513	}
1514
1515	/*
1516	 * If the page table page is mapped, we just increment the
1517	 * hold count, and activate it.
1518	 */
1519	if (ptepa) {
1520		/*
1521		 * In order to get the page table page, try the
1522		 * hint first.
1523		 */
1524		if (pmap->pm_ptphint &&
1525			(pmap->pm_ptphint->pindex == ptepindex)) {
1526			m = pmap->pm_ptphint;
1527		} else {
1528			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1529			pmap->pm_ptphint = m;
1530		}
1531		m->hold_count++;
1532		return m;
1533	}
1534	/*
1535	 * Here if the pte page isn't mapped, or if it has been deallocated.
1536	 */
1537	return _pmap_allocpte(pmap, ptepindex);
1538}
1539
1540
1541/***************************************************
1542* Pmap allocation/deallocation routines.
1543 ***************************************************/
1544
1545/*
1546 * Release any resources held by the given physical map.
1547 * Called when a pmap initialized by pmap_pinit is being released.
1548 * Should only be called if the map contains no valid mappings.
1549 */
1550void
1551pmap_release(pmap_t pmap)
1552{
1553	vm_page_t p,n,ptdpg;
1554	vm_object_t object = pmap->pm_pteobj;
1555	int curgeneration;
1556
1557#if defined(DIAGNOSTIC)
1558	if (object->ref_count != 1)
1559		panic("pmap_release: pteobj reference count != 1");
1560#endif
1561
1562	ptdpg = NULL;
1563	LIST_REMOVE(pmap, pm_list);
1564retry:
1565	curgeneration = object->generation;
1566	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
1567		n = TAILQ_NEXT(p, listq);
1568		if (p->pindex == PTDPTDI) {
1569			ptdpg = p;
1570			continue;
1571		}
1572		while (1) {
1573			if (!pmap_release_free_page(pmap, p) &&
1574				(object->generation != curgeneration))
1575				goto retry;
1576		}
1577	}
1578
1579	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
1580		goto retry;
1581}
1582
1583static int
1584kvm_size(SYSCTL_HANDLER_ARGS)
1585{
1586	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1587
1588        return sysctl_handle_long(oidp, &ksize, 0, req);
1589}
1590SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1591    0, 0, kvm_size, "IU", "Size of KVM");
1592
1593static int
1594kvm_free(SYSCTL_HANDLER_ARGS)
1595{
1596	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1597
1598        return sysctl_handle_long(oidp, &kfree, 0, req);
1599}
1600SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1601    0, 0, kvm_free, "IU", "Amount of KVM free");
1602
1603/*
1604 * grow the number of kernel page table entries, if needed
1605 */
1606void
1607pmap_growkernel(vm_offset_t addr)
1608{
1609	struct pmap *pmap;
1610	int s;
1611	vm_offset_t ptppaddr;
1612	vm_page_t nkpg;
1613	pd_entry_t newpdir;
1614
1615	s = splhigh();
1616	if (kernel_vm_end == 0) {
1617		kernel_vm_end = KERNBASE;
1618		nkpt = 0;
1619		while (pdir_pde(PTD, kernel_vm_end)) {
1620			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1621			nkpt++;
1622		}
1623	}
1624	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1625	while (kernel_vm_end < addr) {
1626		if (pdir_pde(PTD, kernel_vm_end)) {
1627			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1628			continue;
1629		}
1630
1631		/*
1632		 * This index is bogus, but out of the way
1633		 */
1634		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
1635		if (!nkpg)
1636			panic("pmap_growkernel: no memory to grow kernel");
1637
1638		nkpt++;
1639
1640		vm_page_wire(nkpg);
1641		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1642		pmap_zero_page(ptppaddr);
1643		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1644		pdir_pde(PTD, kernel_vm_end) = newpdir;
1645
1646		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1647			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1648		}
1649		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1650	}
1651	splx(s);
1652}
1653
1654/*
1655 *	Retire the given physical map from service.
1656 *	Should only be called if the map contains
1657 *	no valid mappings.
1658 */
1659void
1660pmap_destroy(pmap_t pmap)
1661{
1662	int count;
1663
1664	if (pmap == NULL)
1665		return;
1666
1667	count = --pmap->pm_count;
1668	if (count == 0) {
1669		pmap_release(pmap);
1670		panic("destroying a pmap is not yet implemented");
1671	}
1672}
1673
1674/*
1675 *	Add a reference to the specified pmap.
1676 */
1677void
1678pmap_reference(pmap_t pmap)
1679{
1680	if (pmap != NULL) {
1681		pmap->pm_count++;
1682	}
1683}
1684
1685/***************************************************
1686* page management routines.
1687 ***************************************************/
1688
1689/*
1690 * free the pv_entry back to the free list
1691 */
1692static PMAP_INLINE void
1693free_pv_entry(pv_entry_t pv)
1694{
1695	pv_entry_count--;
1696	zfree(pvzone, pv);
1697}
1698
1699/*
1700 * get a new pv_entry, allocating a block from the system
1701 * when needed.
1702 * the memory allocation is performed bypassing the malloc code
1703 * because of the possibility of allocations at interrupt time.
1704 */
1705static pv_entry_t
1706get_pv_entry(void)
1707{
1708	pv_entry_count++;
1709	if (pv_entry_high_water &&
1710		(pv_entry_count > pv_entry_high_water) &&
1711		(pmap_pagedaemon_waken == 0)) {
1712		pmap_pagedaemon_waken = 1;
1713		wakeup (&vm_pages_needed);
1714	}
1715	return zalloc(pvzone);
1716}
1717
1718/*
1719 * This routine is very drastic, but can save the system
1720 * in a pinch.
1721 */
1722void
1723pmap_collect()
1724{
1725	int i;
1726	vm_page_t m;
1727	static int warningdone = 0;
1728
1729	if (pmap_pagedaemon_waken == 0)
1730		return;
1731
1732	if (warningdone < 5) {
1733		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1734		warningdone++;
1735	}
1736
1737	for(i = 0; i < vm_page_array_size; i++) {
1738		m = &vm_page_array[i];
1739		if (m->wire_count || m->hold_count || m->busy ||
1740		    (m->flags & (PG_BUSY | PG_UNMANAGED)))
1741			continue;
1742		pmap_remove_all(m);
1743	}
1744	pmap_pagedaemon_waken = 0;
1745}
1746
1747
1748/*
1749 * If it is the first entry on the list, it is actually
1750 * in the header and we must copy the following entry up
1751 * to the header.  Otherwise we must search the list for
1752 * the entry.  In either case we free the now unused entry.
1753 */
1754
1755static int
1756pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1757{
1758	pv_entry_t pv;
1759	int rtval;
1760	int s;
1761
1762	s = splvm();
1763	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1764		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1765			if (pmap == pv->pv_pmap && va == pv->pv_va)
1766				break;
1767		}
1768	} else {
1769		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1770			if (va == pv->pv_va)
1771				break;
1772		}
1773	}
1774
1775	rtval = 0;
1776	if (pv) {
1777		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1778		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1779		m->md.pv_list_count--;
1780		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1781			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1782
1783		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1784		free_pv_entry(pv);
1785	}
1786
1787	splx(s);
1788	return rtval;
1789}
1790
1791/*
1792 * Create a pv entry for page at pa for
1793 * (pmap, va).
1794 */
1795static void
1796pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1797{
1798
1799	int s;
1800	pv_entry_t pv;
1801
1802	s = splvm();
1803	pv = get_pv_entry();
1804	pv->pv_va = va;
1805	pv->pv_pmap = pmap;
1806	pv->pv_ptem = mpte;
1807
1808	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1809	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1810	m->md.pv_list_count++;
1811
1812	splx(s);
1813}
1814
1815/*
1816 * pmap_remove_pte: do the things to unmap a page in a process
1817 */
1818static int
1819pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1820{
1821	pt_entry_t oldpte;
1822	vm_page_t m;
1823
1824	oldpte = atomic_readandclear_int(ptq);
1825	if (oldpte & PG_W)
1826		pmap->pm_stats.wired_count -= 1;
1827	/*
1828	 * Machines that don't support invlpg, also don't support
1829	 * PG_G.
1830	 */
1831	if (oldpte & PG_G)
1832		invlpg(va);
1833	pmap->pm_stats.resident_count -= 1;
1834	if (oldpte & PG_MANAGED) {
1835		m = PHYS_TO_VM_PAGE(oldpte);
1836		if (oldpte & PG_M) {
1837#if defined(PMAP_DIAGNOSTIC)
1838			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1839				printf(
1840	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1841				    va, oldpte);
1842			}
1843#endif
1844			if (pmap_track_modified(va))
1845				vm_page_dirty(m);
1846		}
1847		if (oldpte & PG_A)
1848			vm_page_flag_set(m, PG_REFERENCED);
1849		return pmap_remove_entry(pmap, m, va);
1850	} else {
1851		return pmap_unuse_pt(pmap, va, NULL);
1852	}
1853
1854	return 0;
1855}
1856
1857/*
1858 * Remove a single page from a process address space
1859 */
1860static void
1861pmap_remove_page(pmap_t pmap, vm_offset_t va)
1862{
1863	register pt_entry_t *ptq;
1864
1865	/*
1866	 * if there is no pte for this address, just skip it!!!
1867	 */
1868	if (*pmap_pde(pmap, va) == 0) {
1869		return;
1870	}
1871
1872	/*
1873	 * get a local va for mappings for this pmap.
1874	 */
1875	ptq = get_ptbase(pmap) + i386_btop(va);
1876	if (*ptq) {
1877		(void) pmap_remove_pte(pmap, ptq, va);
1878		pmap_invalidate_page(pmap, va);
1879	}
1880	return;
1881}
1882
1883/*
1884 *	Remove the given range of addresses from the specified map.
1885 *
1886 *	It is assumed that the start and end are properly
1887 *	rounded to the page size.
1888 */
1889void
1890pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1891{
1892	register pt_entry_t *ptbase;
1893	vm_offset_t pdnxt;
1894	pd_entry_t ptpaddr;
1895	vm_offset_t sindex, eindex;
1896	int anyvalid;
1897
1898	if (pmap == NULL)
1899		return;
1900
1901	if (pmap->pm_stats.resident_count == 0)
1902		return;
1903
1904	/*
1905	 * special handling of removing one page.  a very
1906	 * common operation and easy to short circuit some
1907	 * code.
1908	 */
1909	if ((sva + PAGE_SIZE == eva) &&
1910	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1911		pmap_remove_page(pmap, sva);
1912		return;
1913	}
1914
1915	anyvalid = 0;
1916
1917	/*
1918	 * Get a local virtual address for the mappings that are being
1919	 * worked with.
1920	 */
1921	ptbase = get_ptbase(pmap);
1922
1923	sindex = i386_btop(sva);
1924	eindex = i386_btop(eva);
1925
1926	for (; sindex < eindex; sindex = pdnxt) {
1927		unsigned pdirindex;
1928
1929		/*
1930		 * Calculate index for next page table.
1931		 */
1932		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1933		if (pmap->pm_stats.resident_count == 0)
1934			break;
1935
1936		pdirindex = sindex / NPDEPG;
1937		ptpaddr = pmap->pm_pdir[pdirindex];
1938		if ((ptpaddr & PG_PS) != 0) {
1939			pmap->pm_pdir[pdirindex] = 0;
1940			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1941			anyvalid++;
1942			continue;
1943		}
1944
1945		/*
1946		 * Weed out invalid mappings. Note: we assume that the page
1947		 * directory table is always allocated, and in kernel virtual.
1948		 */
1949		if (ptpaddr == 0)
1950			continue;
1951
1952		/*
1953		 * Limit our scan to either the end of the va represented
1954		 * by the current page table page, or to the end of the
1955		 * range being removed.
1956		 */
1957		if (pdnxt > eindex) {
1958			pdnxt = eindex;
1959		}
1960
1961		for (; sindex != pdnxt; sindex++) {
1962			vm_offset_t va;
1963			if (ptbase[sindex] == 0) {
1964				continue;
1965			}
1966			va = i386_ptob(sindex);
1967
1968			anyvalid++;
1969			if (pmap_remove_pte(pmap,
1970				ptbase + sindex, va))
1971				break;
1972		}
1973	}
1974
1975	if (anyvalid)
1976		pmap_invalidate_all(pmap);
1977}
1978
1979/*
1980 *	Routine:	pmap_remove_all
1981 *	Function:
1982 *		Removes this physical page from
1983 *		all physical maps in which it resides.
1984 *		Reflects back modify bits to the pager.
1985 *
1986 *	Notes:
1987 *		Original versions of this routine were very
1988 *		inefficient because they iteratively called
1989 *		pmap_remove (slow...)
1990 */
1991
1992static void
1993pmap_remove_all(vm_page_t m)
1994{
1995	register pv_entry_t pv;
1996	pt_entry_t *pte, tpte;
1997	int s;
1998
1999#if defined(PMAP_DIAGNOSTIC)
2000	/*
2001	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
2002	 * pages!
2003	 */
2004	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
2005		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
2006	}
2007#endif
2008
2009	s = splvm();
2010	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2011		pv->pv_pmap->pm_stats.resident_count--;
2012
2013		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2014
2015		tpte = atomic_readandclear_int(pte);
2016		if (tpte & PG_W)
2017			pv->pv_pmap->pm_stats.wired_count--;
2018
2019		if (tpte & PG_A)
2020			vm_page_flag_set(m, PG_REFERENCED);
2021
2022		/*
2023		 * Update the vm_page_t clean and reference bits.
2024		 */
2025		if (tpte & PG_M) {
2026#if defined(PMAP_DIAGNOSTIC)
2027			if (pmap_nw_modified((pt_entry_t) tpte)) {
2028				printf(
2029	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
2030				    pv->pv_va, tpte);
2031			}
2032#endif
2033			if (pmap_track_modified(pv->pv_va))
2034				vm_page_dirty(m);
2035		}
2036		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2037
2038		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2039		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2040		m->md.pv_list_count--;
2041		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2042		free_pv_entry(pv);
2043	}
2044
2045	vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2046
2047	splx(s);
2048}
2049
2050/*
2051 *	Set the physical protection on the
2052 *	specified range of this map as requested.
2053 */
2054void
2055pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2056{
2057	register pt_entry_t *ptbase;
2058	vm_offset_t pdnxt;
2059	pd_entry_t ptpaddr;
2060	vm_pindex_t sindex, eindex;
2061	int anychanged;
2062
2063	if (pmap == NULL)
2064		return;
2065
2066	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2067		pmap_remove(pmap, sva, eva);
2068		return;
2069	}
2070
2071	if (prot & VM_PROT_WRITE)
2072		return;
2073
2074	anychanged = 0;
2075
2076	ptbase = get_ptbase(pmap);
2077
2078	sindex = i386_btop(sva);
2079	eindex = i386_btop(eva);
2080
2081	for (; sindex < eindex; sindex = pdnxt) {
2082
2083		unsigned pdirindex;
2084
2085		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2086
2087		pdirindex = sindex / NPDEPG;
2088		ptpaddr = pmap->pm_pdir[pdirindex];
2089		if ((ptpaddr & PG_PS) != 0) {
2090			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2091			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2092			anychanged++;
2093			continue;
2094		}
2095
2096		/*
2097		 * Weed out invalid mappings. Note: we assume that the page
2098		 * directory table is always allocated, and in kernel virtual.
2099		 */
2100		if (ptpaddr == 0)
2101			continue;
2102
2103		if (pdnxt > eindex) {
2104			pdnxt = eindex;
2105		}
2106
2107		for (; sindex != pdnxt; sindex++) {
2108
2109			pt_entry_t pbits;
2110			vm_page_t m;
2111
2112			pbits = ptbase[sindex];
2113
2114			if (pbits & PG_MANAGED) {
2115				m = NULL;
2116				if (pbits & PG_A) {
2117					m = PHYS_TO_VM_PAGE(pbits);
2118					vm_page_flag_set(m, PG_REFERENCED);
2119					pbits &= ~PG_A;
2120				}
2121				if (pbits & PG_M) {
2122					if (pmap_track_modified(i386_ptob(sindex))) {
2123						if (m == NULL)
2124							m = PHYS_TO_VM_PAGE(pbits);
2125						vm_page_dirty(m);
2126						pbits &= ~PG_M;
2127					}
2128				}
2129			}
2130
2131			pbits &= ~PG_RW;
2132
2133			if (pbits != ptbase[sindex]) {
2134				ptbase[sindex] = pbits;
2135				anychanged = 1;
2136			}
2137		}
2138	}
2139	if (anychanged)
2140		pmap_invalidate_all(pmap);
2141}
2142
2143/*
2144 *	Insert the given physical page (p) at
2145 *	the specified virtual address (v) in the
2146 *	target physical map with the protection requested.
2147 *
2148 *	If specified, the page will be wired down, meaning
2149 *	that the related pte can not be reclaimed.
2150 *
2151 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2152 *	or lose information.  That is, this routine must actually
2153 *	insert this page into the given map NOW.
2154 */
2155void
2156pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2157	   boolean_t wired)
2158{
2159	vm_offset_t pa;
2160	register pt_entry_t *pte;
2161	vm_offset_t opa;
2162	pt_entry_t origpte, newpte;
2163	vm_page_t mpte;
2164
2165	if (pmap == NULL)
2166		return;
2167
2168	va &= PG_FRAME;
2169#ifdef PMAP_DIAGNOSTIC
2170	if (va > VM_MAX_KERNEL_ADDRESS)
2171		panic("pmap_enter: toobig");
2172	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2173		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2174#endif
2175
2176	mpte = NULL;
2177	/*
2178	 * In the case that a page table page is not
2179	 * resident, we are creating it here.
2180	 */
2181	if (va < VM_MAXUSER_ADDRESS) {
2182		mpte = pmap_allocpte(pmap, va);
2183	}
2184#if 0 && defined(PMAP_DIAGNOSTIC)
2185	else {
2186		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2187		origpte = *pdeaddr;
2188		if ((origpte & PG_V) == 0) {
2189			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2190				pmap->pm_pdir[PTDPTDI], origpte, va);
2191		}
2192	}
2193#endif
2194
2195	pte = pmap_pte(pmap, va);
2196
2197	/*
2198	 * Page Directory table entry not valid, we need a new PT page
2199	 */
2200	if (pte == NULL) {
2201		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n",
2202			(void *)pmap->pm_pdir[PTDPTDI], va);
2203	}
2204
2205	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2206	origpte = *(vm_offset_t *)pte;
2207	opa = origpte & PG_FRAME;
2208
2209	if (origpte & PG_PS)
2210		panic("pmap_enter: attempted pmap_enter on 4MB page");
2211
2212	/*
2213	 * Mapping has not changed, must be protection or wiring change.
2214	 */
2215	if (origpte && (opa == pa)) {
2216		/*
2217		 * Wiring change, just update stats. We don't worry about
2218		 * wiring PT pages as they remain resident as long as there
2219		 * are valid mappings in them. Hence, if a user page is wired,
2220		 * the PT page will be also.
2221		 */
2222		if (wired && ((origpte & PG_W) == 0))
2223			pmap->pm_stats.wired_count++;
2224		else if (!wired && (origpte & PG_W))
2225			pmap->pm_stats.wired_count--;
2226
2227#if defined(PMAP_DIAGNOSTIC)
2228		if (pmap_nw_modified((pt_entry_t) origpte)) {
2229			printf(
2230	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2231			    va, origpte);
2232		}
2233#endif
2234
2235		/*
2236		 * Remove extra pte reference
2237		 */
2238		if (mpte)
2239			mpte->hold_count--;
2240
2241		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2242			if ((origpte & PG_RW) == 0) {
2243				*pte |= PG_RW;
2244#ifdef SMP
2245				cpu_invlpg((void *)va);
2246				if (pmap->pm_active & PCPU_GET(other_cpus))
2247					smp_invltlb();
2248#else
2249				invltlb_1pg(va);
2250#endif
2251			}
2252			return;
2253		}
2254
2255		/*
2256		 * We might be turning off write access to the page,
2257		 * so we go ahead and sense modify status.
2258		 */
2259		if (origpte & PG_MANAGED) {
2260			if ((origpte & PG_M) && pmap_track_modified(va)) {
2261				vm_page_t om;
2262				om = PHYS_TO_VM_PAGE(opa);
2263				vm_page_dirty(om);
2264			}
2265			pa |= PG_MANAGED;
2266		}
2267		goto validate;
2268	}
2269	/*
2270	 * Mapping has changed, invalidate old range and fall through to
2271	 * handle validating new mapping.
2272	 */
2273	if (opa) {
2274		int err;
2275		err = pmap_remove_pte(pmap, pte, va);
2276		if (err)
2277			panic("pmap_enter: pte vanished, va: 0x%x", va);
2278	}
2279
2280	/*
2281	 * Enter on the PV list if part of our managed memory. Note that we
2282	 * raise IPL while manipulating pv_table since pmap_enter can be
2283	 * called at interrupt time.
2284	 */
2285	if (pmap_initialized &&
2286	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2287		pmap_insert_entry(pmap, va, mpte, m);
2288		pa |= PG_MANAGED;
2289	}
2290
2291	/*
2292	 * Increment counters
2293	 */
2294	pmap->pm_stats.resident_count++;
2295	if (wired)
2296		pmap->pm_stats.wired_count++;
2297
2298validate:
2299	/*
2300	 * Now validate mapping with desired protection/wiring.
2301	 */
2302	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2303
2304	if (wired)
2305		newpte |= PG_W;
2306	if (va < VM_MAXUSER_ADDRESS)
2307		newpte |= PG_U;
2308	if (pmap == kernel_pmap)
2309		newpte |= pgeflag;
2310
2311	/*
2312	 * if the mapping or permission bits are different, we need
2313	 * to update the pte.
2314	 */
2315	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2316		*pte = newpte | PG_A;
2317		/*if (origpte)*/ {
2318#ifdef SMP
2319			cpu_invlpg((void *)va);
2320			if (pmap->pm_active & PCPU_GET(other_cpus))
2321				smp_invltlb();
2322#else
2323			invltlb_1pg(va);
2324#endif
2325		}
2326	}
2327}
2328
2329/*
2330 * this code makes some *MAJOR* assumptions:
2331 * 1. Current pmap & pmap exists.
2332 * 2. Not wired.
2333 * 3. Read access.
2334 * 4. No page table pages.
2335 * 5. Tlbflush is deferred to calling procedure.
2336 * 6. Page IS managed.
2337 * but is *MUCH* faster than pmap_enter...
2338 */
2339
2340static vm_page_t
2341pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2342{
2343	pt_entry_t *pte;
2344	vm_offset_t pa;
2345
2346	/*
2347	 * In the case that a page table page is not
2348	 * resident, we are creating it here.
2349	 */
2350	if (va < VM_MAXUSER_ADDRESS) {
2351		unsigned ptepindex;
2352		pd_entry_t ptepa;
2353
2354		/*
2355		 * Calculate pagetable page index
2356		 */
2357		ptepindex = va >> PDRSHIFT;
2358		if (mpte && (mpte->pindex == ptepindex)) {
2359			mpte->hold_count++;
2360		} else {
2361retry:
2362			/*
2363			 * Get the page directory entry
2364			 */
2365			ptepa = pmap->pm_pdir[ptepindex];
2366
2367			/*
2368			 * If the page table page is mapped, we just increment
2369			 * the hold count, and activate it.
2370			 */
2371			if (ptepa) {
2372				if (ptepa & PG_PS)
2373					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2374				if (pmap->pm_ptphint &&
2375					(pmap->pm_ptphint->pindex == ptepindex)) {
2376					mpte = pmap->pm_ptphint;
2377				} else {
2378					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
2379					pmap->pm_ptphint = mpte;
2380				}
2381				if (mpte == NULL)
2382					goto retry;
2383				mpte->hold_count++;
2384			} else {
2385				mpte = _pmap_allocpte(pmap, ptepindex);
2386			}
2387		}
2388	} else {
2389		mpte = NULL;
2390	}
2391
2392	/*
2393	 * This call to vtopte makes the assumption that we are
2394	 * entering the page into the current pmap.  In order to support
2395	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2396	 * But that isn't as quick as vtopte.
2397	 */
2398	pte = vtopte(va);
2399	if (*pte) {
2400		if (mpte)
2401			pmap_unwire_pte_hold(pmap, mpte);
2402		return 0;
2403	}
2404
2405	/*
2406	 * Enter on the PV list if part of our managed memory. Note that we
2407	 * raise IPL while manipulating pv_table since pmap_enter can be
2408	 * called at interrupt time.
2409	 */
2410	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2411		pmap_insert_entry(pmap, va, mpte, m);
2412
2413	/*
2414	 * Increment counters
2415	 */
2416	pmap->pm_stats.resident_count++;
2417
2418	pa = VM_PAGE_TO_PHYS(m);
2419
2420	/*
2421	 * Now validate mapping with RO protection
2422	 */
2423	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2424		*pte = pa | PG_V | PG_U;
2425	else
2426		*pte = pa | PG_V | PG_U | PG_MANAGED;
2427
2428	return mpte;
2429}
2430
2431/*
2432 * Make a temporary mapping for a physical address.  This is only intended
2433 * to be used for panic dumps.
2434 */
2435void *
2436pmap_kenter_temporary(vm_offset_t pa, int i)
2437{
2438	pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2439	return ((void *)crashdumpmap);
2440}
2441
2442#define MAX_INIT_PT (96)
2443/*
2444 * pmap_object_init_pt preloads the ptes for a given object
2445 * into the specified pmap.  This eliminates the blast of soft
2446 * faults on process startup and immediately after an mmap.
2447 */
2448void
2449pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2450		    vm_object_t object, vm_pindex_t pindex,
2451		    vm_size_t size, int limit)
2452{
2453	vm_offset_t tmpidx;
2454	int psize;
2455	vm_page_t p, mpte;
2456	int objpgs;
2457
2458	if (pmap == NULL || object == NULL)
2459		return;
2460
2461	/*
2462	 * This code maps large physical mmap regions into the
2463	 * processor address space.  Note that some shortcuts
2464	 * are taken, but the code works.
2465	 */
2466	if (pseflag && (object->type == OBJT_DEVICE) &&
2467	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2468		int i;
2469		vm_page_t m[1];
2470		unsigned int ptepindex;
2471		int npdes;
2472		pd_entry_t ptepa;
2473
2474		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2475			return;
2476
2477retry:
2478		p = vm_page_lookup(object, pindex);
2479		if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
2480			goto retry;
2481
2482		if (p == NULL) {
2483			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2484			if (p == NULL)
2485				return;
2486			m[0] = p;
2487
2488			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2489				vm_page_free(p);
2490				return;
2491			}
2492
2493			p = vm_page_lookup(object, pindex);
2494			vm_page_wakeup(p);
2495		}
2496
2497		ptepa = VM_PAGE_TO_PHYS(p);
2498		if (ptepa & (NBPDR - 1)) {
2499			return;
2500		}
2501
2502		p->valid = VM_PAGE_BITS_ALL;
2503
2504		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2505		npdes = size >> PDRSHIFT;
2506		for(i = 0; i < npdes; i++) {
2507			pmap->pm_pdir[ptepindex] =
2508			    ptepa | PG_U | PG_RW | PG_V | PG_PS;
2509			ptepa += NBPDR;
2510			ptepindex += 1;
2511		}
2512		vm_page_flag_set(p, PG_MAPPED);
2513		invltlb();
2514		return;
2515	}
2516
2517	psize = i386_btop(size);
2518
2519	if ((object->type != OBJT_VNODE) ||
2520	    ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2521	     (object->resident_page_count > MAX_INIT_PT))) {
2522		return;
2523	}
2524
2525	if (psize + pindex > object->size) {
2526		if (object->size < pindex)
2527			return;
2528		psize = object->size - pindex;
2529	}
2530
2531	mpte = NULL;
2532	/*
2533	 * if we are processing a major portion of the object, then scan the
2534	 * entire thing.
2535	 */
2536	if (psize > (object->resident_page_count >> 2)) {
2537		objpgs = psize;
2538
2539		for (p = TAILQ_FIRST(&object->memq);
2540		    ((objpgs > 0) && (p != NULL));
2541		    p = TAILQ_NEXT(p, listq)) {
2542
2543			tmpidx = p->pindex;
2544			if (tmpidx < pindex) {
2545				continue;
2546			}
2547			tmpidx -= pindex;
2548			if (tmpidx >= psize) {
2549				continue;
2550			}
2551			/*
2552			 * don't allow an madvise to blow away our really
2553			 * free pages allocating pv entries.
2554			 */
2555			if ((limit & MAP_PREFAULT_MADVISE) &&
2556			    cnt.v_free_count < cnt.v_free_reserved) {
2557				break;
2558			}
2559			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2560				(p->busy == 0) &&
2561			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2562				if ((p->queue - p->pc) == PQ_CACHE)
2563					vm_page_deactivate(p);
2564				vm_page_busy(p);
2565				mpte = pmap_enter_quick(pmap,
2566					addr + i386_ptob(tmpidx), p, mpte);
2567				vm_page_flag_set(p, PG_MAPPED);
2568				vm_page_wakeup(p);
2569			}
2570			objpgs -= 1;
2571		}
2572	} else {
2573		/*
2574		 * else lookup the pages one-by-one.
2575		 */
2576		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
2577			/*
2578			 * don't allow an madvise to blow away our really
2579			 * free pages allocating pv entries.
2580			 */
2581			if ((limit & MAP_PREFAULT_MADVISE) &&
2582			    cnt.v_free_count < cnt.v_free_reserved) {
2583				break;
2584			}
2585			p = vm_page_lookup(object, tmpidx + pindex);
2586			if (p &&
2587			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2588				(p->busy == 0) &&
2589			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2590				if ((p->queue - p->pc) == PQ_CACHE)
2591					vm_page_deactivate(p);
2592				vm_page_busy(p);
2593				mpte = pmap_enter_quick(pmap,
2594					addr + i386_ptob(tmpidx), p, mpte);
2595				vm_page_flag_set(p, PG_MAPPED);
2596				vm_page_wakeup(p);
2597			}
2598		}
2599	}
2600	return;
2601}
2602
2603/*
2604 * pmap_prefault provides a quick way of clustering
2605 * pagefaults into a processes address space.  It is a "cousin"
2606 * of pmap_object_init_pt, except it runs at page fault time instead
2607 * of mmap time.
2608 */
2609#define PFBAK 4
2610#define PFFOR 4
2611#define PAGEORDER_SIZE (PFBAK+PFFOR)
2612
2613static int pmap_prefault_pageorder[] = {
2614	-PAGE_SIZE, PAGE_SIZE,
2615	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2616	-3 * PAGE_SIZE, 3 * PAGE_SIZE
2617	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2618};
2619
2620void
2621pmap_prefault(pmap, addra, entry)
2622	pmap_t pmap;
2623	vm_offset_t addra;
2624	vm_map_entry_t entry;
2625{
2626	int i;
2627	vm_offset_t starta;
2628	vm_offset_t addr;
2629	vm_pindex_t pindex;
2630	vm_page_t m, mpte;
2631	vm_object_t object;
2632
2633	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2634		return;
2635
2636	object = entry->object.vm_object;
2637
2638	starta = addra - PFBAK * PAGE_SIZE;
2639	if (starta < entry->start) {
2640		starta = entry->start;
2641	} else if (starta > addra) {
2642		starta = 0;
2643	}
2644
2645	mpte = NULL;
2646	for (i = 0; i < PAGEORDER_SIZE; i++) {
2647		vm_object_t lobject;
2648		pt_entry_t *pte;
2649
2650		addr = addra + pmap_prefault_pageorder[i];
2651		if (addr > addra + (PFFOR * PAGE_SIZE))
2652			addr = 0;
2653
2654		if (addr < starta || addr >= entry->end)
2655			continue;
2656
2657		if ((*pmap_pde(pmap, addr)) == NULL)
2658			continue;
2659
2660		pte = vtopte(addr);
2661		if (*pte)
2662			continue;
2663
2664		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2665		lobject = object;
2666		for (m = vm_page_lookup(lobject, pindex);
2667		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2668		    lobject = lobject->backing_object) {
2669			if (lobject->backing_object_offset & PAGE_MASK)
2670				break;
2671			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2672			m = vm_page_lookup(lobject->backing_object, pindex);
2673		}
2674
2675		/*
2676		 * give-up when a page is not in memory
2677		 */
2678		if (m == NULL)
2679			break;
2680
2681		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2682			(m->busy == 0) &&
2683		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2684
2685			if ((m->queue - m->pc) == PQ_CACHE) {
2686				vm_page_deactivate(m);
2687			}
2688			vm_page_busy(m);
2689			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2690			vm_page_flag_set(m, PG_MAPPED);
2691			vm_page_wakeup(m);
2692		}
2693	}
2694}
2695
2696/*
2697 *	Routine:	pmap_change_wiring
2698 *	Function:	Change the wiring attribute for a map/virtual-address
2699 *			pair.
2700 *	In/out conditions:
2701 *			The mapping must already exist in the pmap.
2702 */
2703void
2704pmap_change_wiring(pmap, va, wired)
2705	register pmap_t pmap;
2706	vm_offset_t va;
2707	boolean_t wired;
2708{
2709	register pt_entry_t *pte;
2710
2711	if (pmap == NULL)
2712		return;
2713
2714	pte = pmap_pte(pmap, va);
2715
2716	if (wired && !pmap_pte_w(pte))
2717		pmap->pm_stats.wired_count++;
2718	else if (!wired && pmap_pte_w(pte))
2719		pmap->pm_stats.wired_count--;
2720
2721	/*
2722	 * Wiring is not a hardware characteristic so there is no need to
2723	 * invalidate TLB.
2724	 */
2725	pmap_pte_set_w(pte, wired);
2726}
2727
2728
2729
2730/*
2731 *	Copy the range specified by src_addr/len
2732 *	from the source map to the range dst_addr/len
2733 *	in the destination map.
2734 *
2735 *	This routine is only advisory and need not do anything.
2736 */
2737
2738void
2739pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2740	  vm_offset_t src_addr)
2741{
2742	vm_offset_t addr;
2743	vm_offset_t end_addr = src_addr + len;
2744	vm_offset_t pdnxt;
2745	pd_entry_t src_frame, dst_frame;
2746	vm_page_t m;
2747	pd_entry_t saved_pde;
2748
2749	if (dst_addr != src_addr)
2750		return;
2751
2752	src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2753	if (src_frame != (PTDpde & PG_FRAME))
2754		return;
2755
2756	dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2757	if (dst_frame != (APTDpde & PG_FRAME)) {
2758		APTDpde = dst_frame | PG_RW | PG_V;
2759#if defined(SMP)
2760		/* The page directory is not shared between CPUs */
2761		cpu_invltlb();
2762#else
2763		invltlb();
2764#endif
2765	}
2766 	saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V);
2767	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
2768		pt_entry_t *src_pte, *dst_pte;
2769		vm_page_t dstmpte, srcmpte;
2770		pd_entry_t srcptepaddr;
2771		unsigned ptepindex;
2772
2773		if (addr >= UPT_MIN_ADDRESS)
2774			panic("pmap_copy: invalid to pmap_copy page tables\n");
2775
2776		/*
2777		 * Don't let optional prefaulting of pages make us go
2778		 * way below the low water mark of free pages or way
2779		 * above high water mark of used pv entries.
2780		 */
2781		if (cnt.v_free_count < cnt.v_free_reserved ||
2782		    pv_entry_count > pv_entry_high_water)
2783			break;
2784
2785		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2786		ptepindex = addr >> PDRSHIFT;
2787
2788		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2789		if (srcptepaddr == 0)
2790			continue;
2791
2792		if (srcptepaddr & PG_PS) {
2793			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2794				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2795				dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2796			}
2797			continue;
2798		}
2799
2800		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2801		if ((srcmpte == NULL) ||
2802		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2803			continue;
2804
2805		if (pdnxt > end_addr)
2806			pdnxt = end_addr;
2807
2808		src_pte = vtopte(addr);
2809		dst_pte = avtopte(addr);
2810		while (addr < pdnxt) {
2811			pt_entry_t ptetemp;
2812			ptetemp = *src_pte;
2813			/*
2814			 * we only virtual copy managed pages
2815			 */
2816			if ((ptetemp & PG_MANAGED) != 0) {
2817				/*
2818				 * We have to check after allocpte for the
2819				 * pte still being around...  allocpte can
2820				 * block.
2821				 */
2822				dstmpte = pmap_allocpte(dst_pmap, addr);
2823				if ((APTDpde & PG_FRAME) !=
2824				    (saved_pde & PG_FRAME)) {
2825					APTDpde = saved_pde;
2826printf ("IT HAPPENNED!");
2827#if defined(SMP)
2828					cpu_invltlb();
2829#else
2830					invltlb();
2831#endif
2832				}
2833				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2834					/*
2835					 * Clear the modified and
2836					 * accessed (referenced) bits
2837					 * during the copy.
2838					 */
2839					m = PHYS_TO_VM_PAGE(ptetemp);
2840					*dst_pte = ptetemp & ~(PG_M | PG_A);
2841					dst_pmap->pm_stats.resident_count++;
2842					pmap_insert_entry(dst_pmap, addr,
2843						dstmpte, m);
2844	 			} else {
2845					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2846				}
2847				if (dstmpte->hold_count >= srcmpte->hold_count)
2848					break;
2849			}
2850			addr += PAGE_SIZE;
2851			src_pte++;
2852			dst_pte++;
2853		}
2854	}
2855}
2856
2857/*
2858 *	Routine:	pmap_kernel
2859 *	Function:
2860 *		Returns the physical map handle for the kernel.
2861 */
2862pmap_t
2863pmap_kernel()
2864{
2865	return (kernel_pmap);
2866}
2867
2868/*
2869 *	pmap_zero_page zeros the specified hardware page by mapping
2870 *	the page into KVM and using bzero to clear its contents.
2871 */
2872void
2873pmap_zero_page(vm_offset_t phys)
2874{
2875
2876	if (*CMAP2)
2877		panic("pmap_zero_page: CMAP2 busy");
2878
2879	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2880	invltlb_1pg((vm_offset_t)CADDR2);
2881
2882#if defined(I686_CPU)
2883	if (cpu_class == CPUCLASS_686)
2884		i686_pagezero(CADDR2);
2885	else
2886#endif
2887		bzero(CADDR2, PAGE_SIZE);
2888	*CMAP2 = 0;
2889}
2890
2891/*
2892 *	pmap_zero_page_area zeros the specified hardware page by mapping
2893 *	the page into KVM and using bzero to clear its contents.
2894 *
2895 *	off and size may not cover an area beyond a single hardware page.
2896 */
2897void
2898pmap_zero_page_area(vm_offset_t phys, int off, int size)
2899{
2900
2901	if (*CMAP2)
2902		panic("pmap_zero_page: CMAP2 busy");
2903
2904	*CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2905	invltlb_1pg((vm_offset_t)CADDR2);
2906
2907#if defined(I686_CPU)
2908	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2909		i686_pagezero(CADDR2);
2910	else
2911#endif
2912		bzero((char *)CADDR2 + off, size);
2913	*CMAP2 = 0;
2914}
2915
2916/*
2917 *	pmap_copy_page copies the specified (machine independent)
2918 *	page by mapping the page into virtual memory and using
2919 *	bcopy to copy the page, one machine dependent page at a
2920 *	time.
2921 */
2922void
2923pmap_copy_page(vm_offset_t src, vm_offset_t dst)
2924{
2925
2926	if (*CMAP1)
2927		panic("pmap_copy_page: CMAP1 busy");
2928	if (*CMAP2)
2929		panic("pmap_copy_page: CMAP2 busy");
2930
2931	*CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2932	*CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2933#ifdef I386_CPU
2934	invltlb();
2935#else
2936	invlpg((u_int)CADDR1);
2937	invlpg((u_int)CADDR2);
2938#endif
2939
2940	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2941
2942	*CMAP1 = 0;
2943	*CMAP2 = 0;
2944}
2945
2946
2947/*
2948 *	Routine:	pmap_pageable
2949 *	Function:
2950 *		Make the specified pages (by pmap, offset)
2951 *		pageable (or not) as requested.
2952 *
2953 *		A page which is not pageable may not take
2954 *		a fault; therefore, its page table entry
2955 *		must remain valid for the duration.
2956 *
2957 *		This routine is merely advisory; pmap_enter
2958 *		will specify that these pages are to be wired
2959 *		down (or not) as appropriate.
2960 */
2961void
2962pmap_pageable(pmap, sva, eva, pageable)
2963	pmap_t pmap;
2964	vm_offset_t sva, eva;
2965	boolean_t pageable;
2966{
2967}
2968
2969/*
2970 * Returns true if the pmap's pv is one of the first
2971 * 16 pvs linked to from this page.  This count may
2972 * be changed upwards or downwards in the future; it
2973 * is only necessary that true be returned for a small
2974 * subset of pmaps for proper page aging.
2975 */
2976boolean_t
2977pmap_page_exists_quick(pmap, m)
2978	pmap_t pmap;
2979	vm_page_t m;
2980{
2981	pv_entry_t pv;
2982	int loops = 0;
2983	int s;
2984
2985	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2986		return FALSE;
2987
2988	s = splvm();
2989
2990	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2991		if (pv->pv_pmap == pmap) {
2992			splx(s);
2993			return TRUE;
2994		}
2995		loops++;
2996		if (loops >= 16)
2997			break;
2998	}
2999	splx(s);
3000	return (FALSE);
3001}
3002
3003#define PMAP_REMOVE_PAGES_CURPROC_ONLY
3004/*
3005 * Remove all pages from specified address space
3006 * this aids process exit speeds.  Also, this code
3007 * is special cased for current process only, but
3008 * can have the more generic (and slightly slower)
3009 * mode enabled.  This is much faster than pmap_remove
3010 * in the case of running down an entire address space.
3011 */
3012void
3013pmap_remove_pages(pmap, sva, eva)
3014	pmap_t pmap;
3015	vm_offset_t sva, eva;
3016{
3017	pt_entry_t *pte, tpte;
3018	vm_page_t m;
3019	pv_entry_t pv, npv;
3020	int s;
3021
3022#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
3023	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
3024		printf("warning: pmap_remove_pages called with non-current pmap\n");
3025		return;
3026	}
3027#endif
3028
3029	s = splvm();
3030	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
3031
3032		if (pv->pv_va >= eva || pv->pv_va < sva) {
3033			npv = TAILQ_NEXT(pv, pv_plist);
3034			continue;
3035		}
3036
3037#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
3038		pte = vtopte(pv->pv_va);
3039#else
3040		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3041#endif
3042		tpte = *pte;
3043
3044		if (tpte == 0) {
3045			printf("TPTE at %p  IS ZERO @ VA %08x\n",
3046							pte, pv->pv_va);
3047			panic("bad pte");
3048		}
3049
3050/*
3051 * We cannot remove wired pages from a process' mapping at this time
3052 */
3053		if (tpte & PG_W) {
3054			npv = TAILQ_NEXT(pv, pv_plist);
3055			continue;
3056		}
3057
3058		m = PHYS_TO_VM_PAGE(tpte);
3059		KASSERT(m->phys_addr == (tpte & PG_FRAME),
3060		    ("vm_page_t %p phys_addr mismatch %08x %08x",
3061		    m, m->phys_addr, tpte));
3062
3063		KASSERT(m < &vm_page_array[vm_page_array_size],
3064			("pmap_remove_pages: bad tpte %x", tpte));
3065
3066		pv->pv_pmap->pm_stats.resident_count--;
3067
3068		*pte = 0;
3069
3070		/*
3071		 * Update the vm_page_t clean and reference bits.
3072		 */
3073		if (tpte & PG_M) {
3074			vm_page_dirty(m);
3075		}
3076
3077		npv = TAILQ_NEXT(pv, pv_plist);
3078		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
3079
3080		m->md.pv_list_count--;
3081		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3082		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
3083			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
3084		}
3085
3086		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
3087		free_pv_entry(pv);
3088	}
3089	splx(s);
3090	pmap_invalidate_all(pmap);
3091}
3092
3093/*
3094 * pmap_testbit tests bits in pte's
3095 * note that the testbit/changebit routines are inline,
3096 * and a lot of things compile-time evaluate.
3097 */
3098static boolean_t
3099pmap_testbit(m, bit)
3100	vm_page_t m;
3101	int bit;
3102{
3103	pv_entry_t pv;
3104	pt_entry_t *pte;
3105	int s;
3106
3107	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3108		return FALSE;
3109
3110	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
3111		return FALSE;
3112
3113	s = splvm();
3114
3115	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3116		/*
3117		 * if the bit being tested is the modified bit, then
3118		 * mark clean_map and ptes as never
3119		 * modified.
3120		 */
3121		if (bit & (PG_A|PG_M)) {
3122			if (!pmap_track_modified(pv->pv_va))
3123				continue;
3124		}
3125
3126#if defined(PMAP_DIAGNOSTIC)
3127		if (!pv->pv_pmap) {
3128			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
3129			continue;
3130		}
3131#endif
3132		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3133		if (*pte & bit) {
3134			splx(s);
3135			return TRUE;
3136		}
3137	}
3138	splx(s);
3139	return (FALSE);
3140}
3141
3142/*
3143 * this routine is used to modify bits in ptes
3144 */
3145static __inline void
3146pmap_changebit(vm_page_t m, int bit, boolean_t setem)
3147{
3148	register pv_entry_t pv;
3149	register pt_entry_t *pte;
3150	int s;
3151
3152	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3153		return;
3154
3155	s = splvm();
3156
3157	/*
3158	 * Loop over all current mappings setting/clearing as appropos If
3159	 * setting RO do we need to clear the VAC?
3160	 */
3161	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3162		/*
3163		 * don't write protect pager mappings
3164		 */
3165		if (!setem && (bit == PG_RW)) {
3166			if (!pmap_track_modified(pv->pv_va))
3167				continue;
3168		}
3169
3170#if defined(PMAP_DIAGNOSTIC)
3171		if (!pv->pv_pmap) {
3172			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3173			continue;
3174		}
3175#endif
3176
3177		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3178
3179		if (setem) {
3180			*pte |= bit;
3181			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3182		} else {
3183			pt_entry_t pbits = *pte;
3184			if (pbits & bit) {
3185				if (bit == PG_RW) {
3186					if (pbits & PG_M) {
3187						vm_page_dirty(m);
3188					}
3189					*pte = pbits & ~(PG_M|PG_RW);
3190				} else {
3191					*pte = pbits & ~bit;
3192				}
3193				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3194			}
3195		}
3196	}
3197	splx(s);
3198}
3199
3200/*
3201 *      pmap_page_protect:
3202 *
3203 *      Lower the permission for all mappings to a given page.
3204 */
3205void
3206pmap_page_protect(vm_page_t m, vm_prot_t prot)
3207{
3208	if ((prot & VM_PROT_WRITE) == 0) {
3209		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3210			pmap_changebit(m, PG_RW, FALSE);
3211		} else {
3212			pmap_remove_all(m);
3213		}
3214	}
3215}
3216
3217vm_offset_t
3218pmap_phys_address(ppn)
3219	int ppn;
3220{
3221	return (i386_ptob(ppn));
3222}
3223
3224/*
3225 *	pmap_ts_referenced:
3226 *
3227 *	Return a count of reference bits for a page, clearing those bits.
3228 *	It is not necessary for every reference bit to be cleared, but it
3229 *	is necessary that 0 only be returned when there are truly no
3230 *	reference bits set.
3231 *
3232 *	XXX: The exact number of bits to check and clear is a matter that
3233 *	should be tested and standardized at some point in the future for
3234 *	optimal aging of shared pages.
3235 */
3236int
3237pmap_ts_referenced(vm_page_t m)
3238{
3239	register pv_entry_t pv, pvf, pvn;
3240	pt_entry_t *pte;
3241	int s;
3242	int rtval = 0;
3243
3244	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3245		return (rtval);
3246
3247	s = splvm();
3248
3249	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3250
3251		pvf = pv;
3252
3253		do {
3254			pvn = TAILQ_NEXT(pv, pv_list);
3255
3256			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3257
3258			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3259
3260			if (!pmap_track_modified(pv->pv_va))
3261				continue;
3262
3263			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3264
3265			if (pte && (*pte & PG_A)) {
3266				*pte &= ~PG_A;
3267
3268				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3269
3270				rtval++;
3271				if (rtval > 4) {
3272					break;
3273				}
3274			}
3275		} while ((pv = pvn) != NULL && pv != pvf);
3276	}
3277	splx(s);
3278
3279	return (rtval);
3280}
3281
3282/*
3283 *	pmap_is_modified:
3284 *
3285 *	Return whether or not the specified physical page was modified
3286 *	in any physical maps.
3287 */
3288boolean_t
3289pmap_is_modified(vm_page_t m)
3290{
3291	return pmap_testbit(m, PG_M);
3292}
3293
3294/*
3295 *	Clear the modify bits on the specified physical page.
3296 */
3297void
3298pmap_clear_modify(vm_page_t m)
3299{
3300	pmap_changebit(m, PG_M, FALSE);
3301}
3302
3303/*
3304 *	pmap_clear_reference:
3305 *
3306 *	Clear the reference bit on the specified physical page.
3307 */
3308void
3309pmap_clear_reference(vm_page_t m)
3310{
3311	pmap_changebit(m, PG_A, FALSE);
3312}
3313
3314/*
3315 * Miscellaneous support routines follow
3316 */
3317
3318static void
3319i386_protection_init()
3320{
3321	register int *kp, prot;
3322
3323	kp = protection_codes;
3324	for (prot = 0; prot < 8; prot++) {
3325		switch (prot) {
3326		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3327			/*
3328			 * Read access is also 0. There isn't any execute bit,
3329			 * so just make it readable.
3330			 */
3331		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3332		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3333		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3334			*kp++ = 0;
3335			break;
3336		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3337		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3338		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3339		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3340			*kp++ = PG_RW;
3341			break;
3342		}
3343	}
3344}
3345
3346/*
3347 * Map a set of physical memory pages into the kernel virtual
3348 * address space. Return a pointer to where it is mapped. This
3349 * routine is intended to be used for mapping device memory,
3350 * NOT real memory.
3351 */
3352void *
3353pmap_mapdev(pa, size)
3354	vm_offset_t pa;
3355	vm_size_t size;
3356{
3357	vm_offset_t va, tmpva, offset;
3358	pt_entry_t *pte;
3359
3360	offset = pa & PAGE_MASK;
3361	size = roundup(offset + size, PAGE_SIZE);
3362
3363	GIANT_REQUIRED;
3364
3365	va = kmem_alloc_pageable(kernel_map, size);
3366	if (!va)
3367		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3368
3369	pa = pa & PG_FRAME;
3370	for (tmpva = va; size > 0;) {
3371		pte = vtopte(tmpva);
3372		*pte = pa | PG_RW | PG_V | pgeflag;
3373		size -= PAGE_SIZE;
3374		tmpva += PAGE_SIZE;
3375		pa += PAGE_SIZE;
3376	}
3377	invltlb();
3378
3379	return ((void *)(va + offset));
3380}
3381
3382void
3383pmap_unmapdev(va, size)
3384	vm_offset_t va;
3385	vm_size_t size;
3386{
3387	vm_offset_t base, offset;
3388
3389	base = va & PG_FRAME;
3390	offset = va & PAGE_MASK;
3391	size = roundup(offset + size, PAGE_SIZE);
3392	kmem_free(kernel_map, base, size);
3393}
3394
3395/*
3396 * perform the pmap work for mincore
3397 */
3398int
3399pmap_mincore(pmap, addr)
3400	pmap_t pmap;
3401	vm_offset_t addr;
3402{
3403	pt_entry_t *ptep, pte;
3404	vm_page_t m;
3405	int val = 0;
3406
3407	ptep = pmap_pte(pmap, addr);
3408	if (ptep == 0) {
3409		return 0;
3410	}
3411
3412	if ((pte = *ptep) != 0) {
3413		vm_offset_t pa;
3414
3415		val = MINCORE_INCORE;
3416		if ((pte & PG_MANAGED) == 0)
3417			return val;
3418
3419		pa = pte & PG_FRAME;
3420
3421		m = PHYS_TO_VM_PAGE(pa);
3422
3423		/*
3424		 * Modified by us
3425		 */
3426		if (pte & PG_M)
3427			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3428		/*
3429		 * Modified by someone
3430		 */
3431		else if (m->dirty || pmap_is_modified(m))
3432			val |= MINCORE_MODIFIED_OTHER;
3433		/*
3434		 * Referenced by us
3435		 */
3436		if (pte & PG_A)
3437			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3438
3439		/*
3440		 * Referenced by someone
3441		 */
3442		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3443			val |= MINCORE_REFERENCED_OTHER;
3444			vm_page_flag_set(m, PG_REFERENCED);
3445		}
3446	}
3447	return val;
3448}
3449
3450void
3451pmap_activate(struct thread *td)
3452{
3453	struct proc *p = td->td_proc;
3454	pmap_t	pmap;
3455	u_int32_t  cr3;
3456
3457	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3458#if defined(SMP)
3459	pmap->pm_active |= PCPU_GET(cpumask);
3460#else
3461	pmap->pm_active |= 1;
3462#endif
3463#if defined(SWTCH_OPTIM_STATS)
3464	tlb_flush_count++;
3465#endif
3466	cr3 = vtophys(pmap->pm_pdir);
3467	/* XXXKSE this is wrong.
3468	 * pmap_activate is for the current thread on the current cpu
3469	 */
3470	if (p->p_flag & P_KSES) {
3471		/* Make sure all other cr3 entries are updated. */
3472		/* what if they are running?  XXXKSE (maybe abort them) */
3473		FOREACH_THREAD_IN_PROC(p, td) {
3474			td->td_pcb->pcb_cr3 = cr3;
3475		}
3476	} else {
3477		td->td_pcb->pcb_cr3 = cr3;
3478	}
3479	load_cr3(cr3);
3480}
3481
3482vm_offset_t
3483pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3484{
3485
3486	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3487		return addr;
3488	}
3489
3490	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3491	return addr;
3492}
3493
3494
3495#if defined(PMAP_DEBUG)
3496pmap_pid_dump(int pid)
3497{
3498	pmap_t pmap;
3499	struct proc *p;
3500	int npte = 0;
3501	int index;
3502
3503	sx_slock(&allproc_lock);
3504	LIST_FOREACH(p, &allproc, p_list) {
3505		if (p->p_pid != pid)
3506			continue;
3507
3508		if (p->p_vmspace) {
3509			int i,j;
3510			index = 0;
3511			pmap = vmspace_pmap(p->p_vmspace);
3512			for (i = 0; i < NPDEPG; i++) {
3513				pd_entry_t *pde;
3514				pt_entry_t *pte;
3515				vm_offset_t base = i << PDRSHIFT;
3516
3517				pde = &pmap->pm_pdir[i];
3518				if (pde && pmap_pde_v(pde)) {
3519					for (j = 0; j < NPTEPG; j++) {
3520						vm_offset_t va = base + (j << PAGE_SHIFT);
3521						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3522							if (index) {
3523								index = 0;
3524								printf("\n");
3525							}
3526							sx_sunlock(&allproc_lock);
3527							return npte;
3528						}
3529						pte = pmap_pte_quick(pmap, va);
3530						if (pte && pmap_pte_v(pte)) {
3531							pt_entry_t pa;
3532							vm_page_t m;
3533							pa = *pte;
3534							m = PHYS_TO_VM_PAGE(pa);
3535							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3536								va, pa, m->hold_count, m->wire_count, m->flags);
3537							npte++;
3538							index++;
3539							if (index >= 2) {
3540								index = 0;
3541								printf("\n");
3542							} else {
3543								printf(" ");
3544							}
3545						}
3546					}
3547				}
3548			}
3549		}
3550	}
3551	sx_sunlock(&allproc_lock);
3552	return npte;
3553}
3554#endif
3555
3556#if defined(DEBUG)
3557
3558static void	pads(pmap_t pm);
3559void		pmap_pvdump(vm_offset_t pa);
3560
3561/* print address space of pmap*/
3562static void
3563pads(pm)
3564	pmap_t pm;
3565{
3566	int i, j;
3567	vm_offset_t va;
3568	pt_entry_t *ptep;
3569
3570	if (pm == kernel_pmap)
3571		return;
3572	for (i = 0; i < NPDEPG; i++)
3573		if (pm->pm_pdir[i])
3574			for (j = 0; j < NPTEPG; j++) {
3575				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3576				if (pm == kernel_pmap && va < KERNBASE)
3577					continue;
3578				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3579					continue;
3580				ptep = pmap_pte_quick(pm, va);
3581				if (pmap_pte_v(ptep))
3582					printf("%x:%x ", va, *ptep);
3583			};
3584
3585}
3586
3587void
3588pmap_pvdump(pa)
3589	vm_offset_t pa;
3590{
3591	pv_entry_t pv;
3592	vm_page_t m;
3593
3594	printf("pa %x", pa);
3595	m = PHYS_TO_VM_PAGE(pa);
3596	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3597		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3598		pads(pv->pv_pmap);
3599	}
3600	printf(" ");
3601}
3602#endif
3603