pmap.c revision 14245
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 *	$Id: pmap.c,v 1.76 1996/02/25 03:02:44 dyson Exp $
43 */
44
45/*
46 * Derived from hp300 version by Mike Hibler, this version by William
47 * Jolitz uses a recursive map [a pde points to the page directory] to
48 * map the page tables using the pagetables themselves. This is done to
49 * reduce the impact on kernel virtual memory for lots of sparse address
50 * space, and to reduce the cost of memory to each process.
51 *
52 *	Derived from: hp300/@(#)pmap.c	7.1 (Berkeley) 12/5/90
53 */
54/*
55 * Major modifications by John S. Dyson primarily to support
56 * pageable page tables, eliminating pmap_attributes,
57 * discontiguous memory pages, and using more efficient string
58 * instructions. Jan 13, 1994.  Further modifications on Mar 2, 1994,
59 * general clean-up and efficiency mods.
60 */
61
62/*
63 *	Manages physical address maps.
64 *
65 *	In addition to hardware address maps, this
66 *	module is called upon to provide software-use-only
67 *	maps which may or may not be stored in the same
68 *	form as hardware maps.  These pseudo-maps are
69 *	used to store intermediate results from copy
70 *	operations to and from address spaces.
71 *
72 *	Since the information managed by this module is
73 *	also stored by the logical address mapping module,
74 *	this module may throw away valid virtual-to-physical
75 *	mappings at almost any time.  However, invalidations
76 *	of virtual-to-physical mappings must be done as
77 *	requested.
78 *
79 *	In order to cope with hardware architectures which
80 *	make virtual-to-physical map invalidates expensive,
81 *	this module may delay invalidate or reduced protection
82 *	operations until such time as they are actually
83 *	necessary.  This module is given full information as
84 *	to which processors are currently using which maps,
85 *	and to when physical maps must be made correct.
86 */
87
88#include <sys/param.h>
89#include <sys/systm.h>
90#include <sys/proc.h>
91#include <sys/malloc.h>
92#include <sys/msgbuf.h>
93#include <sys/queue.h>
94#include <sys/vmmeter.h>
95
96#include <vm/vm.h>
97#include <vm/vm_param.h>
98#include <vm/vm_prot.h>
99#include <vm/lock.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_page.h>
102#include <vm/vm_map.h>
103#include <vm/vm_object.h>
104#include <vm/vm_extern.h>
105
106#include <machine/pcb.h>
107#include <machine/cputypes.h>
108#include <machine/md_var.h>
109
110#include <i386/isa/isa.h>
111
112#define PMAP_KEEP_PDIRS
113
114static void	init_pv_entries __P((int));
115
116/*
117 * Get PDEs and PTEs for user/kernel address space
118 */
119#define	pmap_pde(m, v)	(&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023]))
120#define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023])
121
122#define pmap_pte_pa(pte)	(*(int *)(pte) & PG_FRAME)
123
124#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
125#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
126#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
127#define pmap_pte_u(pte)		((*(int *)pte & PG_U) != 0)
128#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
129
130#define pmap_pte_set_w(pte, v)		((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
131#define pmap_pte_set_prot(pte, v)	((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
132
133/*
134 * Given a map and a machine independent protection code,
135 * convert to a vax protection code.
136 */
137#define pte_prot(m, p)	(protection_codes[p])
138static int protection_codes[8];
139
140static struct pmap kernel_pmap_store;
141pmap_t kernel_pmap;
142
143vm_offset_t avail_start;	/* PA of first available physical page */
144vm_offset_t avail_end;		/* PA of last available physical page */
145vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
146vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
147static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
148static vm_offset_t vm_first_phys;
149
150static int nkpt;
151
152extern vm_offset_t clean_sva, clean_eva;
153extern int cpu_class;
154
155/*
156 * All those kernel PT submaps that BSD is so fond of
157 */
158pt_entry_t *CMAP1;
159static pt_entry_t *CMAP2, *ptmmap;
160static pv_entry_t pv_table;
161caddr_t CADDR1, ptvmmap;
162static caddr_t CADDR2;
163static pt_entry_t *msgbufmap;
164struct msgbuf *msgbufp;
165
166static void	free_pv_entry __P((pv_entry_t pv));
167static pt_entry_t *
168		get_pt_entry __P((pmap_t pmap));
169static pv_entry_t
170		get_pv_entry __P((void));
171static void	i386_protection_init __P((void));
172static void	pmap_alloc_pv_entry __P((void));
173static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
174static void	pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
175				      vm_offset_t pa));
176static int	pmap_is_managed __P((vm_offset_t pa));
177static void	pmap_remove_all __P((vm_offset_t pa));
178static void	pmap_remove_entry __P((struct pmap *pmap, pv_entry_t pv,
179				       vm_offset_t va));
180static vm_page_t
181		pmap_pte_vm_page __P((pmap_t pmap, vm_offset_t pt));
182static boolean_t
183		pmap_testbit __P((vm_offset_t pa, int bit));
184static void *	pmap_getpdir __P((void));
185void	pmap_prefault __P((pmap_t pmap, vm_offset_t addra,
186				   vm_map_entry_t entry, vm_object_t object));
187
188/*
189 * The below are finer grained pmap_update routines.  These eliminate
190 * the gratuitious tlb flushes on non-i386 architectures.
191 */
192static __inline void
193pmap_update_1pg( vm_offset_t va) {
194#if defined(I386_CPU)
195	if (cpu_class == CPUCLASS_386)
196		pmap_update();
197	else
198#endif
199		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va));
200}
201
202static __inline void
203pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) {
204#if defined(I386_CPU)
205	if (cpu_class == CPUCLASS_386) {
206		pmap_update();
207	} else
208#endif
209	{
210		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1));
211		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2));
212	}
213}
214
215/*
216 *	Routine:	pmap_pte
217 *	Function:
218 *		Extract the page table entry associated
219 *		with the given map/virtual_address pair.
220 * [ what about induced faults -wfj]
221 */
222
223__inline pt_entry_t * __pure
224pmap_pte(pmap, va)
225	register pmap_t pmap;
226	vm_offset_t va;
227{
228
229	if (pmap && *pmap_pde(pmap, va)) {
230		vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
231
232		/* are we current address space or kernel? */
233		if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME)))
234			return ((pt_entry_t *) vtopte(va));
235		/* otherwise, we are alternate address space */
236		else {
237			if (frame != ((int) APTDpde & PG_FRAME)) {
238				APTDpde = pmap->pm_pdir[PTDPTDI];
239				pmap_update();
240			}
241			return ((pt_entry_t *) avtopte(va));
242		}
243	}
244	return (0);
245}
246
247/*
248 *	Routine:	pmap_extract
249 *	Function:
250 *		Extract the physical page address associated
251 *		with the given map/virtual_address pair.
252 */
253
254vm_offset_t
255pmap_extract(pmap, va)
256	register pmap_t pmap;
257	vm_offset_t va;
258{
259	vm_offset_t pa;
260
261	if (pmap && *pmap_pde(pmap, va)) {
262		vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
263
264		/* are we current address space or kernel? */
265		if ((pmap == kernel_pmap)
266		    || (frame == ((int) PTDpde & PG_FRAME))) {
267			pa = *(int *) vtopte(va);
268			/* otherwise, we are alternate address space */
269		} else {
270			if (frame != ((int) APTDpde & PG_FRAME)) {
271				APTDpde = pmap->pm_pdir[PTDPTDI];
272				pmap_update();
273			}
274			pa = *(int *) avtopte(va);
275		}
276		return ((pa & PG_FRAME) | (va & ~PG_FRAME));
277	}
278	return 0;
279
280}
281
282/*
283 * determine if a page is managed (memory vs. device)
284 */
285static __inline int
286pmap_is_managed(pa)
287	vm_offset_t pa;
288{
289	int i;
290
291	if (!pmap_initialized)
292		return 0;
293
294	for (i = 0; phys_avail[i + 1]; i += 2) {
295		if (pa >= phys_avail[i] && pa < phys_avail[i + 1])
296			return 1;
297	}
298	return 0;
299}
300
301/*
302 * find the vm_page_t of a pte (only) given va of pte and pmap
303 */
304static __inline vm_page_t
305pmap_pte_vm_page(pmap, pt)
306	pmap_t pmap;
307	vm_offset_t pt;
308{
309	vm_page_t m;
310
311	pt = trunc_page(pt);
312	pt = (pt - UPT_MIN_ADDRESS) / PAGE_SIZE;
313	pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME;
314	m = PHYS_TO_VM_PAGE(pt);
315	return m;
316}
317
318/*
319 * Wire a page table page
320 */
321__inline vm_page_t
322pmap_use_pt(pmap, va)
323	pmap_t pmap;
324	vm_offset_t va;
325{
326	vm_offset_t pt;
327	vm_page_t m;
328
329	if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized)
330		return NULL;
331
332	pt = (vm_offset_t) vtopte(va);
333	m = pmap_pte_vm_page(pmap, pt);
334	vm_page_hold(m);
335	return m;
336}
337
338/*
339 * Unwire a page table page
340 */
341__inline void
342pmap_unuse_pt(pmap, va, mpte)
343	pmap_t pmap;
344	vm_offset_t va;
345	vm_page_t mpte;
346{
347
348	if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized)
349		return;
350
351	if (mpte == NULL) {
352		vm_offset_t pt;
353		pt = (vm_offset_t) vtopte(va);
354		mpte = pmap_pte_vm_page(pmap, pt);
355	}
356
357	vm_page_unhold(mpte);
358
359	if (pmap != kernel_pmap &&
360	    (mpte->hold_count == 0) &&
361	    (mpte->wire_count == 0) &&
362	    (va < KPT_MIN_ADDRESS)) {
363/*
364 * We don't free page-table-pages anymore because it can have a negative
365 * impact on perf at times.  Now we just deactivate, and it'll get cleaned
366 * up if needed...  Also, if the page ends up getting used, it will fault
367 * back into the process address space and be reactivated.
368 */
369#ifdef PMAP_FREE_OLD_PTES
370		pmap_page_protect(VM_PAGE_TO_PHYS(mpte), VM_PROT_NONE);
371		vm_page_free(mpte);
372#else
373		mpte->dirty = 0;
374		vm_page_deactivate(mpte);
375#endif
376	}
377}
378
379/*
380 *	Bootstrap the system enough to run with virtual memory.
381 *
382 *	On the i386 this is called after mapping has already been enabled
383 *	and just syncs the pmap module with what has already been done.
384 *	[We can't call it easily with mapping off since the kernel is not
385 *	mapped with PA == VA, hence we would have to relocate every address
386 *	from the linked base (virtual) address "KERNBASE" to the actual
387 *	(physical) address starting relative to 0]
388 */
389void
390pmap_bootstrap(firstaddr, loadaddr)
391	vm_offset_t firstaddr;
392	vm_offset_t loadaddr;
393{
394	vm_offset_t va;
395	pt_entry_t *pte;
396
397	avail_start = firstaddr;
398
399	/*
400	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
401	 * large. It should instead be correctly calculated in locore.s and
402	 * not based on 'first' (which is a physical address, not a virtual
403	 * address, for the start of unused physical memory). The kernel
404	 * page tables are NOT double mapped and thus should not be included
405	 * in this calculation.
406	 */
407	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
408	virtual_end = VM_MAX_KERNEL_ADDRESS;
409
410	/*
411	 * Initialize protection array.
412	 */
413	i386_protection_init();
414
415	/*
416	 * The kernel's pmap is statically allocated so we don't have to use
417	 * pmap_create, which is unlikely to work correctly at this part of
418	 * the boot sequence (XXX and which no longer exists).
419	 */
420	kernel_pmap = &kernel_pmap_store;
421
422	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD);
423
424	kernel_pmap->pm_count = 1;
425	nkpt = NKPT;
426
427	/*
428	 * Reserve some special page table entries/VA space for temporary
429	 * mapping of pages.
430	 */
431#define	SYSMAP(c, p, v, n)	\
432	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
433
434	va = virtual_avail;
435	pte = pmap_pte(kernel_pmap, va);
436
437	/*
438	 * CMAP1/CMAP2 are used for zeroing and copying pages.
439	 */
440	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
441	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
442
443	/*
444	 * ptmmap is used for reading arbitrary physical pages via /dev/mem.
445	 */
446	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
447
448	/*
449	 * msgbufmap is used to map the system message buffer.
450	 */
451	SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1)
452
453	virtual_avail = va;
454
455	*(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0;
456	pmap_update();
457}
458
459/*
460 *	Initialize the pmap module.
461 *	Called by vm_init, to initialize any structures that the pmap
462 *	system needs to map virtual memory.
463 *	pmap_init has been enhanced to support in a fairly consistant
464 *	way, discontiguous physical memory.
465 */
466void
467pmap_init(phys_start, phys_end)
468	vm_offset_t phys_start, phys_end;
469{
470	vm_offset_t addr;
471	vm_size_t npg, s;
472	int i;
473
474	/*
475	 * calculate the number of pv_entries needed
476	 */
477	vm_first_phys = phys_avail[0];
478	for (i = 0; phys_avail[i + 1]; i += 2);
479	npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
480
481	/*
482	 * Allocate memory for random pmap data structures.  Includes the
483	 * pv_head_table.
484	 */
485	s = (vm_size_t) (sizeof(struct pv_entry) * npg);
486	s = round_page(s);
487	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
488	pv_table = (pv_entry_t) addr;
489
490	/*
491	 * init the pv free list
492	 */
493	init_pv_entries(npg);
494	/*
495	 * Now it is safe to enable pv_table recording.
496	 */
497	pmap_initialized = TRUE;
498}
499
500/*
501 *	Used to map a range of physical addresses into kernel
502 *	virtual address space.
503 *
504 *	For now, VM is already on, we only need to map the
505 *	specified memory.
506 */
507vm_offset_t
508pmap_map(virt, start, end, prot)
509	vm_offset_t virt;
510	vm_offset_t start;
511	vm_offset_t end;
512	int prot;
513{
514	while (start < end) {
515		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
516		virt += PAGE_SIZE;
517		start += PAGE_SIZE;
518	}
519	return (virt);
520}
521
522#ifdef PMAP_KEEP_PDIRS
523int nfreepdir;
524caddr_t *pdirlist;
525#define NFREEPDIR 3
526
527static void *
528pmap_getpdir() {
529	caddr_t *pdir;
530	if (pdirlist) {
531		--nfreepdir;
532		pdir = pdirlist;
533		pdirlist = (caddr_t *) *pdir;
534		bzero( (caddr_t) pdir, PAGE_SIZE);
535	} else {
536		pdir = (caddr_t *) kmem_alloc(kernel_map, PAGE_SIZE);
537	}
538
539	return (void *) pdir;
540}
541
542static void
543pmap_freepdir(void *pdir) {
544	if (nfreepdir > NFREEPDIR) {
545		kmem_free(kernel_map, (vm_offset_t) pdir, PAGE_SIZE);
546	} else {
547		* (caddr_t *) pdir = (caddr_t) pdirlist;
548		pdirlist = (caddr_t *) pdir;
549		++nfreepdir;
550	}
551}
552#endif
553
554/*
555 * Initialize a preallocated and zeroed pmap structure,
556 * such as one in a vmspace structure.
557 */
558void
559pmap_pinit(pmap)
560	register struct pmap *pmap;
561{
562	/*
563	 * No need to allocate page table space yet but we do need a valid
564	 * page directory table.
565	 */
566
567#ifdef PMAP_KEEP_PDIRS
568	pmap->pm_pdir = pmap_getpdir();
569#else
570	pmap->pm_pdir = (pd_entry_t *) kmem_alloc(kernel_map, PAGE_SIZE);
571#endif
572
573	/* wire in kernel global address entries */
574	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
575
576	/* install self-referential address mapping entry */
577	*(int *) (pmap->pm_pdir + PTDPTDI) =
578	    ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW;
579
580	pmap->pm_count = 1;
581}
582
583/*
584 * grow the number of kernel page table entries, if needed
585 */
586
587static vm_page_t nkpg;
588vm_offset_t kernel_vm_end;
589
590void
591pmap_growkernel(vm_offset_t addr)
592{
593	struct proc *p;
594	struct pmap *pmap;
595	int s;
596
597	s = splhigh();
598	if (kernel_vm_end == 0) {
599		kernel_vm_end = KERNBASE;
600		nkpt = 0;
601		while (pdir_pde(PTD, kernel_vm_end)) {
602			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
603			++nkpt;
604		}
605	}
606	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
607	while (kernel_vm_end < addr) {
608		if (pdir_pde(PTD, kernel_vm_end)) {
609			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
610			continue;
611		}
612		++nkpt;
613		if (!nkpg) {
614			nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM);
615			if (!nkpg)
616				panic("pmap_growkernel: no memory to grow kernel");
617			vm_page_wire(nkpg);
618			vm_page_remove(nkpg);
619			pmap_zero_page(VM_PAGE_TO_PHYS(nkpg));
620		}
621		pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW);
622		nkpg = NULL;
623
624		for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
625			if (p->p_vmspace) {
626				pmap = &p->p_vmspace->vm_pmap;
627				*pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end);
628			}
629		}
630		*pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end);
631		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
632	}
633	splx(s);
634}
635
636/*
637 *	Retire the given physical map from service.
638 *	Should only be called if the map contains
639 *	no valid mappings.
640 */
641void
642pmap_destroy(pmap)
643	register pmap_t pmap;
644{
645	int count;
646
647	if (pmap == NULL)
648		return;
649
650	count = --pmap->pm_count;
651	if (count == 0) {
652		pmap_release(pmap);
653		free((caddr_t) pmap, M_VMPMAP);
654	}
655}
656
657/*
658 * Release any resources held by the given physical map.
659 * Called when a pmap initialized by pmap_pinit is being released.
660 * Should only be called if the map contains no valid mappings.
661 */
662void
663pmap_release(pmap)
664	register struct pmap *pmap;
665{
666#ifdef PMAP_KEEP_PDIRS
667	pmap_freepdir( (void *)pmap->pm_pdir);
668#else
669	kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE);
670#endif
671}
672
673/*
674 *	Add a reference to the specified pmap.
675 */
676void
677pmap_reference(pmap)
678	pmap_t pmap;
679{
680	if (pmap != NULL) {
681		pmap->pm_count++;
682	}
683}
684
685#define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2)
686
687/*
688 * Data for the pv entry allocation mechanism
689 */
690static int pv_freelistcnt;
691static pv_entry_t pv_freelist;
692static vm_offset_t pvva;
693static int npvvapg;
694
695/*
696 * free the pv_entry back to the free list
697 */
698static __inline void
699free_pv_entry(pv)
700	pv_entry_t pv;
701{
702	if (!pv)
703		return;
704	++pv_freelistcnt;
705	pv->pv_next = pv_freelist;
706	pv_freelist = pv;
707}
708
709/*
710 * get a new pv_entry, allocating a block from the system
711 * when needed.
712 * the memory allocation is performed bypassing the malloc code
713 * because of the possibility of allocations at interrupt time.
714 */
715static __inline pv_entry_t
716get_pv_entry()
717{
718	pv_entry_t tmp;
719
720	/*
721	 * get more pv_entry pages if needed
722	 */
723	if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) {
724		pmap_alloc_pv_entry();
725	}
726	/*
727	 * get a pv_entry off of the free list
728	 */
729	--pv_freelistcnt;
730	tmp = pv_freelist;
731	pv_freelist = tmp->pv_next;
732	return tmp;
733}
734
735/*
736 * this *strange* allocation routine *statistically* eliminates the
737 * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure.
738 * also -- this code is MUCH MUCH faster than the malloc equiv...
739 */
740static void
741pmap_alloc_pv_entry()
742{
743	/*
744	 * do we have any pre-allocated map-pages left?
745	 */
746	if (npvvapg) {
747		vm_page_t m;
748
749		/*
750		 * we do this to keep recursion away
751		 */
752		pv_freelistcnt += PV_FREELIST_MIN;
753		/*
754		 * allocate a physical page out of the vm system
755		 */
756		m = vm_page_alloc(kernel_object,
757		    OFF_TO_IDX(pvva - vm_map_min(kernel_map)),
758		    VM_ALLOC_INTERRUPT);
759		if (m) {
760			int newentries;
761			int i;
762			pv_entry_t entry;
763
764			newentries = (PAGE_SIZE / sizeof(struct pv_entry));
765			/*
766			 * wire the page
767			 */
768			vm_page_wire(m);
769			m->flags &= ~PG_BUSY;
770			/*
771			 * let the kernel see it
772			 */
773			pmap_kenter(pvva, VM_PAGE_TO_PHYS(m));
774
775			entry = (pv_entry_t) pvva;
776			/*
777			 * update the allocation pointers
778			 */
779			pvva += PAGE_SIZE;
780			--npvvapg;
781
782			/*
783			 * free the entries into the free list
784			 */
785			for (i = 0; i < newentries; i++) {
786				free_pv_entry(entry);
787				entry++;
788			}
789		}
790		pv_freelistcnt -= PV_FREELIST_MIN;
791	}
792	if (!pv_freelist)
793		panic("get_pv_entry: cannot get a pv_entry_t");
794}
795
796
797
798/*
799 * init the pv_entry allocation system
800 */
801#define PVSPERPAGE 64
802void
803init_pv_entries(npg)
804	int npg;
805{
806	/*
807	 * allocate enough kvm space for PVSPERPAGE entries per page (lots)
808	 * kvm space is fairly cheap, be generous!!!  (the system can panic if
809	 * this is too small.)
810	 */
811	npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry)
812		+ PAGE_SIZE - 1) / PAGE_SIZE;
813	pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE);
814	/*
815	 * get the first batch of entries
816	 */
817	free_pv_entry(get_pv_entry());
818}
819
820static pt_entry_t *
821get_pt_entry(pmap)
822	pmap_t pmap;
823{
824	vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
825
826	/* are we current address space or kernel? */
827	if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) {
828		return PTmap;
829	}
830	/* otherwise, we are alternate address space */
831	if (frame != ((int) APTDpde & PG_FRAME)) {
832		APTDpde = pmap->pm_pdir[PTDPTDI];
833		pmap_update();
834	}
835	return APTmap;
836}
837
838/*
839 * If it is the first entry on the list, it is actually
840 * in the header and we must copy the following entry up
841 * to the header.  Otherwise we must search the list for
842 * the entry.  In either case we free the now unused entry.
843 */
844static void
845pmap_remove_entry(pmap, pv, va)
846	struct pmap *pmap;
847	pv_entry_t pv;
848	vm_offset_t va;
849{
850	pv_entry_t npv;
851	int s;
852
853	s = splhigh();
854	if (pmap == pv->pv_pmap && va == pv->pv_va) {
855		pmap_unuse_pt(pmap, va, pv->pv_ptem);
856		npv = pv->pv_next;
857		if (npv) {
858			*pv = *npv;
859			free_pv_entry(npv);
860		} else {
861			pv->pv_pmap = NULL;
862		}
863	} else {
864		for (npv = pv->pv_next; npv; (pv = npv, npv = pv->pv_next)) {
865			if (pmap == npv->pv_pmap && va == npv->pv_va) {
866				pmap_unuse_pt(pmap, va, npv->pv_ptem);
867				pv->pv_next = npv->pv_next;
868				free_pv_entry(npv);
869				break;
870			}
871		}
872	}
873	splx(s);
874}
875
876/*
877 *	Remove the given range of addresses from the specified map.
878 *
879 *	It is assumed that the start and end are properly
880 *	rounded to the page size.
881 */
882void
883pmap_remove(pmap, sva, eva)
884	struct pmap *pmap;
885	register vm_offset_t sva;
886	register vm_offset_t eva;
887{
888	register pt_entry_t *ptp, *ptq;
889	vm_offset_t pa;
890	register pv_entry_t pv;
891	vm_offset_t va;
892	pt_entry_t oldpte;
893	vm_offset_t pdnxt;
894	vm_offset_t ptepaddr;
895	vm_page_t mpte;
896	int update_needed;
897
898	if (pmap == NULL)
899		return;
900
901	ptp = get_pt_entry(pmap);
902
903	/*
904	 * special handling of removing one page.  a very
905	 * common operation and easy to short circuit some
906	 * code.
907	 */
908	if ((sva + PAGE_SIZE) == eva) {
909
910		if (*pmap_pde(pmap, sva) == 0)
911			return;
912
913		ptq = ptp + i386_btop(sva);
914
915		if (!*ptq)
916			return;
917
918		oldpte = *ptq;
919		if (((int)oldpte) & PG_W)
920			pmap->pm_stats.wired_count--;
921		pmap->pm_stats.resident_count--;
922
923		*ptq = 0;
924
925		pa = ((int)oldpte) & PG_FRAME;
926		if (pmap_is_managed(pa)) {
927			if ((int) oldpte & PG_M) {
928				if (sva < USRSTACK + (UPAGES * PAGE_SIZE) ||
929				    (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) {
930					PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
931				}
932			}
933			pv = pa_to_pvh(pa);
934			pmap_remove_entry(pmap, pv, sva);
935		} else {
936			pmap_unuse_pt(pmap, sva, NULL);
937		}
938		pmap_update_1pg(sva);
939		return;
940	}
941
942	update_needed = 0;
943	sva = i386_btop(sva);
944	pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1));
945	ptepaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sva));
946	eva = i386_btop(eva);
947	mpte = NULL;
948
949	while (sva < eva) {
950		if (sva >= pdnxt) {
951			pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1));
952			ptepaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sva));
953			mpte = NULL;
954		}
955		/*
956		 * Weed out invalid mappings. Note: we assume that the page
957		 * directory table is always allocated, and in kernel virtual.
958		 */
959		if (ptepaddr == 0) {
960			sva = pdnxt;
961			continue;
962		}
963
964		if (mpte == NULL)
965			mpte = PHYS_TO_VM_PAGE(i386_trunc_page(ptepaddr));
966		if ((mpte->hold_count == 0) && (mpte->wire_count == 0)) {
967			sva = pdnxt;
968			continue;
969		}
970
971		if (pdnxt > eva)
972			pdnxt = eva;
973		/*
974		 * search for page table entries
975		 */
976		while ((sva < pdnxt) && (*(ptp + sva) == 0))
977			++sva;
978		if (sva == pdnxt) {
979			continue;
980		}
981
982		ptq = ptp + sva;
983		/*
984		 * Invalidate the PTEs. XXX: should cluster them up and
985		 * invalidate as many as possible at once.
986		 * Update statistics
987		 */
988		oldpte = *ptq;
989		*ptq = 0;
990		if (((int) oldpte) & PG_W)
991			pmap->pm_stats.wired_count--;
992		pmap->pm_stats.resident_count--;
993
994		va = i386_ptob(sva);
995
996		++update_needed;
997		pa = ((int) oldpte) & PG_FRAME;
998		if (!pmap_is_managed(pa)) {
999			pmap_unuse_pt(pmap, (vm_offset_t) va, NULL);
1000			++sva;
1001			continue;
1002		}
1003		if ((int) oldpte & PG_M) {
1004			if (va < USRSTACK + (UPAGES * PAGE_SIZE) ||
1005			    (va >= KERNBASE && (va < clean_sva || va >= clean_eva))) {
1006				PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
1007			}
1008		}
1009		pv = pa_to_pvh(pa);
1010		pmap_remove_entry(pmap, pv, va);
1011		++sva;
1012	}
1013	if (update_needed)
1014		pmap_update();
1015}
1016
1017/*
1018 *	Routine:	pmap_remove_all
1019 *	Function:
1020 *		Removes this physical page from
1021 *		all physical maps in which it resides.
1022 *		Reflects back modify bits to the pager.
1023 *
1024 *	Notes:
1025 *		Original versions of this routine were very
1026 *		inefficient because they iteratively called
1027 *		pmap_remove (slow...)
1028 */
1029static void
1030pmap_remove_all(pa)
1031	vm_offset_t pa;
1032{
1033	register pv_entry_t pv, opv, npv;
1034	register pt_entry_t *pte, *ptp;
1035	vm_offset_t va;
1036	struct pmap *pmap;
1037	vm_page_t m;
1038	int s;
1039	int anyvalid = 0;
1040
1041	/*
1042	 * Not one of ours
1043	 */
1044	/*
1045	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1046	 * pages!
1047	 */
1048	if (!pmap_is_managed(pa))
1049		return;
1050
1051	pa = trunc_page(pa);
1052	opv = pa_to_pvh(pa);
1053	if (opv->pv_pmap == NULL)
1054		return;
1055
1056	m = PHYS_TO_VM_PAGE(pa);
1057	s = splhigh();
1058	pv = opv;
1059	while (pv && ((pmap = pv->pv_pmap) != NULL)) {
1060		int tpte;
1061		ptp = get_pt_entry(pmap);
1062		va = pv->pv_va;
1063		pte = ptp + i386_btop(va);
1064		if (tpte = ((int) *pte)) {
1065			*pte = 0;
1066			if (tpte & PG_W)
1067				pmap->pm_stats.wired_count--;
1068			pmap->pm_stats.resident_count--;
1069			if (curproc != pageproc)
1070				anyvalid++;
1071
1072			/*
1073			 * Update the vm_page_t clean and reference bits.
1074			 */
1075			if ((tpte & PG_M) != 0) {
1076				if (va < USRSTACK + (UPAGES * PAGE_SIZE) ||
1077				    (va >= KERNBASE && (va < clean_sva || va >= clean_eva))) {
1078					m->dirty = VM_PAGE_BITS_ALL;
1079				}
1080			}
1081		}
1082		pv = pv->pv_next;
1083	}
1084
1085	for (pv = opv->pv_next; pv; pv = npv) {
1086		npv = pv->pv_next;
1087		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1088		free_pv_entry(pv);
1089	}
1090
1091	opv->pv_pmap = NULL;
1092	opv->pv_next = NULL;
1093
1094	splx(s);
1095	if (anyvalid)
1096		pmap_update();
1097}
1098
1099
1100/*
1101 *	Set the physical protection on the
1102 *	specified range of this map as requested.
1103 */
1104void
1105pmap_protect(pmap, sva, eva, prot)
1106	register pmap_t pmap;
1107	vm_offset_t sva, eva;
1108	vm_prot_t prot;
1109{
1110	register pt_entry_t *pte;
1111	register vm_offset_t va;
1112	int i386prot;
1113	register pt_entry_t *ptp;
1114	int anychanged = 0;
1115
1116	if (pmap == NULL)
1117		return;
1118
1119	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1120		pmap_remove(pmap, sva, eva);
1121		return;
1122	}
1123	if (prot & VM_PROT_WRITE)
1124		return;
1125
1126	ptp = get_pt_entry(pmap);
1127
1128	sva = i386_btop(sva);
1129	eva = i386_btop(eva);
1130
1131	while (sva < eva) {
1132		vm_offset_t pdnxt;
1133		vm_offset_t ptepaddr;
1134		vm_page_t mpte;
1135		int pprot;
1136		/*
1137		 * Weed out invalid mappings. Note: we assume that the page
1138		 * directory table is always allocated, and in kernel virtual.
1139		 */
1140
1141		pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1));
1142		ptepaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sva));
1143		if (ptepaddr == 0) {
1144			sva = pdnxt;
1145			continue;
1146		}
1147
1148		mpte = PHYS_TO_VM_PAGE(i386_trunc_page(ptepaddr));
1149		if ((mpte->hold_count == 0) && (mpte->wire_count == 0)) {
1150			sva = pdnxt;
1151			continue;
1152		}
1153
1154		if (pdnxt > eva)
1155			pdnxt = eva;
1156		/*
1157		 * search for page table entries
1158		 */
1159		while ((sva < pdnxt) && (*(ptp + sva) == 0))
1160			++sva;
1161
1162		if (sva == pdnxt)
1163			continue;
1164
1165		pte = ptp + sva;
1166
1167		va = i386_ptob(sva);
1168		i386prot = pte_prot(pmap, prot);
1169		if (va < UPT_MAX_ADDRESS) {
1170			i386prot |= PG_u;
1171			if (va >= UPT_MIN_ADDRESS)
1172				i386prot |= PG_RW;
1173		}
1174		pprot = *(int *)pte & PG_PROT;
1175		if (pprot != i386prot) {
1176			pmap_pte_set_prot(pte, i386prot);
1177			anychanged++;
1178		}
1179		++sva;
1180	}
1181	if (anychanged)
1182		pmap_update();
1183}
1184
1185/*
1186 *	Insert the given physical page (p) at
1187 *	the specified virtual address (v) in the
1188 *	target physical map with the protection requested.
1189 *
1190 *	If specified, the page will be wired down, meaning
1191 *	that the related pte can not be reclaimed.
1192 *
1193 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1194 *	or lose information.  That is, this routine must actually
1195 *	insert this page into the given map NOW.
1196 */
1197void
1198pmap_enter(pmap, va, pa, prot, wired)
1199	register pmap_t pmap;
1200	vm_offset_t va;
1201	register vm_offset_t pa;
1202	vm_prot_t prot;
1203	boolean_t wired;
1204{
1205	register pt_entry_t *pte;
1206	register pt_entry_t npte;
1207	vm_offset_t opa;
1208	register pv_entry_t pv, npv;
1209	int ptevalid = 0;
1210
1211	if (pmap == NULL)
1212		return;
1213
1214	pv = NULL;
1215
1216	va = trunc_page(va);
1217	pa = trunc_page(pa);
1218	if (va > VM_MAX_KERNEL_ADDRESS)
1219		panic("pmap_enter: toobig");
1220
1221	/*
1222	 * Page Directory table entry not valid, we need a new PT page
1223	 */
1224	pte = pmap_pte(pmap, va);
1225	if (pte == NULL) {
1226		printf("kernel page directory invalid pdir=%p, va=0x%lx\n",
1227			pmap->pm_pdir[PTDPTDI], va);
1228		panic("invalid kernel page directory");
1229	}
1230	opa = pmap_pte_pa(pte);
1231
1232	/*
1233	 * Mapping has not changed, must be protection or wiring change.
1234	 */
1235	if (opa == pa) {
1236		/*
1237		 * Wiring change, just update stats. We don't worry about
1238		 * wiring PT pages as they remain resident as long as there
1239		 * are valid mappings in them. Hence, if a user page is wired,
1240		 * the PT page will be also.
1241		 */
1242		if (wired && !pmap_pte_w(pte))
1243			pmap->pm_stats.wired_count++;
1244		else if (!wired && pmap_pte_w(pte))
1245			pmap->pm_stats.wired_count--;
1246
1247		goto validate;
1248	}
1249	/*
1250	 * Mapping has changed, invalidate old range and fall through to
1251	 * handle validating new mapping.
1252	 */
1253	if (opa) {
1254		pmap_remove(pmap, va, va + PAGE_SIZE);
1255	}
1256	/*
1257	 * Enter on the PV list if part of our managed memory Note that we
1258	 * raise IPL while manipulating pv_table since pmap_enter can be
1259	 * called at interrupt time.
1260	 */
1261	if (pmap_is_managed(pa)) {
1262		int s;
1263
1264		pv = pa_to_pvh(pa);
1265		s = splhigh();
1266		/*
1267		 * No entries yet, use header as the first entry
1268		 */
1269		if (pv->pv_pmap == NULL) {
1270			pv->pv_va = va;
1271			pv->pv_pmap = pmap;
1272			pv->pv_next = NULL;
1273			pv->pv_ptem = NULL;
1274		}
1275		/*
1276		 * There is at least one other VA mapping this page. Place
1277		 * this entry after the header.
1278		 */
1279		else {
1280			npv = get_pv_entry();
1281			npv->pv_va = va;
1282			npv->pv_pmap = pmap;
1283			npv->pv_next = pv->pv_next;
1284			pv->pv_next = npv;
1285			pv = npv;
1286			pv->pv_ptem = NULL;
1287		}
1288		splx(s);
1289	}
1290
1291	/*
1292	 * Increment counters
1293	 */
1294	pmap->pm_stats.resident_count++;
1295	if (wired)
1296		pmap->pm_stats.wired_count++;
1297
1298validate:
1299	/*
1300	 * Now validate mapping with desired protection/wiring.
1301	 */
1302	npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V));
1303
1304	/*
1305	 * When forking (copy-on-write, etc): A process will turn off write
1306	 * permissions for any of its writable pages.  If the data (object) is
1307	 * only referred to by one process, the processes map is modified
1308	 * directly as opposed to using the object manipulation routine.  When
1309	 * using pmap_protect, the modified bits are not kept in the vm_page_t
1310	 * data structure.  Therefore, when using pmap_enter in vm_fault to
1311	 * bring back writability of a page, there has been no memory of the
1312	 * modified or referenced bits except at the pte level.  this clause
1313	 * supports the carryover of the modified and used (referenced) bits.
1314	 */
1315	if (pa == opa)
1316		(int) npte |= (int) *pte & (PG_M | PG_U);
1317
1318	if (wired)
1319		(int) npte |= PG_W;
1320	if (va < UPT_MIN_ADDRESS)
1321		(int) npte |= PG_u;
1322	else if (va < UPT_MAX_ADDRESS)
1323		(int) npte |= PG_u | PG_RW;
1324
1325	if (*pte != npte) {
1326		if (*pte)
1327			ptevalid++;
1328		*pte = npte;
1329	}
1330	if (ptevalid) {
1331		pmap_update_1pg(va);
1332	} else {
1333		if (pv) {
1334			pv->pv_ptem = pmap_use_pt(pmap, va);
1335		}
1336	}
1337}
1338
1339/*
1340 * Add a list of wired pages to the kva
1341 * this routine is only used for temporary
1342 * kernel mappings that do not need to have
1343 * page modification or references recorded.
1344 * Note that old mappings are simply written
1345 * over.  The page *must* be wired.
1346 */
1347void
1348pmap_qenter(va, m, count)
1349	vm_offset_t va;
1350	vm_page_t *m;
1351	int count;
1352{
1353	int i;
1354	int anyvalid = 0;
1355	register pt_entry_t *pte;
1356
1357	for (i = 0; i < count; i++) {
1358		vm_offset_t tva = va + i * PAGE_SIZE;
1359		pt_entry_t npte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V));
1360		pte = vtopte(tva);
1361		if (*pte && (*pte != npte))
1362			pmap_update_1pg(tva);
1363		*pte = npte;
1364	}
1365}
1366/*
1367 * this routine jerks page mappings from the
1368 * kernel -- it is meant only for temporary mappings.
1369 */
1370void
1371pmap_qremove(va, count)
1372	vm_offset_t va;
1373	int count;
1374{
1375	int i;
1376	register pt_entry_t *pte;
1377
1378	for (i = 0; i < count; i++) {
1379		vm_offset_t tva = va + i * PAGE_SIZE;
1380		pte = vtopte(tva);
1381		*pte = 0;
1382		pmap_update_1pg(tva);
1383	}
1384}
1385
1386/*
1387 * add a wired page to the kva
1388 * note that in order for the mapping to take effect -- you
1389 * should do a pmap_update after doing the pmap_kenter...
1390 */
1391void
1392pmap_kenter(va, pa)
1393	vm_offset_t va;
1394	register vm_offset_t pa;
1395{
1396	register pt_entry_t *pte;
1397	int wasvalid = 0;
1398
1399	pte = vtopte(va);
1400
1401	if (*pte)
1402		wasvalid++;
1403
1404	*pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V));
1405
1406	if (wasvalid)
1407		pmap_update_1pg(va);
1408}
1409
1410/*
1411 * remove a page from the kernel pagetables
1412 */
1413void
1414pmap_kremove(va)
1415	vm_offset_t va;
1416{
1417	register pt_entry_t *pte;
1418
1419	pte = vtopte(va);
1420
1421	*pte = (pt_entry_t) 0;
1422	pmap_update_1pg(va);
1423}
1424
1425/*
1426 * this code makes some *MAJOR* assumptions:
1427 * 1. Current pmap & pmap exists.
1428 * 2. Not wired.
1429 * 3. Read access.
1430 * 4. No page table pages.
1431 * 5. Tlbflush is deferred to calling procedure.
1432 * 6. Page IS managed.
1433 * but is *MUCH* faster than pmap_enter...
1434 */
1435
1436static void
1437pmap_enter_quick(pmap, va, pa)
1438	register pmap_t pmap;
1439	vm_offset_t va;
1440	register vm_offset_t pa;
1441{
1442	register pt_entry_t *pte;
1443	register pv_entry_t pv, npv;
1444	int s;
1445
1446	/*
1447	 * Enter on the PV list if part of our managed memory Note that we
1448	 * raise IPL while manipulating pv_table since pmap_enter can be
1449	 * called at interrupt time.
1450	 */
1451
1452	pte = vtopte(va);
1453#if 1
1454	/* a fault on the page table might occur here */
1455	if (*pte) {
1456		pmap_remove(pmap, va, va + PAGE_SIZE);
1457	}
1458#endif
1459
1460	pv = pa_to_pvh(pa);
1461	s = splhigh();
1462	/*
1463	 * No entries yet, use header as the first entry
1464	 */
1465	if (pv->pv_pmap == NULL) {
1466		pv->pv_pmap = pmap;
1467		pv->pv_va = va;
1468		pv->pv_next = NULL;
1469	}
1470	/*
1471	 * There is at least one other VA mapping this page. Place this entry
1472	 * after the header.
1473	 */
1474	else {
1475		npv = get_pv_entry();
1476		npv->pv_va = va;
1477		npv->pv_pmap = pmap;
1478		npv->pv_next = pv->pv_next;
1479		pv->pv_next = npv;
1480		pv = npv;
1481	}
1482	splx(s);
1483	pv->pv_ptem = pmap_use_pt(pmap, va);
1484
1485	/*
1486	 * Increment counters
1487	 */
1488	pmap->pm_stats.resident_count++;
1489
1490	/*
1491	 * Now validate mapping with desired protection/wiring.
1492	 */
1493	*pte = (pt_entry_t) ((int) (pa | PG_V | PG_u));
1494
1495	return;
1496}
1497
1498#define MAX_INIT_PT (512)
1499/*
1500 * pmap_object_init_pt preloads the ptes for a given object
1501 * into the specified pmap.  This eliminates the blast of soft
1502 * faults on process startup and immediately after an mmap.
1503 */
1504void
1505pmap_object_init_pt(pmap, addr, object, pindex, size)
1506	pmap_t pmap;
1507	vm_offset_t addr;
1508	vm_object_t object;
1509	vm_pindex_t pindex;
1510	vm_size_t size;
1511{
1512	vm_offset_t tmpidx;
1513	int psize;
1514	vm_page_t p;
1515	int objpgs;
1516
1517	psize = (size >> PAGE_SHIFT);
1518
1519	if (!pmap || ((psize > MAX_INIT_PT) &&
1520		(object->resident_page_count > MAX_INIT_PT))) {
1521		return;
1522	}
1523
1524	/*
1525	 * remove any already used mappings
1526	 */
1527	pmap_remove( pmap, trunc_page(addr), round_page(addr + size));
1528
1529	/*
1530	 * if we are processing a major portion of the object, then scan the
1531	 * entire thing.
1532	 */
1533	if (psize > (object->size >> 2)) {
1534		objpgs = psize;
1535
1536		for (p = object->memq.tqh_first;
1537		    ((objpgs > 0) && (p != NULL));
1538		    p = p->listq.tqe_next) {
1539
1540			tmpidx = p->pindex;
1541			if (tmpidx < pindex) {
1542				continue;
1543			}
1544			tmpidx -= pindex;
1545			if (tmpidx >= psize) {
1546				continue;
1547			}
1548			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
1549			    (p->busy == 0) &&
1550			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
1551				if (p->queue == PQ_CACHE)
1552					vm_page_deactivate(p);
1553				vm_page_hold(p);
1554				p->flags |= PG_MAPPED;
1555				pmap_enter_quick(pmap,
1556					addr + (tmpidx << PAGE_SHIFT),
1557					VM_PAGE_TO_PHYS(p));
1558				vm_page_unhold(p);
1559			}
1560			objpgs -= 1;
1561		}
1562	} else {
1563		/*
1564		 * else lookup the pages one-by-one.
1565		 */
1566		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
1567			p = vm_page_lookup(object, tmpidx + pindex);
1568			if (p && (p->busy == 0) &&
1569			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
1570			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
1571				if (p->queue == PQ_CACHE)
1572					vm_page_deactivate(p);
1573				vm_page_hold(p);
1574				p->flags |= PG_MAPPED;
1575				pmap_enter_quick(pmap,
1576					addr + (tmpidx << PAGE_SHIFT),
1577					VM_PAGE_TO_PHYS(p));
1578				vm_page_unhold(p);
1579			}
1580		}
1581	}
1582}
1583
1584/*
1585 * pmap_prefault provides a quick way of clustering
1586 * pagefaults into a processes address space.  It is a "cousin"
1587 * of pmap_object_init_pt, except it runs at page fault time instead
1588 * of mmap time.
1589 */
1590#define PFBAK 2
1591#define PFFOR 2
1592#define PAGEORDER_SIZE (PFBAK+PFFOR)
1593
1594static int pmap_prefault_pageorder[] = {
1595	-NBPG, NBPG, -2 * NBPG, 2 * NBPG
1596};
1597
1598void
1599pmap_prefault(pmap, addra, entry, object)
1600	pmap_t pmap;
1601	vm_offset_t addra;
1602	vm_map_entry_t entry;
1603	vm_object_t object;
1604{
1605	int i;
1606	vm_offset_t starta;
1607	vm_offset_t addr;
1608	vm_pindex_t pindex;
1609	vm_page_t m;
1610	int pageorder_index;
1611
1612	if (entry->object.vm_object != object)
1613		return;
1614
1615	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
1616		return;
1617
1618	starta = addra - PFBAK * PAGE_SIZE;
1619	if (starta < entry->start) {
1620		starta = entry->start;
1621	} else if (starta > addra) {
1622		starta = 0;
1623	}
1624
1625	for (i = 0; i < PAGEORDER_SIZE; i++) {
1626		vm_object_t lobject;
1627		pt_entry_t *pte;
1628
1629		addr = addra + pmap_prefault_pageorder[i];
1630		if (addr < starta || addr >= entry->end)
1631			continue;
1632
1633		pte = vtopte(addr);
1634		if (*pte)
1635			continue;
1636
1637		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
1638		lobject = object;
1639		for (m = vm_page_lookup(lobject, pindex);
1640		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
1641		    lobject = lobject->backing_object) {
1642			if (lobject->backing_object_offset & PAGE_MASK)
1643				break;
1644			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
1645			m = vm_page_lookup(lobject->backing_object, pindex);
1646		}
1647
1648		/*
1649		 * give-up when a page is not in memory
1650		 */
1651		if (m == NULL)
1652			break;
1653
1654		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
1655		    (m->busy == 0) &&
1656		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
1657
1658			if (m->queue == PQ_CACHE) {
1659				if ((cnt.v_free_count + cnt.v_cache_count) <
1660					cnt.v_free_min)
1661					break;
1662				vm_page_deactivate(m);
1663			}
1664			vm_page_hold(m);
1665			m->flags |= PG_MAPPED;
1666			pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m));
1667			vm_page_unhold(m);
1668		}
1669	}
1670}
1671
1672/*
1673 *	Routine:	pmap_change_wiring
1674 *	Function:	Change the wiring attribute for a map/virtual-address
1675 *			pair.
1676 *	In/out conditions:
1677 *			The mapping must already exist in the pmap.
1678 */
1679void
1680pmap_change_wiring(pmap, va, wired)
1681	register pmap_t pmap;
1682	vm_offset_t va;
1683	boolean_t wired;
1684{
1685	register pt_entry_t *pte;
1686
1687	if (pmap == NULL)
1688		return;
1689
1690	pte = pmap_pte(pmap, va);
1691
1692	if (wired && !pmap_pte_w(pte))
1693		pmap->pm_stats.wired_count++;
1694	else if (!wired && pmap_pte_w(pte))
1695		pmap->pm_stats.wired_count--;
1696
1697	/*
1698	 * Wiring is not a hardware characteristic so there is no need to
1699	 * invalidate TLB.
1700	 */
1701	pmap_pte_set_w(pte, wired);
1702}
1703
1704
1705
1706/*
1707 *	Copy the range specified by src_addr/len
1708 *	from the source map to the range dst_addr/len
1709 *	in the destination map.
1710 *
1711 *	This routine is only advisory and need not do anything.
1712 */
1713void
1714pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
1715	pmap_t dst_pmap, src_pmap;
1716	vm_offset_t dst_addr;
1717	vm_size_t len;
1718	vm_offset_t src_addr;
1719{
1720}
1721
1722/*
1723 *	Routine:	pmap_kernel
1724 *	Function:
1725 *		Returns the physical map handle for the kernel.
1726 */
1727pmap_t
1728pmap_kernel()
1729{
1730	return (kernel_pmap);
1731}
1732
1733/*
1734 *	pmap_zero_page zeros the specified (machine independent)
1735 *	page by mapping the page into virtual memory and using
1736 *	bzero to clear its contents, one machine dependent page
1737 *	at a time.
1738 */
1739void
1740pmap_zero_page(phys)
1741	vm_offset_t phys;
1742{
1743	if (*(int *) CMAP2)
1744		panic("pmap_zero_page: CMAP busy");
1745
1746	*(int *) CMAP2 = PG_V | PG_KW | trunc_page(phys);
1747	bzero(CADDR2, PAGE_SIZE);
1748
1749	*(int *) CMAP2 = 0;
1750	pmap_update_1pg((vm_offset_t) CADDR2);
1751}
1752
1753/*
1754 *	pmap_copy_page copies the specified (machine independent)
1755 *	page by mapping the page into virtual memory and using
1756 *	bcopy to copy the page, one machine dependent page at a
1757 *	time.
1758 */
1759void
1760pmap_copy_page(src, dst)
1761	vm_offset_t src;
1762	vm_offset_t dst;
1763{
1764	if (*(int *) CMAP1 || *(int *) CMAP2)
1765		panic("pmap_copy_page: CMAP busy");
1766
1767	*(int *) CMAP1 = PG_V | PG_KW | trunc_page(src);
1768	*(int *) CMAP2 = PG_V | PG_KW | trunc_page(dst);
1769
1770#if __GNUC__ > 1
1771	memcpy(CADDR2, CADDR1, PAGE_SIZE);
1772#else
1773	bcopy(CADDR1, CADDR2, PAGE_SIZE);
1774#endif
1775	*(int *) CMAP1 = 0;
1776	*(int *) CMAP2 = 0;
1777	pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2);
1778}
1779
1780
1781/*
1782 *	Routine:	pmap_pageable
1783 *	Function:
1784 *		Make the specified pages (by pmap, offset)
1785 *		pageable (or not) as requested.
1786 *
1787 *		A page which is not pageable may not take
1788 *		a fault; therefore, its page table entry
1789 *		must remain valid for the duration.
1790 *
1791 *		This routine is merely advisory; pmap_enter
1792 *		will specify that these pages are to be wired
1793 *		down (or not) as appropriate.
1794 */
1795void
1796pmap_pageable(pmap, sva, eva, pageable)
1797	pmap_t pmap;
1798	vm_offset_t sva, eva;
1799	boolean_t pageable;
1800{
1801}
1802
1803/*
1804 * this routine returns true if a physical page resides
1805 * in the given pmap.
1806 */
1807boolean_t
1808pmap_page_exists(pmap, pa)
1809	pmap_t pmap;
1810	vm_offset_t pa;
1811{
1812	register pv_entry_t pv;
1813	int s;
1814
1815	if (!pmap_is_managed(pa))
1816		return FALSE;
1817
1818	pv = pa_to_pvh(pa);
1819	s = splhigh();
1820
1821	/*
1822	 * Not found, check current mappings returning immediately if found.
1823	 */
1824	if (pv->pv_pmap != NULL) {
1825		for (; pv; pv = pv->pv_next) {
1826			if (pv->pv_pmap == pmap) {
1827				splx(s);
1828				return TRUE;
1829			}
1830		}
1831	}
1832	splx(s);
1833	return (FALSE);
1834}
1835
1836/*
1837 * pmap_testbit tests bits in pte's
1838 * note that the testbit/changebit routines are inline,
1839 * and a lot of things compile-time evaluate.
1840 */
1841static __inline boolean_t
1842pmap_testbit(pa, bit)
1843	register vm_offset_t pa;
1844	int bit;
1845{
1846	register pv_entry_t pv;
1847	pt_entry_t *pte;
1848	int s;
1849
1850	if (!pmap_is_managed(pa))
1851		return FALSE;
1852
1853	pv = pa_to_pvh(pa);
1854	s = splhigh();
1855
1856	/*
1857	 * Not found, check current mappings returning immediately if found.
1858	 */
1859	if (pv->pv_pmap != NULL) {
1860		for (; pv; pv = pv->pv_next) {
1861			/*
1862			 * if the bit being tested is the modified bit, then
1863			 * mark UPAGES as always modified, and ptes as never
1864			 * modified.
1865			 */
1866			if (bit & (PG_U|PG_M)) {
1867				if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) {
1868					continue;
1869				}
1870			}
1871			if (!pv->pv_pmap) {
1872				printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
1873				continue;
1874			}
1875			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
1876			if ((int) *pte & bit) {
1877				splx(s);
1878				return TRUE;
1879			}
1880		}
1881	}
1882	splx(s);
1883	return (FALSE);
1884}
1885
1886/*
1887 * this routine is used to modify bits in ptes
1888 */
1889static __inline void
1890pmap_changebit(pa, bit, setem)
1891	vm_offset_t pa;
1892	int bit;
1893	boolean_t setem;
1894{
1895	register pv_entry_t pv;
1896	register pt_entry_t *pte, npte;
1897	vm_offset_t va;
1898	int changed;
1899	int s;
1900
1901	if (!pmap_is_managed(pa))
1902		return;
1903
1904	pv = pa_to_pvh(pa);
1905	s = splhigh();
1906
1907	/*
1908	 * Loop over all current mappings setting/clearing as appropos If
1909	 * setting RO do we need to clear the VAC?
1910	 */
1911	if (pv->pv_pmap != NULL) {
1912		for (; pv; pv = pv->pv_next) {
1913			va = pv->pv_va;
1914
1915			/*
1916			 * don't write protect pager mappings
1917			 */
1918			if (!setem && (bit == PG_RW)) {
1919				if (va >= clean_sva && va < clean_eva)
1920					continue;
1921			}
1922			if (!pv->pv_pmap) {
1923				printf("Null pmap (cb) at va: 0x%lx\n", va);
1924				continue;
1925			}
1926			pte = pmap_pte(pv->pv_pmap, va);
1927			if (setem) {
1928				(int) npte = (int) *pte | bit;
1929			} else {
1930				(int) npte = (int) *pte & ~bit;
1931			}
1932			*pte = npte;
1933		}
1934	}
1935	splx(s);
1936	if (curproc != pageproc)
1937		pmap_update();
1938}
1939
1940/*
1941 *      pmap_page_protect:
1942 *
1943 *      Lower the permission for all mappings to a given page.
1944 */
1945void
1946pmap_page_protect(phys, prot)
1947	vm_offset_t phys;
1948	vm_prot_t prot;
1949{
1950	if ((prot & VM_PROT_WRITE) == 0) {
1951		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE))
1952			pmap_changebit(phys, PG_RW, FALSE);
1953		else
1954			pmap_remove_all(phys);
1955	}
1956}
1957
1958vm_offset_t
1959pmap_phys_address(ppn)
1960	int ppn;
1961{
1962	return (i386_ptob(ppn));
1963}
1964
1965/*
1966 *	pmap_is_referenced:
1967 *
1968 *	Return whether or not the specified physical page was referenced
1969 *	by any physical maps.
1970 */
1971boolean_t
1972pmap_is_referenced(vm_offset_t pa)
1973{
1974	return pmap_testbit((pa), PG_U);
1975}
1976
1977/*
1978 *	pmap_is_modified:
1979 *
1980 *	Return whether or not the specified physical page was modified
1981 *	in any physical maps.
1982 */
1983boolean_t
1984pmap_is_modified(vm_offset_t pa)
1985{
1986	return pmap_testbit((pa), PG_M);
1987}
1988
1989/*
1990 *	Clear the modify bits on the specified physical page.
1991 */
1992void
1993pmap_clear_modify(vm_offset_t pa)
1994{
1995	pmap_changebit((pa), PG_M, FALSE);
1996}
1997
1998/*
1999 *	pmap_clear_reference:
2000 *
2001 *	Clear the reference bit on the specified physical page.
2002 */
2003void
2004pmap_clear_reference(vm_offset_t pa)
2005{
2006	pmap_changebit((pa), PG_U, FALSE);
2007}
2008
2009/*
2010 * Miscellaneous support routines follow
2011 */
2012
2013static void
2014i386_protection_init()
2015{
2016	register int *kp, prot;
2017
2018	kp = protection_codes;
2019	for (prot = 0; prot < 8; prot++) {
2020		switch (prot) {
2021		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
2022			/*
2023			 * Read access is also 0. There isn't any execute bit,
2024			 * so just make it readable.
2025			 */
2026		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
2027		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
2028		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
2029			*kp++ = 0;
2030			break;
2031		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
2032		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
2033		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
2034		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
2035			*kp++ = PG_RW;
2036			break;
2037		}
2038	}
2039}
2040
2041/*
2042 * Map a set of physical memory pages into the kernel virtual
2043 * address space. Return a pointer to where it is mapped. This
2044 * routine is intended to be used for mapping device memory,
2045 * NOT real memory. The non-cacheable bits are set on each
2046 * mapped page.
2047 */
2048void *
2049pmap_mapdev(pa, size)
2050	vm_offset_t pa;
2051	vm_size_t size;
2052{
2053	vm_offset_t va, tmpva;
2054	pt_entry_t *pte;
2055
2056	pa = trunc_page(pa);
2057	size = roundup(size, PAGE_SIZE);
2058
2059	va = kmem_alloc_pageable(kernel_map, size);
2060	if (!va)
2061		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2062
2063	for (tmpva = va; size > 0;) {
2064		pte = vtopte(tmpva);
2065		*pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N));
2066		size -= PAGE_SIZE;
2067		tmpva += PAGE_SIZE;
2068		pa += PAGE_SIZE;
2069	}
2070	pmap_update();
2071
2072	return ((void *) va);
2073}
2074
2075#ifdef PMAP_DEBUG
2076pmap_pid_dump(int pid) {
2077	pmap_t pmap;
2078	struct proc *p;
2079	int npte = 0;
2080	int index;
2081	for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
2082		if (p->p_pid != pid)
2083			continue;
2084
2085		if (p->p_vmspace) {
2086			int i,j;
2087			index = 0;
2088			pmap = &p->p_vmspace->vm_pmap;
2089			for(i=0;i<1024;i++) {
2090				pd_entry_t *pde;
2091				pt_entry_t *pte;
2092				unsigned base = i << PD_SHIFT;
2093
2094				pde = &pmap->pm_pdir[i];
2095				if (pde && pmap_pde_v(pde)) {
2096					for(j=0;j<1024;j++) {
2097						unsigned va = base + (j << PG_SHIFT);
2098						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
2099							if (index) {
2100								index = 0;
2101								printf("\n");
2102							}
2103							return npte;
2104						}
2105						pte = pmap_pte( pmap, va);
2106						if (pte && pmap_pte_v(pte)) {
2107							vm_offset_t pa;
2108							vm_page_t m;
2109							pa = *(int *)pte;
2110							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
2111							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
2112								va, pa, m->hold_count, m->wire_count, m->flags);
2113							npte++;
2114							index++;
2115							if (index >= 2) {
2116								index = 0;
2117								printf("\n");
2118							} else {
2119								printf(" ");
2120							}
2121						}
2122					}
2123				}
2124			}
2125		}
2126	}
2127	return npte;
2128}
2129#endif
2130
2131#ifdef DEBUG
2132
2133static void	pads __P((pmap_t pm));
2134static void	pmap_pvdump __P((vm_offset_t pa));
2135
2136/* print address space of pmap*/
2137static void
2138pads(pm)
2139	pmap_t pm;
2140{
2141	unsigned va, i, j;
2142	pt_entry_t *ptep;
2143
2144	if (pm == kernel_pmap)
2145		return;
2146	for (i = 0; i < 1024; i++)
2147		if (pm->pm_pdir[i])
2148			for (j = 0; j < 1024; j++) {
2149				va = (i << PD_SHIFT) + (j << PG_SHIFT);
2150				if (pm == kernel_pmap && va < KERNBASE)
2151					continue;
2152				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
2153					continue;
2154				ptep = pmap_pte(pm, va);
2155				if (pmap_pte_v(ptep))
2156					printf("%x:%x ", va, *(int *) ptep);
2157			};
2158
2159}
2160
2161static void
2162pmap_pvdump(pa)
2163	vm_offset_t pa;
2164{
2165	register pv_entry_t pv;
2166
2167	printf("pa %x", pa);
2168	for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) {
2169#ifdef used_to_be
2170		printf(" -> pmap %x, va %x, flags %x",
2171		    pv->pv_pmap, pv->pv_va, pv->pv_flags);
2172#endif
2173		printf(" -> pmap %x, va %x",
2174		    pv->pv_pmap, pv->pv_va);
2175		pads(pv->pv_pmap);
2176	}
2177	printf(" ");
2178}
2179#endif
2180