pmap.c revision 112836
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 * $FreeBSD: head/sys/i386/i386/pmap.c 112836 2003-03-30 01:16:19Z jake $
43 */
44/*-
45 * Copyright (c) 2003 Networks Associates Technology, Inc.
46 * All rights reserved.
47 *
48 * This software was developed for the FreeBSD Project by Jake Burkholder,
49 * Safeport Network Services, and Network Associates Laboratories, the
50 * Security Research Division of Network Associates, Inc. under
51 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
52 * CHATS research program.
53 *
54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions
56 * are met:
57 * 1. Redistributions of source code must retain the above copyright
58 *    notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright
60 *    notice, this list of conditions and the following disclaimer in the
61 *    documentation and/or other materials provided with the distribution.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 */
75
76/*
77 *	Manages physical address maps.
78 *
79 *	In addition to hardware address maps, this
80 *	module is called upon to provide software-use-only
81 *	maps which may or may not be stored in the same
82 *	form as hardware maps.  These pseudo-maps are
83 *	used to store intermediate results from copy
84 *	operations to and from address spaces.
85 *
86 *	Since the information managed by this module is
87 *	also stored by the logical address mapping module,
88 *	this module may throw away valid virtual-to-physical
89 *	mappings at almost any time.  However, invalidations
90 *	of virtual-to-physical mappings must be done as
91 *	requested.
92 *
93 *	In order to cope with hardware architectures which
94 *	make virtual-to-physical map invalidates expensive,
95 *	this module may delay invalidate or reduced protection
96 *	operations until such time as they are actually
97 *	necessary.  This module is given full information as
98 *	to which processors are currently using which maps,
99 *	and to when physical maps must be made correct.
100 */
101
102#include "opt_pmap.h"
103#include "opt_msgbuf.h"
104#include "opt_kstack_pages.h"
105
106#include <sys/param.h>
107#include <sys/systm.h>
108#include <sys/kernel.h>
109#include <sys/lock.h>
110#include <sys/mman.h>
111#include <sys/msgbuf.h>
112#include <sys/mutex.h>
113#include <sys/proc.h>
114#include <sys/sx.h>
115#include <sys/user.h>
116#include <sys/vmmeter.h>
117#include <sys/sysctl.h>
118#ifdef SMP
119#include <sys/smp.h>
120#endif
121
122#include <vm/vm.h>
123#include <vm/vm_param.h>
124#include <vm/vm_kern.h>
125#include <vm/vm_page.h>
126#include <vm/vm_map.h>
127#include <vm/vm_object.h>
128#include <vm/vm_extern.h>
129#include <vm/vm_pageout.h>
130#include <vm/vm_pager.h>
131#include <vm/uma.h>
132
133#include <machine/cpu.h>
134#include <machine/cputypes.h>
135#include <machine/md_var.h>
136#include <machine/specialreg.h>
137#if defined(SMP) || defined(APIC_IO)
138#include <machine/smp.h>
139#include <machine/apic.h>
140#include <machine/segments.h>
141#include <machine/tss.h>
142#endif /* SMP || APIC_IO */
143
144#define PMAP_KEEP_PDIRS
145#ifndef PMAP_SHPGPERPROC
146#define PMAP_SHPGPERPROC 200
147#endif
148
149#if defined(DIAGNOSTIC)
150#define PMAP_DIAGNOSTIC
151#endif
152
153#define MINPV 2048
154
155#if !defined(PMAP_DIAGNOSTIC)
156#define PMAP_INLINE __inline
157#else
158#define PMAP_INLINE
159#endif
160
161/*
162 * Get PDEs and PTEs for user/kernel address space
163 */
164#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
165#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
166
167#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
168#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
169#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
170#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
171#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
172
173#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
174#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
175
176/*
177 * Given a map and a machine independent protection code,
178 * convert to a vax protection code.
179 */
180#define pte_prot(m, p)	(protection_codes[p])
181static int protection_codes[8];
182
183struct pmap kernel_pmap_store;
184LIST_HEAD(pmaplist, pmap);
185static struct pmaplist allpmaps;
186static struct mtx allpmaps_lock;
187
188vm_paddr_t avail_start;	/* PA of first available physical page */
189vm_paddr_t avail_end;	/* PA of last available physical page */
190vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
191vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
192static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
193static int pgeflag;		/* PG_G or-in */
194static int pseflag;		/* PG_PS or-in */
195
196static int nkpt;
197vm_offset_t kernel_vm_end;
198extern u_int32_t KERNend;
199
200/*
201 * Data for the pv entry allocation mechanism
202 */
203static uma_zone_t pvzone;
204static struct vm_object pvzone_obj;
205static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
206int pmap_pagedaemon_waken;
207
208/*
209 * All those kernel PT submaps that BSD is so fond of
210 */
211pt_entry_t *CMAP1 = 0;
212static pt_entry_t *CMAP2, *CMAP3, *ptmmap;
213caddr_t CADDR1 = 0, ptvmmap = 0;
214static caddr_t CADDR2, CADDR3;
215static struct mtx CMAPCADDR12_lock;
216static pt_entry_t *msgbufmap;
217struct msgbuf *msgbufp = 0;
218
219/*
220 * Crashdump maps.
221 */
222static pt_entry_t *pt_crashdumpmap;
223static caddr_t crashdumpmap;
224
225#ifdef SMP
226extern pt_entry_t *SMPpt;
227#endif
228static pt_entry_t *PMAP1 = 0;
229static pt_entry_t *PADDR1 = 0;
230
231static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
232static pv_entry_t get_pv_entry(void);
233static void	i386_protection_init(void);
234static __inline void	pmap_changebit(vm_page_t m, int bit, boolean_t setem);
235
236static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va,
237				      vm_page_t m, vm_page_t mpte);
238static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
239static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
240static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
241					vm_offset_t va);
242static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
243		vm_page_t mpte, vm_page_t m);
244
245static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
246
247static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex);
248static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex);
249static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
250static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
251static void *pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
252
253static pd_entry_t pdir4mb;
254
255CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
256CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
257
258/*
259 * Move the kernel virtual free pointer to the next
260 * 4MB.  This is used to help improve performance
261 * by using a large (4MB) page for much of the kernel
262 * (.text, .data, .bss)
263 */
264static vm_offset_t
265pmap_kmem_choose(vm_offset_t addr)
266{
267	vm_offset_t newaddr = addr;
268
269#ifdef I686_CPU_not	/* Problem seems to have gone away */
270	/* Deal with un-resolved Pentium4 issues */
271	if (cpu_class == CPUCLASS_686 &&
272	    strcmp(cpu_vendor, "GenuineIntel") == 0 &&
273	    (cpu_id & 0xf00) == 0xf00)
274		return newaddr;
275#endif
276#ifndef DISABLE_PSE
277	if (cpu_feature & CPUID_PSE)
278		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
279#endif
280	return newaddr;
281}
282
283/*
284 *	Bootstrap the system enough to run with virtual memory.
285 *
286 *	On the i386 this is called after mapping has already been enabled
287 *	and just syncs the pmap module with what has already been done.
288 *	[We can't call it easily with mapping off since the kernel is not
289 *	mapped with PA == VA, hence we would have to relocate every address
290 *	from the linked base (virtual) address "KERNBASE" to the actual
291 *	(physical) address starting relative to 0]
292 */
293void
294pmap_bootstrap(firstaddr, loadaddr)
295	vm_paddr_t firstaddr;
296	vm_paddr_t loadaddr;
297{
298	vm_offset_t va;
299	pt_entry_t *pte;
300	int i;
301
302	avail_start = firstaddr;
303
304	/*
305	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
306	 * large. It should instead be correctly calculated in locore.s and
307	 * not based on 'first' (which is a physical address, not a virtual
308	 * address, for the start of unused physical memory). The kernel
309	 * page tables are NOT double mapped and thus should not be included
310	 * in this calculation.
311	 */
312	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
313	virtual_avail = pmap_kmem_choose(virtual_avail);
314
315	virtual_end = VM_MAX_KERNEL_ADDRESS;
316
317	/*
318	 * Initialize protection array.
319	 */
320	i386_protection_init();
321
322	/*
323	 * Initialize the kernel pmap (which is statically allocated).
324	 */
325	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
326	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
327	TAILQ_INIT(&kernel_pmap->pm_pvlist);
328	LIST_INIT(&allpmaps);
329	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
330	mtx_lock_spin(&allpmaps_lock);
331	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
332	mtx_unlock_spin(&allpmaps_lock);
333	nkpt = NKPT;
334
335	/*
336	 * Reserve some special page table entries/VA space for temporary
337	 * mapping of pages.
338	 */
339#define	SYSMAP(c, p, v, n)	\
340	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
341
342	va = virtual_avail;
343	pte = vtopte(va);
344
345	/*
346	 * CMAP1/CMAP2 are used for zeroing and copying pages.
347	 * CMAP3 is used for the idle process page zeroing.
348	 */
349	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
350	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
351	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
352
353	mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF);
354
355	/*
356	 * Crashdump maps.
357	 */
358	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
359
360	/*
361	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
362	 * XXX ptmmap is not used.
363	 */
364	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
365
366	/*
367	 * msgbufp is used to map the system message buffer.
368	 * XXX msgbufmap is not used.
369	 */
370	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
371	       atop(round_page(MSGBUF_SIZE)))
372
373	/*
374	 * ptemap is used for pmap_pte_quick
375	 */
376	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
377
378	virtual_avail = va;
379
380	*CMAP1 = *CMAP2 = 0;
381	for (i = 0; i < NKPT; i++)
382		PTD[i] = 0;
383
384	pgeflag = 0;
385#ifndef DISABLE_PG_G
386	if (cpu_feature & CPUID_PGE)
387		pgeflag = PG_G;
388#endif
389#ifdef I686_CPU_not	/* Problem seems to have gone away */
390	/* Deal with un-resolved Pentium4 issues */
391	if (cpu_class == CPUCLASS_686 &&
392	    strcmp(cpu_vendor, "GenuineIntel") == 0 &&
393	    (cpu_id & 0xf00) == 0xf00) {
394		printf("Warning: Pentium 4 cpu: PG_G disabled (global flag)\n");
395		pgeflag = 0;
396	}
397#endif
398
399/*
400 * Initialize the 4MB page size flag
401 */
402	pseflag = 0;
403/*
404 * The 4MB page version of the initial
405 * kernel page mapping.
406 */
407	pdir4mb = 0;
408
409#ifndef DISABLE_PSE
410	if (cpu_feature & CPUID_PSE)
411		pseflag = PG_PS;
412#endif
413#ifdef I686_CPU_not	/* Problem seems to have gone away */
414	/* Deal with un-resolved Pentium4 issues */
415	if (cpu_class == CPUCLASS_686 &&
416	    strcmp(cpu_vendor, "GenuineIntel") == 0 &&
417	    (cpu_id & 0xf00) == 0xf00) {
418		printf("Warning: Pentium 4 cpu: PG_PS disabled (4MB pages)\n");
419		pseflag = 0;
420	}
421#endif
422#ifndef DISABLE_PSE
423	if (pseflag) {
424		pd_entry_t ptditmp;
425		/*
426		 * Note that we have enabled PSE mode
427		 */
428		ptditmp = *(PTmap + i386_btop(KERNBASE));
429		ptditmp &= ~(NBPDR - 1);
430		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
431		pdir4mb = ptditmp;
432	}
433#endif
434#ifndef SMP
435	/*
436	 * Turn on PGE/PSE.  SMP does this later on since the
437	 * 4K page tables are required for AP boot (for now).
438	 * XXX fixme.
439	 */
440	pmap_set_opt();
441#endif
442#ifdef SMP
443	if (cpu_apic_address == 0)
444		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
445
446	/* local apic is mapped on last page */
447	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
448	    (cpu_apic_address & PG_FRAME));
449#endif
450	invltlb();
451}
452
453/*
454 * Enable 4MB page mode for MP startup.  Turn on PG_G support.
455 * BSP will run this after all the AP's have started up.
456 */
457void
458pmap_set_opt(void)
459{
460	pt_entry_t *pte;
461	vm_offset_t va, endva;
462
463	if (pgeflag && (cpu_feature & CPUID_PGE)) {
464		load_cr4(rcr4() | CR4_PGE);
465		invltlb();		/* Insurance */
466	}
467#ifndef DISABLE_PSE
468	if (pseflag && (cpu_feature & CPUID_PSE)) {
469		load_cr4(rcr4() | CR4_PSE);
470		invltlb();		/* Insurance */
471	}
472#endif
473	if (PCPU_GET(cpuid) == 0) {
474#ifndef DISABLE_PSE
475		if (pdir4mb) {
476			kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
477			invltlb();	/* Insurance */
478		}
479#endif
480		if (pgeflag) {
481			/* Turn on PG_G for text, data, bss pages. */
482			va = (vm_offset_t)btext;
483#ifndef DISABLE_PSE
484			if (pseflag && (cpu_feature & CPUID_PSE)) {
485				if (va < KERNBASE + (1 << PDRSHIFT))
486					va = KERNBASE + (1 << PDRSHIFT);
487			}
488#endif
489			endva = KERNBASE + KERNend;
490			while (va < endva) {
491				pte = vtopte(va);
492				if (*pte)
493					*pte |= pgeflag;
494				va += PAGE_SIZE;
495			}
496			invltlb();	/* Insurance */
497		}
498		/*
499		 * We do not need to broadcast the invltlb here, because
500		 * each AP does it the moment it is released from the boot
501		 * lock.  See ap_init().
502		 */
503	}
504}
505
506static void *
507pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
508{
509	*flags = UMA_SLAB_PRIV;
510	return (void *)kmem_alloc(kernel_map, bytes);
511}
512
513/*
514 *	Initialize the pmap module.
515 *	Called by vm_init, to initialize any structures that the pmap
516 *	system needs to map virtual memory.
517 *	pmap_init has been enhanced to support in a fairly consistant
518 *	way, discontiguous physical memory.
519 */
520void
521pmap_init(phys_start, phys_end)
522	vm_paddr_t phys_start, phys_end;
523{
524	int i;
525	int initial_pvs;
526
527	/*
528	 * Allocate memory for random pmap data structures.  Includes the
529	 * pv_head_table.
530	 */
531
532	for(i = 0; i < vm_page_array_size; i++) {
533		vm_page_t m;
534
535		m = &vm_page_array[i];
536		TAILQ_INIT(&m->md.pv_list);
537		m->md.pv_list_count = 0;
538	}
539
540	/*
541	 * init the pv free list
542	 */
543	initial_pvs = vm_page_array_size;
544	if (initial_pvs < MINPV)
545		initial_pvs = MINPV;
546	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
547	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
548	uma_zone_set_allocf(pvzone, pmap_allocf);
549	uma_prealloc(pvzone, initial_pvs);
550
551	/*
552	 * Now it is safe to enable pv_table recording.
553	 */
554	pmap_initialized = TRUE;
555}
556
557/*
558 * Initialize the address space (zone) for the pv_entries.  Set a
559 * high water mark so that the system can recover from excessive
560 * numbers of pv entries.
561 */
562void
563pmap_init2()
564{
565	int shpgperproc = PMAP_SHPGPERPROC;
566
567	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
568	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
569	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
570	pv_entry_high_water = 9 * (pv_entry_max / 10);
571	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
572}
573
574
575/***************************************************
576 * Low level helper routines.....
577 ***************************************************/
578
579#if defined(PMAP_DIAGNOSTIC)
580
581/*
582 * This code checks for non-writeable/modified pages.
583 * This should be an invalid condition.
584 */
585static int
586pmap_nw_modified(pt_entry_t ptea)
587{
588	int pte;
589
590	pte = (int) ptea;
591
592	if ((pte & (PG_M|PG_RW)) == PG_M)
593		return 1;
594	else
595		return 0;
596}
597#endif
598
599
600/*
601 * this routine defines the region(s) of memory that should
602 * not be tested for the modified bit.
603 */
604static PMAP_INLINE int
605pmap_track_modified(vm_offset_t va)
606{
607	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
608		return 1;
609	else
610		return 0;
611}
612
613#ifdef I386_CPU
614/*
615 * i386 only has "invalidate everything" and no SMP to worry about.
616 */
617PMAP_INLINE void
618pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
619{
620
621	if (pmap == kernel_pmap || pmap->pm_active)
622		invltlb();
623}
624
625PMAP_INLINE void
626pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
627{
628
629	if (pmap == kernel_pmap || pmap->pm_active)
630		invltlb();
631}
632
633PMAP_INLINE void
634pmap_invalidate_all(pmap_t pmap)
635{
636
637	if (pmap == kernel_pmap || pmap->pm_active)
638		invltlb();
639}
640#else /* !I386_CPU */
641#ifdef SMP
642/*
643 * For SMP, these functions have to use the IPI mechanism for coherence.
644 */
645void
646pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
647{
648	u_int cpumask;
649	u_int other_cpus;
650
651	critical_enter();
652	/*
653	 * We need to disable interrupt preemption but MUST NOT have
654	 * interrupts disabled here.
655	 * XXX we may need to hold schedlock to get a coherent pm_active
656	 */
657	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
658		invlpg(va);
659		smp_invlpg(va);
660	} else {
661		cpumask = PCPU_GET(cpumask);
662		other_cpus = PCPU_GET(other_cpus);
663		if (pmap->pm_active & cpumask)
664			invlpg(va);
665		if (pmap->pm_active & other_cpus)
666			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
667	}
668	critical_exit();
669}
670
671void
672pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
673{
674	u_int cpumask;
675	u_int other_cpus;
676	vm_offset_t addr;
677
678	critical_enter();
679	/*
680	 * We need to disable interrupt preemption but MUST NOT have
681	 * interrupts disabled here.
682	 * XXX we may need to hold schedlock to get a coherent pm_active
683	 */
684	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
685		for (addr = sva; addr < eva; addr += PAGE_SIZE)
686			invlpg(addr);
687		smp_invlpg_range(sva, eva);
688	} else {
689		cpumask = PCPU_GET(cpumask);
690		other_cpus = PCPU_GET(other_cpus);
691		if (pmap->pm_active & cpumask)
692			for (addr = sva; addr < eva; addr += PAGE_SIZE)
693				invlpg(addr);
694		if (pmap->pm_active & other_cpus)
695			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
696			    sva, eva);
697	}
698	critical_exit();
699}
700
701void
702pmap_invalidate_all(pmap_t pmap)
703{
704	u_int cpumask;
705	u_int other_cpus;
706
707#ifdef SWTCH_OPTIM_STATS
708	tlb_flush_count++;
709#endif
710	critical_enter();
711	/*
712	 * We need to disable interrupt preemption but MUST NOT have
713	 * interrupts disabled here.
714	 * XXX we may need to hold schedlock to get a coherent pm_active
715	 */
716	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
717		invltlb();
718		smp_invltlb();
719	} else {
720		cpumask = PCPU_GET(cpumask);
721		other_cpus = PCPU_GET(other_cpus);
722		if (pmap->pm_active & cpumask)
723			invltlb();
724		if (pmap->pm_active & other_cpus)
725			smp_masked_invltlb(pmap->pm_active & other_cpus);
726	}
727	critical_exit();
728}
729#else /* !SMP */
730/*
731 * Normal, non-SMP, 486+ invalidation functions.
732 * We inline these within pmap.c for speed.
733 */
734PMAP_INLINE void
735pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
736{
737
738	if (pmap == kernel_pmap || pmap->pm_active)
739		invlpg(va);
740}
741
742PMAP_INLINE void
743pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
744{
745	vm_offset_t addr;
746
747	if (pmap == kernel_pmap || pmap->pm_active)
748		for (addr = sva; addr < eva; addr += PAGE_SIZE)
749			invlpg(addr);
750}
751
752PMAP_INLINE void
753pmap_invalidate_all(pmap_t pmap)
754{
755
756	if (pmap == kernel_pmap || pmap->pm_active)
757		invltlb();
758}
759#endif /* !SMP */
760#endif /* !I386_CPU */
761
762/*
763 * Are we current address space or kernel?
764 */
765static __inline int
766pmap_is_current(pmap_t pmap)
767{
768	return (pmap == kernel_pmap ||
769	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME));
770}
771
772/*
773 * Super fast pmap_pte routine best used when scanning
774 * the pv lists.  This eliminates many coarse-grained
775 * invltlb calls.  Note that many of the pv list
776 * scans are across different pmaps.  It is very wasteful
777 * to do an entire invltlb for checking a single mapping.
778 */
779pt_entry_t *
780pmap_pte_quick(pmap, va)
781	register pmap_t pmap;
782	vm_offset_t va;
783{
784	pd_entry_t newpf;
785	pd_entry_t *pde;
786
787	pde = pmap_pde(pmap, va);
788	if (*pde & PG_PS)
789		return (pde);
790	if (*pde != 0) {
791		/* are we current address space or kernel? */
792		if (pmap_is_current(pmap))
793			return vtopte(va);
794		newpf = *pde & PG_FRAME;
795		if (((*PMAP1) & PG_FRAME) != newpf) {
796			*PMAP1 = newpf | PG_RW | PG_V;
797			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1);
798		}
799		return PADDR1 + (i386_btop(va) & (NPTEPG - 1));
800	}
801	return (0);
802}
803
804/*
805 *	Routine:	pmap_extract
806 *	Function:
807 *		Extract the physical page address associated
808 *		with the given map/virtual_address pair.
809 */
810vm_paddr_t
811pmap_extract(pmap, va)
812	register pmap_t pmap;
813	vm_offset_t va;
814{
815	vm_paddr_t rtval;
816	pt_entry_t *pte;
817	pd_entry_t pde;
818
819	if (pmap == 0)
820		return 0;
821	pde = pmap->pm_pdir[va >> PDRSHIFT];
822	if (pde != 0) {
823		if ((pde & PG_PS) != 0) {
824			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
825			return rtval;
826		}
827		pte = pmap_pte_quick(pmap, va);
828		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
829		return rtval;
830	}
831	return 0;
832
833}
834
835/***************************************************
836 * Low level mapping routines.....
837 ***************************************************/
838
839/*
840 * Add a wired page to the kva.
841 * Note: not SMP coherent.
842 */
843PMAP_INLINE void
844pmap_kenter(vm_offset_t va, vm_paddr_t pa)
845{
846	pt_entry_t *pte;
847
848	pte = vtopte(va);
849	*pte = pa | PG_RW | PG_V | pgeflag;
850}
851
852/*
853 * Remove a page from the kernel pagetables.
854 * Note: not SMP coherent.
855 */
856PMAP_INLINE void
857pmap_kremove(vm_offset_t va)
858{
859	pt_entry_t *pte;
860
861	pte = vtopte(va);
862	*pte = 0;
863}
864
865/*
866 *	Used to map a range of physical addresses into kernel
867 *	virtual address space.
868 *
869 *	The value passed in '*virt' is a suggested virtual address for
870 *	the mapping. Architectures which can support a direct-mapped
871 *	physical to virtual region can return the appropriate address
872 *	within that region, leaving '*virt' unchanged. Other
873 *	architectures should map the pages starting at '*virt' and
874 *	update '*virt' with the first usable address after the mapped
875 *	region.
876 */
877vm_offset_t
878pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
879{
880	vm_offset_t va, sva;
881
882	va = sva = *virt;
883	while (start < end) {
884		pmap_kenter(va, start);
885		va += PAGE_SIZE;
886		start += PAGE_SIZE;
887	}
888	pmap_invalidate_range(kernel_pmap, sva, va);
889	*virt = va;
890	return (sva);
891}
892
893
894/*
895 * Add a list of wired pages to the kva
896 * this routine is only used for temporary
897 * kernel mappings that do not need to have
898 * page modification or references recorded.
899 * Note that old mappings are simply written
900 * over.  The page *must* be wired.
901 * Note: SMP coherent.  Uses a ranged shootdown IPI.
902 */
903void
904pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
905{
906	vm_offset_t va;
907
908	va = sva;
909	while (count-- > 0) {
910		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
911		va += PAGE_SIZE;
912		m++;
913	}
914	pmap_invalidate_range(kernel_pmap, sva, va);
915}
916
917/*
918 * This routine tears out page mappings from the
919 * kernel -- it is meant only for temporary mappings.
920 * Note: SMP coherent.  Uses a ranged shootdown IPI.
921 */
922void
923pmap_qremove(vm_offset_t sva, int count)
924{
925	vm_offset_t va;
926
927	va = sva;
928	while (count-- > 0) {
929		pmap_kremove(va);
930		va += PAGE_SIZE;
931	}
932	pmap_invalidate_range(kernel_pmap, sva, va);
933}
934
935static vm_page_t
936pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
937{
938	vm_page_t m;
939
940retry:
941	m = vm_page_lookup(object, pindex);
942	if (m != NULL) {
943		vm_page_lock_queues();
944		if (vm_page_sleep_if_busy(m, FALSE, "pplookp"))
945			goto retry;
946		vm_page_unlock_queues();
947	}
948	return m;
949}
950
951#ifndef KSTACK_MAX_PAGES
952#define KSTACK_MAX_PAGES 32
953#endif
954
955/*
956 * Create the kernel stack (including pcb for i386) for a new thread.
957 * This routine directly affects the fork perf for a process and
958 * create performance for a thread.
959 */
960void
961pmap_new_thread(struct thread *td, int pages)
962{
963	int i;
964	vm_page_t ma[KSTACK_MAX_PAGES];
965	vm_object_t ksobj;
966	vm_page_t m;
967	vm_offset_t ks;
968
969	/* Bounds check */
970	if (pages <= 1)
971		pages = KSTACK_PAGES;
972	else if (pages > KSTACK_MAX_PAGES)
973		pages = KSTACK_MAX_PAGES;
974
975	/*
976	 * allocate object for the kstack
977	 */
978	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
979	td->td_kstack_obj = ksobj;
980
981	/* get a kernel virtual address for the kstack for this thread */
982#ifdef KSTACK_GUARD
983	ks = kmem_alloc_nofault(kernel_map, (pages + 1) * PAGE_SIZE);
984	if (ks == 0)
985		panic("pmap_new_thread: kstack allocation failed");
986	if (*vtopte(ks) != 0)
987		pmap_qremove(ks, 1);
988	ks += PAGE_SIZE;
989	td->td_kstack = ks;
990#else
991	/* get a kernel virtual address for the kstack for this thread */
992	ks = kmem_alloc_nofault(kernel_map, pages * PAGE_SIZE);
993	if (ks == 0)
994		panic("pmap_new_thread: kstack allocation failed");
995	td->td_kstack = ks;
996#endif
997	/*
998	 * Knowing the number of pages allocated is useful when you
999	 * want to deallocate them.
1000	 */
1001	td->td_kstack_pages = pages;
1002
1003	/*
1004	 * For the length of the stack, link in a real page of ram for each
1005	 * page of stack.
1006	 */
1007	for (i = 0; i < pages; i++) {
1008		/*
1009		 * Get a kernel stack page
1010		 */
1011		m = vm_page_grab(ksobj, i,
1012		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
1013		ma[i] = m;
1014
1015		vm_page_lock_queues();
1016		vm_page_wakeup(m);
1017		vm_page_flag_clear(m, PG_ZERO);
1018		m->valid = VM_PAGE_BITS_ALL;
1019		vm_page_unlock_queues();
1020	}
1021	pmap_qenter(ks, ma, pages);
1022}
1023
1024/*
1025 * Dispose the kernel stack for a thread that has exited.
1026 * This routine directly impacts the exit perf of a process and thread.
1027 */
1028void
1029pmap_dispose_thread(td)
1030	struct thread *td;
1031{
1032	int i;
1033	int pages;
1034	vm_object_t ksobj;
1035	vm_offset_t ks;
1036	vm_page_t m;
1037
1038	pages = td->td_kstack_pages;
1039	ksobj = td->td_kstack_obj;
1040	ks = td->td_kstack;
1041	pmap_qremove(ks, pages);
1042	for (i = 0; i < pages; i++) {
1043		m = vm_page_lookup(ksobj, i);
1044		if (m == NULL)
1045			panic("pmap_dispose_thread: kstack already missing?");
1046		vm_page_lock_queues();
1047		vm_page_busy(m);
1048		vm_page_unwire(m, 0);
1049		vm_page_free(m);
1050		vm_page_unlock_queues();
1051	}
1052	/*
1053	 * Free the space that this stack was mapped to in the kernel
1054	 * address map.
1055	 */
1056#ifdef KSTACK_GUARD
1057	kmem_free(kernel_map, ks - PAGE_SIZE, (pages + 1) * PAGE_SIZE);
1058#else
1059	kmem_free(kernel_map, ks, pages * PAGE_SIZE);
1060#endif
1061	vm_object_deallocate(ksobj);
1062}
1063
1064/*
1065 * Set up a variable sized alternate kstack.  Though it may look MI, it may
1066 * need to be different on certain arches like ia64.
1067 */
1068void
1069pmap_new_altkstack(struct thread *td, int pages)
1070{
1071	/* shuffle the original stack */
1072	td->td_altkstack_obj = td->td_kstack_obj;
1073	td->td_altkstack = td->td_kstack;
1074	td->td_altkstack_pages = td->td_kstack_pages;
1075
1076	pmap_new_thread(td, pages);
1077}
1078
1079void
1080pmap_dispose_altkstack(td)
1081	struct thread *td;
1082{
1083	pmap_dispose_thread(td);
1084
1085	/* restore the original kstack */
1086	td->td_kstack = td->td_altkstack;
1087	td->td_kstack_obj = td->td_altkstack_obj;
1088	td->td_kstack_pages = td->td_altkstack_pages;
1089	td->td_altkstack = 0;
1090	td->td_altkstack_obj = NULL;
1091	td->td_altkstack_pages = 0;
1092}
1093
1094/*
1095 * Allow the Kernel stack for a thread to be prejudicially paged out.
1096 */
1097void
1098pmap_swapout_thread(td)
1099	struct thread *td;
1100{
1101	int i;
1102	int pages;
1103	vm_object_t ksobj;
1104	vm_offset_t ks;
1105	vm_page_t m;
1106
1107	pages = td->td_kstack_pages;
1108	ksobj = td->td_kstack_obj;
1109	ks = td->td_kstack;
1110	pmap_qremove(ks, pages);
1111	for (i = 0; i < pages; i++) {
1112		m = vm_page_lookup(ksobj, i);
1113		if (m == NULL)
1114			panic("pmap_swapout_thread: kstack already missing?");
1115		vm_page_lock_queues();
1116		vm_page_dirty(m);
1117		vm_page_unwire(m, 0);
1118		vm_page_unlock_queues();
1119	}
1120}
1121
1122/*
1123 * Bring the kernel stack for a specified thread back in.
1124 */
1125void
1126pmap_swapin_thread(td)
1127	struct thread *td;
1128{
1129	int i, rv;
1130	int pages;
1131	vm_page_t ma[KSTACK_MAX_PAGES];
1132	vm_object_t ksobj;
1133	vm_offset_t ks;
1134	vm_page_t m;
1135
1136	pages = td->td_kstack_pages;
1137	ksobj = td->td_kstack_obj;
1138	ks = td->td_kstack;
1139	for (i = 0; i < pages; i++) {
1140		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1141		if (m->valid != VM_PAGE_BITS_ALL) {
1142			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
1143			if (rv != VM_PAGER_OK)
1144				panic("pmap_swapin_thread: cannot get kstack for proc: %d\n", td->td_proc->p_pid);
1145			m = vm_page_lookup(ksobj, i);
1146			m->valid = VM_PAGE_BITS_ALL;
1147		}
1148		ma[i] = m;
1149		vm_page_lock_queues();
1150		vm_page_wire(m);
1151		vm_page_wakeup(m);
1152		vm_page_unlock_queues();
1153	}
1154	pmap_qenter(ks, ma, pages);
1155}
1156
1157/***************************************************
1158 * Page table page management routines.....
1159 ***************************************************/
1160
1161/*
1162 * This routine unholds page table pages, and if the hold count
1163 * drops to zero, then it decrements the wire count.
1164 */
1165static int
1166_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1167{
1168
1169	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
1170		vm_page_lock_queues();
1171
1172	if (m->hold_count == 0) {
1173		vm_offset_t pteva;
1174		/*
1175		 * unmap the page table page
1176		 */
1177		pmap->pm_pdir[m->pindex] = 0;
1178		--pmap->pm_stats.resident_count;
1179		if (pmap_is_current(pmap)) {
1180			/*
1181			 * Do an invltlb to make the invalidated mapping
1182			 * take effect immediately.
1183			 */
1184			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1185			pmap_invalidate_page(pmap, pteva);
1186		}
1187
1188		/*
1189		 * If the page is finally unwired, simply free it.
1190		 */
1191		--m->wire_count;
1192		if (m->wire_count == 0) {
1193			vm_page_busy(m);
1194			vm_page_free_zero(m);
1195			atomic_subtract_int(&cnt.v_wire_count, 1);
1196		}
1197		return 1;
1198	}
1199	return 0;
1200}
1201
1202static PMAP_INLINE int
1203pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1204{
1205	vm_page_unhold(m);
1206	if (m->hold_count == 0)
1207		return _pmap_unwire_pte_hold(pmap, m);
1208	else
1209		return 0;
1210}
1211
1212/*
1213 * After removing a page table entry, this routine is used to
1214 * conditionally free the page, and manage the hold/wire counts.
1215 */
1216static int
1217pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1218{
1219	unsigned ptepindex;
1220	if (va >= VM_MAXUSER_ADDRESS)
1221		return 0;
1222
1223	if (mpte == NULL) {
1224		ptepindex = (va >> PDRSHIFT);
1225		if (pmap->pm_pteobj->root &&
1226			(pmap->pm_pteobj->root->pindex == ptepindex)) {
1227			mpte = pmap->pm_pteobj->root;
1228		} else {
1229			while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL &&
1230			       vm_page_sleep_if_busy(mpte, FALSE, "pulook"))
1231				vm_page_lock_queues();
1232		}
1233	}
1234
1235	return pmap_unwire_pte_hold(pmap, mpte);
1236}
1237
1238void
1239pmap_pinit0(pmap)
1240	struct pmap *pmap;
1241{
1242
1243	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1244	pmap->pm_active = 0;
1245	TAILQ_INIT(&pmap->pm_pvlist);
1246	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1247	mtx_lock_spin(&allpmaps_lock);
1248	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1249	mtx_unlock_spin(&allpmaps_lock);
1250}
1251
1252/*
1253 * Initialize a preallocated and zeroed pmap structure,
1254 * such as one in a vmspace structure.
1255 */
1256void
1257pmap_pinit(pmap)
1258	register struct pmap *pmap;
1259{
1260	vm_page_t ptdpg[NPGPTD];
1261	vm_paddr_t pa;
1262	int i;
1263
1264	/*
1265	 * No need to allocate page table space yet but we do need a valid
1266	 * page directory table.
1267	 */
1268	if (pmap->pm_pdir == NULL)
1269		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map,
1270		    NBPTD);
1271
1272	/*
1273	 * allocate object for the ptes
1274	 */
1275	if (pmap->pm_pteobj == NULL)
1276		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI +
1277		    NPGPTD);
1278
1279	/*
1280	 * allocate the page directory page(s)
1281	 */
1282	for (i = 0; i < NPGPTD; i++) {
1283		ptdpg[i] = vm_page_grab(pmap->pm_pteobj, PTDPTDI + i,
1284		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED |
1285		    VM_ALLOC_ZERO);
1286		vm_page_lock_queues();
1287		vm_page_flag_clear(ptdpg[i], PG_BUSY);
1288		ptdpg[i]->valid = VM_PAGE_BITS_ALL;
1289		vm_page_unlock_queues();
1290	}
1291
1292	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1293
1294	for (i = 0; i < NPGPTD; i++) {
1295		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1296			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1297	}
1298
1299	mtx_lock_spin(&allpmaps_lock);
1300	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1301	mtx_unlock_spin(&allpmaps_lock);
1302	/* Wire in kernel global address entries. */
1303	/* XXX copies current process, does not fill in MPPTDI */
1304	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1305#ifdef SMP
1306	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1307#endif
1308
1309	/* install self-referential address mapping entry(s) */
1310	for (i = 0; i < NPGPTD; i++) {
1311		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1312		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1313	}
1314
1315	pmap->pm_active = 0;
1316	TAILQ_INIT(&pmap->pm_pvlist);
1317	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1318}
1319
1320/*
1321 * Wire in kernel global address entries.  To avoid a race condition
1322 * between pmap initialization and pmap_growkernel, this procedure
1323 * should be called after the vmspace is attached to the process
1324 * but before this pmap is activated.
1325 */
1326void
1327pmap_pinit2(pmap)
1328	struct pmap *pmap;
1329{
1330	/* XXX: Remove this stub when no longer called */
1331}
1332
1333/*
1334 * this routine is called if the page table page is not
1335 * mapped correctly.
1336 */
1337static vm_page_t
1338_pmap_allocpte(pmap, ptepindex)
1339	pmap_t	pmap;
1340	unsigned ptepindex;
1341{
1342	vm_paddr_t ptepa;
1343	vm_offset_t pteva;
1344	vm_page_t m;
1345
1346	/*
1347	 * Find or fabricate a new pagetable page
1348	 */
1349	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1350	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1351
1352	KASSERT(m->queue == PQ_NONE,
1353		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1354
1355	/*
1356	 * Increment the hold count for the page table page
1357	 * (denoting a new mapping.)
1358	 */
1359	m->hold_count++;
1360
1361	/*
1362	 * Map the pagetable page into the process address space, if
1363	 * it isn't already there.
1364	 */
1365
1366	pmap->pm_stats.resident_count++;
1367
1368	ptepa = VM_PAGE_TO_PHYS(m);
1369	pmap->pm_pdir[ptepindex] =
1370		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1371
1372	/*
1373	 * Try to use the new mapping, but if we cannot, then
1374	 * do it with the routine that maps the page explicitly.
1375	 */
1376	if ((m->flags & PG_ZERO) == 0) {
1377		if (pmap_is_current(pmap)) {
1378			pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex);
1379			bzero((caddr_t) pteva, PAGE_SIZE);
1380		} else {
1381			pmap_zero_page(m);
1382		}
1383	}
1384	vm_page_lock_queues();
1385	m->valid = VM_PAGE_BITS_ALL;
1386	vm_page_flag_clear(m, PG_ZERO);
1387	vm_page_wakeup(m);
1388	vm_page_unlock_queues();
1389
1390	return m;
1391}
1392
1393static vm_page_t
1394pmap_allocpte(pmap_t pmap, vm_offset_t va)
1395{
1396	unsigned ptepindex;
1397	pd_entry_t ptepa;
1398	vm_page_t m;
1399
1400	/*
1401	 * Calculate pagetable page index
1402	 */
1403	ptepindex = va >> PDRSHIFT;
1404
1405	/*
1406	 * Get the page directory entry
1407	 */
1408	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1409
1410	/*
1411	 * This supports switching from a 4MB page to a
1412	 * normal 4K page.
1413	 */
1414	if (ptepa & PG_PS) {
1415		pmap->pm_pdir[ptepindex] = 0;
1416		ptepa = 0;
1417		pmap_invalidate_all(kernel_pmap);
1418	}
1419
1420	/*
1421	 * If the page table page is mapped, we just increment the
1422	 * hold count, and activate it.
1423	 */
1424	if (ptepa) {
1425		/*
1426		 * In order to get the page table page, try the
1427		 * hint first.
1428		 */
1429		if (pmap->pm_pteobj->root &&
1430			(pmap->pm_pteobj->root->pindex == ptepindex)) {
1431			m = pmap->pm_pteobj->root;
1432		} else {
1433			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1434		}
1435		m->hold_count++;
1436		return m;
1437	}
1438	/*
1439	 * Here if the pte page isn't mapped, or if it has been deallocated.
1440	 */
1441	return _pmap_allocpte(pmap, ptepindex);
1442}
1443
1444
1445/***************************************************
1446* Pmap allocation/deallocation routines.
1447 ***************************************************/
1448
1449/*
1450 * Release any resources held by the given physical map.
1451 * Called when a pmap initialized by pmap_pinit is being released.
1452 * Should only be called if the map contains no valid mappings.
1453 */
1454void
1455pmap_release(pmap_t pmap)
1456{
1457	vm_object_t object;
1458	vm_page_t m;
1459	int i;
1460
1461	object = pmap->pm_pteobj;
1462
1463	KASSERT(object->ref_count == 1,
1464	    ("pmap_release: pteobj reference count %d != 1",
1465	    object->ref_count));
1466	KASSERT(pmap->pm_stats.resident_count == 0,
1467	    ("pmap_release: pmap resident count %ld != 0",
1468	    pmap->pm_stats.resident_count));
1469
1470	mtx_lock_spin(&allpmaps_lock);
1471	LIST_REMOVE(pmap, pm_list);
1472	mtx_unlock_spin(&allpmaps_lock);
1473
1474	bzero(pmap->pm_pdir + KPTDI, nkpt * sizeof(*pmap->pm_pdir));
1475	for (i = 0; i < NPGPTD; i++) {
1476		pmap->pm_pdir[PTDPTDI + i] = 0;
1477		pmap->pm_pdir[APTDPTDI + i] = 0;
1478	}
1479#ifdef SMP
1480	pmap->pm_pdir[MPPTDI] = 0;
1481#endif
1482
1483	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1484
1485	vm_page_lock_queues();
1486	for (i = 0; i < NPGPTD; i++) {
1487		m = TAILQ_FIRST(&object->memq);
1488		m->wire_count--;
1489		atomic_subtract_int(&cnt.v_wire_count, 1);
1490		vm_page_busy(m);
1491		vm_page_free_zero(m);
1492	}
1493	KASSERT(TAILQ_EMPTY(&object->memq),
1494	    ("pmap_release: leaking page table pages"));
1495	vm_page_unlock_queues();
1496}
1497
1498static int
1499kvm_size(SYSCTL_HANDLER_ARGS)
1500{
1501	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1502
1503	return sysctl_handle_long(oidp, &ksize, 0, req);
1504}
1505SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1506    0, 0, kvm_size, "IU", "Size of KVM");
1507
1508static int
1509kvm_free(SYSCTL_HANDLER_ARGS)
1510{
1511	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1512
1513	return sysctl_handle_long(oidp, &kfree, 0, req);
1514}
1515SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1516    0, 0, kvm_free, "IU", "Amount of KVM free");
1517
1518/*
1519 * grow the number of kernel page table entries, if needed
1520 */
1521void
1522pmap_growkernel(vm_offset_t addr)
1523{
1524	struct pmap *pmap;
1525	int s;
1526	vm_paddr_t ptppaddr;
1527	vm_page_t nkpg;
1528	pd_entry_t newpdir;
1529
1530	s = splhigh();
1531	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1532	if (kernel_vm_end == 0) {
1533		kernel_vm_end = KERNBASE;
1534		nkpt = 0;
1535		while (pdir_pde(PTD, kernel_vm_end)) {
1536			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1537			nkpt++;
1538		}
1539	}
1540	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1541	while (kernel_vm_end < addr) {
1542		if (pdir_pde(PTD, kernel_vm_end)) {
1543			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1544			continue;
1545		}
1546
1547		/*
1548		 * This index is bogus, but out of the way
1549		 */
1550		nkpg = vm_page_alloc(NULL, nkpt,
1551		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1552		if (!nkpg)
1553			panic("pmap_growkernel: no memory to grow kernel");
1554
1555		nkpt++;
1556
1557		pmap_zero_page(nkpg);
1558		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1559		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1560		pdir_pde(PTD, kernel_vm_end) = newpdir;
1561
1562		mtx_lock_spin(&allpmaps_lock);
1563		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1564			*pmap_pde(pmap, kernel_vm_end) = newpdir;
1565		}
1566		mtx_unlock_spin(&allpmaps_lock);
1567		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1568	}
1569	splx(s);
1570}
1571
1572
1573/***************************************************
1574 * page management routines.
1575 ***************************************************/
1576
1577/*
1578 * free the pv_entry back to the free list
1579 */
1580static PMAP_INLINE void
1581free_pv_entry(pv_entry_t pv)
1582{
1583	pv_entry_count--;
1584	uma_zfree(pvzone, pv);
1585}
1586
1587/*
1588 * get a new pv_entry, allocating a block from the system
1589 * when needed.
1590 * the memory allocation is performed bypassing the malloc code
1591 * because of the possibility of allocations at interrupt time.
1592 */
1593static pv_entry_t
1594get_pv_entry(void)
1595{
1596	pv_entry_count++;
1597	if (pv_entry_high_water &&
1598		(pv_entry_count > pv_entry_high_water) &&
1599		(pmap_pagedaemon_waken == 0)) {
1600		pmap_pagedaemon_waken = 1;
1601		wakeup (&vm_pages_needed);
1602	}
1603	return uma_zalloc(pvzone, M_NOWAIT);
1604}
1605
1606/*
1607 * If it is the first entry on the list, it is actually
1608 * in the header and we must copy the following entry up
1609 * to the header.  Otherwise we must search the list for
1610 * the entry.  In either case we free the now unused entry.
1611 */
1612
1613static int
1614pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1615{
1616	pv_entry_t pv;
1617	int rtval;
1618	int s;
1619
1620	s = splvm();
1621	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1622	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1623		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1624			if (pmap == pv->pv_pmap && va == pv->pv_va)
1625				break;
1626		}
1627	} else {
1628		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1629			if (va == pv->pv_va)
1630				break;
1631		}
1632	}
1633
1634	rtval = 0;
1635	if (pv) {
1636		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1637		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1638		m->md.pv_list_count--;
1639		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1640			vm_page_flag_clear(m, PG_WRITEABLE);
1641
1642		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1643		free_pv_entry(pv);
1644	}
1645
1646	splx(s);
1647	return rtval;
1648}
1649
1650/*
1651 * Create a pv entry for page at pa for
1652 * (pmap, va).
1653 */
1654static void
1655pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1656{
1657
1658	int s;
1659	pv_entry_t pv;
1660
1661	s = splvm();
1662	pv = get_pv_entry();
1663	pv->pv_va = va;
1664	pv->pv_pmap = pmap;
1665	pv->pv_ptem = mpte;
1666
1667	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1668	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1669	m->md.pv_list_count++;
1670
1671	splx(s);
1672}
1673
1674/*
1675 * pmap_remove_pte: do the things to unmap a page in a process
1676 */
1677static int
1678pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1679{
1680	pt_entry_t oldpte;
1681	vm_page_t m;
1682
1683	oldpte = atomic_readandclear_int(ptq);
1684	if (oldpte & PG_W)
1685		pmap->pm_stats.wired_count -= 1;
1686	/*
1687	 * Machines that don't support invlpg, also don't support
1688	 * PG_G.
1689	 */
1690	if (oldpte & PG_G)
1691		pmap_invalidate_page(kernel_pmap, va);
1692	pmap->pm_stats.resident_count -= 1;
1693	if (oldpte & PG_MANAGED) {
1694		m = PHYS_TO_VM_PAGE(oldpte);
1695		if (oldpte & PG_M) {
1696#if defined(PMAP_DIAGNOSTIC)
1697			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1698				printf(
1699	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1700				    va, oldpte);
1701			}
1702#endif
1703			if (pmap_track_modified(va))
1704				vm_page_dirty(m);
1705		}
1706		if (oldpte & PG_A)
1707			vm_page_flag_set(m, PG_REFERENCED);
1708		return pmap_remove_entry(pmap, m, va);
1709	} else {
1710		return pmap_unuse_pt(pmap, va, NULL);
1711	}
1712
1713	return 0;
1714}
1715
1716/*
1717 * Remove a single page from a process address space
1718 */
1719static void
1720pmap_remove_page(pmap_t pmap, vm_offset_t va)
1721{
1722	pt_entry_t *pte;
1723
1724	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1725		return;
1726	pmap_remove_pte(pmap, pte, va);
1727	pmap_invalidate_page(pmap, va);
1728}
1729
1730/*
1731 *	Remove the given range of addresses from the specified map.
1732 *
1733 *	It is assumed that the start and end are properly
1734 *	rounded to the page size.
1735 */
1736void
1737pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1738{
1739	vm_offset_t pdnxt;
1740	pd_entry_t ptpaddr;
1741	pt_entry_t *pte;
1742	int anyvalid;
1743
1744	if (pmap == NULL)
1745		return;
1746
1747	if (pmap->pm_stats.resident_count == 0)
1748		return;
1749
1750	/*
1751	 * special handling of removing one page.  a very
1752	 * common operation and easy to short circuit some
1753	 * code.
1754	 */
1755	if ((sva + PAGE_SIZE == eva) &&
1756	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1757		pmap_remove_page(pmap, sva);
1758		return;
1759	}
1760
1761	anyvalid = 0;
1762
1763	for (; sva < eva; sva = pdnxt) {
1764		unsigned pdirindex;
1765
1766		/*
1767		 * Calculate index for next page table.
1768		 */
1769		pdnxt = (sva + NBPDR) & ~PDRMASK;
1770		if (pmap->pm_stats.resident_count == 0)
1771			break;
1772
1773		pdirindex = sva >> PDRSHIFT;
1774		ptpaddr = pmap->pm_pdir[pdirindex];
1775
1776		/*
1777		 * Weed out invalid mappings. Note: we assume that the page
1778		 * directory table is always allocated, and in kernel virtual.
1779		 */
1780		if (ptpaddr == 0)
1781			continue;
1782
1783		/*
1784		 * Check for large page.
1785		 */
1786		if ((ptpaddr & PG_PS) != 0) {
1787			pmap->pm_pdir[pdirindex] = 0;
1788			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1789			anyvalid = 1;
1790			continue;
1791		}
1792
1793		/*
1794		 * Limit our scan to either the end of the va represented
1795		 * by the current page table page, or to the end of the
1796		 * range being removed.
1797		 */
1798		if (pdnxt > eva)
1799			pdnxt = eva;
1800
1801		for (; sva != pdnxt; sva += PAGE_SIZE) {
1802			if ((pte = pmap_pte_quick(pmap, sva)) == NULL ||
1803			    *pte == 0)
1804				continue;
1805			anyvalid = 1;
1806			if (pmap_remove_pte(pmap, pte, sva))
1807				break;
1808		}
1809	}
1810
1811	if (anyvalid)
1812		pmap_invalidate_all(pmap);
1813}
1814
1815/*
1816 *	Routine:	pmap_remove_all
1817 *	Function:
1818 *		Removes this physical page from
1819 *		all physical maps in which it resides.
1820 *		Reflects back modify bits to the pager.
1821 *
1822 *	Notes:
1823 *		Original versions of this routine were very
1824 *		inefficient because they iteratively called
1825 *		pmap_remove (slow...)
1826 */
1827
1828void
1829pmap_remove_all(vm_page_t m)
1830{
1831	register pv_entry_t pv;
1832	pt_entry_t *pte, tpte;
1833	int s;
1834
1835#if defined(PMAP_DIAGNOSTIC)
1836	/*
1837	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1838	 */
1839	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1840		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1841		    VM_PAGE_TO_PHYS(m));
1842	}
1843#endif
1844	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1845	s = splvm();
1846	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1847		pv->pv_pmap->pm_stats.resident_count--;
1848		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1849		tpte = atomic_readandclear_int(pte);
1850		if (tpte & PG_W)
1851			pv->pv_pmap->pm_stats.wired_count--;
1852		if (tpte & PG_A)
1853			vm_page_flag_set(m, PG_REFERENCED);
1854
1855		/*
1856		 * Update the vm_page_t clean and reference bits.
1857		 */
1858		if (tpte & PG_M) {
1859#if defined(PMAP_DIAGNOSTIC)
1860			if (pmap_nw_modified((pt_entry_t) tpte)) {
1861				printf(
1862	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1863				    pv->pv_va, tpte);
1864			}
1865#endif
1866			if (pmap_track_modified(pv->pv_va))
1867				vm_page_dirty(m);
1868		}
1869		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1870		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1871		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1872		m->md.pv_list_count--;
1873		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1874		free_pv_entry(pv);
1875	}
1876	vm_page_flag_clear(m, PG_WRITEABLE);
1877	splx(s);
1878}
1879
1880/*
1881 *	Set the physical protection on the
1882 *	specified range of this map as requested.
1883 */
1884void
1885pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1886{
1887	vm_offset_t pdnxt;
1888	pd_entry_t ptpaddr;
1889	int anychanged;
1890
1891	if (pmap == NULL)
1892		return;
1893
1894	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1895		pmap_remove(pmap, sva, eva);
1896		return;
1897	}
1898
1899	if (prot & VM_PROT_WRITE)
1900		return;
1901
1902	anychanged = 0;
1903
1904	for (; sva < eva; sva = pdnxt) {
1905		unsigned pdirindex;
1906
1907		pdnxt = (sva + NBPDR) & ~PDRMASK;
1908
1909		pdirindex = sva >> PDRSHIFT;
1910		ptpaddr = pmap->pm_pdir[pdirindex];
1911
1912		/*
1913		 * Weed out invalid mappings. Note: we assume that the page
1914		 * directory table is always allocated, and in kernel virtual.
1915		 */
1916		if (ptpaddr == 0)
1917			continue;
1918
1919		/*
1920		 * Check for large page.
1921		 */
1922		if ((ptpaddr & PG_PS) != 0) {
1923			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1924			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1925			anychanged = 1;
1926			continue;
1927		}
1928
1929		if (pdnxt > eva)
1930			pdnxt = eva;
1931
1932		for (; sva != pdnxt; sva += PAGE_SIZE) {
1933			pt_entry_t pbits;
1934			pt_entry_t *pte;
1935			vm_page_t m;
1936
1937			if ((pte = pmap_pte_quick(pmap, sva)) == NULL)
1938				continue;
1939			pbits = *pte;
1940			if (pbits & PG_MANAGED) {
1941				m = NULL;
1942				if (pbits & PG_A) {
1943					m = PHYS_TO_VM_PAGE(pbits);
1944					vm_page_flag_set(m, PG_REFERENCED);
1945					pbits &= ~PG_A;
1946				}
1947				if ((pbits & PG_M) != 0 &&
1948				    pmap_track_modified(sva)) {
1949					if (m == NULL)
1950						m = PHYS_TO_VM_PAGE(pbits);
1951					vm_page_dirty(m);
1952					pbits &= ~PG_M;
1953				}
1954			}
1955
1956			pbits &= ~PG_RW;
1957
1958			if (pbits != *pte) {
1959				*pte = pbits;
1960				anychanged = 1;
1961			}
1962		}
1963	}
1964	if (anychanged)
1965		pmap_invalidate_all(pmap);
1966}
1967
1968/*
1969 *	Insert the given physical page (p) at
1970 *	the specified virtual address (v) in the
1971 *	target physical map with the protection requested.
1972 *
1973 *	If specified, the page will be wired down, meaning
1974 *	that the related pte can not be reclaimed.
1975 *
1976 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1977 *	or lose information.  That is, this routine must actually
1978 *	insert this page into the given map NOW.
1979 */
1980void
1981pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1982	   boolean_t wired)
1983{
1984	vm_paddr_t pa;
1985	register pt_entry_t *pte;
1986	vm_paddr_t opa;
1987	pt_entry_t origpte, newpte;
1988	vm_page_t mpte;
1989
1990	if (pmap == NULL)
1991		return;
1992
1993	va &= PG_FRAME;
1994#ifdef PMAP_DIAGNOSTIC
1995	if (va > VM_MAX_KERNEL_ADDRESS)
1996		panic("pmap_enter: toobig");
1997	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1998		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1999#endif
2000
2001	mpte = NULL;
2002	/*
2003	 * In the case that a page table page is not
2004	 * resident, we are creating it here.
2005	 */
2006	if (va < VM_MAXUSER_ADDRESS) {
2007		mpte = pmap_allocpte(pmap, va);
2008	}
2009#if 0 && defined(PMAP_DIAGNOSTIC)
2010	else {
2011		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2012		origpte = *pdeaddr;
2013		if ((origpte & PG_V) == 0) {
2014			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2015				pmap->pm_pdir[PTDPTDI], origpte, va);
2016		}
2017	}
2018#endif
2019
2020	pte = pmap_pte_quick(pmap, va);
2021
2022	/*
2023	 * Page Directory table entry not valid, we need a new PT page
2024	 */
2025	if (pte == NULL) {
2026		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
2027			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
2028	}
2029
2030	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2031	origpte = *(vm_offset_t *)pte;
2032	opa = origpte & PG_FRAME;
2033
2034	if (origpte & PG_PS)
2035		panic("pmap_enter: attempted pmap_enter on 4MB page");
2036
2037	/*
2038	 * Mapping has not changed, must be protection or wiring change.
2039	 */
2040	if (origpte && (opa == pa)) {
2041		/*
2042		 * Wiring change, just update stats. We don't worry about
2043		 * wiring PT pages as they remain resident as long as there
2044		 * are valid mappings in them. Hence, if a user page is wired,
2045		 * the PT page will be also.
2046		 */
2047		if (wired && ((origpte & PG_W) == 0))
2048			pmap->pm_stats.wired_count++;
2049		else if (!wired && (origpte & PG_W))
2050			pmap->pm_stats.wired_count--;
2051
2052#if defined(PMAP_DIAGNOSTIC)
2053		if (pmap_nw_modified((pt_entry_t) origpte)) {
2054			printf(
2055	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2056			    va, origpte);
2057		}
2058#endif
2059
2060		/*
2061		 * Remove extra pte reference
2062		 */
2063		if (mpte)
2064			mpte->hold_count--;
2065
2066		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2067			if ((origpte & PG_RW) == 0) {
2068				*pte |= PG_RW;
2069				pmap_invalidate_page(pmap, va);
2070			}
2071			return;
2072		}
2073
2074		/*
2075		 * We might be turning off write access to the page,
2076		 * so we go ahead and sense modify status.
2077		 */
2078		if (origpte & PG_MANAGED) {
2079			if ((origpte & PG_M) && pmap_track_modified(va)) {
2080				vm_page_t om;
2081				om = PHYS_TO_VM_PAGE(opa);
2082				vm_page_dirty(om);
2083			}
2084			pa |= PG_MANAGED;
2085		}
2086		goto validate;
2087	}
2088	/*
2089	 * Mapping has changed, invalidate old range and fall through to
2090	 * handle validating new mapping.
2091	 */
2092	if (opa) {
2093		int err;
2094		vm_page_lock_queues();
2095		err = pmap_remove_pte(pmap, pte, va);
2096		vm_page_unlock_queues();
2097		if (err)
2098			panic("pmap_enter: pte vanished, va: 0x%x", va);
2099	}
2100
2101	/*
2102	 * Enter on the PV list if part of our managed memory. Note that we
2103	 * raise IPL while manipulating pv_table since pmap_enter can be
2104	 * called at interrupt time.
2105	 */
2106	if (pmap_initialized &&
2107	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2108		pmap_insert_entry(pmap, va, mpte, m);
2109		pa |= PG_MANAGED;
2110	}
2111
2112	/*
2113	 * Increment counters
2114	 */
2115	pmap->pm_stats.resident_count++;
2116	if (wired)
2117		pmap->pm_stats.wired_count++;
2118
2119validate:
2120	/*
2121	 * Now validate mapping with desired protection/wiring.
2122	 */
2123	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2124
2125	if (wired)
2126		newpte |= PG_W;
2127	if (va < VM_MAXUSER_ADDRESS)
2128		newpte |= PG_U;
2129	if (pmap == kernel_pmap)
2130		newpte |= pgeflag;
2131
2132	/*
2133	 * if the mapping or permission bits are different, we need
2134	 * to update the pte.
2135	 */
2136	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2137		*pte = newpte | PG_A;
2138		/*if (origpte)*/ {
2139			pmap_invalidate_page(pmap, va);
2140		}
2141	}
2142}
2143
2144/*
2145 * this code makes some *MAJOR* assumptions:
2146 * 1. Current pmap & pmap exists.
2147 * 2. Not wired.
2148 * 3. Read access.
2149 * 4. No page table pages.
2150 * 5. Tlbflush is deferred to calling procedure.
2151 * 6. Page IS managed.
2152 * but is *MUCH* faster than pmap_enter...
2153 */
2154
2155static vm_page_t
2156pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2157{
2158	pt_entry_t *pte;
2159	vm_paddr_t pa;
2160
2161	/*
2162	 * In the case that a page table page is not
2163	 * resident, we are creating it here.
2164	 */
2165	if (va < VM_MAXUSER_ADDRESS) {
2166		unsigned ptepindex;
2167		pd_entry_t ptepa;
2168
2169		/*
2170		 * Calculate pagetable page index
2171		 */
2172		ptepindex = va >> PDRSHIFT;
2173		if (mpte && (mpte->pindex == ptepindex)) {
2174			mpte->hold_count++;
2175		} else {
2176retry:
2177			/*
2178			 * Get the page directory entry
2179			 */
2180			ptepa = pmap->pm_pdir[ptepindex];
2181
2182			/*
2183			 * If the page table page is mapped, we just increment
2184			 * the hold count, and activate it.
2185			 */
2186			if (ptepa) {
2187				if (ptepa & PG_PS)
2188					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2189				if (pmap->pm_pteobj->root &&
2190					(pmap->pm_pteobj->root->pindex == ptepindex)) {
2191					mpte = pmap->pm_pteobj->root;
2192				} else {
2193					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
2194				}
2195				if (mpte == NULL)
2196					goto retry;
2197				mpte->hold_count++;
2198			} else {
2199				mpte = _pmap_allocpte(pmap, ptepindex);
2200			}
2201		}
2202	} else {
2203		mpte = NULL;
2204	}
2205
2206	/*
2207	 * This call to vtopte makes the assumption that we are
2208	 * entering the page into the current pmap.  In order to support
2209	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2210	 * But that isn't as quick as vtopte.
2211	 */
2212	pte = vtopte(va);
2213	if (*pte) {
2214		if (mpte != NULL) {
2215			vm_page_lock_queues();
2216			pmap_unwire_pte_hold(pmap, mpte);
2217			vm_page_unlock_queues();
2218		}
2219		return 0;
2220	}
2221
2222	/*
2223	 * Enter on the PV list if part of our managed memory. Note that we
2224	 * raise IPL while manipulating pv_table since pmap_enter can be
2225	 * called at interrupt time.
2226	 */
2227	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2228		pmap_insert_entry(pmap, va, mpte, m);
2229
2230	/*
2231	 * Increment counters
2232	 */
2233	pmap->pm_stats.resident_count++;
2234
2235	pa = VM_PAGE_TO_PHYS(m);
2236
2237	/*
2238	 * Now validate mapping with RO protection
2239	 */
2240	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2241		*pte = pa | PG_V | PG_U;
2242	else
2243		*pte = pa | PG_V | PG_U | PG_MANAGED;
2244
2245	return mpte;
2246}
2247
2248/*
2249 * Make a temporary mapping for a physical address.  This is only intended
2250 * to be used for panic dumps.
2251 */
2252void *
2253pmap_kenter_temporary(vm_offset_t pa, int i)
2254{
2255	vm_offset_t va;
2256
2257	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2258	pmap_kenter(va, pa);
2259#ifndef I386_CPU
2260	invlpg(va);
2261#else
2262	invltlb();
2263#endif
2264	return ((void *)crashdumpmap);
2265}
2266
2267#define MAX_INIT_PT (96)
2268/*
2269 * pmap_object_init_pt preloads the ptes for a given object
2270 * into the specified pmap.  This eliminates the blast of soft
2271 * faults on process startup and immediately after an mmap.
2272 */
2273void
2274pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2275		    vm_object_t object, vm_pindex_t pindex,
2276		    vm_size_t size, int limit)
2277{
2278	vm_offset_t tmpidx;
2279	int psize;
2280	vm_page_t p, mpte;
2281
2282	if (pmap == NULL || object == NULL)
2283		return;
2284
2285	/*
2286	 * This code maps large physical mmap regions into the
2287	 * processor address space.  Note that some shortcuts
2288	 * are taken, but the code works.
2289	 */
2290	if (pseflag && (object->type == OBJT_DEVICE) &&
2291	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2292		int i;
2293		vm_page_t m[1];
2294		unsigned int ptepindex;
2295		int npdes;
2296		pd_entry_t ptepa;
2297
2298		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2299			return;
2300
2301retry:
2302		p = vm_page_lookup(object, pindex);
2303		if (p != NULL) {
2304			vm_page_lock_queues();
2305			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2306				goto retry;
2307		} else {
2308			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2309			if (p == NULL)
2310				return;
2311			m[0] = p;
2312
2313			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2314				vm_page_lock_queues();
2315				vm_page_free(p);
2316				vm_page_unlock_queues();
2317				return;
2318			}
2319
2320			p = vm_page_lookup(object, pindex);
2321			vm_page_lock_queues();
2322			vm_page_wakeup(p);
2323		}
2324		vm_page_unlock_queues();
2325
2326		ptepa = VM_PAGE_TO_PHYS(p);
2327		if (ptepa & (NBPDR - 1)) {
2328			return;
2329		}
2330
2331		p->valid = VM_PAGE_BITS_ALL;
2332
2333		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2334		npdes = size >> PDRSHIFT;
2335		for(i = 0; i < npdes; i++) {
2336			pmap->pm_pdir[ptepindex] =
2337			    ptepa | PG_U | PG_RW | PG_V | PG_PS;
2338			ptepa += NBPDR;
2339			ptepindex += 1;
2340		}
2341		pmap_invalidate_all(kernel_pmap);
2342		return;
2343	}
2344
2345	psize = i386_btop(size);
2346
2347	if ((object->type != OBJT_VNODE) ||
2348	    ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2349	     (object->resident_page_count > MAX_INIT_PT))) {
2350		return;
2351	}
2352
2353	if (psize + pindex > object->size) {
2354		if (object->size < pindex)
2355			return;
2356		psize = object->size - pindex;
2357	}
2358
2359	mpte = NULL;
2360
2361	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
2362		if (p->pindex < pindex) {
2363			p = vm_page_splay(pindex, object->root);
2364			if ((object->root = p)->pindex < pindex)
2365				p = TAILQ_NEXT(p, listq);
2366		}
2367	}
2368	/*
2369	 * Assert: the variable p is either (1) the page with the
2370	 * least pindex greater than or equal to the parameter pindex
2371	 * or (2) NULL.
2372	 */
2373	for (;
2374	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
2375	     p = TAILQ_NEXT(p, listq)) {
2376		/*
2377		 * don't allow an madvise to blow away our really
2378		 * free pages allocating pv entries.
2379		 */
2380		if ((limit & MAP_PREFAULT_MADVISE) &&
2381		    cnt.v_free_count < cnt.v_free_reserved) {
2382			break;
2383		}
2384		vm_page_lock_queues();
2385		if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
2386		    (p->busy == 0) &&
2387		    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2388			if ((p->queue - p->pc) == PQ_CACHE)
2389				vm_page_deactivate(p);
2390			vm_page_busy(p);
2391			vm_page_unlock_queues();
2392			mpte = pmap_enter_quick(pmap,
2393				addr + i386_ptob(tmpidx), p, mpte);
2394			vm_page_lock_queues();
2395			vm_page_wakeup(p);
2396		}
2397		vm_page_unlock_queues();
2398	}
2399	return;
2400}
2401
2402/*
2403 * pmap_prefault provides a quick way of clustering
2404 * pagefaults into a processes address space.  It is a "cousin"
2405 * of pmap_object_init_pt, except it runs at page fault time instead
2406 * of mmap time.
2407 */
2408#define PFBAK 4
2409#define PFFOR 4
2410#define PAGEORDER_SIZE (PFBAK+PFFOR)
2411
2412static int pmap_prefault_pageorder[] = {
2413	-1 * PAGE_SIZE, 1 * PAGE_SIZE,
2414	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2415	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
2416	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2417};
2418
2419void
2420pmap_prefault(pmap, addra, entry)
2421	pmap_t pmap;
2422	vm_offset_t addra;
2423	vm_map_entry_t entry;
2424{
2425	int i;
2426	vm_offset_t starta;
2427	vm_offset_t addr;
2428	vm_pindex_t pindex;
2429	vm_page_t m, mpte;
2430	vm_object_t object;
2431
2432	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2433		return;
2434
2435	object = entry->object.vm_object;
2436
2437	starta = addra - PFBAK * PAGE_SIZE;
2438	if (starta < entry->start) {
2439		starta = entry->start;
2440	} else if (starta > addra) {
2441		starta = 0;
2442	}
2443
2444	mpte = NULL;
2445	for (i = 0; i < PAGEORDER_SIZE; i++) {
2446		vm_object_t lobject;
2447		pt_entry_t *pte;
2448
2449		addr = addra + pmap_prefault_pageorder[i];
2450		if (addr > addra + (PFFOR * PAGE_SIZE))
2451			addr = 0;
2452
2453		if (addr < starta || addr >= entry->end)
2454			continue;
2455
2456		if ((*pmap_pde(pmap, addr)) == 0)
2457			continue;
2458
2459		pte = vtopte(addr);
2460		if (*pte)
2461			continue;
2462
2463		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2464		lobject = object;
2465		for (m = vm_page_lookup(lobject, pindex);
2466		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2467		    lobject = lobject->backing_object) {
2468			if (lobject->backing_object_offset & PAGE_MASK)
2469				break;
2470			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2471			m = vm_page_lookup(lobject->backing_object, pindex);
2472		}
2473
2474		/*
2475		 * give-up when a page is not in memory
2476		 */
2477		if (m == NULL)
2478			break;
2479		vm_page_lock_queues();
2480		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2481			(m->busy == 0) &&
2482		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2483
2484			if ((m->queue - m->pc) == PQ_CACHE) {
2485				vm_page_deactivate(m);
2486			}
2487			vm_page_busy(m);
2488			vm_page_unlock_queues();
2489			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2490			vm_page_lock_queues();
2491			vm_page_wakeup(m);
2492		}
2493		vm_page_unlock_queues();
2494	}
2495}
2496
2497/*
2498 *	Routine:	pmap_change_wiring
2499 *	Function:	Change the wiring attribute for a map/virtual-address
2500 *			pair.
2501 *	In/out conditions:
2502 *			The mapping must already exist in the pmap.
2503 */
2504void
2505pmap_change_wiring(pmap, va, wired)
2506	register pmap_t pmap;
2507	vm_offset_t va;
2508	boolean_t wired;
2509{
2510	register pt_entry_t *pte;
2511
2512	if (pmap == NULL)
2513		return;
2514
2515	pte = pmap_pte_quick(pmap, va);
2516
2517	if (wired && !pmap_pte_w(pte))
2518		pmap->pm_stats.wired_count++;
2519	else if (!wired && pmap_pte_w(pte))
2520		pmap->pm_stats.wired_count--;
2521
2522	/*
2523	 * Wiring is not a hardware characteristic so there is no need to
2524	 * invalidate TLB.
2525	 */
2526	pmap_pte_set_w(pte, wired);
2527}
2528
2529
2530
2531/*
2532 *	Copy the range specified by src_addr/len
2533 *	from the source map to the range dst_addr/len
2534 *	in the destination map.
2535 *
2536 *	This routine is only advisory and need not do anything.
2537 */
2538
2539void
2540pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2541	  vm_offset_t src_addr)
2542{
2543	vm_offset_t addr;
2544	vm_offset_t end_addr = src_addr + len;
2545	vm_offset_t pdnxt;
2546	vm_page_t m;
2547
2548	if (dst_addr != src_addr)
2549		return;
2550
2551	if (!pmap_is_current(src_pmap))
2552		return;
2553
2554	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2555		pt_entry_t *src_pte, *dst_pte;
2556		vm_page_t dstmpte, srcmpte;
2557		pd_entry_t srcptepaddr;
2558		unsigned ptepindex;
2559
2560		if (addr >= UPT_MIN_ADDRESS)
2561			panic("pmap_copy: invalid to pmap_copy page tables\n");
2562
2563		/*
2564		 * Don't let optional prefaulting of pages make us go
2565		 * way below the low water mark of free pages or way
2566		 * above high water mark of used pv entries.
2567		 */
2568		if (cnt.v_free_count < cnt.v_free_reserved ||
2569		    pv_entry_count > pv_entry_high_water)
2570			break;
2571
2572		pdnxt = (addr + NBPDR) & ~PDRMASK;
2573		ptepindex = addr >> PDRSHIFT;
2574
2575		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2576		if (srcptepaddr == 0)
2577			continue;
2578
2579		if (srcptepaddr & PG_PS) {
2580			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2581				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2582				dst_pmap->pm_stats.resident_count +=
2583				    NBPDR / PAGE_SIZE;
2584			}
2585			continue;
2586		}
2587
2588		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2589		if ((srcmpte == NULL) ||
2590		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2591			continue;
2592
2593		if (pdnxt > end_addr)
2594			pdnxt = end_addr;
2595
2596		src_pte = vtopte(addr);
2597		while (addr < pdnxt) {
2598			pt_entry_t ptetemp;
2599			ptetemp = *src_pte;
2600			/*
2601			 * we only virtual copy managed pages
2602			 */
2603			if ((ptetemp & PG_MANAGED) != 0) {
2604				/*
2605				 * We have to check after allocpte for the
2606				 * pte still being around...  allocpte can
2607				 * block.
2608				 */
2609				dstmpte = pmap_allocpte(dst_pmap, addr);
2610				dst_pte = pmap_pte_quick(dst_pmap, addr);
2611				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2612					/*
2613					 * Clear the modified and
2614					 * accessed (referenced) bits
2615					 * during the copy.
2616					 */
2617					m = PHYS_TO_VM_PAGE(ptetemp);
2618					*dst_pte = ptetemp & ~(PG_M | PG_A);
2619					dst_pmap->pm_stats.resident_count++;
2620					pmap_insert_entry(dst_pmap, addr,
2621						dstmpte, m);
2622	 			} else {
2623					vm_page_lock_queues();
2624					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2625					vm_page_unlock_queues();
2626				}
2627				if (dstmpte->hold_count >= srcmpte->hold_count)
2628					break;
2629			}
2630			addr += PAGE_SIZE;
2631			src_pte++;
2632		}
2633	}
2634}
2635
2636#ifdef SMP
2637
2638/*
2639 *	pmap_zpi_switchin*()
2640 *
2641 *	These functions allow us to avoid doing IPIs alltogether in certain
2642 *	temporary page-mapping situations (page zeroing).  Instead to deal
2643 *	with being preempted and moved onto a different cpu we invalidate
2644 *	the page when the scheduler switches us in.  This does not occur
2645 *	very often so we remain relatively optimal with very little effort.
2646 */
2647static void
2648pmap_zpi_switchin12(void)
2649{
2650	invlpg((u_int)CADDR1);
2651	invlpg((u_int)CADDR2);
2652}
2653
2654static void
2655pmap_zpi_switchin2(void)
2656{
2657	invlpg((u_int)CADDR2);
2658}
2659
2660static void
2661pmap_zpi_switchin3(void)
2662{
2663	invlpg((u_int)CADDR3);
2664}
2665
2666#endif
2667
2668/*
2669 *	pmap_zero_page zeros the specified hardware page by mapping
2670 *	the page into KVM and using bzero to clear its contents.
2671 */
2672void
2673pmap_zero_page(vm_page_t m)
2674{
2675
2676	mtx_lock(&CMAPCADDR12_lock);
2677	if (*CMAP2)
2678		panic("pmap_zero_page: CMAP2 busy");
2679	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2680#ifdef I386_CPU
2681	invltlb();
2682#else
2683#ifdef SMP
2684	curthread->td_switchin = pmap_zpi_switchin2;
2685#endif
2686	invlpg((u_int)CADDR2);
2687#endif
2688#if defined(I686_CPU)
2689	if (cpu_class == CPUCLASS_686)
2690		i686_pagezero(CADDR2);
2691	else
2692#endif
2693		bzero(CADDR2, PAGE_SIZE);
2694#ifdef SMP
2695	curthread->td_switchin = NULL;
2696#endif
2697	*CMAP2 = 0;
2698	mtx_unlock(&CMAPCADDR12_lock);
2699}
2700
2701/*
2702 *	pmap_zero_page_area zeros the specified hardware page by mapping
2703 *	the page into KVM and using bzero to clear its contents.
2704 *
2705 *	off and size may not cover an area beyond a single hardware page.
2706 */
2707void
2708pmap_zero_page_area(vm_page_t m, int off, int size)
2709{
2710
2711	mtx_lock(&CMAPCADDR12_lock);
2712	if (*CMAP2)
2713		panic("pmap_zero_page: CMAP2 busy");
2714	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2715#ifdef I386_CPU
2716	invltlb();
2717#else
2718#ifdef SMP
2719	curthread->td_switchin = pmap_zpi_switchin2;
2720#endif
2721	invlpg((u_int)CADDR2);
2722#endif
2723#if defined(I686_CPU)
2724	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2725		i686_pagezero(CADDR2);
2726	else
2727#endif
2728		bzero((char *)CADDR2 + off, size);
2729#ifdef SMP
2730	curthread->td_switchin = NULL;
2731#endif
2732	*CMAP2 = 0;
2733	mtx_unlock(&CMAPCADDR12_lock);
2734}
2735
2736/*
2737 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2738 *	the page into KVM and using bzero to clear its contents.  This
2739 *	is intended to be called from the vm_pagezero process only and
2740 *	outside of Giant.
2741 */
2742void
2743pmap_zero_page_idle(vm_page_t m)
2744{
2745
2746	if (*CMAP3)
2747		panic("pmap_zero_page: CMAP3 busy");
2748	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2749#ifdef I386_CPU
2750	invltlb();
2751#else
2752#ifdef SMP
2753	curthread->td_switchin = pmap_zpi_switchin3;
2754#endif
2755	invlpg((u_int)CADDR3);
2756#endif
2757#if defined(I686_CPU)
2758	if (cpu_class == CPUCLASS_686)
2759		i686_pagezero(CADDR3);
2760	else
2761#endif
2762		bzero(CADDR3, PAGE_SIZE);
2763#ifdef SMP
2764	curthread->td_switchin = NULL;
2765#endif
2766	*CMAP3 = 0;
2767}
2768
2769/*
2770 *	pmap_copy_page copies the specified (machine independent)
2771 *	page by mapping the page into virtual memory and using
2772 *	bcopy to copy the page, one machine dependent page at a
2773 *	time.
2774 */
2775void
2776pmap_copy_page(vm_page_t src, vm_page_t dst)
2777{
2778
2779	mtx_lock(&CMAPCADDR12_lock);
2780	if (*CMAP1)
2781		panic("pmap_copy_page: CMAP1 busy");
2782	if (*CMAP2)
2783		panic("pmap_copy_page: CMAP2 busy");
2784	*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2785	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2786#ifdef I386_CPU
2787	invltlb();
2788#else
2789#ifdef SMP
2790	curthread->td_switchin = pmap_zpi_switchin12;
2791#endif
2792	invlpg((u_int)CADDR1);
2793	invlpg((u_int)CADDR2);
2794#endif
2795	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2796#ifdef SMP
2797	curthread->td_switchin = NULL;
2798#endif
2799	*CMAP1 = 0;
2800	*CMAP2 = 0;
2801	mtx_unlock(&CMAPCADDR12_lock);
2802}
2803
2804/*
2805 * Returns true if the pmap's pv is one of the first
2806 * 16 pvs linked to from this page.  This count may
2807 * be changed upwards or downwards in the future; it
2808 * is only necessary that true be returned for a small
2809 * subset of pmaps for proper page aging.
2810 */
2811boolean_t
2812pmap_page_exists_quick(pmap, m)
2813	pmap_t pmap;
2814	vm_page_t m;
2815{
2816	pv_entry_t pv;
2817	int loops = 0;
2818	int s;
2819
2820	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2821		return FALSE;
2822
2823	s = splvm();
2824	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2825	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2826		if (pv->pv_pmap == pmap) {
2827			splx(s);
2828			return TRUE;
2829		}
2830		loops++;
2831		if (loops >= 16)
2832			break;
2833	}
2834	splx(s);
2835	return (FALSE);
2836}
2837
2838#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2839/*
2840 * Remove all pages from specified address space
2841 * this aids process exit speeds.  Also, this code
2842 * is special cased for current process only, but
2843 * can have the more generic (and slightly slower)
2844 * mode enabled.  This is much faster than pmap_remove
2845 * in the case of running down an entire address space.
2846 */
2847void
2848pmap_remove_pages(pmap, sva, eva)
2849	pmap_t pmap;
2850	vm_offset_t sva, eva;
2851{
2852	pt_entry_t *pte, tpte;
2853	vm_page_t m;
2854	pv_entry_t pv, npv;
2855	int s;
2856
2857#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2858	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2859		printf("warning: pmap_remove_pages called with non-current pmap\n");
2860		return;
2861	}
2862#endif
2863	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2864	s = splvm();
2865	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2866
2867		if (pv->pv_va >= eva || pv->pv_va < sva) {
2868			npv = TAILQ_NEXT(pv, pv_plist);
2869			continue;
2870		}
2871
2872#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2873		pte = vtopte(pv->pv_va);
2874#else
2875		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2876#endif
2877		tpte = *pte;
2878
2879		if (tpte == 0) {
2880			printf("TPTE at %p  IS ZERO @ VA %08x\n",
2881							pte, pv->pv_va);
2882			panic("bad pte");
2883		}
2884
2885/*
2886 * We cannot remove wired pages from a process' mapping at this time
2887 */
2888		if (tpte & PG_W) {
2889			npv = TAILQ_NEXT(pv, pv_plist);
2890			continue;
2891		}
2892
2893		m = PHYS_TO_VM_PAGE(tpte);
2894		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2895		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2896		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2897
2898		KASSERT(m < &vm_page_array[vm_page_array_size],
2899			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2900
2901		pv->pv_pmap->pm_stats.resident_count--;
2902
2903		*pte = 0;
2904
2905		/*
2906		 * Update the vm_page_t clean and reference bits.
2907		 */
2908		if (tpte & PG_M) {
2909			vm_page_dirty(m);
2910		}
2911
2912		npv = TAILQ_NEXT(pv, pv_plist);
2913		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2914
2915		m->md.pv_list_count--;
2916		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2917		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2918			vm_page_flag_clear(m, PG_WRITEABLE);
2919		}
2920
2921		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2922		free_pv_entry(pv);
2923	}
2924	splx(s);
2925	pmap_invalidate_all(pmap);
2926}
2927
2928/*
2929 *	pmap_is_modified:
2930 *
2931 *	Return whether or not the specified physical page was modified
2932 *	in any physical maps.
2933 */
2934boolean_t
2935pmap_is_modified(vm_page_t m)
2936{
2937	pv_entry_t pv;
2938	pt_entry_t *pte;
2939	int s;
2940
2941	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2942		return FALSE;
2943
2944	s = splvm();
2945	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2946	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2947		/*
2948		 * if the bit being tested is the modified bit, then
2949		 * mark clean_map and ptes as never
2950		 * modified.
2951		 */
2952		if (!pmap_track_modified(pv->pv_va))
2953			continue;
2954#if defined(PMAP_DIAGNOSTIC)
2955		if (!pv->pv_pmap) {
2956			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2957			continue;
2958		}
2959#endif
2960		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2961		if (*pte & PG_M) {
2962			splx(s);
2963			return TRUE;
2964		}
2965	}
2966	splx(s);
2967	return (FALSE);
2968}
2969
2970/*
2971 * this routine is used to modify bits in ptes
2972 */
2973static __inline void
2974pmap_changebit(vm_page_t m, int bit, boolean_t setem)
2975{
2976	register pv_entry_t pv;
2977	register pt_entry_t *pte;
2978	int s;
2979
2980	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2981	    (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2982		return;
2983
2984	s = splvm();
2985	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2986	/*
2987	 * Loop over all current mappings setting/clearing as appropos If
2988	 * setting RO do we need to clear the VAC?
2989	 */
2990	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2991		/*
2992		 * don't write protect pager mappings
2993		 */
2994		if (!setem && (bit == PG_RW)) {
2995			if (!pmap_track_modified(pv->pv_va))
2996				continue;
2997		}
2998
2999#if defined(PMAP_DIAGNOSTIC)
3000		if (!pv->pv_pmap) {
3001			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3002			continue;
3003		}
3004#endif
3005
3006		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3007
3008		if (setem) {
3009			*pte |= bit;
3010			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3011		} else {
3012			pt_entry_t pbits = *pte;
3013			if (pbits & bit) {
3014				if (bit == PG_RW) {
3015					if (pbits & PG_M) {
3016						vm_page_dirty(m);
3017					}
3018					*pte = pbits & ~(PG_M|PG_RW);
3019				} else {
3020					*pte = pbits & ~bit;
3021				}
3022				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3023			}
3024		}
3025	}
3026	if (!setem && bit == PG_RW)
3027		vm_page_flag_clear(m, PG_WRITEABLE);
3028	splx(s);
3029}
3030
3031/*
3032 *      pmap_page_protect:
3033 *
3034 *      Lower the permission for all mappings to a given page.
3035 */
3036void
3037pmap_page_protect(vm_page_t m, vm_prot_t prot)
3038{
3039	if ((prot & VM_PROT_WRITE) == 0) {
3040		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3041			pmap_changebit(m, PG_RW, FALSE);
3042		} else {
3043			pmap_remove_all(m);
3044		}
3045	}
3046}
3047
3048/*
3049 *	pmap_ts_referenced:
3050 *
3051 *	Return a count of reference bits for a page, clearing those bits.
3052 *	It is not necessary for every reference bit to be cleared, but it
3053 *	is necessary that 0 only be returned when there are truly no
3054 *	reference bits set.
3055 *
3056 *	XXX: The exact number of bits to check and clear is a matter that
3057 *	should be tested and standardized at some point in the future for
3058 *	optimal aging of shared pages.
3059 */
3060int
3061pmap_ts_referenced(vm_page_t m)
3062{
3063	register pv_entry_t pv, pvf, pvn;
3064	pt_entry_t *pte;
3065	int s;
3066	int rtval = 0;
3067
3068	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3069		return (rtval);
3070
3071	s = splvm();
3072	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3073	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3074
3075		pvf = pv;
3076
3077		do {
3078			pvn = TAILQ_NEXT(pv, pv_list);
3079
3080			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3081
3082			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3083
3084			if (!pmap_track_modified(pv->pv_va))
3085				continue;
3086
3087			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3088
3089			if (pte && (*pte & PG_A)) {
3090				*pte &= ~PG_A;
3091
3092				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
3093
3094				rtval++;
3095				if (rtval > 4) {
3096					break;
3097				}
3098			}
3099		} while ((pv = pvn) != NULL && pv != pvf);
3100	}
3101	splx(s);
3102
3103	return (rtval);
3104}
3105
3106/*
3107 *	Clear the modify bits on the specified physical page.
3108 */
3109void
3110pmap_clear_modify(vm_page_t m)
3111{
3112	pmap_changebit(m, PG_M, FALSE);
3113}
3114
3115/*
3116 *	pmap_clear_reference:
3117 *
3118 *	Clear the reference bit on the specified physical page.
3119 */
3120void
3121pmap_clear_reference(vm_page_t m)
3122{
3123	pmap_changebit(m, PG_A, FALSE);
3124}
3125
3126/*
3127 * Miscellaneous support routines follow
3128 */
3129
3130static void
3131i386_protection_init()
3132{
3133	register int *kp, prot;
3134
3135	kp = protection_codes;
3136	for (prot = 0; prot < 8; prot++) {
3137		switch (prot) {
3138		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3139			/*
3140			 * Read access is also 0. There isn't any execute bit,
3141			 * so just make it readable.
3142			 */
3143		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3144		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3145		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3146			*kp++ = 0;
3147			break;
3148		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3149		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3150		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3151		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3152			*kp++ = PG_RW;
3153			break;
3154		}
3155	}
3156}
3157
3158/*
3159 * Map a set of physical memory pages into the kernel virtual
3160 * address space. Return a pointer to where it is mapped. This
3161 * routine is intended to be used for mapping device memory,
3162 * NOT real memory.
3163 */
3164void *
3165pmap_mapdev(pa, size)
3166	vm_paddr_t pa;
3167	vm_size_t size;
3168{
3169	vm_offset_t va, tmpva, offset;
3170
3171	offset = pa & PAGE_MASK;
3172	size = roundup(offset + size, PAGE_SIZE);
3173
3174	GIANT_REQUIRED;
3175
3176	va = kmem_alloc_pageable(kernel_map, size);
3177	if (!va)
3178		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3179
3180	pa = pa & PG_FRAME;
3181	for (tmpva = va; size > 0; ) {
3182		pmap_kenter(tmpva, pa);
3183		size -= PAGE_SIZE;
3184		tmpva += PAGE_SIZE;
3185		pa += PAGE_SIZE;
3186	}
3187	pmap_invalidate_range(kernel_pmap, va, tmpva);
3188	return ((void *)(va + offset));
3189}
3190
3191void
3192pmap_unmapdev(va, size)
3193	vm_offset_t va;
3194	vm_size_t size;
3195{
3196	vm_offset_t base, offset, tmpva;
3197	pt_entry_t *pte;
3198
3199	base = va & PG_FRAME;
3200	offset = va & PAGE_MASK;
3201	size = roundup(offset + size, PAGE_SIZE);
3202	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
3203		pte = vtopte(tmpva);
3204		*pte = 0;
3205	}
3206	pmap_invalidate_range(kernel_pmap, va, tmpva);
3207	kmem_free(kernel_map, base, size);
3208}
3209
3210/*
3211 * perform the pmap work for mincore
3212 */
3213int
3214pmap_mincore(pmap, addr)
3215	pmap_t pmap;
3216	vm_offset_t addr;
3217{
3218	pt_entry_t *ptep, pte;
3219	vm_page_t m;
3220	int val = 0;
3221
3222	ptep = pmap_pte_quick(pmap, addr);
3223	if (ptep == 0) {
3224		return 0;
3225	}
3226
3227	if ((pte = *ptep) != 0) {
3228		vm_paddr_t pa;
3229
3230		val = MINCORE_INCORE;
3231		if ((pte & PG_MANAGED) == 0)
3232			return val;
3233
3234		pa = pte & PG_FRAME;
3235
3236		m = PHYS_TO_VM_PAGE(pa);
3237
3238		/*
3239		 * Modified by us
3240		 */
3241		if (pte & PG_M)
3242			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3243		else {
3244			/*
3245			 * Modified by someone else
3246			 */
3247			vm_page_lock_queues();
3248			if (m->dirty || pmap_is_modified(m))
3249				val |= MINCORE_MODIFIED_OTHER;
3250			vm_page_unlock_queues();
3251		}
3252		/*
3253		 * Referenced by us
3254		 */
3255		if (pte & PG_A)
3256			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3257		else {
3258			/*
3259			 * Referenced by someone else
3260			 */
3261			vm_page_lock_queues();
3262			if ((m->flags & PG_REFERENCED) ||
3263			    pmap_ts_referenced(m)) {
3264				val |= MINCORE_REFERENCED_OTHER;
3265				vm_page_flag_set(m, PG_REFERENCED);
3266			}
3267			vm_page_unlock_queues();
3268		}
3269	}
3270	return val;
3271}
3272
3273void
3274pmap_activate(struct thread *td)
3275{
3276	struct proc *p = td->td_proc;
3277	pmap_t	pmap;
3278	u_int32_t  cr3;
3279
3280	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3281#if defined(SMP)
3282	pmap->pm_active |= PCPU_GET(cpumask);
3283#else
3284	pmap->pm_active |= 1;
3285#endif
3286	cr3 = vtophys(pmap->pm_pdir);
3287	/* XXXKSE this is wrong.
3288	 * pmap_activate is for the current thread on the current cpu
3289	 */
3290	if (p->p_flag & P_THREADED) {
3291		/* Make sure all other cr3 entries are updated. */
3292		/* what if they are running?  XXXKSE (maybe abort them) */
3293		FOREACH_THREAD_IN_PROC(p, td) {
3294			td->td_pcb->pcb_cr3 = cr3;
3295		}
3296	} else {
3297		td->td_pcb->pcb_cr3 = cr3;
3298	}
3299	load_cr3(cr3);
3300#ifdef SWTCH_OPTIM_STATS
3301	tlb_flush_count++;
3302#endif
3303}
3304
3305vm_offset_t
3306pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3307{
3308
3309	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3310		return addr;
3311	}
3312
3313	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3314	return addr;
3315}
3316
3317
3318#if defined(PMAP_DEBUG)
3319pmap_pid_dump(int pid)
3320{
3321	pmap_t pmap;
3322	struct proc *p;
3323	int npte = 0;
3324	int index;
3325
3326	sx_slock(&allproc_lock);
3327	LIST_FOREACH(p, &allproc, p_list) {
3328		if (p->p_pid != pid)
3329			continue;
3330
3331		if (p->p_vmspace) {
3332			int i,j;
3333			index = 0;
3334			pmap = vmspace_pmap(p->p_vmspace);
3335			for (i = 0; i < NPDEPTD; i++) {
3336				pd_entry_t *pde;
3337				pt_entry_t *pte;
3338				vm_offset_t base = i << PDRSHIFT;
3339
3340				pde = &pmap->pm_pdir[i];
3341				if (pde && pmap_pde_v(pde)) {
3342					for (j = 0; j < NPTEPG; j++) {
3343						vm_offset_t va = base + (j << PAGE_SHIFT);
3344						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3345							if (index) {
3346								index = 0;
3347								printf("\n");
3348							}
3349							sx_sunlock(&allproc_lock);
3350							return npte;
3351						}
3352						pte = pmap_pte_quick(pmap, va);
3353						if (pte && pmap_pte_v(pte)) {
3354							pt_entry_t pa;
3355							vm_page_t m;
3356							pa = *pte;
3357							m = PHYS_TO_VM_PAGE(pa);
3358							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3359								va, pa, m->hold_count, m->wire_count, m->flags);
3360							npte++;
3361							index++;
3362							if (index >= 2) {
3363								index = 0;
3364								printf("\n");
3365							} else {
3366								printf(" ");
3367							}
3368						}
3369					}
3370				}
3371			}
3372		}
3373	}
3374	sx_sunlock(&allproc_lock);
3375	return npte;
3376}
3377#endif
3378
3379#if defined(DEBUG)
3380
3381static void	pads(pmap_t pm);
3382void		pmap_pvdump(vm_offset_t pa);
3383
3384/* print address space of pmap*/
3385static void
3386pads(pm)
3387	pmap_t pm;
3388{
3389	int i, j;
3390	vm_paddr_t va;
3391	pt_entry_t *ptep;
3392
3393	if (pm == kernel_pmap)
3394		return;
3395	for (i = 0; i < NPDEPTD; i++)
3396		if (pm->pm_pdir[i])
3397			for (j = 0; j < NPTEPG; j++) {
3398				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3399				if (pm == kernel_pmap && va < KERNBASE)
3400					continue;
3401				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3402					continue;
3403				ptep = pmap_pte_quick(pm, va);
3404				if (pmap_pte_v(ptep))
3405					printf("%x:%x ", va, *ptep);
3406			};
3407
3408}
3409
3410void
3411pmap_pvdump(pa)
3412	vm_paddr_t pa;
3413{
3414	pv_entry_t pv;
3415	vm_page_t m;
3416
3417	printf("pa %x", pa);
3418	m = PHYS_TO_VM_PAGE(pa);
3419	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3420		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3421		pads(pv->pv_pmap);
3422	}
3423	printf(" ");
3424}
3425#endif
3426