pmap.c revision 121996
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 */
43/*-
44 * Copyright (c) 2003 Networks Associates Technology, Inc.
45 * All rights reserved.
46 *
47 * This software was developed for the FreeBSD Project by Jake Burkholder,
48 * Safeport Network Services, and Network Associates Laboratories, the
49 * Security Research Division of Network Associates, Inc. under
50 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
51 * CHATS research program.
52 *
53 * Redistribution and use in source and binary forms, with or without
54 * modification, are permitted provided that the following conditions
55 * are met:
56 * 1. Redistributions of source code must retain the above copyright
57 *    notice, this list of conditions and the following disclaimer.
58 * 2. Redistributions in binary form must reproduce the above copyright
59 *    notice, this list of conditions and the following disclaimer in the
60 *    documentation and/or other materials provided with the distribution.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 */
74
75#include <sys/cdefs.h>
76__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 121996 2003-11-03 22:32:04Z jhb $");
77
78/*
79 *	Manages physical address maps.
80 *
81 *	In addition to hardware address maps, this
82 *	module is called upon to provide software-use-only
83 *	maps which may or may not be stored in the same
84 *	form as hardware maps.  These pseudo-maps are
85 *	used to store intermediate results from copy
86 *	operations to and from address spaces.
87 *
88 *	Since the information managed by this module is
89 *	also stored by the logical address mapping module,
90 *	this module may throw away valid virtual-to-physical
91 *	mappings at almost any time.  However, invalidations
92 *	of virtual-to-physical mappings must be done as
93 *	requested.
94 *
95 *	In order to cope with hardware architectures which
96 *	make virtual-to-physical map invalidates expensive,
97 *	this module may delay invalidate or reduced protection
98 *	operations until such time as they are actually
99 *	necessary.  This module is given full information as
100 *	to which processors are currently using which maps,
101 *	and to when physical maps must be made correct.
102 */
103
104#include "opt_pmap.h"
105#include "opt_msgbuf.h"
106#include "opt_kstack_pages.h"
107
108#include <sys/param.h>
109#include <sys/systm.h>
110#include <sys/kernel.h>
111#include <sys/lock.h>
112#include <sys/mman.h>
113#include <sys/msgbuf.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/sx.h>
117#include <sys/user.h>
118#include <sys/vmmeter.h>
119#include <sys/sysctl.h>
120#ifdef SMP
121#include <sys/smp.h>
122#endif
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/uma.h>
134
135#include <machine/cpu.h>
136#include <machine/cputypes.h>
137#include <machine/md_var.h>
138#include <machine/specialreg.h>
139#ifdef SMP
140#include <machine/smp.h>
141#endif
142
143#define PMAP_KEEP_PDIRS
144#ifndef PMAP_SHPGPERPROC
145#define PMAP_SHPGPERPROC 200
146#endif
147
148#if defined(DIAGNOSTIC)
149#define PMAP_DIAGNOSTIC
150#endif
151
152#define MINPV 2048
153
154#if !defined(PMAP_DIAGNOSTIC)
155#define PMAP_INLINE __inline
156#else
157#define PMAP_INLINE
158#endif
159
160/*
161 * Get PDEs and PTEs for user/kernel address space
162 */
163#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
164#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
165
166#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
167#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
168#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
169#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
170#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
171
172#define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
173#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
174
175/*
176 * Given a map and a machine independent protection code,
177 * convert to a vax protection code.
178 */
179#define pte_prot(m, p)	(protection_codes[p])
180static int protection_codes[8];
181
182struct pmap kernel_pmap_store;
183LIST_HEAD(pmaplist, pmap);
184static struct pmaplist allpmaps;
185static struct mtx allpmaps_lock;
186#ifdef SMP
187static struct mtx lazypmap_lock;
188#endif
189
190vm_paddr_t avail_start;	/* PA of first available physical page */
191vm_paddr_t avail_end;	/* PA of last available physical page */
192vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
193vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
194static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
195int pgeflag = 0;		/* PG_G or-in */
196int pseflag = 0;		/* PG_PS or-in */
197
198static int nkpt;
199vm_offset_t kernel_vm_end;
200extern u_int32_t KERNend;
201
202#ifdef PAE
203static uma_zone_t pdptzone;
204#endif
205
206/*
207 * Data for the pv entry allocation mechanism
208 */
209static uma_zone_t pvzone;
210static struct vm_object pvzone_obj;
211static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
212int pmap_pagedaemon_waken;
213
214/*
215 * All those kernel PT submaps that BSD is so fond of
216 */
217pt_entry_t *CMAP1 = 0;
218static pt_entry_t *CMAP2, *CMAP3, *ptmmap;
219caddr_t CADDR1 = 0, ptvmmap = 0;
220static caddr_t CADDR2, CADDR3;
221static struct mtx CMAPCADDR12_lock;
222static pt_entry_t *msgbufmap;
223struct msgbuf *msgbufp = 0;
224
225/*
226 * Crashdump maps.
227 */
228static pt_entry_t *pt_crashdumpmap;
229static caddr_t crashdumpmap;
230
231#ifdef SMP
232extern pt_entry_t *SMPpt;
233#endif
234static pt_entry_t *PMAP1 = 0;
235static pt_entry_t *PADDR1 = 0;
236
237static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
238static pv_entry_t get_pv_entry(void);
239static void	i386_protection_init(void);
240static void	pmap_clear_ptes(vm_page_t m, int bit)
241    __always_inline;
242
243static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
244static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
245static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
246					vm_offset_t va);
247static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
248		vm_page_t mpte, vm_page_t m);
249
250static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
251
252static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex);
253static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
254static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
255static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
256#ifdef PAE
257static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
258#endif
259
260CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
261CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
262
263/*
264 * Move the kernel virtual free pointer to the next
265 * 4MB.  This is used to help improve performance
266 * by using a large (4MB) page for much of the kernel
267 * (.text, .data, .bss)
268 */
269static vm_offset_t
270pmap_kmem_choose(vm_offset_t addr)
271{
272	vm_offset_t newaddr = addr;
273
274#ifndef DISABLE_PSE
275	if (cpu_feature & CPUID_PSE)
276		newaddr = (addr + PDRMASK) & ~PDRMASK;
277#endif
278	return newaddr;
279}
280
281/*
282 *	Bootstrap the system enough to run with virtual memory.
283 *
284 *	On the i386 this is called after mapping has already been enabled
285 *	and just syncs the pmap module with what has already been done.
286 *	[We can't call it easily with mapping off since the kernel is not
287 *	mapped with PA == VA, hence we would have to relocate every address
288 *	from the linked base (virtual) address "KERNBASE" to the actual
289 *	(physical) address starting relative to 0]
290 */
291void
292pmap_bootstrap(firstaddr, loadaddr)
293	vm_paddr_t firstaddr;
294	vm_paddr_t loadaddr;
295{
296	vm_offset_t va;
297	pt_entry_t *pte;
298	int i;
299
300	avail_start = firstaddr;
301
302	/*
303	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
304	 * large. It should instead be correctly calculated in locore.s and
305	 * not based on 'first' (which is a physical address, not a virtual
306	 * address, for the start of unused physical memory). The kernel
307	 * page tables are NOT double mapped and thus should not be included
308	 * in this calculation.
309	 */
310	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
311	virtual_avail = pmap_kmem_choose(virtual_avail);
312
313	virtual_end = VM_MAX_KERNEL_ADDRESS;
314
315	/*
316	 * Initialize protection array.
317	 */
318	i386_protection_init();
319
320	/*
321	 * Initialize the kernel pmap (which is statically allocated).
322	 */
323	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
324#ifdef PAE
325	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
326#endif
327	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
328	TAILQ_INIT(&kernel_pmap->pm_pvlist);
329	LIST_INIT(&allpmaps);
330#ifdef SMP
331	mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN);
332#endif
333	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
334	mtx_lock_spin(&allpmaps_lock);
335	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
336	mtx_unlock_spin(&allpmaps_lock);
337	nkpt = NKPT;
338
339	/*
340	 * Reserve some special page table entries/VA space for temporary
341	 * mapping of pages.
342	 */
343#define	SYSMAP(c, p, v, n)	\
344	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
345
346	va = virtual_avail;
347	pte = vtopte(va);
348
349	/*
350	 * CMAP1/CMAP2 are used for zeroing and copying pages.
351	 * CMAP3 is used for the idle process page zeroing.
352	 */
353	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
354	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
355	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
356	*CMAP3 = 0;
357
358	mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF);
359
360	/*
361	 * Crashdump maps.
362	 */
363	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
364
365	/*
366	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
367	 * XXX ptmmap is not used.
368	 */
369	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
370
371	/*
372	 * msgbufp is used to map the system message buffer.
373	 * XXX msgbufmap is not used.
374	 */
375	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
376	       atop(round_page(MSGBUF_SIZE)))
377
378	/*
379	 * ptemap is used for pmap_pte_quick
380	 */
381	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
382
383	virtual_avail = va;
384
385	*CMAP1 = *CMAP2 = 0;
386	for (i = 0; i < NKPT; i++)
387		PTD[i] = 0;
388
389	/* Turn on PG_G on kernel page(s) */
390	pmap_set_pg();
391}
392
393/*
394 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
395 */
396void
397pmap_set_pg(void)
398{
399	pd_entry_t pdir;
400	pt_entry_t *pte;
401	vm_offset_t va, endva;
402	int i;
403
404	if (pgeflag == 0)
405		return;
406
407	i = KERNLOAD/NBPDR;
408	endva = KERNBASE + KERNend;
409
410	if (pseflag) {
411		va = KERNBASE + KERNLOAD;
412		while (va  < endva) {
413			pdir = kernel_pmap->pm_pdir[KPTDI+i];
414			pdir |= pgeflag;
415			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
416			invltlb();	/* Play it safe, invltlb() every time */
417			i++;
418			va += NBPDR;
419		}
420	} else {
421		va = (vm_offset_t)btext;
422		while (va < endva) {
423			pte = vtopte(va);
424			if (*pte)
425				*pte |= pgeflag;
426			invltlb();	/* Play it safe, invltlb() every time */
427			va += PAGE_SIZE;
428		}
429	}
430}
431
432static void *
433pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
434{
435	*flags = UMA_SLAB_PRIV;
436	return (void *)kmem_alloc(kernel_map, bytes);
437}
438
439#ifdef PAE
440static void *
441pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
442{
443	*flags = UMA_SLAB_PRIV;
444	return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0));
445}
446#endif
447
448/*
449 *	Initialize the pmap module.
450 *	Called by vm_init, to initialize any structures that the pmap
451 *	system needs to map virtual memory.
452 *	pmap_init has been enhanced to support in a fairly consistant
453 *	way, discontiguous physical memory.
454 */
455void
456pmap_init(phys_start, phys_end)
457	vm_paddr_t phys_start, phys_end;
458{
459	int i;
460	int initial_pvs;
461
462	/*
463	 * Allocate memory for random pmap data structures.  Includes the
464	 * pv_head_table.
465	 */
466
467	for(i = 0; i < vm_page_array_size; i++) {
468		vm_page_t m;
469
470		m = &vm_page_array[i];
471		TAILQ_INIT(&m->md.pv_list);
472		m->md.pv_list_count = 0;
473	}
474
475	/*
476	 * init the pv free list
477	 */
478	initial_pvs = vm_page_array_size;
479	if (initial_pvs < MINPV)
480		initial_pvs = MINPV;
481	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
482	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
483	uma_zone_set_allocf(pvzone, pmap_pv_allocf);
484	uma_prealloc(pvzone, initial_pvs);
485
486#ifdef PAE
487	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
488	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
489	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
490	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
491#endif
492
493	/*
494	 * Now it is safe to enable pv_table recording.
495	 */
496	pmap_initialized = TRUE;
497}
498
499/*
500 * Initialize the address space (zone) for the pv_entries.  Set a
501 * high water mark so that the system can recover from excessive
502 * numbers of pv entries.
503 */
504void
505pmap_init2()
506{
507	int shpgperproc = PMAP_SHPGPERPROC;
508
509	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
510	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
511	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
512	pv_entry_high_water = 9 * (pv_entry_max / 10);
513	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
514}
515
516
517/***************************************************
518 * Low level helper routines.....
519 ***************************************************/
520
521#if defined(PMAP_DIAGNOSTIC)
522
523/*
524 * This code checks for non-writeable/modified pages.
525 * This should be an invalid condition.
526 */
527static int
528pmap_nw_modified(pt_entry_t ptea)
529{
530	int pte;
531
532	pte = (int) ptea;
533
534	if ((pte & (PG_M|PG_RW)) == PG_M)
535		return 1;
536	else
537		return 0;
538}
539#endif
540
541
542/*
543 * this routine defines the region(s) of memory that should
544 * not be tested for the modified bit.
545 */
546static PMAP_INLINE int
547pmap_track_modified(vm_offset_t va)
548{
549	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
550		return 1;
551	else
552		return 0;
553}
554
555#ifdef I386_CPU
556/*
557 * i386 only has "invalidate everything" and no SMP to worry about.
558 */
559PMAP_INLINE void
560pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
561{
562
563	if (pmap == kernel_pmap || pmap->pm_active)
564		invltlb();
565}
566
567PMAP_INLINE void
568pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
569{
570
571	if (pmap == kernel_pmap || pmap->pm_active)
572		invltlb();
573}
574
575PMAP_INLINE void
576pmap_invalidate_all(pmap_t pmap)
577{
578
579	if (pmap == kernel_pmap || pmap->pm_active)
580		invltlb();
581}
582#else /* !I386_CPU */
583#ifdef SMP
584/*
585 * For SMP, these functions have to use the IPI mechanism for coherence.
586 */
587void
588pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
589{
590	u_int cpumask;
591	u_int other_cpus;
592
593	if (smp_started) {
594		if (!(read_eflags() & PSL_I))
595			panic("%s: interrupts disabled", __func__);
596		mtx_lock_spin(&smp_tlb_mtx);
597	} else
598		critical_enter();
599	/*
600	 * We need to disable interrupt preemption but MUST NOT have
601	 * interrupts disabled here.
602	 * XXX we may need to hold schedlock to get a coherent pm_active
603	 * XXX critical sections disable interrupts again
604	 */
605	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
606		invlpg(va);
607		smp_invlpg(va);
608	} else {
609		cpumask = PCPU_GET(cpumask);
610		other_cpus = PCPU_GET(other_cpus);
611		if (pmap->pm_active & cpumask)
612			invlpg(va);
613		if (pmap->pm_active & other_cpus)
614			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
615	}
616	if (smp_started)
617		mtx_unlock_spin(&smp_tlb_mtx);
618	else
619		critical_exit();
620}
621
622void
623pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
624{
625	u_int cpumask;
626	u_int other_cpus;
627	vm_offset_t addr;
628
629	if (smp_started) {
630		if (!(read_eflags() & PSL_I))
631			panic("%s: interrupts disabled", __func__);
632		mtx_lock_spin(&smp_tlb_mtx);
633	} else
634		critical_enter();
635	/*
636	 * We need to disable interrupt preemption but MUST NOT have
637	 * interrupts disabled here.
638	 * XXX we may need to hold schedlock to get a coherent pm_active
639	 * XXX critical sections disable interrupts again
640	 */
641	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
642		for (addr = sva; addr < eva; addr += PAGE_SIZE)
643			invlpg(addr);
644		smp_invlpg_range(sva, eva);
645	} else {
646		cpumask = PCPU_GET(cpumask);
647		other_cpus = PCPU_GET(other_cpus);
648		if (pmap->pm_active & cpumask)
649			for (addr = sva; addr < eva; addr += PAGE_SIZE)
650				invlpg(addr);
651		if (pmap->pm_active & other_cpus)
652			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
653			    sva, eva);
654	}
655	if (smp_started)
656		mtx_unlock_spin(&smp_tlb_mtx);
657	else
658		critical_exit();
659}
660
661void
662pmap_invalidate_all(pmap_t pmap)
663{
664	u_int cpumask;
665	u_int other_cpus;
666
667	if (smp_started) {
668		if (!(read_eflags() & PSL_I))
669			panic("%s: interrupts disabled", __func__);
670		mtx_lock_spin(&smp_tlb_mtx);
671	} else
672		critical_enter();
673	/*
674	 * We need to disable interrupt preemption but MUST NOT have
675	 * interrupts disabled here.
676	 * XXX we may need to hold schedlock to get a coherent pm_active
677	 * XXX critical sections disable interrupts again
678	 */
679	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
680		invltlb();
681		smp_invltlb();
682	} else {
683		cpumask = PCPU_GET(cpumask);
684		other_cpus = PCPU_GET(other_cpus);
685		if (pmap->pm_active & cpumask)
686			invltlb();
687		if (pmap->pm_active & other_cpus)
688			smp_masked_invltlb(pmap->pm_active & other_cpus);
689	}
690	if (smp_started)
691		mtx_unlock_spin(&smp_tlb_mtx);
692	else
693		critical_exit();
694}
695#else /* !SMP */
696/*
697 * Normal, non-SMP, 486+ invalidation functions.
698 * We inline these within pmap.c for speed.
699 */
700PMAP_INLINE void
701pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
702{
703
704	if (pmap == kernel_pmap || pmap->pm_active)
705		invlpg(va);
706}
707
708PMAP_INLINE void
709pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
710{
711	vm_offset_t addr;
712
713	if (pmap == kernel_pmap || pmap->pm_active)
714		for (addr = sva; addr < eva; addr += PAGE_SIZE)
715			invlpg(addr);
716}
717
718PMAP_INLINE void
719pmap_invalidate_all(pmap_t pmap)
720{
721
722	if (pmap == kernel_pmap || pmap->pm_active)
723		invltlb();
724}
725#endif /* !SMP */
726#endif /* !I386_CPU */
727
728/*
729 * Are we current address space or kernel?  N.B. We return FALSE when
730 * a pmap's page table is in use because a kernel thread is borrowing
731 * it.  The borrowed page table can change spontaneously, making any
732 * dependence on its continued use subject to a race condition.
733 */
734static int
735pmap_is_current(pmap_t pmap)
736{
737	return (pmap == kernel_pmap ||
738		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
739	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
740}
741
742/*
743 * Super fast pmap_pte routine best used when scanning
744 * the pv lists.  This eliminates many coarse-grained
745 * invltlb calls.  Note that many of the pv list
746 * scans are across different pmaps.  It is very wasteful
747 * to do an entire invltlb for checking a single mapping.
748 */
749pt_entry_t *
750pmap_pte_quick(pmap, va)
751	register pmap_t pmap;
752	vm_offset_t va;
753{
754	pd_entry_t newpf;
755	pd_entry_t *pde;
756
757	pde = pmap_pde(pmap, va);
758	if (*pde & PG_PS)
759		return (pde);
760	if (*pde != 0) {
761		/* are we current address space or kernel? */
762		if (pmap_is_current(pmap))
763			return vtopte(va);
764		newpf = *pde & PG_FRAME;
765		if (((*PMAP1) & PG_FRAME) != newpf) {
766			*PMAP1 = newpf | PG_RW | PG_V;
767			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1);
768		}
769		return PADDR1 + (i386_btop(va) & (NPTEPG - 1));
770	}
771	return (0);
772}
773
774/*
775 *	Routine:	pmap_extract
776 *	Function:
777 *		Extract the physical page address associated
778 *		with the given map/virtual_address pair.
779 */
780vm_paddr_t
781pmap_extract(pmap, va)
782	register pmap_t pmap;
783	vm_offset_t va;
784{
785	vm_paddr_t rtval;
786	pt_entry_t *pte;
787	pd_entry_t pde;
788
789	if (pmap == 0)
790		return 0;
791	pde = pmap->pm_pdir[va >> PDRSHIFT];
792	if (pde != 0) {
793		if ((pde & PG_PS) != 0) {
794			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
795			return rtval;
796		}
797		pte = pmap_pte_quick(pmap, va);
798		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
799		return rtval;
800	}
801	return 0;
802
803}
804
805/*
806 *	Routine:	pmap_extract_and_hold
807 *	Function:
808 *		Atomically extract and hold the physical page
809 *		with the given pmap and virtual address pair
810 *		if that mapping permits the given protection.
811 */
812vm_page_t
813pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
814{
815	vm_paddr_t pa;
816	vm_page_t m;
817
818	m = NULL;
819	mtx_lock(&Giant);
820	if ((pa = pmap_extract(pmap, va)) != 0) {
821		m = PHYS_TO_VM_PAGE(pa);
822		vm_page_lock_queues();
823		vm_page_hold(m);
824		vm_page_unlock_queues();
825	}
826	mtx_unlock(&Giant);
827	return (m);
828}
829
830/***************************************************
831 * Low level mapping routines.....
832 ***************************************************/
833
834/*
835 * Add a wired page to the kva.
836 * Note: not SMP coherent.
837 */
838PMAP_INLINE void
839pmap_kenter(vm_offset_t va, vm_paddr_t pa)
840{
841	pt_entry_t *pte;
842
843	pte = vtopte(va);
844	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
845}
846
847/*
848 * Remove a page from the kernel pagetables.
849 * Note: not SMP coherent.
850 */
851PMAP_INLINE void
852pmap_kremove(vm_offset_t va)
853{
854	pt_entry_t *pte;
855
856	pte = vtopte(va);
857	pte_clear(pte);
858}
859
860/*
861 *	Used to map a range of physical addresses into kernel
862 *	virtual address space.
863 *
864 *	The value passed in '*virt' is a suggested virtual address for
865 *	the mapping. Architectures which can support a direct-mapped
866 *	physical to virtual region can return the appropriate address
867 *	within that region, leaving '*virt' unchanged. Other
868 *	architectures should map the pages starting at '*virt' and
869 *	update '*virt' with the first usable address after the mapped
870 *	region.
871 */
872vm_offset_t
873pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
874{
875	vm_offset_t va, sva;
876
877	va = sva = *virt;
878	while (start < end) {
879		pmap_kenter(va, start);
880		va += PAGE_SIZE;
881		start += PAGE_SIZE;
882	}
883	pmap_invalidate_range(kernel_pmap, sva, va);
884	*virt = va;
885	return (sva);
886}
887
888
889/*
890 * Add a list of wired pages to the kva
891 * this routine is only used for temporary
892 * kernel mappings that do not need to have
893 * page modification or references recorded.
894 * Note that old mappings are simply written
895 * over.  The page *must* be wired.
896 * Note: SMP coherent.  Uses a ranged shootdown IPI.
897 */
898void
899pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
900{
901	vm_offset_t va;
902
903	va = sva;
904	while (count-- > 0) {
905		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
906		va += PAGE_SIZE;
907		m++;
908	}
909	pmap_invalidate_range(kernel_pmap, sva, va);
910}
911
912/*
913 * This routine tears out page mappings from the
914 * kernel -- it is meant only for temporary mappings.
915 * Note: SMP coherent.  Uses a ranged shootdown IPI.
916 */
917void
918pmap_qremove(vm_offset_t sva, int count)
919{
920	vm_offset_t va;
921
922	va = sva;
923	while (count-- > 0) {
924		pmap_kremove(va);
925		va += PAGE_SIZE;
926	}
927	pmap_invalidate_range(kernel_pmap, sva, va);
928}
929
930/***************************************************
931 * Page table page management routines.....
932 ***************************************************/
933
934/*
935 * This routine unholds page table pages, and if the hold count
936 * drops to zero, then it decrements the wire count.
937 */
938static int
939_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
940{
941
942	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
943		vm_page_lock_queues();
944
945	if (m->hold_count == 0) {
946		vm_offset_t pteva;
947		/*
948		 * unmap the page table page
949		 */
950		pmap->pm_pdir[m->pindex] = 0;
951		--pmap->pm_stats.resident_count;
952		/*
953		 * We never unwire a kernel page table page, making a
954		 * check for the kernel_pmap unnecessary.
955		 */
956		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) {
957			/*
958			 * Do an invltlb to make the invalidated mapping
959			 * take effect immediately.
960			 */
961			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
962			pmap_invalidate_page(pmap, pteva);
963		}
964
965		/*
966		 * If the page is finally unwired, simply free it.
967		 */
968		--m->wire_count;
969		if (m->wire_count == 0) {
970			vm_page_busy(m);
971			vm_page_free_zero(m);
972			atomic_subtract_int(&cnt.v_wire_count, 1);
973		}
974		return 1;
975	}
976	return 0;
977}
978
979static PMAP_INLINE int
980pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
981{
982	vm_page_unhold(m);
983	if (m->hold_count == 0)
984		return _pmap_unwire_pte_hold(pmap, m);
985	else
986		return 0;
987}
988
989/*
990 * After removing a page table entry, this routine is used to
991 * conditionally free the page, and manage the hold/wire counts.
992 */
993static int
994pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
995{
996
997	if (va >= VM_MAXUSER_ADDRESS)
998		return 0;
999
1000	return pmap_unwire_pte_hold(pmap, mpte);
1001}
1002
1003void
1004pmap_pinit0(pmap)
1005	struct pmap *pmap;
1006{
1007
1008	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1009#ifdef PAE
1010	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1011#endif
1012	pmap->pm_active = 0;
1013	PCPU_SET(curpmap, pmap);
1014	TAILQ_INIT(&pmap->pm_pvlist);
1015	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1016	mtx_lock_spin(&allpmaps_lock);
1017	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1018	mtx_unlock_spin(&allpmaps_lock);
1019}
1020
1021/*
1022 * Initialize a preallocated and zeroed pmap structure,
1023 * such as one in a vmspace structure.
1024 */
1025void
1026pmap_pinit(pmap)
1027	register struct pmap *pmap;
1028{
1029	vm_page_t m, ptdpg[NPGPTD];
1030	vm_paddr_t pa;
1031	static int color;
1032	int i;
1033
1034	/*
1035	 * No need to allocate page table space yet but we do need a valid
1036	 * page directory table.
1037	 */
1038	if (pmap->pm_pdir == NULL) {
1039		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1040		    NBPTD);
1041#ifdef PAE
1042		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1043		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1044		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1045		    ("pmap_pinit: pdpt misaligned"));
1046		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1047		    ("pmap_pinit: pdpt above 4g"));
1048#endif
1049	}
1050
1051	/*
1052	 * allocate the page directory page(s)
1053	 */
1054	for (i = 0; i < NPGPTD;) {
1055		m = vm_page_alloc(NULL, color++,
1056		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1057		    VM_ALLOC_ZERO);
1058		if (m == NULL)
1059			VM_WAIT;
1060		else {
1061			vm_page_lock_queues();
1062			vm_page_flag_clear(m, PG_BUSY);
1063			vm_page_unlock_queues();
1064			ptdpg[i++] = m;
1065		}
1066	}
1067
1068	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1069
1070	for (i = 0; i < NPGPTD; i++) {
1071		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1072			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1073	}
1074
1075	mtx_lock_spin(&allpmaps_lock);
1076	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1077	mtx_unlock_spin(&allpmaps_lock);
1078	/* Wire in kernel global address entries. */
1079	/* XXX copies current process, does not fill in MPPTDI */
1080	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1081#ifdef SMP
1082	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1083#endif
1084
1085	/* install self-referential address mapping entry(s) */
1086	for (i = 0; i < NPGPTD; i++) {
1087		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1088		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1089#ifdef PAE
1090		pmap->pm_pdpt[i] = pa | PG_V;
1091#endif
1092	}
1093
1094	pmap->pm_active = 0;
1095	TAILQ_INIT(&pmap->pm_pvlist);
1096	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1097}
1098
1099/*
1100 * Wire in kernel global address entries.  To avoid a race condition
1101 * between pmap initialization and pmap_growkernel, this procedure
1102 * should be called after the vmspace is attached to the process
1103 * but before this pmap is activated.
1104 */
1105void
1106pmap_pinit2(pmap)
1107	struct pmap *pmap;
1108{
1109	/* XXX: Remove this stub when no longer called */
1110}
1111
1112/*
1113 * this routine is called if the page table page is not
1114 * mapped correctly.
1115 */
1116static vm_page_t
1117_pmap_allocpte(pmap, ptepindex)
1118	pmap_t	pmap;
1119	unsigned ptepindex;
1120{
1121	vm_paddr_t ptepa;
1122	vm_page_t m;
1123
1124	/*
1125	 * Allocate a page table page.
1126	 */
1127	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1128	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1129		VM_WAIT;
1130		/*
1131		 * Indicate the need to retry.  While waiting, the page table
1132		 * page may have been allocated.
1133		 */
1134		return (NULL);
1135	}
1136	if ((m->flags & PG_ZERO) == 0)
1137		pmap_zero_page(m);
1138
1139	KASSERT(m->queue == PQ_NONE,
1140		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1141
1142	/*
1143	 * Increment the hold count for the page table page
1144	 * (denoting a new mapping.)
1145	 */
1146	m->hold_count++;
1147
1148	/*
1149	 * Map the pagetable page into the process address space, if
1150	 * it isn't already there.
1151	 */
1152
1153	pmap->pm_stats.resident_count++;
1154
1155	ptepa = VM_PAGE_TO_PHYS(m);
1156	pmap->pm_pdir[ptepindex] =
1157		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1158
1159	vm_page_lock_queues();
1160	vm_page_flag_clear(m, PG_ZERO);
1161	vm_page_wakeup(m);
1162	vm_page_unlock_queues();
1163
1164	return m;
1165}
1166
1167static vm_page_t
1168pmap_allocpte(pmap_t pmap, vm_offset_t va)
1169{
1170	unsigned ptepindex;
1171	pd_entry_t ptepa;
1172	vm_page_t m;
1173
1174	/*
1175	 * Calculate pagetable page index
1176	 */
1177	ptepindex = va >> PDRSHIFT;
1178retry:
1179	/*
1180	 * Get the page directory entry
1181	 */
1182	ptepa = pmap->pm_pdir[ptepindex];
1183
1184	/*
1185	 * This supports switching from a 4MB page to a
1186	 * normal 4K page.
1187	 */
1188	if (ptepa & PG_PS) {
1189		pmap->pm_pdir[ptepindex] = 0;
1190		ptepa = 0;
1191		pmap_invalidate_all(kernel_pmap);
1192	}
1193
1194	/*
1195	 * If the page table page is mapped, we just increment the
1196	 * hold count, and activate it.
1197	 */
1198	if (ptepa) {
1199		m = PHYS_TO_VM_PAGE(ptepa);
1200		m->hold_count++;
1201	} else {
1202		/*
1203		 * Here if the pte page isn't mapped, or if it has
1204		 * been deallocated.
1205		 */
1206		m = _pmap_allocpte(pmap, ptepindex);
1207		if (m == NULL)
1208			goto retry;
1209	}
1210	return (m);
1211}
1212
1213
1214/***************************************************
1215* Pmap allocation/deallocation routines.
1216 ***************************************************/
1217
1218#ifdef SMP
1219/*
1220 * Deal with a SMP shootdown of other users of the pmap that we are
1221 * trying to dispose of.  This can be a bit hairy.
1222 */
1223static u_int *lazymask;
1224static u_int lazyptd;
1225static volatile u_int lazywait;
1226
1227void pmap_lazyfix_action(void);
1228
1229void
1230pmap_lazyfix_action(void)
1231{
1232	u_int mymask = PCPU_GET(cpumask);
1233
1234	if (rcr3() == lazyptd)
1235		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1236	atomic_clear_int(lazymask, mymask);
1237	atomic_store_rel_int(&lazywait, 1);
1238}
1239
1240static void
1241pmap_lazyfix_self(u_int mymask)
1242{
1243
1244	if (rcr3() == lazyptd)
1245		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1246	atomic_clear_int(lazymask, mymask);
1247}
1248
1249
1250static void
1251pmap_lazyfix(pmap_t pmap)
1252{
1253	u_int mymask = PCPU_GET(cpumask);
1254	u_int mask;
1255	register u_int spins;
1256
1257	while ((mask = pmap->pm_active) != 0) {
1258		spins = 50000000;
1259		mask = mask & -mask;	/* Find least significant set bit */
1260		mtx_lock_spin(&lazypmap_lock);
1261#ifdef PAE
1262		lazyptd = vtophys(pmap->pm_pdpt);
1263#else
1264		lazyptd = vtophys(pmap->pm_pdir);
1265#endif
1266		if (mask == mymask) {
1267			lazymask = &pmap->pm_active;
1268			pmap_lazyfix_self(mymask);
1269		} else {
1270			atomic_store_rel_int((u_int *)&lazymask,
1271			    (u_int)&pmap->pm_active);
1272			atomic_store_rel_int(&lazywait, 0);
1273			ipi_selected(mask, IPI_LAZYPMAP);
1274			while (lazywait == 0) {
1275				ia32_pause();
1276				if (--spins == 0)
1277					break;
1278			}
1279		}
1280		mtx_unlock_spin(&lazypmap_lock);
1281		if (spins == 0)
1282			printf("pmap_lazyfix: spun for 50000000\n");
1283	}
1284}
1285
1286#else	/* SMP */
1287
1288/*
1289 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1290 * unlikely to have to even execute this code, including the fact
1291 * that the cleanup is deferred until the parent does a wait(2), which
1292 * means that another userland process has run.
1293 */
1294static void
1295pmap_lazyfix(pmap_t pmap)
1296{
1297	u_int cr3;
1298
1299	cr3 = vtophys(pmap->pm_pdir);
1300	if (cr3 == rcr3()) {
1301		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1302		pmap->pm_active &= ~(PCPU_GET(cpumask));
1303	}
1304}
1305#endif	/* SMP */
1306
1307/*
1308 * Release any resources held by the given physical map.
1309 * Called when a pmap initialized by pmap_pinit is being released.
1310 * Should only be called if the map contains no valid mappings.
1311 */
1312void
1313pmap_release(pmap_t pmap)
1314{
1315	vm_page_t m, ptdpg[NPGPTD];
1316	int i;
1317
1318	KASSERT(pmap->pm_stats.resident_count == 0,
1319	    ("pmap_release: pmap resident count %ld != 0",
1320	    pmap->pm_stats.resident_count));
1321
1322	pmap_lazyfix(pmap);
1323	mtx_lock_spin(&allpmaps_lock);
1324	LIST_REMOVE(pmap, pm_list);
1325	mtx_unlock_spin(&allpmaps_lock);
1326
1327	for (i = 0; i < NPGPTD; i++)
1328		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1329
1330	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1331	    sizeof(*pmap->pm_pdir));
1332#ifdef SMP
1333	pmap->pm_pdir[MPPTDI] = 0;
1334#endif
1335
1336	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1337
1338	vm_page_lock_queues();
1339	for (i = 0; i < NPGPTD; i++) {
1340		m = ptdpg[i];
1341#ifdef PAE
1342		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1343		    ("pmap_release: got wrong ptd page"));
1344#endif
1345		m->wire_count--;
1346		atomic_subtract_int(&cnt.v_wire_count, 1);
1347		vm_page_busy(m);
1348		vm_page_free_zero(m);
1349	}
1350	vm_page_unlock_queues();
1351}
1352
1353static int
1354kvm_size(SYSCTL_HANDLER_ARGS)
1355{
1356	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1357
1358	return sysctl_handle_long(oidp, &ksize, 0, req);
1359}
1360SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1361    0, 0, kvm_size, "IU", "Size of KVM");
1362
1363static int
1364kvm_free(SYSCTL_HANDLER_ARGS)
1365{
1366	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1367
1368	return sysctl_handle_long(oidp, &kfree, 0, req);
1369}
1370SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1371    0, 0, kvm_free, "IU", "Amount of KVM free");
1372
1373/*
1374 * grow the number of kernel page table entries, if needed
1375 */
1376void
1377pmap_growkernel(vm_offset_t addr)
1378{
1379	struct pmap *pmap;
1380	int s;
1381	vm_paddr_t ptppaddr;
1382	vm_page_t nkpg;
1383	pd_entry_t newpdir;
1384	pt_entry_t *pde;
1385
1386	s = splhigh();
1387	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1388	if (kernel_vm_end == 0) {
1389		kernel_vm_end = KERNBASE;
1390		nkpt = 0;
1391		while (pdir_pde(PTD, kernel_vm_end)) {
1392			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1393			nkpt++;
1394		}
1395	}
1396	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1397	while (kernel_vm_end < addr) {
1398		if (pdir_pde(PTD, kernel_vm_end)) {
1399			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1400			continue;
1401		}
1402
1403		/*
1404		 * This index is bogus, but out of the way
1405		 */
1406		nkpg = vm_page_alloc(NULL, nkpt,
1407		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1408		if (!nkpg)
1409			panic("pmap_growkernel: no memory to grow kernel");
1410
1411		nkpt++;
1412
1413		pmap_zero_page(nkpg);
1414		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1415		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1416		pdir_pde(PTD, kernel_vm_end) = newpdir;
1417
1418		mtx_lock_spin(&allpmaps_lock);
1419		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1420			pde = pmap_pde(pmap, kernel_vm_end);
1421			pde_store(pde, newpdir);
1422		}
1423		mtx_unlock_spin(&allpmaps_lock);
1424		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1425	}
1426	splx(s);
1427}
1428
1429
1430/***************************************************
1431 * page management routines.
1432 ***************************************************/
1433
1434/*
1435 * free the pv_entry back to the free list
1436 */
1437static PMAP_INLINE void
1438free_pv_entry(pv_entry_t pv)
1439{
1440	pv_entry_count--;
1441	uma_zfree(pvzone, pv);
1442}
1443
1444/*
1445 * get a new pv_entry, allocating a block from the system
1446 * when needed.
1447 * the memory allocation is performed bypassing the malloc code
1448 * because of the possibility of allocations at interrupt time.
1449 */
1450static pv_entry_t
1451get_pv_entry(void)
1452{
1453	pv_entry_count++;
1454	if (pv_entry_high_water &&
1455		(pv_entry_count > pv_entry_high_water) &&
1456		(pmap_pagedaemon_waken == 0)) {
1457		pmap_pagedaemon_waken = 1;
1458		wakeup (&vm_pages_needed);
1459	}
1460	return uma_zalloc(pvzone, M_NOWAIT);
1461}
1462
1463/*
1464 * If it is the first entry on the list, it is actually
1465 * in the header and we must copy the following entry up
1466 * to the header.  Otherwise we must search the list for
1467 * the entry.  In either case we free the now unused entry.
1468 */
1469
1470static int
1471pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1472{
1473	pv_entry_t pv;
1474	int rtval;
1475	int s;
1476
1477	s = splvm();
1478	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1479	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1480		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1481			if (pmap == pv->pv_pmap && va == pv->pv_va)
1482				break;
1483		}
1484	} else {
1485		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1486			if (va == pv->pv_va)
1487				break;
1488		}
1489	}
1490
1491	rtval = 0;
1492	if (pv) {
1493		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1494		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1495		m->md.pv_list_count--;
1496		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1497			vm_page_flag_clear(m, PG_WRITEABLE);
1498
1499		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1500		free_pv_entry(pv);
1501	}
1502
1503	splx(s);
1504	return rtval;
1505}
1506
1507/*
1508 * Create a pv entry for page at pa for
1509 * (pmap, va).
1510 */
1511static void
1512pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1513{
1514
1515	int s;
1516	pv_entry_t pv;
1517
1518	s = splvm();
1519	pv = get_pv_entry();
1520	pv->pv_va = va;
1521	pv->pv_pmap = pmap;
1522	pv->pv_ptem = mpte;
1523
1524	vm_page_lock_queues();
1525	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1526	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1527	m->md.pv_list_count++;
1528
1529	vm_page_unlock_queues();
1530	splx(s);
1531}
1532
1533/*
1534 * pmap_remove_pte: do the things to unmap a page in a process
1535 */
1536static int
1537pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1538{
1539	pt_entry_t oldpte;
1540	vm_page_t m, mpte;
1541
1542	oldpte = pte_load_clear(ptq);
1543	if (oldpte & PG_W)
1544		pmap->pm_stats.wired_count -= 1;
1545	/*
1546	 * Machines that don't support invlpg, also don't support
1547	 * PG_G.
1548	 */
1549	if (oldpte & PG_G)
1550		pmap_invalidate_page(kernel_pmap, va);
1551	pmap->pm_stats.resident_count -= 1;
1552	if (oldpte & PG_MANAGED) {
1553		m = PHYS_TO_VM_PAGE(oldpte);
1554		if (oldpte & PG_M) {
1555#if defined(PMAP_DIAGNOSTIC)
1556			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1557				printf(
1558	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1559				    va, oldpte);
1560			}
1561#endif
1562			if (pmap_track_modified(va))
1563				vm_page_dirty(m);
1564		}
1565		if (oldpte & PG_A)
1566			vm_page_flag_set(m, PG_REFERENCED);
1567		return pmap_remove_entry(pmap, m, va);
1568	} else {
1569		mpte = PHYS_TO_VM_PAGE(*pmap_pde(pmap, va));
1570		return pmap_unuse_pt(pmap, va, mpte);
1571	}
1572}
1573
1574/*
1575 * Remove a single page from a process address space
1576 */
1577static void
1578pmap_remove_page(pmap_t pmap, vm_offset_t va)
1579{
1580	pt_entry_t *pte;
1581
1582	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1583		return;
1584	pmap_remove_pte(pmap, pte, va);
1585	pmap_invalidate_page(pmap, va);
1586}
1587
1588/*
1589 *	Remove the given range of addresses from the specified map.
1590 *
1591 *	It is assumed that the start and end are properly
1592 *	rounded to the page size.
1593 */
1594void
1595pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1596{
1597	vm_offset_t pdnxt;
1598	pd_entry_t ptpaddr;
1599	pt_entry_t *pte;
1600	int anyvalid;
1601
1602	if (pmap == NULL)
1603		return;
1604
1605	if (pmap->pm_stats.resident_count == 0)
1606		return;
1607
1608	/*
1609	 * special handling of removing one page.  a very
1610	 * common operation and easy to short circuit some
1611	 * code.
1612	 */
1613	if ((sva + PAGE_SIZE == eva) &&
1614	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1615		pmap_remove_page(pmap, sva);
1616		return;
1617	}
1618
1619	anyvalid = 0;
1620
1621	for (; sva < eva; sva = pdnxt) {
1622		unsigned pdirindex;
1623
1624		/*
1625		 * Calculate index for next page table.
1626		 */
1627		pdnxt = (sva + NBPDR) & ~PDRMASK;
1628		if (pmap->pm_stats.resident_count == 0)
1629			break;
1630
1631		pdirindex = sva >> PDRSHIFT;
1632		ptpaddr = pmap->pm_pdir[pdirindex];
1633
1634		/*
1635		 * Weed out invalid mappings. Note: we assume that the page
1636		 * directory table is always allocated, and in kernel virtual.
1637		 */
1638		if (ptpaddr == 0)
1639			continue;
1640
1641		/*
1642		 * Check for large page.
1643		 */
1644		if ((ptpaddr & PG_PS) != 0) {
1645			pmap->pm_pdir[pdirindex] = 0;
1646			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1647			anyvalid = 1;
1648			continue;
1649		}
1650
1651		/*
1652		 * Limit our scan to either the end of the va represented
1653		 * by the current page table page, or to the end of the
1654		 * range being removed.
1655		 */
1656		if (pdnxt > eva)
1657			pdnxt = eva;
1658
1659		for (; sva != pdnxt; sva += PAGE_SIZE) {
1660			if ((pte = pmap_pte_quick(pmap, sva)) == NULL ||
1661			    *pte == 0)
1662				continue;
1663			anyvalid = 1;
1664			if (pmap_remove_pte(pmap, pte, sva))
1665				break;
1666		}
1667	}
1668
1669	if (anyvalid)
1670		pmap_invalidate_all(pmap);
1671}
1672
1673/*
1674 *	Routine:	pmap_remove_all
1675 *	Function:
1676 *		Removes this physical page from
1677 *		all physical maps in which it resides.
1678 *		Reflects back modify bits to the pager.
1679 *
1680 *	Notes:
1681 *		Original versions of this routine were very
1682 *		inefficient because they iteratively called
1683 *		pmap_remove (slow...)
1684 */
1685
1686void
1687pmap_remove_all(vm_page_t m)
1688{
1689	register pv_entry_t pv;
1690	pt_entry_t *pte, tpte;
1691	int s;
1692
1693#if defined(PMAP_DIAGNOSTIC)
1694	/*
1695	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1696	 */
1697	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1698		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1699		    VM_PAGE_TO_PHYS(m));
1700	}
1701#endif
1702	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1703	s = splvm();
1704	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1705		pv->pv_pmap->pm_stats.resident_count--;
1706		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1707		tpte = pte_load_clear(pte);
1708		if (tpte & PG_W)
1709			pv->pv_pmap->pm_stats.wired_count--;
1710		if (tpte & PG_A)
1711			vm_page_flag_set(m, PG_REFERENCED);
1712
1713		/*
1714		 * Update the vm_page_t clean and reference bits.
1715		 */
1716		if (tpte & PG_M) {
1717#if defined(PMAP_DIAGNOSTIC)
1718			if (pmap_nw_modified((pt_entry_t) tpte)) {
1719				printf(
1720	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1721				    pv->pv_va, tpte);
1722			}
1723#endif
1724			if (pmap_track_modified(pv->pv_va))
1725				vm_page_dirty(m);
1726		}
1727		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1728		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1729		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1730		m->md.pv_list_count--;
1731		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1732		free_pv_entry(pv);
1733	}
1734	vm_page_flag_clear(m, PG_WRITEABLE);
1735	splx(s);
1736}
1737
1738/*
1739 *	Set the physical protection on the
1740 *	specified range of this map as requested.
1741 */
1742void
1743pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1744{
1745	vm_offset_t pdnxt;
1746	pd_entry_t ptpaddr;
1747	int anychanged;
1748
1749	if (pmap == NULL)
1750		return;
1751
1752	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1753		pmap_remove(pmap, sva, eva);
1754		return;
1755	}
1756
1757	if (prot & VM_PROT_WRITE)
1758		return;
1759
1760	anychanged = 0;
1761
1762	for (; sva < eva; sva = pdnxt) {
1763		unsigned pdirindex;
1764
1765		pdnxt = (sva + NBPDR) & ~PDRMASK;
1766
1767		pdirindex = sva >> PDRSHIFT;
1768		ptpaddr = pmap->pm_pdir[pdirindex];
1769
1770		/*
1771		 * Weed out invalid mappings. Note: we assume that the page
1772		 * directory table is always allocated, and in kernel virtual.
1773		 */
1774		if (ptpaddr == 0)
1775			continue;
1776
1777		/*
1778		 * Check for large page.
1779		 */
1780		if ((ptpaddr & PG_PS) != 0) {
1781			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1782			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1783			anychanged = 1;
1784			continue;
1785		}
1786
1787		if (pdnxt > eva)
1788			pdnxt = eva;
1789
1790		for (; sva != pdnxt; sva += PAGE_SIZE) {
1791			pt_entry_t pbits;
1792			pt_entry_t *pte;
1793			vm_page_t m;
1794
1795			if ((pte = pmap_pte_quick(pmap, sva)) == NULL)
1796				continue;
1797			pbits = *pte;
1798			if (pbits & PG_MANAGED) {
1799				m = NULL;
1800				if (pbits & PG_A) {
1801					m = PHYS_TO_VM_PAGE(pbits);
1802					vm_page_flag_set(m, PG_REFERENCED);
1803					pbits &= ~PG_A;
1804				}
1805				if ((pbits & PG_M) != 0 &&
1806				    pmap_track_modified(sva)) {
1807					if (m == NULL)
1808						m = PHYS_TO_VM_PAGE(pbits);
1809					vm_page_dirty(m);
1810					pbits &= ~PG_M;
1811				}
1812			}
1813
1814			pbits &= ~PG_RW;
1815
1816			if (pbits != *pte) {
1817				pte_store(pte, pbits);
1818				anychanged = 1;
1819			}
1820		}
1821	}
1822	if (anychanged)
1823		pmap_invalidate_all(pmap);
1824}
1825
1826/*
1827 *	Insert the given physical page (p) at
1828 *	the specified virtual address (v) in the
1829 *	target physical map with the protection requested.
1830 *
1831 *	If specified, the page will be wired down, meaning
1832 *	that the related pte can not be reclaimed.
1833 *
1834 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1835 *	or lose information.  That is, this routine must actually
1836 *	insert this page into the given map NOW.
1837 */
1838void
1839pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1840	   boolean_t wired)
1841{
1842	vm_paddr_t pa;
1843	register pt_entry_t *pte;
1844	vm_paddr_t opa;
1845	pt_entry_t origpte, newpte;
1846	vm_page_t mpte;
1847
1848	if (pmap == NULL)
1849		return;
1850
1851	va &= PG_FRAME;
1852#ifdef PMAP_DIAGNOSTIC
1853	if (va > VM_MAX_KERNEL_ADDRESS)
1854		panic("pmap_enter: toobig");
1855	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1856		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1857#endif
1858
1859	mpte = NULL;
1860	/*
1861	 * In the case that a page table page is not
1862	 * resident, we are creating it here.
1863	 */
1864	if (va < VM_MAXUSER_ADDRESS) {
1865		mpte = pmap_allocpte(pmap, va);
1866	}
1867#if 0 && defined(PMAP_DIAGNOSTIC)
1868	else {
1869		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1870		origpte = *pdeaddr;
1871		if ((origpte & PG_V) == 0) {
1872			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
1873				pmap->pm_pdir[PTDPTDI], origpte, va);
1874		}
1875	}
1876#endif
1877
1878	pte = pmap_pte_quick(pmap, va);
1879
1880	/*
1881	 * Page Directory table entry not valid, we need a new PT page
1882	 */
1883	if (pte == NULL) {
1884		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
1885			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
1886	}
1887
1888	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
1889	origpte = *pte;
1890	opa = origpte & PG_FRAME;
1891
1892	if (origpte & PG_PS) {
1893		/*
1894		 * Yes, I know this will truncate upper address bits for PAE,
1895		 * but I'm actually more interested in the lower bits
1896		 */
1897		printf("pmap_enter: va %p, pte %p, origpte %p\n",
1898		    (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
1899		panic("pmap_enter: attempted pmap_enter on 4MB page");
1900	}
1901
1902	/*
1903	 * Mapping has not changed, must be protection or wiring change.
1904	 */
1905	if (origpte && (opa == pa)) {
1906		/*
1907		 * Wiring change, just update stats. We don't worry about
1908		 * wiring PT pages as they remain resident as long as there
1909		 * are valid mappings in them. Hence, if a user page is wired,
1910		 * the PT page will be also.
1911		 */
1912		if (wired && ((origpte & PG_W) == 0))
1913			pmap->pm_stats.wired_count++;
1914		else if (!wired && (origpte & PG_W))
1915			pmap->pm_stats.wired_count--;
1916
1917#if defined(PMAP_DIAGNOSTIC)
1918		if (pmap_nw_modified((pt_entry_t) origpte)) {
1919			printf(
1920	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
1921			    va, origpte);
1922		}
1923#endif
1924
1925		/*
1926		 * Remove extra pte reference
1927		 */
1928		if (mpte)
1929			mpte->hold_count--;
1930
1931		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
1932			if ((origpte & PG_RW) == 0) {
1933				pte_store(pte, origpte | PG_RW);
1934				pmap_invalidate_page(pmap, va);
1935			}
1936			return;
1937		}
1938
1939		/*
1940		 * We might be turning off write access to the page,
1941		 * so we go ahead and sense modify status.
1942		 */
1943		if (origpte & PG_MANAGED) {
1944			if ((origpte & PG_M) && pmap_track_modified(va)) {
1945				vm_page_t om;
1946				om = PHYS_TO_VM_PAGE(opa);
1947				vm_page_dirty(om);
1948			}
1949			pa |= PG_MANAGED;
1950		}
1951		goto validate;
1952	}
1953	/*
1954	 * Mapping has changed, invalidate old range and fall through to
1955	 * handle validating new mapping.
1956	 */
1957	if (opa) {
1958		int err;
1959		vm_page_lock_queues();
1960		err = pmap_remove_pte(pmap, pte, va);
1961		vm_page_unlock_queues();
1962		if (err)
1963			panic("pmap_enter: pte vanished, va: 0x%x", va);
1964	}
1965
1966	/*
1967	 * Enter on the PV list if part of our managed memory. Note that we
1968	 * raise IPL while manipulating pv_table since pmap_enter can be
1969	 * called at interrupt time.
1970	 */
1971	if (pmap_initialized &&
1972	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1973		pmap_insert_entry(pmap, va, mpte, m);
1974		pa |= PG_MANAGED;
1975	}
1976
1977	/*
1978	 * Increment counters
1979	 */
1980	pmap->pm_stats.resident_count++;
1981	if (wired)
1982		pmap->pm_stats.wired_count++;
1983
1984validate:
1985	/*
1986	 * Now validate mapping with desired protection/wiring.
1987	 */
1988	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
1989
1990	if (wired)
1991		newpte |= PG_W;
1992	if (va < VM_MAXUSER_ADDRESS)
1993		newpte |= PG_U;
1994	if (pmap == kernel_pmap)
1995		newpte |= pgeflag;
1996
1997	/*
1998	 * if the mapping or permission bits are different, we need
1999	 * to update the pte.
2000	 */
2001	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2002		pte_store(pte, newpte | PG_A);
2003		/*if (origpte)*/ {
2004			pmap_invalidate_page(pmap, va);
2005		}
2006	}
2007}
2008
2009/*
2010 * this code makes some *MAJOR* assumptions:
2011 * 1. Current pmap & pmap exists.
2012 * 2. Not wired.
2013 * 3. Read access.
2014 * 4. No page table pages.
2015 * 5. Tlbflush is deferred to calling procedure.
2016 * 6. Page IS managed.
2017 * but is *MUCH* faster than pmap_enter...
2018 */
2019
2020vm_page_t
2021pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2022{
2023	pt_entry_t *pte;
2024	vm_paddr_t pa;
2025
2026	/*
2027	 * In the case that a page table page is not
2028	 * resident, we are creating it here.
2029	 */
2030	if (va < VM_MAXUSER_ADDRESS) {
2031		unsigned ptepindex;
2032		pd_entry_t ptepa;
2033
2034		/*
2035		 * Calculate pagetable page index
2036		 */
2037		ptepindex = va >> PDRSHIFT;
2038		if (mpte && (mpte->pindex == ptepindex)) {
2039			mpte->hold_count++;
2040		} else {
2041retry:
2042			/*
2043			 * Get the page directory entry
2044			 */
2045			ptepa = pmap->pm_pdir[ptepindex];
2046
2047			/*
2048			 * If the page table page is mapped, we just increment
2049			 * the hold count, and activate it.
2050			 */
2051			if (ptepa) {
2052				if (ptepa & PG_PS)
2053					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2054				mpte = PHYS_TO_VM_PAGE(ptepa);
2055				mpte->hold_count++;
2056			} else {
2057				mpte = _pmap_allocpte(pmap, ptepindex);
2058				if (mpte == NULL)
2059					goto retry;
2060			}
2061		}
2062	} else {
2063		mpte = NULL;
2064	}
2065
2066	/*
2067	 * This call to vtopte makes the assumption that we are
2068	 * entering the page into the current pmap.  In order to support
2069	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2070	 * But that isn't as quick as vtopte.
2071	 */
2072	pte = vtopte(va);
2073	if (*pte) {
2074		if (mpte != NULL) {
2075			vm_page_lock_queues();
2076			pmap_unwire_pte_hold(pmap, mpte);
2077			vm_page_unlock_queues();
2078		}
2079		return 0;
2080	}
2081
2082	/*
2083	 * Enter on the PV list if part of our managed memory. Note that we
2084	 * raise IPL while manipulating pv_table since pmap_enter can be
2085	 * called at interrupt time.
2086	 */
2087	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2088		pmap_insert_entry(pmap, va, mpte, m);
2089
2090	/*
2091	 * Increment counters
2092	 */
2093	pmap->pm_stats.resident_count++;
2094
2095	pa = VM_PAGE_TO_PHYS(m);
2096
2097	/*
2098	 * Now validate mapping with RO protection
2099	 */
2100	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2101		pte_store(pte, pa | PG_V | PG_U);
2102	else
2103		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2104
2105	return mpte;
2106}
2107
2108/*
2109 * Make a temporary mapping for a physical address.  This is only intended
2110 * to be used for panic dumps.
2111 */
2112void *
2113pmap_kenter_temporary(vm_offset_t pa, int i)
2114{
2115	vm_offset_t va;
2116
2117	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2118	pmap_kenter(va, pa);
2119#ifndef I386_CPU
2120	invlpg(va);
2121#else
2122	invltlb();
2123#endif
2124	return ((void *)crashdumpmap);
2125}
2126
2127/*
2128 * This code maps large physical mmap regions into the
2129 * processor address space.  Note that some shortcuts
2130 * are taken, but the code works.
2131 */
2132void
2133pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2134		    vm_object_t object, vm_pindex_t pindex,
2135		    vm_size_t size)
2136{
2137	vm_page_t p;
2138
2139	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2140	KASSERT(object->type == OBJT_DEVICE,
2141	    ("pmap_object_init_pt: non-device object"));
2142	if (pseflag &&
2143	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2144		int i;
2145		vm_page_t m[1];
2146		unsigned int ptepindex;
2147		int npdes;
2148		pd_entry_t ptepa;
2149
2150		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2151			return;
2152retry:
2153		p = vm_page_lookup(object, pindex);
2154		if (p != NULL) {
2155			vm_page_lock_queues();
2156			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2157				goto retry;
2158		} else {
2159			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2160			if (p == NULL)
2161				return;
2162			m[0] = p;
2163
2164			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2165				vm_page_lock_queues();
2166				vm_page_free(p);
2167				vm_page_unlock_queues();
2168				return;
2169			}
2170
2171			p = vm_page_lookup(object, pindex);
2172			vm_page_lock_queues();
2173			vm_page_wakeup(p);
2174		}
2175		vm_page_unlock_queues();
2176
2177		ptepa = VM_PAGE_TO_PHYS(p);
2178		if (ptepa & (NBPDR - 1))
2179			return;
2180
2181		p->valid = VM_PAGE_BITS_ALL;
2182
2183		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2184		npdes = size >> PDRSHIFT;
2185		for(i = 0; i < npdes; i++) {
2186			pde_store(&pmap->pm_pdir[ptepindex],
2187			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
2188			ptepa += NBPDR;
2189			ptepindex += 1;
2190		}
2191		pmap_invalidate_all(pmap);
2192	}
2193}
2194
2195/*
2196 *	Routine:	pmap_change_wiring
2197 *	Function:	Change the wiring attribute for a map/virtual-address
2198 *			pair.
2199 *	In/out conditions:
2200 *			The mapping must already exist in the pmap.
2201 */
2202void
2203pmap_change_wiring(pmap, va, wired)
2204	register pmap_t pmap;
2205	vm_offset_t va;
2206	boolean_t wired;
2207{
2208	register pt_entry_t *pte;
2209
2210	if (pmap == NULL)
2211		return;
2212
2213	pte = pmap_pte_quick(pmap, va);
2214
2215	if (wired && !pmap_pte_w(pte))
2216		pmap->pm_stats.wired_count++;
2217	else if (!wired && pmap_pte_w(pte))
2218		pmap->pm_stats.wired_count--;
2219
2220	/*
2221	 * Wiring is not a hardware characteristic so there is no need to
2222	 * invalidate TLB.
2223	 */
2224	pmap_pte_set_w(pte, wired);
2225}
2226
2227
2228
2229/*
2230 *	Copy the range specified by src_addr/len
2231 *	from the source map to the range dst_addr/len
2232 *	in the destination map.
2233 *
2234 *	This routine is only advisory and need not do anything.
2235 */
2236
2237void
2238pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2239	  vm_offset_t src_addr)
2240{
2241	vm_offset_t addr;
2242	vm_offset_t end_addr = src_addr + len;
2243	vm_offset_t pdnxt;
2244	vm_page_t m;
2245
2246	if (dst_addr != src_addr)
2247		return;
2248
2249	if (!pmap_is_current(src_pmap))
2250		return;
2251
2252	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2253		pt_entry_t *src_pte, *dst_pte;
2254		vm_page_t dstmpte, srcmpte;
2255		pd_entry_t srcptepaddr;
2256		unsigned ptepindex;
2257
2258		if (addr >= UPT_MIN_ADDRESS)
2259			panic("pmap_copy: invalid to pmap_copy page tables\n");
2260
2261		/*
2262		 * Don't let optional prefaulting of pages make us go
2263		 * way below the low water mark of free pages or way
2264		 * above high water mark of used pv entries.
2265		 */
2266		if (cnt.v_free_count < cnt.v_free_reserved ||
2267		    pv_entry_count > pv_entry_high_water)
2268			break;
2269
2270		pdnxt = (addr + NBPDR) & ~PDRMASK;
2271		ptepindex = addr >> PDRSHIFT;
2272
2273		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2274		if (srcptepaddr == 0)
2275			continue;
2276
2277		if (srcptepaddr & PG_PS) {
2278			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2279				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2280				dst_pmap->pm_stats.resident_count +=
2281				    NBPDR / PAGE_SIZE;
2282			}
2283			continue;
2284		}
2285
2286		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2287		if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY))
2288			continue;
2289
2290		if (pdnxt > end_addr)
2291			pdnxt = end_addr;
2292
2293		src_pte = vtopte(addr);
2294		while (addr < pdnxt) {
2295			pt_entry_t ptetemp;
2296			ptetemp = *src_pte;
2297			/*
2298			 * we only virtual copy managed pages
2299			 */
2300			if ((ptetemp & PG_MANAGED) != 0) {
2301				/*
2302				 * We have to check after allocpte for the
2303				 * pte still being around...  allocpte can
2304				 * block.
2305				 */
2306				dstmpte = pmap_allocpte(dst_pmap, addr);
2307				dst_pte = pmap_pte_quick(dst_pmap, addr);
2308				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2309					/*
2310					 * Clear the modified and
2311					 * accessed (referenced) bits
2312					 * during the copy.
2313					 */
2314					m = PHYS_TO_VM_PAGE(ptetemp);
2315					*dst_pte = ptetemp & ~(PG_M | PG_A);
2316					dst_pmap->pm_stats.resident_count++;
2317					pmap_insert_entry(dst_pmap, addr,
2318						dstmpte, m);
2319	 			} else {
2320					vm_page_lock_queues();
2321					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2322					vm_page_unlock_queues();
2323				}
2324				if (dstmpte->hold_count >= srcmpte->hold_count)
2325					break;
2326			}
2327			addr += PAGE_SIZE;
2328			src_pte++;
2329		}
2330	}
2331}
2332
2333#ifdef SMP
2334
2335/*
2336 *	pmap_zpi_switchout*()
2337 *
2338 *	These functions allow us to avoid doing IPIs alltogether in certain
2339 *	temporary page-mapping situations (page zeroing).  Instead to deal
2340 *	with being preempted and moved onto a different cpu we invalidate
2341 *	the page when the scheduler switches us in.  This does not occur
2342 *	very often so we remain relatively optimal with very little effort.
2343 */
2344static void
2345pmap_zpi_switchout12(void)
2346{
2347	invlpg((u_int)CADDR1);
2348	invlpg((u_int)CADDR2);
2349}
2350
2351static void
2352pmap_zpi_switchout2(void)
2353{
2354	invlpg((u_int)CADDR2);
2355}
2356
2357static void
2358pmap_zpi_switchout3(void)
2359{
2360	invlpg((u_int)CADDR3);
2361}
2362
2363#endif
2364
2365static __inline void
2366pagezero(void *page)
2367{
2368#if defined(I686_CPU)
2369	if (cpu_class == CPUCLASS_686) {
2370#if defined(CPU_ENABLE_SSE)
2371		if (cpu_feature & CPUID_SSE2)
2372			sse2_pagezero(page);
2373		else
2374#endif
2375			i686_pagezero(page);
2376	} else
2377#endif
2378		bzero(page, PAGE_SIZE);
2379}
2380
2381static __inline void
2382invlcaddr(void *caddr)
2383{
2384#ifdef I386_CPU
2385	invltlb();
2386#else
2387	invlpg((u_int)caddr);
2388#endif
2389}
2390
2391/*
2392 *	pmap_zero_page zeros the specified hardware page by mapping
2393 *	the page into KVM and using bzero to clear its contents.
2394 */
2395void
2396pmap_zero_page(vm_page_t m)
2397{
2398
2399	mtx_lock(&CMAPCADDR12_lock);
2400	if (*CMAP2)
2401		panic("pmap_zero_page: CMAP2 busy");
2402#ifdef SMP
2403	curthread->td_pcb->pcb_switchout = pmap_zpi_switchout2;
2404#endif
2405	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2406#ifdef SMP
2407	invlpg((u_int)CADDR2);
2408#endif
2409	pagezero(CADDR2);
2410	*CMAP2 = 0;
2411	invlcaddr(CADDR2);
2412#ifdef SMP
2413	curthread->td_pcb->pcb_switchout = NULL;
2414#endif
2415	mtx_unlock(&CMAPCADDR12_lock);
2416}
2417
2418/*
2419 *	pmap_zero_page_area zeros the specified hardware page by mapping
2420 *	the page into KVM and using bzero to clear its contents.
2421 *
2422 *	off and size may not cover an area beyond a single hardware page.
2423 */
2424void
2425pmap_zero_page_area(vm_page_t m, int off, int size)
2426{
2427
2428	mtx_lock(&CMAPCADDR12_lock);
2429	if (*CMAP2)
2430		panic("pmap_zero_page: CMAP2 busy");
2431#ifdef SMP
2432	curthread->td_pcb->pcb_switchout = pmap_zpi_switchout2;
2433#endif
2434	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2435#ifdef SMP
2436	invlpg((u_int)CADDR2);
2437#endif
2438	if (off == 0 && size == PAGE_SIZE)
2439		pagezero(CADDR2);
2440	else
2441		bzero((char *)CADDR2 + off, size);
2442	*CMAP2 = 0;
2443	invlcaddr(CADDR2);
2444#ifdef SMP
2445	curthread->td_pcb->pcb_switchout = NULL;
2446#endif
2447	mtx_unlock(&CMAPCADDR12_lock);
2448}
2449
2450/*
2451 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2452 *	the page into KVM and using bzero to clear its contents.  This
2453 *	is intended to be called from the vm_pagezero process only and
2454 *	outside of Giant.
2455 */
2456void
2457pmap_zero_page_idle(vm_page_t m)
2458{
2459
2460	if (*CMAP3)
2461		panic("pmap_zero_page: CMAP3 busy");
2462#ifdef SMP
2463	curthread->td_pcb->pcb_switchout = pmap_zpi_switchout3;
2464#endif
2465	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2466#ifdef SMP
2467	invlpg((u_int)CADDR3);
2468#endif
2469	pagezero(CADDR3);
2470	*CMAP3 = 0;
2471	invlcaddr(CADDR3);
2472#ifdef SMP
2473	curthread->td_pcb->pcb_switchout = NULL;
2474#endif
2475}
2476
2477/*
2478 *	pmap_copy_page copies the specified (machine independent)
2479 *	page by mapping the page into virtual memory and using
2480 *	bcopy to copy the page, one machine dependent page at a
2481 *	time.
2482 */
2483void
2484pmap_copy_page(vm_page_t src, vm_page_t dst)
2485{
2486
2487	mtx_lock(&CMAPCADDR12_lock);
2488	if (*CMAP1)
2489		panic("pmap_copy_page: CMAP1 busy");
2490	if (*CMAP2)
2491		panic("pmap_copy_page: CMAP2 busy");
2492#ifdef SMP
2493	curthread->td_pcb->pcb_switchout = pmap_zpi_switchout12;
2494#endif
2495	*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2496	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2497#ifdef SMP
2498	invlpg((u_int)CADDR1);
2499	invlpg((u_int)CADDR2);
2500#endif
2501	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2502	*CMAP1 = 0;
2503	*CMAP2 = 0;
2504#ifdef I386_CPU
2505	invltlb();
2506#else
2507	invlpg((u_int)CADDR1);
2508	invlpg((u_int)CADDR2);
2509#endif
2510#ifdef SMP
2511	curthread->td_pcb->pcb_switchout = NULL;
2512#endif
2513	mtx_unlock(&CMAPCADDR12_lock);
2514}
2515
2516/*
2517 * Returns true if the pmap's pv is one of the first
2518 * 16 pvs linked to from this page.  This count may
2519 * be changed upwards or downwards in the future; it
2520 * is only necessary that true be returned for a small
2521 * subset of pmaps for proper page aging.
2522 */
2523boolean_t
2524pmap_page_exists_quick(pmap, m)
2525	pmap_t pmap;
2526	vm_page_t m;
2527{
2528	pv_entry_t pv;
2529	int loops = 0;
2530	int s;
2531
2532	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2533		return FALSE;
2534
2535	s = splvm();
2536	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2537	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2538		if (pv->pv_pmap == pmap) {
2539			splx(s);
2540			return TRUE;
2541		}
2542		loops++;
2543		if (loops >= 16)
2544			break;
2545	}
2546	splx(s);
2547	return (FALSE);
2548}
2549
2550#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2551/*
2552 * Remove all pages from specified address space
2553 * this aids process exit speeds.  Also, this code
2554 * is special cased for current process only, but
2555 * can have the more generic (and slightly slower)
2556 * mode enabled.  This is much faster than pmap_remove
2557 * in the case of running down an entire address space.
2558 */
2559void
2560pmap_remove_pages(pmap, sva, eva)
2561	pmap_t pmap;
2562	vm_offset_t sva, eva;
2563{
2564	pt_entry_t *pte, tpte;
2565	vm_page_t m;
2566	pv_entry_t pv, npv;
2567	int s;
2568
2569#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2570	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2571		printf("warning: pmap_remove_pages called with non-current pmap\n");
2572		return;
2573	}
2574#endif
2575	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2576	s = splvm();
2577	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2578
2579		if (pv->pv_va >= eva || pv->pv_va < sva) {
2580			npv = TAILQ_NEXT(pv, pv_plist);
2581			continue;
2582		}
2583
2584#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2585		pte = vtopte(pv->pv_va);
2586#else
2587		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2588#endif
2589		tpte = *pte;
2590
2591		if (tpte == 0) {
2592			printf("TPTE at %p  IS ZERO @ VA %08x\n",
2593							pte, pv->pv_va);
2594			panic("bad pte");
2595		}
2596
2597/*
2598 * We cannot remove wired pages from a process' mapping at this time
2599 */
2600		if (tpte & PG_W) {
2601			npv = TAILQ_NEXT(pv, pv_plist);
2602			continue;
2603		}
2604
2605		m = PHYS_TO_VM_PAGE(tpte);
2606		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2607		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2608		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2609
2610		KASSERT(m < &vm_page_array[vm_page_array_size],
2611			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2612
2613		pv->pv_pmap->pm_stats.resident_count--;
2614
2615		pte_clear(pte);
2616
2617		/*
2618		 * Update the vm_page_t clean and reference bits.
2619		 */
2620		if (tpte & PG_M) {
2621			vm_page_dirty(m);
2622		}
2623
2624		npv = TAILQ_NEXT(pv, pv_plist);
2625		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2626
2627		m->md.pv_list_count--;
2628		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2629		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2630			vm_page_flag_clear(m, PG_WRITEABLE);
2631		}
2632
2633		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2634		free_pv_entry(pv);
2635	}
2636	splx(s);
2637	pmap_invalidate_all(pmap);
2638}
2639
2640/*
2641 *	pmap_is_modified:
2642 *
2643 *	Return whether or not the specified physical page was modified
2644 *	in any physical maps.
2645 */
2646boolean_t
2647pmap_is_modified(vm_page_t m)
2648{
2649	pv_entry_t pv;
2650	pt_entry_t *pte;
2651	int s;
2652
2653	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2654		return FALSE;
2655
2656	s = splvm();
2657	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2658	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2659		/*
2660		 * if the bit being tested is the modified bit, then
2661		 * mark clean_map and ptes as never
2662		 * modified.
2663		 */
2664		if (!pmap_track_modified(pv->pv_va))
2665			continue;
2666#if defined(PMAP_DIAGNOSTIC)
2667		if (!pv->pv_pmap) {
2668			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2669			continue;
2670		}
2671#endif
2672		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2673		if (*pte & PG_M) {
2674			splx(s);
2675			return TRUE;
2676		}
2677	}
2678	splx(s);
2679	return (FALSE);
2680}
2681
2682/*
2683 *	pmap_is_prefaultable:
2684 *
2685 *	Return whether or not the specified virtual address is elgible
2686 *	for prefault.
2687 */
2688boolean_t
2689pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2690{
2691	pt_entry_t *pte;
2692
2693	if ((*pmap_pde(pmap, addr)) == 0)
2694		return (FALSE);
2695	pte = vtopte(addr);
2696	if (*pte)
2697		return (FALSE);
2698	return (TRUE);
2699}
2700
2701/*
2702 *	Clear the given bit in each of the given page's ptes.
2703 */
2704static __inline void
2705pmap_clear_ptes(vm_page_t m, int bit)
2706{
2707	register pv_entry_t pv;
2708	pt_entry_t pbits, *pte;
2709	int s;
2710
2711	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2712	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2713		return;
2714
2715	s = splvm();
2716	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2717	/*
2718	 * Loop over all current mappings setting/clearing as appropos If
2719	 * setting RO do we need to clear the VAC?
2720	 */
2721	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2722		/*
2723		 * don't write protect pager mappings
2724		 */
2725		if (bit == PG_RW) {
2726			if (!pmap_track_modified(pv->pv_va))
2727				continue;
2728		}
2729
2730#if defined(PMAP_DIAGNOSTIC)
2731		if (!pv->pv_pmap) {
2732			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2733			continue;
2734		}
2735#endif
2736
2737		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2738		pbits = *pte;
2739		if (pbits & bit) {
2740			if (bit == PG_RW) {
2741				if (pbits & PG_M) {
2742					vm_page_dirty(m);
2743				}
2744				pte_store(pte, pbits & ~(PG_M|PG_RW));
2745			} else {
2746				pte_store(pte, pbits & ~bit);
2747			}
2748			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2749		}
2750	}
2751	if (bit == PG_RW)
2752		vm_page_flag_clear(m, PG_WRITEABLE);
2753	splx(s);
2754}
2755
2756/*
2757 *      pmap_page_protect:
2758 *
2759 *      Lower the permission for all mappings to a given page.
2760 */
2761void
2762pmap_page_protect(vm_page_t m, vm_prot_t prot)
2763{
2764	if ((prot & VM_PROT_WRITE) == 0) {
2765		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2766			pmap_clear_ptes(m, PG_RW);
2767		} else {
2768			pmap_remove_all(m);
2769		}
2770	}
2771}
2772
2773/*
2774 *	pmap_ts_referenced:
2775 *
2776 *	Return a count of reference bits for a page, clearing those bits.
2777 *	It is not necessary for every reference bit to be cleared, but it
2778 *	is necessary that 0 only be returned when there are truly no
2779 *	reference bits set.
2780 *
2781 *	XXX: The exact number of bits to check and clear is a matter that
2782 *	should be tested and standardized at some point in the future for
2783 *	optimal aging of shared pages.
2784 */
2785int
2786pmap_ts_referenced(vm_page_t m)
2787{
2788	register pv_entry_t pv, pvf, pvn;
2789	pt_entry_t *pte;
2790	pt_entry_t v;
2791	int s;
2792	int rtval = 0;
2793
2794	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2795		return (rtval);
2796
2797	s = splvm();
2798	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2799	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2800
2801		pvf = pv;
2802
2803		do {
2804			pvn = TAILQ_NEXT(pv, pv_list);
2805
2806			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2807
2808			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2809
2810			if (!pmap_track_modified(pv->pv_va))
2811				continue;
2812
2813			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2814
2815			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2816				pte_store(pte, v & ~PG_A);
2817				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2818
2819				rtval++;
2820				if (rtval > 4) {
2821					break;
2822				}
2823			}
2824		} while ((pv = pvn) != NULL && pv != pvf);
2825	}
2826	splx(s);
2827
2828	return (rtval);
2829}
2830
2831/*
2832 *	Clear the modify bits on the specified physical page.
2833 */
2834void
2835pmap_clear_modify(vm_page_t m)
2836{
2837	pmap_clear_ptes(m, PG_M);
2838}
2839
2840/*
2841 *	pmap_clear_reference:
2842 *
2843 *	Clear the reference bit on the specified physical page.
2844 */
2845void
2846pmap_clear_reference(vm_page_t m)
2847{
2848	pmap_clear_ptes(m, PG_A);
2849}
2850
2851/*
2852 * Miscellaneous support routines follow
2853 */
2854
2855static void
2856i386_protection_init()
2857{
2858	register int *kp, prot;
2859
2860	kp = protection_codes;
2861	for (prot = 0; prot < 8; prot++) {
2862		switch (prot) {
2863		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
2864			/*
2865			 * Read access is also 0. There isn't any execute bit,
2866			 * so just make it readable.
2867			 */
2868		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
2869		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
2870		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
2871			*kp++ = 0;
2872			break;
2873		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
2874		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
2875		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
2876		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
2877			*kp++ = PG_RW;
2878			break;
2879		}
2880	}
2881}
2882
2883/*
2884 * Map a set of physical memory pages into the kernel virtual
2885 * address space. Return a pointer to where it is mapped. This
2886 * routine is intended to be used for mapping device memory,
2887 * NOT real memory.
2888 */
2889void *
2890pmap_mapdev(pa, size)
2891	vm_paddr_t pa;
2892	vm_size_t size;
2893{
2894	vm_offset_t va, tmpva, offset;
2895
2896	offset = pa & PAGE_MASK;
2897	size = roundup(offset + size, PAGE_SIZE);
2898	pa = pa & PG_FRAME;
2899
2900	if (pa < KERNLOAD && pa + size <= KERNLOAD)
2901		va = KERNBASE + pa;
2902	else
2903		va = kmem_alloc_nofault(kernel_map, size);
2904	if (!va)
2905		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2906
2907	for (tmpva = va; size > 0; ) {
2908		pmap_kenter(tmpva, pa);
2909		size -= PAGE_SIZE;
2910		tmpva += PAGE_SIZE;
2911		pa += PAGE_SIZE;
2912	}
2913	pmap_invalidate_range(kernel_pmap, va, tmpva);
2914	return ((void *)(va + offset));
2915}
2916
2917void
2918pmap_unmapdev(va, size)
2919	vm_offset_t va;
2920	vm_size_t size;
2921{
2922	vm_offset_t base, offset, tmpva;
2923
2924	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
2925		return;
2926	base = va & PG_FRAME;
2927	offset = va & PAGE_MASK;
2928	size = roundup(offset + size, PAGE_SIZE);
2929	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2930		pmap_kremove(tmpva);
2931	pmap_invalidate_range(kernel_pmap, va, tmpva);
2932	kmem_free(kernel_map, base, size);
2933}
2934
2935/*
2936 * perform the pmap work for mincore
2937 */
2938int
2939pmap_mincore(pmap, addr)
2940	pmap_t pmap;
2941	vm_offset_t addr;
2942{
2943	pt_entry_t *ptep, pte;
2944	vm_page_t m;
2945	int val = 0;
2946
2947	ptep = pmap_pte_quick(pmap, addr);
2948	if (ptep == 0) {
2949		return 0;
2950	}
2951
2952	if ((pte = *ptep) != 0) {
2953		vm_paddr_t pa;
2954
2955		val = MINCORE_INCORE;
2956		if ((pte & PG_MANAGED) == 0)
2957			return val;
2958
2959		pa = pte & PG_FRAME;
2960
2961		m = PHYS_TO_VM_PAGE(pa);
2962
2963		/*
2964		 * Modified by us
2965		 */
2966		if (pte & PG_M)
2967			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2968		else {
2969			/*
2970			 * Modified by someone else
2971			 */
2972			vm_page_lock_queues();
2973			if (m->dirty || pmap_is_modified(m))
2974				val |= MINCORE_MODIFIED_OTHER;
2975			vm_page_unlock_queues();
2976		}
2977		/*
2978		 * Referenced by us
2979		 */
2980		if (pte & PG_A)
2981			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2982		else {
2983			/*
2984			 * Referenced by someone else
2985			 */
2986			vm_page_lock_queues();
2987			if ((m->flags & PG_REFERENCED) ||
2988			    pmap_ts_referenced(m)) {
2989				val |= MINCORE_REFERENCED_OTHER;
2990				vm_page_flag_set(m, PG_REFERENCED);
2991			}
2992			vm_page_unlock_queues();
2993		}
2994	}
2995	return val;
2996}
2997
2998void
2999pmap_activate(struct thread *td)
3000{
3001	struct proc *p = td->td_proc;
3002	pmap_t	pmap, oldpmap;
3003	u_int32_t  cr3;
3004
3005	critical_enter();
3006	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3007	oldpmap = PCPU_GET(curpmap);
3008#if defined(SMP)
3009	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3010	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3011#else
3012	oldpmap->pm_active &= ~1;
3013	pmap->pm_active |= 1;
3014#endif
3015#ifdef PAE
3016	cr3 = vtophys(pmap->pm_pdpt);
3017#else
3018	cr3 = vtophys(pmap->pm_pdir);
3019#endif
3020	/* XXXKSE this is wrong.
3021	 * pmap_activate is for the current thread on the current cpu
3022	 */
3023	if (p->p_flag & P_SA) {
3024		/* Make sure all other cr3 entries are updated. */
3025		/* what if they are running?  XXXKSE (maybe abort them) */
3026		FOREACH_THREAD_IN_PROC(p, td) {
3027			td->td_pcb->pcb_cr3 = cr3;
3028		}
3029	} else {
3030		td->td_pcb->pcb_cr3 = cr3;
3031	}
3032	load_cr3(cr3);
3033	PCPU_SET(curpmap, pmap);
3034	critical_exit();
3035}
3036
3037vm_offset_t
3038pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3039{
3040
3041	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3042		return addr;
3043	}
3044
3045	addr = (addr + PDRMASK) & ~PDRMASK;
3046	return addr;
3047}
3048
3049
3050#if defined(PMAP_DEBUG)
3051pmap_pid_dump(int pid)
3052{
3053	pmap_t pmap;
3054	struct proc *p;
3055	int npte = 0;
3056	int index;
3057
3058	sx_slock(&allproc_lock);
3059	LIST_FOREACH(p, &allproc, p_list) {
3060		if (p->p_pid != pid)
3061			continue;
3062
3063		if (p->p_vmspace) {
3064			int i,j;
3065			index = 0;
3066			pmap = vmspace_pmap(p->p_vmspace);
3067			for (i = 0; i < NPDEPTD; i++) {
3068				pd_entry_t *pde;
3069				pt_entry_t *pte;
3070				vm_offset_t base = i << PDRSHIFT;
3071
3072				pde = &pmap->pm_pdir[i];
3073				if (pde && pmap_pde_v(pde)) {
3074					for (j = 0; j < NPTEPG; j++) {
3075						vm_offset_t va = base + (j << PAGE_SHIFT);
3076						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3077							if (index) {
3078								index = 0;
3079								printf("\n");
3080							}
3081							sx_sunlock(&allproc_lock);
3082							return npte;
3083						}
3084						pte = pmap_pte_quick(pmap, va);
3085						if (pte && pmap_pte_v(pte)) {
3086							pt_entry_t pa;
3087							vm_page_t m;
3088							pa = *pte;
3089							m = PHYS_TO_VM_PAGE(pa);
3090							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3091								va, pa, m->hold_count, m->wire_count, m->flags);
3092							npte++;
3093							index++;
3094							if (index >= 2) {
3095								index = 0;
3096								printf("\n");
3097							} else {
3098								printf(" ");
3099							}
3100						}
3101					}
3102				}
3103			}
3104		}
3105	}
3106	sx_sunlock(&allproc_lock);
3107	return npte;
3108}
3109#endif
3110
3111#if defined(DEBUG)
3112
3113static void	pads(pmap_t pm);
3114void		pmap_pvdump(vm_offset_t pa);
3115
3116/* print address space of pmap*/
3117static void
3118pads(pm)
3119	pmap_t pm;
3120{
3121	int i, j;
3122	vm_paddr_t va;
3123	pt_entry_t *ptep;
3124
3125	if (pm == kernel_pmap)
3126		return;
3127	for (i = 0; i < NPDEPTD; i++)
3128		if (pm->pm_pdir[i])
3129			for (j = 0; j < NPTEPG; j++) {
3130				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3131				if (pm == kernel_pmap && va < KERNBASE)
3132					continue;
3133				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3134					continue;
3135				ptep = pmap_pte_quick(pm, va);
3136				if (pmap_pte_v(ptep))
3137					printf("%x:%x ", va, *ptep);
3138			};
3139
3140}
3141
3142void
3143pmap_pvdump(pa)
3144	vm_paddr_t pa;
3145{
3146	pv_entry_t pv;
3147	vm_page_t m;
3148
3149	printf("pa %x", pa);
3150	m = PHYS_TO_VM_PAGE(pa);
3151	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3152		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3153		pads(pv->pv_pmap);
3154	}
3155	printf(" ");
3156}
3157#endif
3158