pmap.c revision 159803
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 159803 2006-06-20 20:52:11Z alc $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_cpu.h"
107#include "opt_pmap.h"
108#include "opt_msgbuf.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/lock.h>
116#include <sys/malloc.h>
117#include <sys/mman.h>
118#include <sys/msgbuf.h>
119#include <sys/mutex.h>
120#include <sys/proc.h>
121#include <sys/sx.h>
122#include <sys/vmmeter.h>
123#include <sys/sched.h>
124#include <sys/sysctl.h>
125#ifdef SMP
126#include <sys/smp.h>
127#endif
128
129#include <vm/vm.h>
130#include <vm/vm_param.h>
131#include <vm/vm_kern.h>
132#include <vm/vm_page.h>
133#include <vm/vm_map.h>
134#include <vm/vm_object.h>
135#include <vm/vm_extern.h>
136#include <vm/vm_pageout.h>
137#include <vm/vm_pager.h>
138#include <vm/uma.h>
139
140#include <machine/cpu.h>
141#include <machine/cputypes.h>
142#include <machine/md_var.h>
143#include <machine/pcb.h>
144#include <machine/specialreg.h>
145#ifdef SMP
146#include <machine/smp.h>
147#endif
148
149#ifdef XBOX
150#include <machine/xbox.h>
151#endif
152
153#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
154#define CPU_ENABLE_SSE
155#endif
156
157#ifndef PMAP_SHPGPERPROC
158#define PMAP_SHPGPERPROC 200
159#endif
160
161#if defined(DIAGNOSTIC)
162#define PMAP_DIAGNOSTIC
163#endif
164
165#if !defined(PMAP_DIAGNOSTIC)
166#define PMAP_INLINE __inline
167#else
168#define PMAP_INLINE
169#endif
170
171#define PV_STATS
172#ifdef PV_STATS
173#define PV_STAT(x)	do { x ; } while (0)
174#else
175#define PV_STAT(x)	do { } while (0)
176#endif
177
178/*
179 * Get PDEs and PTEs for user/kernel address space
180 */
181#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
182#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
183
184#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
185#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
186#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
187#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
188#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
189
190#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
191    atomic_clear_int((u_int *)(pte), PG_W))
192#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
193
194struct pmap kernel_pmap_store;
195LIST_HEAD(pmaplist, pmap);
196static struct pmaplist allpmaps;
197static struct mtx allpmaps_lock;
198
199vm_paddr_t avail_end;	/* PA of last available physical page */
200vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
201vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
202int pgeflag = 0;		/* PG_G or-in */
203int pseflag = 0;		/* PG_PS or-in */
204
205static int nkpt;
206vm_offset_t kernel_vm_end;
207extern u_int32_t KERNend;
208
209#ifdef PAE
210static uma_zone_t pdptzone;
211#endif
212
213/*
214 * Data for the pv entry allocation mechanism
215 */
216static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
217static int shpgperproc = PMAP_SHPGPERPROC;
218
219struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
220int pv_maxchunks;			/* How many chunks we have KVA for */
221vm_offset_t pv_vafree;			/* freelist stored in the PTE */
222
223/*
224 * All those kernel PT submaps that BSD is so fond of
225 */
226struct sysmaps {
227	struct	mtx lock;
228	pt_entry_t *CMAP1;
229	pt_entry_t *CMAP2;
230	caddr_t	CADDR1;
231	caddr_t	CADDR2;
232};
233static struct sysmaps sysmaps_pcpu[MAXCPU];
234pt_entry_t *CMAP1 = 0;
235static pt_entry_t *CMAP3;
236caddr_t CADDR1 = 0, ptvmmap = 0;
237static caddr_t CADDR3;
238struct msgbuf *msgbufp = 0;
239
240/*
241 * Crashdump maps.
242 */
243static caddr_t crashdumpmap;
244
245#ifdef SMP
246extern pt_entry_t *SMPpt;
247#endif
248static pt_entry_t *PMAP1 = 0, *PMAP2;
249static pt_entry_t *PADDR1 = 0, *PADDR2;
250#ifdef SMP
251static int PMAP1cpu;
252static int PMAP1changedcpu;
253SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
254	   &PMAP1changedcpu, 0,
255	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
256#endif
257static int PMAP1changed;
258SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
259	   &PMAP1changed, 0,
260	   "Number of times pmap_pte_quick changed PMAP1");
261static int PMAP1unchanged;
262SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
263	   &PMAP1unchanged, 0,
264	   "Number of times pmap_pte_quick didn't change PMAP1");
265static struct mtx PMAP2mutex;
266
267static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
268static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
269static void	pmap_clear_ptes(vm_page_t m, int bit);
270
271static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
272    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
273static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
274static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
275static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
276					vm_offset_t va);
277static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
278static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
279    vm_page_t m);
280
281static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
282
283static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
284static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
285static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
286static void pmap_pte_release(pt_entry_t *pte);
287static int pmap_unuse_pt(pmap_t, vm_offset_t);
288static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
289#ifdef PAE
290static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
291#endif
292
293CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
294CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
295
296/*
297 * Move the kernel virtual free pointer to the next
298 * 4MB.  This is used to help improve performance
299 * by using a large (4MB) page for much of the kernel
300 * (.text, .data, .bss)
301 */
302static vm_offset_t
303pmap_kmem_choose(vm_offset_t addr)
304{
305	vm_offset_t newaddr = addr;
306
307#ifndef DISABLE_PSE
308	if (cpu_feature & CPUID_PSE)
309		newaddr = (addr + PDRMASK) & ~PDRMASK;
310#endif
311	return newaddr;
312}
313
314/*
315 *	Bootstrap the system enough to run with virtual memory.
316 *
317 *	On the i386 this is called after mapping has already been enabled
318 *	and just syncs the pmap module with what has already been done.
319 *	[We can't call it easily with mapping off since the kernel is not
320 *	mapped with PA == VA, hence we would have to relocate every address
321 *	from the linked base (virtual) address "KERNBASE" to the actual
322 *	(physical) address starting relative to 0]
323 */
324void
325pmap_bootstrap(firstaddr, loadaddr)
326	vm_paddr_t firstaddr;
327	vm_paddr_t loadaddr;
328{
329	vm_offset_t va;
330	pt_entry_t *pte, *unused;
331	struct sysmaps *sysmaps;
332	int i;
333
334	/*
335	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
336	 * large. It should instead be correctly calculated in locore.s and
337	 * not based on 'first' (which is a physical address, not a virtual
338	 * address, for the start of unused physical memory). The kernel
339	 * page tables are NOT double mapped and thus should not be included
340	 * in this calculation.
341	 */
342	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
343	virtual_avail = pmap_kmem_choose(virtual_avail);
344
345	virtual_end = VM_MAX_KERNEL_ADDRESS;
346
347	/*
348	 * Initialize the kernel pmap (which is statically allocated).
349	 */
350	PMAP_LOCK_INIT(kernel_pmap);
351	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
352#ifdef PAE
353	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
354#endif
355	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
356	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
357	LIST_INIT(&allpmaps);
358	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
359	mtx_lock_spin(&allpmaps_lock);
360	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
361	mtx_unlock_spin(&allpmaps_lock);
362	nkpt = NKPT;
363
364	/*
365	 * Reserve some special page table entries/VA space for temporary
366	 * mapping of pages.
367	 */
368#define	SYSMAP(c, p, v, n)	\
369	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
370
371	va = virtual_avail;
372	pte = vtopte(va);
373
374	/*
375	 * CMAP1/CMAP2 are used for zeroing and copying pages.
376	 * CMAP3 is used for the idle process page zeroing.
377	 */
378	for (i = 0; i < MAXCPU; i++) {
379		sysmaps = &sysmaps_pcpu[i];
380		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
381		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
382		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
383	}
384	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
385	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
386	*CMAP3 = 0;
387
388	/*
389	 * Crashdump maps.
390	 */
391	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
392
393	/*
394	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
395	 */
396	SYSMAP(caddr_t, unused, ptvmmap, 1)
397
398	/*
399	 * msgbufp is used to map the system message buffer.
400	 */
401	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
402
403	/*
404	 * ptemap is used for pmap_pte_quick
405	 */
406	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
407	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
408
409	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
410
411	virtual_avail = va;
412
413	*CMAP1 = 0;
414
415#ifdef XBOX
416	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
417	 * an early stadium, we cannot yet neatly map video memory ... :-(
418	 * Better fixes are very welcome! */
419	if (!arch_i386_is_xbox)
420#endif
421	for (i = 0; i < NKPT; i++)
422		PTD[i] = 0;
423
424	/* Initialize the PAT MSR if present. */
425	pmap_init_pat();
426
427	/* Turn on PG_G on kernel page(s) */
428	pmap_set_pg();
429}
430
431/*
432 * Setup the PAT MSR.
433 */
434void
435pmap_init_pat(void)
436{
437	uint64_t pat_msr;
438
439	/* Bail if this CPU doesn't implement PAT. */
440	if (!(cpu_feature & CPUID_PAT))
441		return;
442
443#ifdef PAT_WORKS
444	/*
445	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
446	 * Program 4 and 5 as WP and WC.
447	 * Leave 6 and 7 as UC and UC-.
448	 */
449	pat_msr = rdmsr(MSR_PAT);
450	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
451	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
452	    PAT_VALUE(5, PAT_WRITE_COMBINING);
453#else
454	/*
455	 * Due to some Intel errata, we can only safely use the lower 4
456	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
457	 * of UC-.
458	 *
459	 *   Intel Pentium III Processor Specification Update
460	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
461	 * or Mode C Paging)
462	 *
463	 *   Intel Pentium IV  Processor Specification Update
464	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
465	 */
466	pat_msr = rdmsr(MSR_PAT);
467	pat_msr &= ~PAT_MASK(2);
468	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
469#endif
470	wrmsr(MSR_PAT, pat_msr);
471}
472
473/*
474 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
475 */
476void
477pmap_set_pg(void)
478{
479	pd_entry_t pdir;
480	pt_entry_t *pte;
481	vm_offset_t va, endva;
482	int i;
483
484	if (pgeflag == 0)
485		return;
486
487	i = KERNLOAD/NBPDR;
488	endva = KERNBASE + KERNend;
489
490	if (pseflag) {
491		va = KERNBASE + KERNLOAD;
492		while (va  < endva) {
493			pdir = kernel_pmap->pm_pdir[KPTDI+i];
494			pdir |= pgeflag;
495			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
496			invltlb();	/* Play it safe, invltlb() every time */
497			i++;
498			va += NBPDR;
499		}
500	} else {
501		va = (vm_offset_t)btext;
502		while (va < endva) {
503			pte = vtopte(va);
504			if (*pte)
505				*pte |= pgeflag;
506			invltlb();	/* Play it safe, invltlb() every time */
507			va += PAGE_SIZE;
508		}
509	}
510}
511
512/*
513 * Initialize a vm_page's machine-dependent fields.
514 */
515void
516pmap_page_init(vm_page_t m)
517{
518
519	TAILQ_INIT(&m->md.pv_list);
520	m->md.pv_list_count = 0;
521}
522
523#ifdef PAE
524
525static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
526
527static void *
528pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
529{
530	*flags = UMA_SLAB_PRIV;
531	return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
532	    1, 0));
533}
534#endif
535
536/*
537 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
538 * Requirements:
539 *  - Must deal with pages in order to ensure that none of the PG_* bits
540 *    are ever set, PG_V in particular.
541 *  - Assumes we can write to ptes without pte_store() atomic ops, even
542 *    on PAE systems.  This should be ok.
543 *  - Assumes nothing will ever test these addresses for 0 to indicate
544 *    no mapping instead of correctly checking PG_V.
545 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
546 * Because PG_V is never set, there can be no mappings to invalidate.
547 */
548static vm_offset_t
549pmap_ptelist_alloc(vm_offset_t *head)
550{
551	pt_entry_t *pte;
552	vm_offset_t va;
553
554	va = *head;
555	if (va == 0)
556		return (va);	/* Out of memory */
557	pte = vtopte(va);
558	*head = *pte;
559	if (*head & PG_V)
560		panic("pmap_ptelist_alloc: va with PG_V set!");
561	*pte = 0;
562	return (va);
563}
564
565static void
566pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
567{
568	pt_entry_t *pte;
569
570	if (va & PG_V)
571		panic("pmap_ptelist_free: freeing va with PG_V set!");
572	pte = vtopte(va);
573	*pte = *head;		/* virtual! PG_V is 0 though */
574	*head = va;
575}
576
577static void
578pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
579{
580	int i;
581	vm_offset_t va;
582
583	*head = 0;
584	for (i = npages - 1; i >= 0; i--) {
585		va = (vm_offset_t)base + i * PAGE_SIZE;
586		pmap_ptelist_free(head, va);
587	}
588}
589
590
591/*
592 *	Initialize the pmap module.
593 *	Called by vm_init, to initialize any structures that the pmap
594 *	system needs to map virtual memory.
595 */
596void
597pmap_init(void)
598{
599
600	/*
601	 * Initialize the address space (zone) for the pv entries.  Set a
602	 * high water mark so that the system can recover from excessive
603	 * numbers of pv entries.
604	 */
605	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
606	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
607	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
608	pv_entry_max = roundup(pv_entry_max, _NPCPV);
609	pv_entry_high_water = 9 * (pv_entry_max / 10);
610
611	pv_maxchunks = pv_entry_max / _NPCPV;
612	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
613	    PAGE_SIZE * pv_maxchunks);
614	if (pv_chunkbase == NULL)
615		panic("pmap_init: not enough kvm for pv chunks");
616	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
617#ifdef PAE
618	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
619	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
620	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
621	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
622#endif
623}
624
625
626SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
627SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
628	"Max number of PV entries");
629SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
630	"Page share factor per proc");
631
632/***************************************************
633 * Low level helper routines.....
634 ***************************************************/
635
636#ifdef SMP
637/*
638 * For SMP, these functions have to use the IPI mechanism for coherence.
639 */
640void
641pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
642{
643	u_int cpumask;
644	u_int other_cpus;
645
646	if (smp_started) {
647		if (!(read_eflags() & PSL_I))
648			panic("%s: interrupts disabled", __func__);
649		mtx_lock_spin(&smp_ipi_mtx);
650	} else
651		critical_enter();
652	/*
653	 * We need to disable interrupt preemption but MUST NOT have
654	 * interrupts disabled here.
655	 * XXX we may need to hold schedlock to get a coherent pm_active
656	 * XXX critical sections disable interrupts again
657	 */
658	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
659		invlpg(va);
660		smp_invlpg(va);
661	} else {
662		cpumask = PCPU_GET(cpumask);
663		other_cpus = PCPU_GET(other_cpus);
664		if (pmap->pm_active & cpumask)
665			invlpg(va);
666		if (pmap->pm_active & other_cpus)
667			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
668	}
669	if (smp_started)
670		mtx_unlock_spin(&smp_ipi_mtx);
671	else
672		critical_exit();
673}
674
675void
676pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
677{
678	u_int cpumask;
679	u_int other_cpus;
680	vm_offset_t addr;
681
682	if (smp_started) {
683		if (!(read_eflags() & PSL_I))
684			panic("%s: interrupts disabled", __func__);
685		mtx_lock_spin(&smp_ipi_mtx);
686	} else
687		critical_enter();
688	/*
689	 * We need to disable interrupt preemption but MUST NOT have
690	 * interrupts disabled here.
691	 * XXX we may need to hold schedlock to get a coherent pm_active
692	 * XXX critical sections disable interrupts again
693	 */
694	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
695		for (addr = sva; addr < eva; addr += PAGE_SIZE)
696			invlpg(addr);
697		smp_invlpg_range(sva, eva);
698	} else {
699		cpumask = PCPU_GET(cpumask);
700		other_cpus = PCPU_GET(other_cpus);
701		if (pmap->pm_active & cpumask)
702			for (addr = sva; addr < eva; addr += PAGE_SIZE)
703				invlpg(addr);
704		if (pmap->pm_active & other_cpus)
705			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
706			    sva, eva);
707	}
708	if (smp_started)
709		mtx_unlock_spin(&smp_ipi_mtx);
710	else
711		critical_exit();
712}
713
714void
715pmap_invalidate_all(pmap_t pmap)
716{
717	u_int cpumask;
718	u_int other_cpus;
719
720	if (smp_started) {
721		if (!(read_eflags() & PSL_I))
722			panic("%s: interrupts disabled", __func__);
723		mtx_lock_spin(&smp_ipi_mtx);
724	} else
725		critical_enter();
726	/*
727	 * We need to disable interrupt preemption but MUST NOT have
728	 * interrupts disabled here.
729	 * XXX we may need to hold schedlock to get a coherent pm_active
730	 * XXX critical sections disable interrupts again
731	 */
732	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
733		invltlb();
734		smp_invltlb();
735	} else {
736		cpumask = PCPU_GET(cpumask);
737		other_cpus = PCPU_GET(other_cpus);
738		if (pmap->pm_active & cpumask)
739			invltlb();
740		if (pmap->pm_active & other_cpus)
741			smp_masked_invltlb(pmap->pm_active & other_cpus);
742	}
743	if (smp_started)
744		mtx_unlock_spin(&smp_ipi_mtx);
745	else
746		critical_exit();
747}
748
749void
750pmap_invalidate_cache(void)
751{
752
753	if (smp_started) {
754		if (!(read_eflags() & PSL_I))
755			panic("%s: interrupts disabled", __func__);
756		mtx_lock_spin(&smp_ipi_mtx);
757	} else
758		critical_enter();
759	/*
760	 * We need to disable interrupt preemption but MUST NOT have
761	 * interrupts disabled here.
762	 * XXX we may need to hold schedlock to get a coherent pm_active
763	 * XXX critical sections disable interrupts again
764	 */
765	wbinvd();
766	smp_cache_flush();
767	if (smp_started)
768		mtx_unlock_spin(&smp_ipi_mtx);
769	else
770		critical_exit();
771}
772#else /* !SMP */
773/*
774 * Normal, non-SMP, 486+ invalidation functions.
775 * We inline these within pmap.c for speed.
776 */
777PMAP_INLINE void
778pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
779{
780
781	if (pmap == kernel_pmap || pmap->pm_active)
782		invlpg(va);
783}
784
785PMAP_INLINE void
786pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
787{
788	vm_offset_t addr;
789
790	if (pmap == kernel_pmap || pmap->pm_active)
791		for (addr = sva; addr < eva; addr += PAGE_SIZE)
792			invlpg(addr);
793}
794
795PMAP_INLINE void
796pmap_invalidate_all(pmap_t pmap)
797{
798
799	if (pmap == kernel_pmap || pmap->pm_active)
800		invltlb();
801}
802
803PMAP_INLINE void
804pmap_invalidate_cache(void)
805{
806
807	wbinvd();
808}
809#endif /* !SMP */
810
811/*
812 * Are we current address space or kernel?  N.B. We return FALSE when
813 * a pmap's page table is in use because a kernel thread is borrowing
814 * it.  The borrowed page table can change spontaneously, making any
815 * dependence on its continued use subject to a race condition.
816 */
817static __inline int
818pmap_is_current(pmap_t pmap)
819{
820
821	return (pmap == kernel_pmap ||
822		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
823	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
824}
825
826/*
827 * If the given pmap is not the current or kernel pmap, the returned pte must
828 * be released by passing it to pmap_pte_release().
829 */
830pt_entry_t *
831pmap_pte(pmap_t pmap, vm_offset_t va)
832{
833	pd_entry_t newpf;
834	pd_entry_t *pde;
835
836	pde = pmap_pde(pmap, va);
837	if (*pde & PG_PS)
838		return (pde);
839	if (*pde != 0) {
840		/* are we current address space or kernel? */
841		if (pmap_is_current(pmap))
842			return (vtopte(va));
843		mtx_lock(&PMAP2mutex);
844		newpf = *pde & PG_FRAME;
845		if ((*PMAP2 & PG_FRAME) != newpf) {
846			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
847			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
848		}
849		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
850	}
851	return (0);
852}
853
854/*
855 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
856 * being NULL.
857 */
858static __inline void
859pmap_pte_release(pt_entry_t *pte)
860{
861
862	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
863		mtx_unlock(&PMAP2mutex);
864}
865
866static __inline void
867invlcaddr(void *caddr)
868{
869
870	invlpg((u_int)caddr);
871}
872
873/*
874 * Super fast pmap_pte routine best used when scanning
875 * the pv lists.  This eliminates many coarse-grained
876 * invltlb calls.  Note that many of the pv list
877 * scans are across different pmaps.  It is very wasteful
878 * to do an entire invltlb for checking a single mapping.
879 *
880 * If the given pmap is not the current pmap, vm_page_queue_mtx
881 * must be held and curthread pinned to a CPU.
882 */
883static pt_entry_t *
884pmap_pte_quick(pmap_t pmap, vm_offset_t va)
885{
886	pd_entry_t newpf;
887	pd_entry_t *pde;
888
889	pde = pmap_pde(pmap, va);
890	if (*pde & PG_PS)
891		return (pde);
892	if (*pde != 0) {
893		/* are we current address space or kernel? */
894		if (pmap_is_current(pmap))
895			return (vtopte(va));
896		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
897		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
898		newpf = *pde & PG_FRAME;
899		if ((*PMAP1 & PG_FRAME) != newpf) {
900			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
901#ifdef SMP
902			PMAP1cpu = PCPU_GET(cpuid);
903#endif
904			invlcaddr(PADDR1);
905			PMAP1changed++;
906		} else
907#ifdef SMP
908		if (PMAP1cpu != PCPU_GET(cpuid)) {
909			PMAP1cpu = PCPU_GET(cpuid);
910			invlcaddr(PADDR1);
911			PMAP1changedcpu++;
912		} else
913#endif
914			PMAP1unchanged++;
915		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
916	}
917	return (0);
918}
919
920/*
921 *	Routine:	pmap_extract
922 *	Function:
923 *		Extract the physical page address associated
924 *		with the given map/virtual_address pair.
925 */
926vm_paddr_t
927pmap_extract(pmap_t pmap, vm_offset_t va)
928{
929	vm_paddr_t rtval;
930	pt_entry_t *pte;
931	pd_entry_t pde;
932
933	rtval = 0;
934	PMAP_LOCK(pmap);
935	pde = pmap->pm_pdir[va >> PDRSHIFT];
936	if (pde != 0) {
937		if ((pde & PG_PS) != 0) {
938			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
939			PMAP_UNLOCK(pmap);
940			return rtval;
941		}
942		pte = pmap_pte(pmap, va);
943		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
944		pmap_pte_release(pte);
945	}
946	PMAP_UNLOCK(pmap);
947	return (rtval);
948}
949
950/*
951 *	Routine:	pmap_extract_and_hold
952 *	Function:
953 *		Atomically extract and hold the physical page
954 *		with the given pmap and virtual address pair
955 *		if that mapping permits the given protection.
956 */
957vm_page_t
958pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
959{
960	pd_entry_t pde;
961	pt_entry_t pte;
962	vm_page_t m;
963
964	m = NULL;
965	vm_page_lock_queues();
966	PMAP_LOCK(pmap);
967	pde = *pmap_pde(pmap, va);
968	if (pde != 0) {
969		if (pde & PG_PS) {
970			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
971				m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
972				    (va & PDRMASK));
973				vm_page_hold(m);
974			}
975		} else {
976			sched_pin();
977			pte = *pmap_pte_quick(pmap, va);
978			if (pte != 0 &&
979			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
980				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
981				vm_page_hold(m);
982			}
983			sched_unpin();
984		}
985	}
986	vm_page_unlock_queues();
987	PMAP_UNLOCK(pmap);
988	return (m);
989}
990
991/***************************************************
992 * Low level mapping routines.....
993 ***************************************************/
994
995/*
996 * Add a wired page to the kva.
997 * Note: not SMP coherent.
998 */
999PMAP_INLINE void
1000pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1001{
1002	pt_entry_t *pte;
1003
1004	pte = vtopte(va);
1005	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1006}
1007
1008/*
1009 * Remove a page from the kernel pagetables.
1010 * Note: not SMP coherent.
1011 */
1012PMAP_INLINE void
1013pmap_kremove(vm_offset_t va)
1014{
1015	pt_entry_t *pte;
1016
1017	pte = vtopte(va);
1018	pte_clear(pte);
1019}
1020
1021/*
1022 *	Used to map a range of physical addresses into kernel
1023 *	virtual address space.
1024 *
1025 *	The value passed in '*virt' is a suggested virtual address for
1026 *	the mapping. Architectures which can support a direct-mapped
1027 *	physical to virtual region can return the appropriate address
1028 *	within that region, leaving '*virt' unchanged. Other
1029 *	architectures should map the pages starting at '*virt' and
1030 *	update '*virt' with the first usable address after the mapped
1031 *	region.
1032 */
1033vm_offset_t
1034pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1035{
1036	vm_offset_t va, sva;
1037
1038	va = sva = *virt;
1039	while (start < end) {
1040		pmap_kenter(va, start);
1041		va += PAGE_SIZE;
1042		start += PAGE_SIZE;
1043	}
1044	pmap_invalidate_range(kernel_pmap, sva, va);
1045	*virt = va;
1046	return (sva);
1047}
1048
1049
1050/*
1051 * Add a list of wired pages to the kva
1052 * this routine is only used for temporary
1053 * kernel mappings that do not need to have
1054 * page modification or references recorded.
1055 * Note that old mappings are simply written
1056 * over.  The page *must* be wired.
1057 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1058 */
1059void
1060pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1061{
1062	pt_entry_t *endpte, oldpte, *pte;
1063
1064	oldpte = 0;
1065	pte = vtopte(sva);
1066	endpte = pte + count;
1067	while (pte < endpte) {
1068		oldpte |= *pte;
1069		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
1070		pte++;
1071		ma++;
1072	}
1073	if ((oldpte & PG_V) != 0)
1074		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1075		    PAGE_SIZE);
1076}
1077
1078/*
1079 * This routine tears out page mappings from the
1080 * kernel -- it is meant only for temporary mappings.
1081 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1082 */
1083void
1084pmap_qremove(vm_offset_t sva, int count)
1085{
1086	vm_offset_t va;
1087
1088	va = sva;
1089	while (count-- > 0) {
1090		pmap_kremove(va);
1091		va += PAGE_SIZE;
1092	}
1093	pmap_invalidate_range(kernel_pmap, sva, va);
1094}
1095
1096/***************************************************
1097 * Page table page management routines.....
1098 ***************************************************/
1099
1100/*
1101 * This routine unholds page table pages, and if the hold count
1102 * drops to zero, then it decrements the wire count.
1103 */
1104static PMAP_INLINE int
1105pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1106{
1107
1108	--m->wire_count;
1109	if (m->wire_count == 0)
1110		return _pmap_unwire_pte_hold(pmap, m);
1111	else
1112		return 0;
1113}
1114
1115static int
1116_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1117{
1118	vm_offset_t pteva;
1119
1120	/*
1121	 * unmap the page table page
1122	 */
1123	pmap->pm_pdir[m->pindex] = 0;
1124	--pmap->pm_stats.resident_count;
1125
1126	/*
1127	 * Do an invltlb to make the invalidated mapping
1128	 * take effect immediately.
1129	 */
1130	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1131	pmap_invalidate_page(pmap, pteva);
1132
1133	vm_page_free_zero(m);
1134	atomic_subtract_int(&cnt.v_wire_count, 1);
1135	return 1;
1136}
1137
1138/*
1139 * After removing a page table entry, this routine is used to
1140 * conditionally free the page, and manage the hold/wire counts.
1141 */
1142static int
1143pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
1144{
1145	pd_entry_t ptepde;
1146	vm_page_t mpte;
1147
1148	if (va >= VM_MAXUSER_ADDRESS)
1149		return 0;
1150	ptepde = *pmap_pde(pmap, va);
1151	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1152	return pmap_unwire_pte_hold(pmap, mpte);
1153}
1154
1155void
1156pmap_pinit0(pmap)
1157	struct pmap *pmap;
1158{
1159
1160	PMAP_LOCK_INIT(pmap);
1161	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1162#ifdef PAE
1163	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1164#endif
1165	pmap->pm_active = 0;
1166	PCPU_SET(curpmap, pmap);
1167	TAILQ_INIT(&pmap->pm_pvchunk);
1168	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1169	mtx_lock_spin(&allpmaps_lock);
1170	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1171	mtx_unlock_spin(&allpmaps_lock);
1172}
1173
1174/*
1175 * Initialize a preallocated and zeroed pmap structure,
1176 * such as one in a vmspace structure.
1177 */
1178void
1179pmap_pinit(pmap)
1180	register struct pmap *pmap;
1181{
1182	vm_page_t m, ptdpg[NPGPTD];
1183	vm_paddr_t pa;
1184	static int color;
1185	int i;
1186
1187	PMAP_LOCK_INIT(pmap);
1188
1189	/*
1190	 * No need to allocate page table space yet but we do need a valid
1191	 * page directory table.
1192	 */
1193	if (pmap->pm_pdir == NULL) {
1194		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1195		    NBPTD);
1196#ifdef PAE
1197		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1198		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1199		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1200		    ("pmap_pinit: pdpt misaligned"));
1201		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1202		    ("pmap_pinit: pdpt above 4g"));
1203#endif
1204	}
1205
1206	/*
1207	 * allocate the page directory page(s)
1208	 */
1209	for (i = 0; i < NPGPTD;) {
1210		m = vm_page_alloc(NULL, color++,
1211		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1212		    VM_ALLOC_ZERO);
1213		if (m == NULL)
1214			VM_WAIT;
1215		else {
1216			ptdpg[i++] = m;
1217		}
1218	}
1219
1220	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1221
1222	for (i = 0; i < NPGPTD; i++) {
1223		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1224			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1225	}
1226
1227	mtx_lock_spin(&allpmaps_lock);
1228	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1229	mtx_unlock_spin(&allpmaps_lock);
1230	/* Wire in kernel global address entries. */
1231	/* XXX copies current process, does not fill in MPPTDI */
1232	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1233#ifdef SMP
1234	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1235#endif
1236
1237	/* install self-referential address mapping entry(s) */
1238	for (i = 0; i < NPGPTD; i++) {
1239		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1240		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1241#ifdef PAE
1242		pmap->pm_pdpt[i] = pa | PG_V;
1243#endif
1244	}
1245
1246	pmap->pm_active = 0;
1247	TAILQ_INIT(&pmap->pm_pvchunk);
1248	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1249}
1250
1251/*
1252 * this routine is called if the page table page is not
1253 * mapped correctly.
1254 */
1255static vm_page_t
1256_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1257{
1258	vm_paddr_t ptepa;
1259	vm_page_t m;
1260
1261	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1262	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1263	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1264
1265	/*
1266	 * Allocate a page table page.
1267	 */
1268	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1269	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1270		if (flags & M_WAITOK) {
1271			PMAP_UNLOCK(pmap);
1272			vm_page_unlock_queues();
1273			VM_WAIT;
1274			vm_page_lock_queues();
1275			PMAP_LOCK(pmap);
1276		}
1277
1278		/*
1279		 * Indicate the need to retry.  While waiting, the page table
1280		 * page may have been allocated.
1281		 */
1282		return (NULL);
1283	}
1284	if ((m->flags & PG_ZERO) == 0)
1285		pmap_zero_page(m);
1286
1287	/*
1288	 * Map the pagetable page into the process address space, if
1289	 * it isn't already there.
1290	 */
1291
1292	pmap->pm_stats.resident_count++;
1293
1294	ptepa = VM_PAGE_TO_PHYS(m);
1295	pmap->pm_pdir[ptepindex] =
1296		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1297
1298	return m;
1299}
1300
1301static vm_page_t
1302pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1303{
1304	unsigned ptepindex;
1305	pd_entry_t ptepa;
1306	vm_page_t m;
1307
1308	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1309	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1310	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1311
1312	/*
1313	 * Calculate pagetable page index
1314	 */
1315	ptepindex = va >> PDRSHIFT;
1316retry:
1317	/*
1318	 * Get the page directory entry
1319	 */
1320	ptepa = pmap->pm_pdir[ptepindex];
1321
1322	/*
1323	 * This supports switching from a 4MB page to a
1324	 * normal 4K page.
1325	 */
1326	if (ptepa & PG_PS) {
1327		pmap->pm_pdir[ptepindex] = 0;
1328		ptepa = 0;
1329		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1330		pmap_invalidate_all(kernel_pmap);
1331	}
1332
1333	/*
1334	 * If the page table page is mapped, we just increment the
1335	 * hold count, and activate it.
1336	 */
1337	if (ptepa) {
1338		m = PHYS_TO_VM_PAGE(ptepa);
1339		m->wire_count++;
1340	} else {
1341		/*
1342		 * Here if the pte page isn't mapped, or if it has
1343		 * been deallocated.
1344		 */
1345		m = _pmap_allocpte(pmap, ptepindex, flags);
1346		if (m == NULL && (flags & M_WAITOK))
1347			goto retry;
1348	}
1349	return (m);
1350}
1351
1352
1353/***************************************************
1354* Pmap allocation/deallocation routines.
1355 ***************************************************/
1356
1357#ifdef SMP
1358/*
1359 * Deal with a SMP shootdown of other users of the pmap that we are
1360 * trying to dispose of.  This can be a bit hairy.
1361 */
1362static u_int *lazymask;
1363static u_int lazyptd;
1364static volatile u_int lazywait;
1365
1366void pmap_lazyfix_action(void);
1367
1368void
1369pmap_lazyfix_action(void)
1370{
1371	u_int mymask = PCPU_GET(cpumask);
1372
1373#ifdef COUNT_IPIS
1374	*ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
1375#endif
1376	if (rcr3() == lazyptd)
1377		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1378	atomic_clear_int(lazymask, mymask);
1379	atomic_store_rel_int(&lazywait, 1);
1380}
1381
1382static void
1383pmap_lazyfix_self(u_int mymask)
1384{
1385
1386	if (rcr3() == lazyptd)
1387		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1388	atomic_clear_int(lazymask, mymask);
1389}
1390
1391
1392static void
1393pmap_lazyfix(pmap_t pmap)
1394{
1395	u_int mymask;
1396	u_int mask;
1397	register u_int spins;
1398
1399	while ((mask = pmap->pm_active) != 0) {
1400		spins = 50000000;
1401		mask = mask & -mask;	/* Find least significant set bit */
1402		mtx_lock_spin(&smp_ipi_mtx);
1403#ifdef PAE
1404		lazyptd = vtophys(pmap->pm_pdpt);
1405#else
1406		lazyptd = vtophys(pmap->pm_pdir);
1407#endif
1408		mymask = PCPU_GET(cpumask);
1409		if (mask == mymask) {
1410			lazymask = &pmap->pm_active;
1411			pmap_lazyfix_self(mymask);
1412		} else {
1413			atomic_store_rel_int((u_int *)&lazymask,
1414			    (u_int)&pmap->pm_active);
1415			atomic_store_rel_int(&lazywait, 0);
1416			ipi_selected(mask, IPI_LAZYPMAP);
1417			while (lazywait == 0) {
1418				ia32_pause();
1419				if (--spins == 0)
1420					break;
1421			}
1422		}
1423		mtx_unlock_spin(&smp_ipi_mtx);
1424		if (spins == 0)
1425			printf("pmap_lazyfix: spun for 50000000\n");
1426	}
1427}
1428
1429#else	/* SMP */
1430
1431/*
1432 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1433 * unlikely to have to even execute this code, including the fact
1434 * that the cleanup is deferred until the parent does a wait(2), which
1435 * means that another userland process has run.
1436 */
1437static void
1438pmap_lazyfix(pmap_t pmap)
1439{
1440	u_int cr3;
1441
1442	cr3 = vtophys(pmap->pm_pdir);
1443	if (cr3 == rcr3()) {
1444		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1445		pmap->pm_active &= ~(PCPU_GET(cpumask));
1446	}
1447}
1448#endif	/* SMP */
1449
1450/*
1451 * Release any resources held by the given physical map.
1452 * Called when a pmap initialized by pmap_pinit is being released.
1453 * Should only be called if the map contains no valid mappings.
1454 */
1455void
1456pmap_release(pmap_t pmap)
1457{
1458	vm_page_t m, ptdpg[NPGPTD];
1459	int i;
1460
1461	KASSERT(pmap->pm_stats.resident_count == 0,
1462	    ("pmap_release: pmap resident count %ld != 0",
1463	    pmap->pm_stats.resident_count));
1464
1465	pmap_lazyfix(pmap);
1466	mtx_lock_spin(&allpmaps_lock);
1467	LIST_REMOVE(pmap, pm_list);
1468	mtx_unlock_spin(&allpmaps_lock);
1469
1470	for (i = 0; i < NPGPTD; i++)
1471		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1472
1473	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1474	    sizeof(*pmap->pm_pdir));
1475#ifdef SMP
1476	pmap->pm_pdir[MPPTDI] = 0;
1477#endif
1478
1479	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1480
1481	vm_page_lock_queues();
1482	for (i = 0; i < NPGPTD; i++) {
1483		m = ptdpg[i];
1484#ifdef PAE
1485		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1486		    ("pmap_release: got wrong ptd page"));
1487#endif
1488		m->wire_count--;
1489		atomic_subtract_int(&cnt.v_wire_count, 1);
1490		vm_page_free_zero(m);
1491	}
1492	vm_page_unlock_queues();
1493	PMAP_LOCK_DESTROY(pmap);
1494}
1495
1496static int
1497kvm_size(SYSCTL_HANDLER_ARGS)
1498{
1499	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1500
1501	return sysctl_handle_long(oidp, &ksize, 0, req);
1502}
1503SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1504    0, 0, kvm_size, "IU", "Size of KVM");
1505
1506static int
1507kvm_free(SYSCTL_HANDLER_ARGS)
1508{
1509	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1510
1511	return sysctl_handle_long(oidp, &kfree, 0, req);
1512}
1513SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1514    0, 0, kvm_free, "IU", "Amount of KVM free");
1515
1516/*
1517 * grow the number of kernel page table entries, if needed
1518 */
1519void
1520pmap_growkernel(vm_offset_t addr)
1521{
1522	struct pmap *pmap;
1523	vm_paddr_t ptppaddr;
1524	vm_page_t nkpg;
1525	pd_entry_t newpdir;
1526	pt_entry_t *pde;
1527
1528	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1529	if (kernel_vm_end == 0) {
1530		kernel_vm_end = KERNBASE;
1531		nkpt = 0;
1532		while (pdir_pde(PTD, kernel_vm_end)) {
1533			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1534			nkpt++;
1535			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1536				kernel_vm_end = kernel_map->max_offset;
1537				break;
1538			}
1539		}
1540	}
1541	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1542	if (addr - 1 >= kernel_map->max_offset)
1543		addr = kernel_map->max_offset;
1544	while (kernel_vm_end < addr) {
1545		if (pdir_pde(PTD, kernel_vm_end)) {
1546			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1547			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1548				kernel_vm_end = kernel_map->max_offset;
1549				break;
1550			}
1551			continue;
1552		}
1553
1554		/*
1555		 * This index is bogus, but out of the way
1556		 */
1557		nkpg = vm_page_alloc(NULL, nkpt,
1558		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1559		if (!nkpg)
1560			panic("pmap_growkernel: no memory to grow kernel");
1561
1562		nkpt++;
1563
1564		pmap_zero_page(nkpg);
1565		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1566		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1567		pdir_pde(PTD, kernel_vm_end) = newpdir;
1568
1569		mtx_lock_spin(&allpmaps_lock);
1570		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1571			pde = pmap_pde(pmap, kernel_vm_end);
1572			pde_store(pde, newpdir);
1573		}
1574		mtx_unlock_spin(&allpmaps_lock);
1575		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1576		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1577			kernel_vm_end = kernel_map->max_offset;
1578			break;
1579		}
1580	}
1581}
1582
1583
1584/***************************************************
1585 * page management routines.
1586 ***************************************************/
1587
1588CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1589CTASSERT(_NPCM == 11);
1590
1591static __inline struct pv_chunk *
1592pv_to_chunk(pv_entry_t pv)
1593{
1594
1595	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1596}
1597
1598#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1599
1600#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
1601#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
1602
1603static uint64_t pc_freemask[11] = {
1604	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1605	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1606	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1607	PC_FREE0_9, PC_FREE10
1608};
1609
1610SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1611	"Current number of pv entries");
1612
1613#ifdef PV_STATS
1614static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1615
1616SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1617	"Current number of pv entry chunks");
1618SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1619	"Current number of pv entry chunks allocated");
1620SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1621	"Current number of pv entry chunks frees");
1622SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1623	"Number of times tried to get a chunk page but failed.");
1624
1625static long pv_entry_frees, pv_entry_allocs;
1626static int pv_entry_spare;
1627
1628SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1629	"Current number of pv entry frees");
1630SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1631	"Current number of pv entry allocs");
1632SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1633	"Current number of spare pv entries");
1634
1635static int pmap_collect_inactive, pmap_collect_active;
1636
1637SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1638	"Current number times pmap_collect called on inactive queue");
1639SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1640	"Current number times pmap_collect called on active queue");
1641#endif
1642
1643/*
1644 * We are in a serious low memory condition.  Resort to
1645 * drastic measures to free some pages so we can allocate
1646 * another pv entry chunk.  This is normally called to
1647 * unmap inactive pages, and if necessary, active pages.
1648 */
1649static void
1650pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1651{
1652	pmap_t pmap;
1653	pt_entry_t *pte, tpte;
1654	pv_entry_t next_pv, pv;
1655	vm_offset_t va;
1656	vm_page_t m;
1657
1658	sched_pin();
1659	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1660		if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1661			continue;
1662		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1663			va = pv->pv_va;
1664			pmap = PV_PMAP(pv);
1665			/* Avoid deadlock and lock recursion. */
1666			if (pmap > locked_pmap)
1667				PMAP_LOCK(pmap);
1668			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1669				continue;
1670			pmap->pm_stats.resident_count--;
1671			pte = pmap_pte_quick(pmap, va);
1672			tpte = pte_load_clear(pte);
1673			KASSERT((tpte & PG_W) == 0,
1674			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
1675			if (tpte & PG_A)
1676				vm_page_flag_set(m, PG_REFERENCED);
1677			if (tpte & PG_M) {
1678				KASSERT((tpte & PG_RW),
1679	("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
1680				    va, (uintmax_t)tpte));
1681				vm_page_dirty(m);
1682			}
1683			pmap_invalidate_page(pmap, va);
1684			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1685			if (TAILQ_EMPTY(&m->md.pv_list))
1686				vm_page_flag_clear(m, PG_WRITEABLE);
1687			m->md.pv_list_count--;
1688			pmap_unuse_pt(pmap, va);
1689			if (pmap != locked_pmap)
1690				PMAP_UNLOCK(pmap);
1691			free_pv_entry(locked_pmap, pv);
1692		}
1693	}
1694	sched_unpin();
1695}
1696
1697
1698/*
1699 * free the pv_entry back to the free list
1700 */
1701static void
1702free_pv_entry(pmap_t pmap, pv_entry_t pv)
1703{
1704	vm_page_t m;
1705	struct pv_chunk *pc;
1706	int idx, field, bit;
1707
1708	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1709	PV_STAT(pv_entry_frees++);
1710	PV_STAT(pv_entry_spare++);
1711	pv_entry_count--;
1712	pc = pv_to_chunk(pv);
1713	idx = pv - &pc->pc_pventry[0];
1714	field = idx / 32;
1715	bit = idx % 32;
1716	pc->pc_map[field] |= 1ul << bit;
1717	/* move to head of list */
1718	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1719	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1720	for (idx = 0; idx < _NPCM; idx++)
1721		if (pc->pc_map[idx] != pc_freemask[idx])
1722			return;
1723	PV_STAT(pv_entry_spare -= _NPCPV);
1724	PV_STAT(pc_chunk_count--);
1725	PV_STAT(pc_chunk_frees++);
1726	/* entire chunk is free, return it */
1727	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1728	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
1729	pmap_qremove((vm_offset_t)pc, 1);
1730	vm_page_unwire(m, 0);
1731	vm_page_free(m);
1732	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
1733}
1734
1735/*
1736 * get a new pv_entry, allocating a block from the system
1737 * when needed.
1738 */
1739static pv_entry_t
1740get_pv_entry(pmap_t pmap, int try)
1741{
1742	static const struct timeval printinterval = { 60, 0 };
1743	static struct timeval lastprint;
1744	static vm_pindex_t colour;
1745	int bit, field, page_req;
1746	pv_entry_t pv;
1747	struct pv_chunk *pc;
1748	vm_page_t m;
1749
1750	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1751	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1752	PV_STAT(pv_entry_allocs++);
1753	pv_entry_count++;
1754	if (pv_entry_count > pv_entry_high_water)
1755		pagedaemon_wakeup();
1756	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1757	if (pc != NULL) {
1758		for (field = 0; field < _NPCM; field++) {
1759			if (pc->pc_map[field]) {
1760				bit = bsfl(pc->pc_map[field]);
1761				break;
1762			}
1763		}
1764		if (field < _NPCM) {
1765			pv = &pc->pc_pventry[field * 32 + bit];
1766			pc->pc_map[field] &= ~(1ul << bit);
1767			/* If this was the last item, move it to tail */
1768			for (field = 0; field < _NPCM; field++)
1769				if (pc->pc_map[field] != 0) {
1770					PV_STAT(pv_entry_spare--);
1771					return (pv);	/* not full, return */
1772				}
1773			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1774			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1775			PV_STAT(pv_entry_spare--);
1776			return (pv);
1777		}
1778	}
1779	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
1780	page_req = try ? VM_ALLOC_NORMAL : VM_ALLOC_SYSTEM;
1781	m = vm_page_alloc(NULL, colour, page_req |
1782	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1783	if (m == NULL || pc == NULL) {
1784		if (try) {
1785			pv_entry_count--;
1786			PV_STAT(pc_chunk_tryfail++);
1787			if (m) {
1788				vm_page_lock_queues();
1789				vm_page_unwire(m, 0);
1790				vm_page_free(m);
1791				vm_page_unlock_queues();
1792			}
1793			if (pc)
1794				pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
1795			return (NULL);
1796		}
1797		/*
1798		 * Reclaim pv entries: At first, destroy mappings to
1799		 * inactive pages.  After that, if a pv chunk entry
1800		 * is still needed, destroy mappings to active pages.
1801		 */
1802		if (ratecheck(&lastprint, &printinterval))
1803			printf("Approaching the limit on PV entries, "
1804			    "consider increasing tunables "
1805			    "vm.pmap.shpgperproc or "
1806			    "vm.pmap.pv_entry_max\n");
1807		PV_STAT(pmap_collect_inactive++);
1808		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
1809		if (m == NULL)
1810			m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM |
1811			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1812		if (pc == NULL)
1813			pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
1814		if (m == NULL || pc == NULL) {
1815			PV_STAT(pmap_collect_active++);
1816			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
1817			if (m == NULL)
1818				m = vm_page_alloc(NULL, colour,
1819				    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
1820				    VM_ALLOC_WIRED);
1821			if (pc == NULL)
1822				pc = (struct pv_chunk *)
1823				    pmap_ptelist_alloc(&pv_vafree);
1824			if (m == NULL || pc == NULL)
1825				panic("get_pv_entry: increase vm.pmap.shpgperproc");
1826		}
1827	}
1828	PV_STAT(pc_chunk_count++);
1829	PV_STAT(pc_chunk_allocs++);
1830	colour++;
1831	pmap_qenter((vm_offset_t)pc, &m, 1);
1832	pc->pc_pmap = pmap;
1833	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
1834	for (field = 1; field < _NPCM; field++)
1835		pc->pc_map[field] = pc_freemask[field];
1836	pv = &pc->pc_pventry[0];
1837	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1838	PV_STAT(pv_entry_spare += _NPCPV - 1);
1839	return (pv);
1840}
1841
1842static void
1843pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1844{
1845	pv_entry_t pv;
1846
1847	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1848	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1849	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1850		if (pmap == PV_PMAP(pv) && va == pv->pv_va)
1851			break;
1852	}
1853	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1854	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1855	m->md.pv_list_count--;
1856	if (TAILQ_EMPTY(&m->md.pv_list))
1857		vm_page_flag_clear(m, PG_WRITEABLE);
1858	free_pv_entry(pmap, pv);
1859}
1860
1861/*
1862 * Create a pv entry for page at pa for
1863 * (pmap, va).
1864 */
1865static void
1866pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1867{
1868	pv_entry_t pv;
1869
1870	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1871	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1872	pv = get_pv_entry(pmap, FALSE);
1873	pv->pv_va = va;
1874	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1875	m->md.pv_list_count++;
1876}
1877
1878/*
1879 * Conditionally create a pv entry.
1880 */
1881static boolean_t
1882pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1883{
1884	pv_entry_t pv;
1885
1886	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1887	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1888	if (pv_entry_count < pv_entry_high_water &&
1889	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
1890		pv->pv_va = va;
1891		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1892		m->md.pv_list_count++;
1893		return (TRUE);
1894	} else
1895		return (FALSE);
1896}
1897
1898/*
1899 * pmap_remove_pte: do the things to unmap a page in a process
1900 */
1901static int
1902pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1903{
1904	pt_entry_t oldpte;
1905	vm_page_t m;
1906
1907	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1908	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1909	oldpte = pte_load_clear(ptq);
1910	if (oldpte & PG_W)
1911		pmap->pm_stats.wired_count -= 1;
1912	/*
1913	 * Machines that don't support invlpg, also don't support
1914	 * PG_G.
1915	 */
1916	if (oldpte & PG_G)
1917		pmap_invalidate_page(kernel_pmap, va);
1918	pmap->pm_stats.resident_count -= 1;
1919	if (oldpte & PG_MANAGED) {
1920		m = PHYS_TO_VM_PAGE(oldpte);
1921		if (oldpte & PG_M) {
1922			KASSERT((oldpte & PG_RW),
1923	("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
1924			    va, (uintmax_t)oldpte));
1925			vm_page_dirty(m);
1926		}
1927		if (oldpte & PG_A)
1928			vm_page_flag_set(m, PG_REFERENCED);
1929		pmap_remove_entry(pmap, m, va);
1930	}
1931	return (pmap_unuse_pt(pmap, va));
1932}
1933
1934/*
1935 * Remove a single page from a process address space
1936 */
1937static void
1938pmap_remove_page(pmap_t pmap, vm_offset_t va)
1939{
1940	pt_entry_t *pte;
1941
1942	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1943	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1944	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1945	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1946		return;
1947	pmap_remove_pte(pmap, pte, va);
1948	pmap_invalidate_page(pmap, va);
1949}
1950
1951/*
1952 *	Remove the given range of addresses from the specified map.
1953 *
1954 *	It is assumed that the start and end are properly
1955 *	rounded to the page size.
1956 */
1957void
1958pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1959{
1960	vm_offset_t pdnxt;
1961	pd_entry_t ptpaddr;
1962	pt_entry_t *pte;
1963	int anyvalid;
1964
1965	/*
1966	 * Perform an unsynchronized read.  This is, however, safe.
1967	 */
1968	if (pmap->pm_stats.resident_count == 0)
1969		return;
1970
1971	anyvalid = 0;
1972
1973	vm_page_lock_queues();
1974	sched_pin();
1975	PMAP_LOCK(pmap);
1976
1977	/*
1978	 * special handling of removing one page.  a very
1979	 * common operation and easy to short circuit some
1980	 * code.
1981	 */
1982	if ((sva + PAGE_SIZE == eva) &&
1983	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1984		pmap_remove_page(pmap, sva);
1985		goto out;
1986	}
1987
1988	for (; sva < eva; sva = pdnxt) {
1989		unsigned pdirindex;
1990
1991		/*
1992		 * Calculate index for next page table.
1993		 */
1994		pdnxt = (sva + NBPDR) & ~PDRMASK;
1995		if (pmap->pm_stats.resident_count == 0)
1996			break;
1997
1998		pdirindex = sva >> PDRSHIFT;
1999		ptpaddr = pmap->pm_pdir[pdirindex];
2000
2001		/*
2002		 * Weed out invalid mappings. Note: we assume that the page
2003		 * directory table is always allocated, and in kernel virtual.
2004		 */
2005		if (ptpaddr == 0)
2006			continue;
2007
2008		/*
2009		 * Check for large page.
2010		 */
2011		if ((ptpaddr & PG_PS) != 0) {
2012			pmap->pm_pdir[pdirindex] = 0;
2013			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2014			anyvalid = 1;
2015			continue;
2016		}
2017
2018		/*
2019		 * Limit our scan to either the end of the va represented
2020		 * by the current page table page, or to the end of the
2021		 * range being removed.
2022		 */
2023		if (pdnxt > eva)
2024			pdnxt = eva;
2025
2026		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2027		    sva += PAGE_SIZE) {
2028			if (*pte == 0)
2029				continue;
2030
2031			/*
2032			 * The TLB entry for a PG_G mapping is invalidated
2033			 * by pmap_remove_pte().
2034			 */
2035			if ((*pte & PG_G) == 0)
2036				anyvalid = 1;
2037			if (pmap_remove_pte(pmap, pte, sva))
2038				break;
2039		}
2040	}
2041out:
2042	sched_unpin();
2043	vm_page_unlock_queues();
2044	if (anyvalid)
2045		pmap_invalidate_all(pmap);
2046	PMAP_UNLOCK(pmap);
2047}
2048
2049/*
2050 *	Routine:	pmap_remove_all
2051 *	Function:
2052 *		Removes this physical page from
2053 *		all physical maps in which it resides.
2054 *		Reflects back modify bits to the pager.
2055 *
2056 *	Notes:
2057 *		Original versions of this routine were very
2058 *		inefficient because they iteratively called
2059 *		pmap_remove (slow...)
2060 */
2061
2062void
2063pmap_remove_all(vm_page_t m)
2064{
2065	register pv_entry_t pv;
2066	pmap_t pmap;
2067	pt_entry_t *pte, tpte;
2068
2069#if defined(PMAP_DIAGNOSTIC)
2070	/*
2071	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
2072	 */
2073	if (m->flags & PG_FICTITIOUS) {
2074		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
2075		    VM_PAGE_TO_PHYS(m));
2076	}
2077#endif
2078	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2079	sched_pin();
2080	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2081		pmap = PV_PMAP(pv);
2082		PMAP_LOCK(pmap);
2083		pmap->pm_stats.resident_count--;
2084		pte = pmap_pte_quick(pmap, pv->pv_va);
2085		tpte = pte_load_clear(pte);
2086		if (tpte & PG_W)
2087			pmap->pm_stats.wired_count--;
2088		if (tpte & PG_A)
2089			vm_page_flag_set(m, PG_REFERENCED);
2090
2091		/*
2092		 * Update the vm_page_t clean and reference bits.
2093		 */
2094		if (tpte & PG_M) {
2095			KASSERT((tpte & PG_RW),
2096	("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
2097			    pv->pv_va, (uintmax_t)tpte));
2098			vm_page_dirty(m);
2099		}
2100		pmap_invalidate_page(pmap, pv->pv_va);
2101		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2102		m->md.pv_list_count--;
2103		pmap_unuse_pt(pmap, pv->pv_va);
2104		PMAP_UNLOCK(pmap);
2105		free_pv_entry(pmap, pv);
2106	}
2107	vm_page_flag_clear(m, PG_WRITEABLE);
2108	sched_unpin();
2109}
2110
2111/*
2112 *	Set the physical protection on the
2113 *	specified range of this map as requested.
2114 */
2115void
2116pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2117{
2118	vm_offset_t pdnxt;
2119	pd_entry_t ptpaddr;
2120	pt_entry_t *pte;
2121	int anychanged;
2122
2123	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2124		pmap_remove(pmap, sva, eva);
2125		return;
2126	}
2127
2128	if (prot & VM_PROT_WRITE)
2129		return;
2130
2131	anychanged = 0;
2132
2133	vm_page_lock_queues();
2134	sched_pin();
2135	PMAP_LOCK(pmap);
2136	for (; sva < eva; sva = pdnxt) {
2137		unsigned obits, pbits, pdirindex;
2138
2139		pdnxt = (sva + NBPDR) & ~PDRMASK;
2140
2141		pdirindex = sva >> PDRSHIFT;
2142		ptpaddr = pmap->pm_pdir[pdirindex];
2143
2144		/*
2145		 * Weed out invalid mappings. Note: we assume that the page
2146		 * directory table is always allocated, and in kernel virtual.
2147		 */
2148		if (ptpaddr == 0)
2149			continue;
2150
2151		/*
2152		 * Check for large page.
2153		 */
2154		if ((ptpaddr & PG_PS) != 0) {
2155			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2156			anychanged = 1;
2157			continue;
2158		}
2159
2160		if (pdnxt > eva)
2161			pdnxt = eva;
2162
2163		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2164		    sva += PAGE_SIZE) {
2165			vm_page_t m;
2166
2167retry:
2168			/*
2169			 * Regardless of whether a pte is 32 or 64 bits in
2170			 * size, PG_RW, PG_A, and PG_M are among the least
2171			 * significant 32 bits.
2172			 */
2173			obits = pbits = *(u_int *)pte;
2174			if (pbits & PG_MANAGED) {
2175				m = NULL;
2176				if (pbits & PG_A) {
2177					m = PHYS_TO_VM_PAGE(*pte);
2178					vm_page_flag_set(m, PG_REFERENCED);
2179					pbits &= ~PG_A;
2180				}
2181				if ((pbits & PG_M) != 0) {
2182					if (m == NULL)
2183						m = PHYS_TO_VM_PAGE(*pte);
2184					vm_page_dirty(m);
2185				}
2186			}
2187
2188			pbits &= ~(PG_RW | PG_M);
2189
2190			if (pbits != obits) {
2191				if (!atomic_cmpset_int((u_int *)pte, obits,
2192				    pbits))
2193					goto retry;
2194				if (obits & PG_G)
2195					pmap_invalidate_page(pmap, sva);
2196				else
2197					anychanged = 1;
2198			}
2199		}
2200	}
2201	sched_unpin();
2202	vm_page_unlock_queues();
2203	if (anychanged)
2204		pmap_invalidate_all(pmap);
2205	PMAP_UNLOCK(pmap);
2206}
2207
2208/*
2209 *	Insert the given physical page (p) at
2210 *	the specified virtual address (v) in the
2211 *	target physical map with the protection requested.
2212 *
2213 *	If specified, the page will be wired down, meaning
2214 *	that the related pte can not be reclaimed.
2215 *
2216 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2217 *	or lose information.  That is, this routine must actually
2218 *	insert this page into the given map NOW.
2219 */
2220void
2221pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2222	   boolean_t wired)
2223{
2224	vm_paddr_t pa;
2225	pd_entry_t *pde;
2226	register pt_entry_t *pte;
2227	vm_paddr_t opa;
2228	pt_entry_t origpte, newpte;
2229	vm_page_t mpte, om;
2230	boolean_t invlva;
2231
2232	va &= PG_FRAME;
2233#ifdef PMAP_DIAGNOSTIC
2234	if (va > VM_MAX_KERNEL_ADDRESS)
2235		panic("pmap_enter: toobig");
2236	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2237		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2238#endif
2239
2240	mpte = NULL;
2241
2242	vm_page_lock_queues();
2243	PMAP_LOCK(pmap);
2244	sched_pin();
2245
2246	/*
2247	 * In the case that a page table page is not
2248	 * resident, we are creating it here.
2249	 */
2250	if (va < VM_MAXUSER_ADDRESS) {
2251		mpte = pmap_allocpte(pmap, va, M_WAITOK);
2252	}
2253#if 0 && defined(PMAP_DIAGNOSTIC)
2254	else {
2255		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2256		origpte = *pdeaddr;
2257		if ((origpte & PG_V) == 0) {
2258			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2259				pmap->pm_pdir[PTDPTDI], origpte, va);
2260		}
2261	}
2262#endif
2263
2264	pde = pmap_pde(pmap, va);
2265	if ((*pde & PG_PS) != 0)
2266		panic("pmap_enter: attempted pmap_enter on 4MB page");
2267	pte = pmap_pte_quick(pmap, va);
2268
2269	/*
2270	 * Page Directory table entry not valid, we need a new PT page
2271	 */
2272	if (pte == NULL) {
2273		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
2274			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
2275	}
2276
2277	pa = VM_PAGE_TO_PHYS(m);
2278	om = NULL;
2279	origpte = *pte;
2280	opa = origpte & PG_FRAME;
2281
2282	/*
2283	 * Mapping has not changed, must be protection or wiring change.
2284	 */
2285	if (origpte && (opa == pa)) {
2286		/*
2287		 * Wiring change, just update stats. We don't worry about
2288		 * wiring PT pages as they remain resident as long as there
2289		 * are valid mappings in them. Hence, if a user page is wired,
2290		 * the PT page will be also.
2291		 */
2292		if (wired && ((origpte & PG_W) == 0))
2293			pmap->pm_stats.wired_count++;
2294		else if (!wired && (origpte & PG_W))
2295			pmap->pm_stats.wired_count--;
2296
2297		/*
2298		 * Remove extra pte reference
2299		 */
2300		if (mpte)
2301			mpte->wire_count--;
2302
2303		/*
2304		 * We might be turning off write access to the page,
2305		 * so we go ahead and sense modify status.
2306		 */
2307		if (origpte & PG_MANAGED) {
2308			om = m;
2309			pa |= PG_MANAGED;
2310		}
2311		goto validate;
2312	}
2313	/*
2314	 * Mapping has changed, invalidate old range and fall through to
2315	 * handle validating new mapping.
2316	 */
2317	if (opa) {
2318		if (origpte & PG_W)
2319			pmap->pm_stats.wired_count--;
2320		if (origpte & PG_MANAGED) {
2321			om = PHYS_TO_VM_PAGE(opa);
2322			pmap_remove_entry(pmap, om, va);
2323		}
2324		if (mpte != NULL) {
2325			mpte->wire_count--;
2326			KASSERT(mpte->wire_count > 0,
2327			    ("pmap_enter: missing reference to page table page,"
2328			     " va: 0x%x", va));
2329		}
2330	} else
2331		pmap->pm_stats.resident_count++;
2332
2333	/*
2334	 * Enter on the PV list if part of our managed memory.
2335	 */
2336	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2337		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2338		    ("pmap_enter: managed mapping within the clean submap"));
2339		pmap_insert_entry(pmap, va, m);
2340		pa |= PG_MANAGED;
2341	}
2342
2343	/*
2344	 * Increment counters
2345	 */
2346	if (wired)
2347		pmap->pm_stats.wired_count++;
2348
2349validate:
2350	/*
2351	 * Now validate mapping with desired protection/wiring.
2352	 */
2353	newpte = (pt_entry_t)(pa | PG_V);
2354	if ((prot & VM_PROT_WRITE) != 0)
2355		newpte |= PG_RW;
2356	if (wired)
2357		newpte |= PG_W;
2358	if (va < VM_MAXUSER_ADDRESS)
2359		newpte |= PG_U;
2360	if (pmap == kernel_pmap)
2361		newpte |= pgeflag;
2362
2363	/*
2364	 * if the mapping or permission bits are different, we need
2365	 * to update the pte.
2366	 */
2367	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2368		if (origpte & PG_V) {
2369			invlva = FALSE;
2370			origpte = pte_load_store(pte, newpte | PG_A);
2371			if (origpte & PG_A) {
2372				if (origpte & PG_MANAGED)
2373					vm_page_flag_set(om, PG_REFERENCED);
2374				if (opa != VM_PAGE_TO_PHYS(m))
2375					invlva = TRUE;
2376			}
2377			if (origpte & PG_M) {
2378				KASSERT((origpte & PG_RW),
2379	("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
2380				    va, (uintmax_t)origpte));
2381				if ((origpte & PG_MANAGED) != 0)
2382					vm_page_dirty(om);
2383				if ((prot & VM_PROT_WRITE) == 0)
2384					invlva = TRUE;
2385			}
2386			if (invlva)
2387				pmap_invalidate_page(pmap, va);
2388		} else
2389			pte_store(pte, newpte | PG_A);
2390	}
2391	sched_unpin();
2392	vm_page_unlock_queues();
2393	PMAP_UNLOCK(pmap);
2394}
2395
2396/*
2397 * Maps a sequence of resident pages belonging to the same object.
2398 * The sequence begins with the given page m_start.  This page is
2399 * mapped at the given virtual address start.  Each subsequent page is
2400 * mapped at a virtual address that is offset from start by the same
2401 * amount as the page is offset from m_start within the object.  The
2402 * last page in the sequence is the page with the largest offset from
2403 * m_start that can be mapped at a virtual address less than the given
2404 * virtual address end.  Not every virtual page between start and end
2405 * is mapped; only those for which a resident page exists with the
2406 * corresponding offset from m_start are mapped.
2407 */
2408void
2409pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2410    vm_page_t m_start, vm_prot_t prot)
2411{
2412	vm_page_t m, mpte;
2413	vm_pindex_t diff, psize;
2414
2415	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
2416	psize = atop(end - start);
2417	mpte = NULL;
2418	m = m_start;
2419	PMAP_LOCK(pmap);
2420	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2421		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
2422		    prot, mpte);
2423		m = TAILQ_NEXT(m, listq);
2424	}
2425 	PMAP_UNLOCK(pmap);
2426}
2427
2428/*
2429 * this code makes some *MAJOR* assumptions:
2430 * 1. Current pmap & pmap exists.
2431 * 2. Not wired.
2432 * 3. Read access.
2433 * 4. No page table pages.
2434 * but is *MUCH* faster than pmap_enter...
2435 */
2436
2437void
2438pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2439{
2440
2441	PMAP_LOCK(pmap);
2442	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
2443	PMAP_UNLOCK(pmap);
2444}
2445
2446static vm_page_t
2447pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
2448    vm_prot_t prot, vm_page_t mpte)
2449{
2450	pt_entry_t *pte;
2451	vm_paddr_t pa;
2452
2453	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2454	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
2455	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2456	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2457	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2458
2459	/*
2460	 * In the case that a page table page is not
2461	 * resident, we are creating it here.
2462	 */
2463	if (va < VM_MAXUSER_ADDRESS) {
2464		unsigned ptepindex;
2465		pd_entry_t ptepa;
2466
2467		/*
2468		 * Calculate pagetable page index
2469		 */
2470		ptepindex = va >> PDRSHIFT;
2471		if (mpte && (mpte->pindex == ptepindex)) {
2472			mpte->wire_count++;
2473		} else {
2474			/*
2475			 * Get the page directory entry
2476			 */
2477			ptepa = pmap->pm_pdir[ptepindex];
2478
2479			/*
2480			 * If the page table page is mapped, we just increment
2481			 * the hold count, and activate it.
2482			 */
2483			if (ptepa) {
2484				if (ptepa & PG_PS)
2485					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2486				mpte = PHYS_TO_VM_PAGE(ptepa);
2487				mpte->wire_count++;
2488			} else {
2489				mpte = _pmap_allocpte(pmap, ptepindex,
2490				    M_NOWAIT);
2491				if (mpte == NULL)
2492					return (mpte);
2493			}
2494		}
2495	} else {
2496		mpte = NULL;
2497	}
2498
2499	/*
2500	 * This call to vtopte makes the assumption that we are
2501	 * entering the page into the current pmap.  In order to support
2502	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2503	 * But that isn't as quick as vtopte.
2504	 */
2505	pte = vtopte(va);
2506	if (*pte) {
2507		if (mpte != NULL) {
2508			pmap_unwire_pte_hold(pmap, mpte);
2509			mpte = NULL;
2510		}
2511		return (mpte);
2512	}
2513
2514	/*
2515	 * Enter on the PV list if part of our managed memory.
2516	 */
2517	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
2518	    !pmap_try_insert_pv_entry(pmap, va, m)) {
2519		if (mpte != NULL) {
2520			pmap_unwire_pte_hold(pmap, mpte);
2521			mpte = NULL;
2522		}
2523		return (mpte);
2524	}
2525
2526	/*
2527	 * Increment counters
2528	 */
2529	pmap->pm_stats.resident_count++;
2530
2531	pa = VM_PAGE_TO_PHYS(m);
2532
2533	/*
2534	 * Now validate mapping with RO protection
2535	 */
2536	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2537		pte_store(pte, pa | PG_V | PG_U);
2538	else
2539		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2540	return mpte;
2541}
2542
2543/*
2544 * Make a temporary mapping for a physical address.  This is only intended
2545 * to be used for panic dumps.
2546 */
2547void *
2548pmap_kenter_temporary(vm_paddr_t pa, int i)
2549{
2550	vm_offset_t va;
2551
2552	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2553	pmap_kenter(va, pa);
2554	invlpg(va);
2555	return ((void *)crashdumpmap);
2556}
2557
2558/*
2559 * This code maps large physical mmap regions into the
2560 * processor address space.  Note that some shortcuts
2561 * are taken, but the code works.
2562 */
2563void
2564pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2565		    vm_object_t object, vm_pindex_t pindex,
2566		    vm_size_t size)
2567{
2568	vm_page_t p;
2569
2570	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2571	KASSERT(object->type == OBJT_DEVICE,
2572	    ("pmap_object_init_pt: non-device object"));
2573	if (pseflag &&
2574	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2575		int i;
2576		vm_page_t m[1];
2577		unsigned int ptepindex;
2578		int npdes;
2579		pd_entry_t ptepa;
2580
2581		PMAP_LOCK(pmap);
2582		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2583			goto out;
2584		PMAP_UNLOCK(pmap);
2585retry:
2586		p = vm_page_lookup(object, pindex);
2587		if (p != NULL) {
2588			vm_page_lock_queues();
2589			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2590				goto retry;
2591		} else {
2592			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2593			if (p == NULL)
2594				return;
2595			m[0] = p;
2596
2597			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2598				vm_page_lock_queues();
2599				vm_page_free(p);
2600				vm_page_unlock_queues();
2601				return;
2602			}
2603
2604			p = vm_page_lookup(object, pindex);
2605			vm_page_lock_queues();
2606			vm_page_wakeup(p);
2607		}
2608		vm_page_unlock_queues();
2609
2610		ptepa = VM_PAGE_TO_PHYS(p);
2611		if (ptepa & (NBPDR - 1))
2612			return;
2613
2614		p->valid = VM_PAGE_BITS_ALL;
2615
2616		PMAP_LOCK(pmap);
2617		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2618		npdes = size >> PDRSHIFT;
2619		for(i = 0; i < npdes; i++) {
2620			pde_store(&pmap->pm_pdir[ptepindex],
2621			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
2622			ptepa += NBPDR;
2623			ptepindex += 1;
2624		}
2625		pmap_invalidate_all(pmap);
2626out:
2627		PMAP_UNLOCK(pmap);
2628	}
2629}
2630
2631/*
2632 *	Routine:	pmap_change_wiring
2633 *	Function:	Change the wiring attribute for a map/virtual-address
2634 *			pair.
2635 *	In/out conditions:
2636 *			The mapping must already exist in the pmap.
2637 */
2638void
2639pmap_change_wiring(pmap, va, wired)
2640	register pmap_t pmap;
2641	vm_offset_t va;
2642	boolean_t wired;
2643{
2644	register pt_entry_t *pte;
2645
2646	PMAP_LOCK(pmap);
2647	pte = pmap_pte(pmap, va);
2648
2649	if (wired && !pmap_pte_w(pte))
2650		pmap->pm_stats.wired_count++;
2651	else if (!wired && pmap_pte_w(pte))
2652		pmap->pm_stats.wired_count--;
2653
2654	/*
2655	 * Wiring is not a hardware characteristic so there is no need to
2656	 * invalidate TLB.
2657	 */
2658	pmap_pte_set_w(pte, wired);
2659	pmap_pte_release(pte);
2660	PMAP_UNLOCK(pmap);
2661}
2662
2663
2664
2665/*
2666 *	Copy the range specified by src_addr/len
2667 *	from the source map to the range dst_addr/len
2668 *	in the destination map.
2669 *
2670 *	This routine is only advisory and need not do anything.
2671 */
2672
2673void
2674pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2675	  vm_offset_t src_addr)
2676{
2677	vm_offset_t addr;
2678	vm_offset_t end_addr = src_addr + len;
2679	vm_offset_t pdnxt;
2680
2681	if (dst_addr != src_addr)
2682		return;
2683
2684	if (!pmap_is_current(src_pmap))
2685		return;
2686
2687	vm_page_lock_queues();
2688	if (dst_pmap < src_pmap) {
2689		PMAP_LOCK(dst_pmap);
2690		PMAP_LOCK(src_pmap);
2691	} else {
2692		PMAP_LOCK(src_pmap);
2693		PMAP_LOCK(dst_pmap);
2694	}
2695	sched_pin();
2696	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2697		pt_entry_t *src_pte, *dst_pte;
2698		vm_page_t dstmpte, srcmpte;
2699		pd_entry_t srcptepaddr;
2700		unsigned ptepindex;
2701
2702		if (addr >= UPT_MIN_ADDRESS)
2703			panic("pmap_copy: invalid to pmap_copy page tables");
2704
2705		pdnxt = (addr + NBPDR) & ~PDRMASK;
2706		ptepindex = addr >> PDRSHIFT;
2707
2708		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2709		if (srcptepaddr == 0)
2710			continue;
2711
2712		if (srcptepaddr & PG_PS) {
2713			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2714				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2715				dst_pmap->pm_stats.resident_count +=
2716				    NBPDR / PAGE_SIZE;
2717			}
2718			continue;
2719		}
2720
2721		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2722		if (srcmpte->wire_count == 0)
2723			panic("pmap_copy: source page table page is unused");
2724
2725		if (pdnxt > end_addr)
2726			pdnxt = end_addr;
2727
2728		src_pte = vtopte(addr);
2729		while (addr < pdnxt) {
2730			pt_entry_t ptetemp;
2731			ptetemp = *src_pte;
2732			/*
2733			 * we only virtual copy managed pages
2734			 */
2735			if ((ptetemp & PG_MANAGED) != 0) {
2736				/*
2737				 * We have to check after allocpte for the
2738				 * pte still being around...  allocpte can
2739				 * block.
2740				 */
2741				dstmpte = pmap_allocpte(dst_pmap, addr,
2742				    M_NOWAIT);
2743				if (dstmpte == NULL)
2744					break;
2745				dst_pte = pmap_pte_quick(dst_pmap, addr);
2746				if (*dst_pte == 0 &&
2747				    pmap_try_insert_pv_entry(dst_pmap, addr,
2748				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
2749					/*
2750					 * Clear the modified and
2751					 * accessed (referenced) bits
2752					 * during the copy.
2753					 */
2754					*dst_pte = ptetemp & ~(PG_M | PG_A);
2755					dst_pmap->pm_stats.resident_count++;
2756	 			} else
2757					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2758				if (dstmpte->wire_count >= srcmpte->wire_count)
2759					break;
2760			}
2761			addr += PAGE_SIZE;
2762			src_pte++;
2763		}
2764	}
2765	sched_unpin();
2766	vm_page_unlock_queues();
2767	PMAP_UNLOCK(src_pmap);
2768	PMAP_UNLOCK(dst_pmap);
2769}
2770
2771static __inline void
2772pagezero(void *page)
2773{
2774#if defined(I686_CPU)
2775	if (cpu_class == CPUCLASS_686) {
2776#if defined(CPU_ENABLE_SSE)
2777		if (cpu_feature & CPUID_SSE2)
2778			sse2_pagezero(page);
2779		else
2780#endif
2781			i686_pagezero(page);
2782	} else
2783#endif
2784		bzero(page, PAGE_SIZE);
2785}
2786
2787/*
2788 *	pmap_zero_page zeros the specified hardware page by mapping
2789 *	the page into KVM and using bzero to clear its contents.
2790 */
2791void
2792pmap_zero_page(vm_page_t m)
2793{
2794	struct sysmaps *sysmaps;
2795
2796	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2797	mtx_lock(&sysmaps->lock);
2798	if (*sysmaps->CMAP2)
2799		panic("pmap_zero_page: CMAP2 busy");
2800	sched_pin();
2801	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2802	invlcaddr(sysmaps->CADDR2);
2803	pagezero(sysmaps->CADDR2);
2804	*sysmaps->CMAP2 = 0;
2805	sched_unpin();
2806	mtx_unlock(&sysmaps->lock);
2807}
2808
2809/*
2810 *	pmap_zero_page_area zeros the specified hardware page by mapping
2811 *	the page into KVM and using bzero to clear its contents.
2812 *
2813 *	off and size may not cover an area beyond a single hardware page.
2814 */
2815void
2816pmap_zero_page_area(vm_page_t m, int off, int size)
2817{
2818	struct sysmaps *sysmaps;
2819
2820	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2821	mtx_lock(&sysmaps->lock);
2822	if (*sysmaps->CMAP2)
2823		panic("pmap_zero_page: CMAP2 busy");
2824	sched_pin();
2825	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2826	invlcaddr(sysmaps->CADDR2);
2827	if (off == 0 && size == PAGE_SIZE)
2828		pagezero(sysmaps->CADDR2);
2829	else
2830		bzero((char *)sysmaps->CADDR2 + off, size);
2831	*sysmaps->CMAP2 = 0;
2832	sched_unpin();
2833	mtx_unlock(&sysmaps->lock);
2834}
2835
2836/*
2837 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2838 *	the page into KVM and using bzero to clear its contents.  This
2839 *	is intended to be called from the vm_pagezero process only and
2840 *	outside of Giant.
2841 */
2842void
2843pmap_zero_page_idle(vm_page_t m)
2844{
2845
2846	if (*CMAP3)
2847		panic("pmap_zero_page: CMAP3 busy");
2848	sched_pin();
2849	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2850	invlcaddr(CADDR3);
2851	pagezero(CADDR3);
2852	*CMAP3 = 0;
2853	sched_unpin();
2854}
2855
2856/*
2857 *	pmap_copy_page copies the specified (machine independent)
2858 *	page by mapping the page into virtual memory and using
2859 *	bcopy to copy the page, one machine dependent page at a
2860 *	time.
2861 */
2862void
2863pmap_copy_page(vm_page_t src, vm_page_t dst)
2864{
2865	struct sysmaps *sysmaps;
2866
2867	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2868	mtx_lock(&sysmaps->lock);
2869	if (*sysmaps->CMAP1)
2870		panic("pmap_copy_page: CMAP1 busy");
2871	if (*sysmaps->CMAP2)
2872		panic("pmap_copy_page: CMAP2 busy");
2873	sched_pin();
2874	invlpg((u_int)sysmaps->CADDR1);
2875	invlpg((u_int)sysmaps->CADDR2);
2876	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2877	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2878	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
2879	*sysmaps->CMAP1 = 0;
2880	*sysmaps->CMAP2 = 0;
2881	sched_unpin();
2882	mtx_unlock(&sysmaps->lock);
2883}
2884
2885/*
2886 * Returns true if the pmap's pv is one of the first
2887 * 16 pvs linked to from this page.  This count may
2888 * be changed upwards or downwards in the future; it
2889 * is only necessary that true be returned for a small
2890 * subset of pmaps for proper page aging.
2891 */
2892boolean_t
2893pmap_page_exists_quick(pmap, m)
2894	pmap_t pmap;
2895	vm_page_t m;
2896{
2897	pv_entry_t pv;
2898	int loops = 0;
2899
2900	if (m->flags & PG_FICTITIOUS)
2901		return FALSE;
2902
2903	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2904	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2905		if (PV_PMAP(pv) == pmap) {
2906			return TRUE;
2907		}
2908		loops++;
2909		if (loops >= 16)
2910			break;
2911	}
2912	return (FALSE);
2913}
2914
2915/*
2916 * Remove all pages from specified address space
2917 * this aids process exit speeds.  Also, this code
2918 * is special cased for current process only, but
2919 * can have the more generic (and slightly slower)
2920 * mode enabled.  This is much faster than pmap_remove
2921 * in the case of running down an entire address space.
2922 */
2923void
2924pmap_remove_pages(pmap_t pmap)
2925{
2926	pt_entry_t *pte, tpte;
2927	vm_page_t m;
2928	pv_entry_t pv;
2929	struct pv_chunk *pc, *npc;
2930	int field, idx;
2931	int32_t bit;
2932	uint32_t inuse, bitmask;
2933	int allfree;
2934
2935	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2936		printf("warning: pmap_remove_pages called with non-current pmap\n");
2937		return;
2938	}
2939	vm_page_lock_queues();
2940	PMAP_LOCK(pmap);
2941	sched_pin();
2942	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
2943		allfree = 1;
2944		for (field = 0; field < _NPCM; field++) {
2945			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
2946			while (inuse != 0) {
2947				bit = bsfl(inuse);
2948				bitmask = 1UL << bit;
2949				idx = field * 32 + bit;
2950				pv = &pc->pc_pventry[idx];
2951				inuse &= ~bitmask;
2952
2953				pte = vtopte(pv->pv_va);
2954				tpte = *pte;
2955
2956				if (tpte == 0) {
2957					printf(
2958					    "TPTE at %p  IS ZERO @ VA %08x\n",
2959					    pte, pv->pv_va);
2960					panic("bad pte");
2961				}
2962
2963/*
2964 * We cannot remove wired pages from a process' mapping at this time
2965 */
2966				if (tpte & PG_W) {
2967					allfree = 0;
2968					continue;
2969				}
2970
2971				m = PHYS_TO_VM_PAGE(tpte);
2972				KASSERT(m->phys_addr == (tpte & PG_FRAME),
2973				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2974				    m, (uintmax_t)m->phys_addr,
2975				    (uintmax_t)tpte));
2976
2977				KASSERT(m < &vm_page_array[vm_page_array_size],
2978					("pmap_remove_pages: bad tpte %#jx",
2979					(uintmax_t)tpte));
2980
2981				pmap->pm_stats.resident_count--;
2982
2983				pte_clear(pte);
2984
2985				/*
2986				 * Update the vm_page_t clean/reference bits.
2987				 */
2988				if (tpte & PG_M)
2989					vm_page_dirty(m);
2990
2991				/* Mark free */
2992				PV_STAT(pv_entry_frees++);
2993				PV_STAT(pv_entry_spare++);
2994				pv_entry_count--;
2995				pc->pc_map[field] |= bitmask;
2996				m->md.pv_list_count--;
2997				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2998				if (TAILQ_EMPTY(&m->md.pv_list))
2999					vm_page_flag_clear(m, PG_WRITEABLE);
3000
3001				pmap_unuse_pt(pmap, pv->pv_va);
3002			}
3003		}
3004		if (allfree) {
3005			PV_STAT(pv_entry_spare -= _NPCPV);
3006			PV_STAT(pc_chunk_count--);
3007			PV_STAT(pc_chunk_frees++);
3008			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3009			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
3010			pmap_qremove((vm_offset_t)pc, 1);
3011			vm_page_unwire(m, 0);
3012			vm_page_free(m);
3013			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
3014		}
3015	}
3016	sched_unpin();
3017	vm_page_unlock_queues();
3018	pmap_invalidate_all(pmap);
3019	PMAP_UNLOCK(pmap);
3020}
3021
3022/*
3023 *	pmap_is_modified:
3024 *
3025 *	Return whether or not the specified physical page was modified
3026 *	in any physical maps.
3027 */
3028boolean_t
3029pmap_is_modified(vm_page_t m)
3030{
3031	pv_entry_t pv;
3032	pt_entry_t *pte;
3033	pmap_t pmap;
3034	boolean_t rv;
3035
3036	rv = FALSE;
3037	if (m->flags & PG_FICTITIOUS)
3038		return (rv);
3039
3040	sched_pin();
3041	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3042	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3043		pmap = PV_PMAP(pv);
3044		PMAP_LOCK(pmap);
3045		pte = pmap_pte_quick(pmap, pv->pv_va);
3046		rv = (*pte & PG_M) != 0;
3047		PMAP_UNLOCK(pmap);
3048		if (rv)
3049			break;
3050	}
3051	sched_unpin();
3052	return (rv);
3053}
3054
3055/*
3056 *	pmap_is_prefaultable:
3057 *
3058 *	Return whether or not the specified virtual address is elgible
3059 *	for prefault.
3060 */
3061boolean_t
3062pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3063{
3064	pt_entry_t *pte;
3065	boolean_t rv;
3066
3067	rv = FALSE;
3068	PMAP_LOCK(pmap);
3069	if (*pmap_pde(pmap, addr)) {
3070		pte = vtopte(addr);
3071		rv = *pte == 0;
3072	}
3073	PMAP_UNLOCK(pmap);
3074	return (rv);
3075}
3076
3077/*
3078 *	Clear the given bit in each of the given page's ptes.  The bit is
3079 *	expressed as a 32-bit mask.  Consequently, if the pte is 64 bits in
3080 *	size, only a bit within the least significant 32 can be cleared.
3081 */
3082static __inline void
3083pmap_clear_ptes(vm_page_t m, int bit)
3084{
3085	register pv_entry_t pv;
3086	pmap_t pmap;
3087	pt_entry_t pbits, *pte;
3088
3089	if ((m->flags & PG_FICTITIOUS) ||
3090	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
3091		return;
3092
3093	sched_pin();
3094	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3095	/*
3096	 * Loop over all current mappings setting/clearing as appropos If
3097	 * setting RO do we need to clear the VAC?
3098	 */
3099	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3100		pmap = PV_PMAP(pv);
3101		PMAP_LOCK(pmap);
3102		pte = pmap_pte_quick(pmap, pv->pv_va);
3103retry:
3104		pbits = *pte;
3105		if (pbits & bit) {
3106			if (bit == PG_RW) {
3107				/*
3108				 * Regardless of whether a pte is 32 or 64 bits
3109				 * in size, PG_RW and PG_M are among the least
3110				 * significant 32 bits.
3111				 */
3112				if (!atomic_cmpset_int((u_int *)pte, pbits,
3113				    pbits & ~(PG_RW | PG_M)))
3114					goto retry;
3115				if (pbits & PG_M) {
3116					vm_page_dirty(m);
3117				}
3118			} else {
3119				atomic_clear_int((u_int *)pte, bit);
3120			}
3121			pmap_invalidate_page(pmap, pv->pv_va);
3122		}
3123		PMAP_UNLOCK(pmap);
3124	}
3125	if (bit == PG_RW)
3126		vm_page_flag_clear(m, PG_WRITEABLE);
3127	sched_unpin();
3128}
3129
3130/*
3131 *      pmap_page_protect:
3132 *
3133 *      Lower the permission for all mappings to a given page.
3134 */
3135void
3136pmap_page_protect(vm_page_t m, vm_prot_t prot)
3137{
3138	if ((prot & VM_PROT_WRITE) == 0) {
3139		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3140			pmap_clear_ptes(m, PG_RW);
3141		} else {
3142			pmap_remove_all(m);
3143		}
3144	}
3145}
3146
3147/*
3148 *	pmap_ts_referenced:
3149 *
3150 *	Return a count of reference bits for a page, clearing those bits.
3151 *	It is not necessary for every reference bit to be cleared, but it
3152 *	is necessary that 0 only be returned when there are truly no
3153 *	reference bits set.
3154 *
3155 *	XXX: The exact number of bits to check and clear is a matter that
3156 *	should be tested and standardized at some point in the future for
3157 *	optimal aging of shared pages.
3158 */
3159int
3160pmap_ts_referenced(vm_page_t m)
3161{
3162	register pv_entry_t pv, pvf, pvn;
3163	pmap_t pmap;
3164	pt_entry_t *pte;
3165	pt_entry_t v;
3166	int rtval = 0;
3167
3168	if (m->flags & PG_FICTITIOUS)
3169		return (rtval);
3170
3171	sched_pin();
3172	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3173	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3174
3175		pvf = pv;
3176
3177		do {
3178			pvn = TAILQ_NEXT(pv, pv_list);
3179
3180			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3181
3182			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3183
3184			pmap = PV_PMAP(pv);
3185			PMAP_LOCK(pmap);
3186			pte = pmap_pte_quick(pmap, pv->pv_va);
3187
3188			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
3189				atomic_clear_int((u_int *)pte, PG_A);
3190				pmap_invalidate_page(pmap, pv->pv_va);
3191
3192				rtval++;
3193				if (rtval > 4) {
3194					PMAP_UNLOCK(pmap);
3195					break;
3196				}
3197			}
3198			PMAP_UNLOCK(pmap);
3199		} while ((pv = pvn) != NULL && pv != pvf);
3200	}
3201	sched_unpin();
3202
3203	return (rtval);
3204}
3205
3206/*
3207 *	Clear the modify bits on the specified physical page.
3208 */
3209void
3210pmap_clear_modify(vm_page_t m)
3211{
3212	pmap_clear_ptes(m, PG_M);
3213}
3214
3215/*
3216 *	pmap_clear_reference:
3217 *
3218 *	Clear the reference bit on the specified physical page.
3219 */
3220void
3221pmap_clear_reference(vm_page_t m)
3222{
3223	pmap_clear_ptes(m, PG_A);
3224}
3225
3226/*
3227 * Miscellaneous support routines follow
3228 */
3229
3230/*
3231 * Map a set of physical memory pages into the kernel virtual
3232 * address space. Return a pointer to where it is mapped. This
3233 * routine is intended to be used for mapping device memory,
3234 * NOT real memory.
3235 */
3236void *
3237pmap_mapdev(pa, size)
3238	vm_paddr_t pa;
3239	vm_size_t size;
3240{
3241	vm_offset_t va, tmpva, offset;
3242
3243	offset = pa & PAGE_MASK;
3244	size = roundup(offset + size, PAGE_SIZE);
3245	pa = pa & PG_FRAME;
3246
3247	if (pa < KERNLOAD && pa + size <= KERNLOAD)
3248		va = KERNBASE + pa;
3249	else
3250		va = kmem_alloc_nofault(kernel_map, size);
3251	if (!va)
3252		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3253
3254	for (tmpva = va; size > 0; ) {
3255		pmap_kenter(tmpva, pa);
3256		size -= PAGE_SIZE;
3257		tmpva += PAGE_SIZE;
3258		pa += PAGE_SIZE;
3259	}
3260	pmap_invalidate_range(kernel_pmap, va, tmpva);
3261	return ((void *)(va + offset));
3262}
3263
3264void
3265pmap_unmapdev(va, size)
3266	vm_offset_t va;
3267	vm_size_t size;
3268{
3269	vm_offset_t base, offset, tmpva;
3270
3271	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
3272		return;
3273	base = va & PG_FRAME;
3274	offset = va & PAGE_MASK;
3275	size = roundup(offset + size, PAGE_SIZE);
3276	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
3277		pmap_kremove(tmpva);
3278	pmap_invalidate_range(kernel_pmap, va, tmpva);
3279	kmem_free(kernel_map, base, size);
3280}
3281
3282/*
3283 * perform the pmap work for mincore
3284 */
3285int
3286pmap_mincore(pmap, addr)
3287	pmap_t pmap;
3288	vm_offset_t addr;
3289{
3290	pt_entry_t *ptep, pte;
3291	vm_page_t m;
3292	int val = 0;
3293
3294	PMAP_LOCK(pmap);
3295	ptep = pmap_pte(pmap, addr);
3296	pte = (ptep != NULL) ? *ptep : 0;
3297	pmap_pte_release(ptep);
3298	PMAP_UNLOCK(pmap);
3299
3300	if (pte != 0) {
3301		vm_paddr_t pa;
3302
3303		val = MINCORE_INCORE;
3304		if ((pte & PG_MANAGED) == 0)
3305			return val;
3306
3307		pa = pte & PG_FRAME;
3308
3309		m = PHYS_TO_VM_PAGE(pa);
3310
3311		/*
3312		 * Modified by us
3313		 */
3314		if (pte & PG_M)
3315			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3316		else {
3317			/*
3318			 * Modified by someone else
3319			 */
3320			vm_page_lock_queues();
3321			if (m->dirty || pmap_is_modified(m))
3322				val |= MINCORE_MODIFIED_OTHER;
3323			vm_page_unlock_queues();
3324		}
3325		/*
3326		 * Referenced by us
3327		 */
3328		if (pte & PG_A)
3329			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3330		else {
3331			/*
3332			 * Referenced by someone else
3333			 */
3334			vm_page_lock_queues();
3335			if ((m->flags & PG_REFERENCED) ||
3336			    pmap_ts_referenced(m)) {
3337				val |= MINCORE_REFERENCED_OTHER;
3338				vm_page_flag_set(m, PG_REFERENCED);
3339			}
3340			vm_page_unlock_queues();
3341		}
3342	}
3343	return val;
3344}
3345
3346void
3347pmap_activate(struct thread *td)
3348{
3349	pmap_t	pmap, oldpmap;
3350	u_int32_t  cr3;
3351
3352	critical_enter();
3353	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3354	oldpmap = PCPU_GET(curpmap);
3355#if defined(SMP)
3356	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3357	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3358#else
3359	oldpmap->pm_active &= ~1;
3360	pmap->pm_active |= 1;
3361#endif
3362#ifdef PAE
3363	cr3 = vtophys(pmap->pm_pdpt);
3364#else
3365	cr3 = vtophys(pmap->pm_pdir);
3366#endif
3367	/*
3368	 * pmap_activate is for the current thread on the current cpu
3369	 */
3370	td->td_pcb->pcb_cr3 = cr3;
3371	load_cr3(cr3);
3372	PCPU_SET(curpmap, pmap);
3373	critical_exit();
3374}
3375
3376vm_offset_t
3377pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3378{
3379
3380	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3381		return addr;
3382	}
3383
3384	addr = (addr + PDRMASK) & ~PDRMASK;
3385	return addr;
3386}
3387
3388
3389#if defined(PMAP_DEBUG)
3390pmap_pid_dump(int pid)
3391{
3392	pmap_t pmap;
3393	struct proc *p;
3394	int npte = 0;
3395	int index;
3396
3397	sx_slock(&allproc_lock);
3398	LIST_FOREACH(p, &allproc, p_list) {
3399		if (p->p_pid != pid)
3400			continue;
3401
3402		if (p->p_vmspace) {
3403			int i,j;
3404			index = 0;
3405			pmap = vmspace_pmap(p->p_vmspace);
3406			for (i = 0; i < NPDEPTD; i++) {
3407				pd_entry_t *pde;
3408				pt_entry_t *pte;
3409				vm_offset_t base = i << PDRSHIFT;
3410
3411				pde = &pmap->pm_pdir[i];
3412				if (pde && pmap_pde_v(pde)) {
3413					for (j = 0; j < NPTEPG; j++) {
3414						vm_offset_t va = base + (j << PAGE_SHIFT);
3415						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3416							if (index) {
3417								index = 0;
3418								printf("\n");
3419							}
3420							sx_sunlock(&allproc_lock);
3421							return npte;
3422						}
3423						pte = pmap_pte(pmap, va);
3424						if (pte && pmap_pte_v(pte)) {
3425							pt_entry_t pa;
3426							vm_page_t m;
3427							pa = *pte;
3428							m = PHYS_TO_VM_PAGE(pa);
3429							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3430								va, pa, m->hold_count, m->wire_count, m->flags);
3431							npte++;
3432							index++;
3433							if (index >= 2) {
3434								index = 0;
3435								printf("\n");
3436							} else {
3437								printf(" ");
3438							}
3439						}
3440					}
3441				}
3442			}
3443		}
3444	}
3445	sx_sunlock(&allproc_lock);
3446	return npte;
3447}
3448#endif
3449
3450#if defined(DEBUG)
3451
3452static void	pads(pmap_t pm);
3453void		pmap_pvdump(vm_offset_t pa);
3454
3455/* print address space of pmap*/
3456static void
3457pads(pm)
3458	pmap_t pm;
3459{
3460	int i, j;
3461	vm_paddr_t va;
3462	pt_entry_t *ptep;
3463
3464	if (pm == kernel_pmap)
3465		return;
3466	for (i = 0; i < NPDEPTD; i++)
3467		if (pm->pm_pdir[i])
3468			for (j = 0; j < NPTEPG; j++) {
3469				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3470				if (pm == kernel_pmap && va < KERNBASE)
3471					continue;
3472				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3473					continue;
3474				ptep = pmap_pte(pm, va);
3475				if (pmap_pte_v(ptep))
3476					printf("%x:%x ", va, *ptep);
3477			};
3478
3479}
3480
3481void
3482pmap_pvdump(pa)
3483	vm_paddr_t pa;
3484{
3485	pv_entry_t pv;
3486	pmap_t pmap;
3487	vm_page_t m;
3488
3489	printf("pa %x", pa);
3490	m = PHYS_TO_VM_PAGE(pa);
3491	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3492		pmap = PV_PMAP(pv);
3493		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
3494		pads(pmap);
3495	}
3496	printf(" ");
3497}
3498#endif
3499