pmap.c revision 168930
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 168930 2007-04-21 14:17:30Z ups $");
81
82/*
83 *	Manages physical address maps.
84 *
85 *	In addition to hardware address maps, this
86 *	module is called upon to provide software-use-only
87 *	maps which may or may not be stored in the same
88 *	form as hardware maps.  These pseudo-maps are
89 *	used to store intermediate results from copy
90 *	operations to and from address spaces.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_msgbuf.h"
109#include "opt_pmap.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/kernel.h>
114#include <sys/lock.h>
115#include <sys/malloc.h>
116#include <sys/mman.h>
117#include <sys/msgbuf.h>
118#include <sys/mutex.h>
119#include <sys/proc.h>
120#include <sys/sx.h>
121#include <sys/vmmeter.h>
122#include <sys/sched.h>
123#include <sys/sysctl.h>
124#ifdef SMP
125#include <sys/smp.h>
126#endif
127
128#include <vm/vm.h>
129#include <vm/vm_param.h>
130#include <vm/vm_kern.h>
131#include <vm/vm_page.h>
132#include <vm/vm_map.h>
133#include <vm/vm_object.h>
134#include <vm/vm_extern.h>
135#include <vm/vm_pageout.h>
136#include <vm/vm_pager.h>
137#include <vm/uma.h>
138
139#include <machine/cpu.h>
140#include <machine/cputypes.h>
141#include <machine/md_var.h>
142#include <machine/pcb.h>
143#include <machine/specialreg.h>
144#ifdef SMP
145#include <machine/smp.h>
146#endif
147
148#ifndef PMAP_SHPGPERPROC
149#define PMAP_SHPGPERPROC 200
150#endif
151
152#if defined(DIAGNOSTIC)
153#define PMAP_DIAGNOSTIC
154#endif
155
156#if !defined(PMAP_DIAGNOSTIC)
157#define PMAP_INLINE __inline
158#else
159#define PMAP_INLINE
160#endif
161
162#define PV_STATS
163#ifdef PV_STATS
164#define PV_STAT(x)	do { x ; } while (0)
165#else
166#define PV_STAT(x)	do { } while (0)
167#endif
168
169struct pmap kernel_pmap_store;
170
171vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
172vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
173
174static int nkpt;
175static int ndmpdp;
176static vm_paddr_t dmaplimit;
177vm_offset_t kernel_vm_end;
178pt_entry_t pg_nx;
179
180static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
181static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
182u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
183u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
184
185static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
186static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
187
188/*
189 * Data for the pv entry allocation mechanism
190 */
191static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
192static int shpgperproc = PMAP_SHPGPERPROC;
193
194/*
195 * All those kernel PT submaps that BSD is so fond of
196 */
197pt_entry_t *CMAP1 = 0;
198caddr_t CADDR1 = 0;
199struct msgbuf *msgbufp = 0;
200
201/*
202 * Crashdump maps.
203 */
204static caddr_t crashdumpmap;
205
206static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
207static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
208
209static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
210    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
211static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
212		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
213static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde);
214static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
215		vm_offset_t va);
216static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
217static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
218    vm_page_t m);
219
220static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
221static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
222
223static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
224static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
225                vm_page_t* free);
226static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
227static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
228
229CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
230CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
231
232/*
233 * Move the kernel virtual free pointer to the next
234 * 2MB.  This is used to help improve performance
235 * by using a large (2MB) page for much of the kernel
236 * (.text, .data, .bss)
237 */
238static vm_offset_t
239pmap_kmem_choose(vm_offset_t addr)
240{
241	vm_offset_t newaddr = addr;
242
243	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
244	return newaddr;
245}
246
247/********************/
248/* Inline functions */
249/********************/
250
251/* Return a non-clipped PD index for a given VA */
252static __inline vm_pindex_t
253pmap_pde_pindex(vm_offset_t va)
254{
255	return va >> PDRSHIFT;
256}
257
258
259/* Return various clipped indexes for a given VA */
260static __inline vm_pindex_t
261pmap_pte_index(vm_offset_t va)
262{
263
264	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
265}
266
267static __inline vm_pindex_t
268pmap_pde_index(vm_offset_t va)
269{
270
271	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
272}
273
274static __inline vm_pindex_t
275pmap_pdpe_index(vm_offset_t va)
276{
277
278	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
279}
280
281static __inline vm_pindex_t
282pmap_pml4e_index(vm_offset_t va)
283{
284
285	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
286}
287
288/* Return a pointer to the PML4 slot that corresponds to a VA */
289static __inline pml4_entry_t *
290pmap_pml4e(pmap_t pmap, vm_offset_t va)
291{
292
293	if (!pmap)
294		return NULL;
295	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
296}
297
298/* Return a pointer to the PDP slot that corresponds to a VA */
299static __inline pdp_entry_t *
300pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
301{
302	pdp_entry_t *pdpe;
303
304	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
305	return (&pdpe[pmap_pdpe_index(va)]);
306}
307
308/* Return a pointer to the PDP slot that corresponds to a VA */
309static __inline pdp_entry_t *
310pmap_pdpe(pmap_t pmap, vm_offset_t va)
311{
312	pml4_entry_t *pml4e;
313
314	pml4e = pmap_pml4e(pmap, va);
315	if (pml4e == NULL || (*pml4e & PG_V) == 0)
316		return NULL;
317	return (pmap_pml4e_to_pdpe(pml4e, va));
318}
319
320/* Return a pointer to the PD slot that corresponds to a VA */
321static __inline pd_entry_t *
322pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
323{
324	pd_entry_t *pde;
325
326	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
327	return (&pde[pmap_pde_index(va)]);
328}
329
330/* Return a pointer to the PD slot that corresponds to a VA */
331static __inline pd_entry_t *
332pmap_pde(pmap_t pmap, vm_offset_t va)
333{
334	pdp_entry_t *pdpe;
335
336	pdpe = pmap_pdpe(pmap, va);
337	if (pdpe == NULL || (*pdpe & PG_V) == 0)
338		 return NULL;
339	return (pmap_pdpe_to_pde(pdpe, va));
340}
341
342/* Return a pointer to the PT slot that corresponds to a VA */
343static __inline pt_entry_t *
344pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
345{
346	pt_entry_t *pte;
347
348	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
349	return (&pte[pmap_pte_index(va)]);
350}
351
352/* Return a pointer to the PT slot that corresponds to a VA */
353static __inline pt_entry_t *
354pmap_pte(pmap_t pmap, vm_offset_t va)
355{
356	pd_entry_t *pde;
357
358	pde = pmap_pde(pmap, va);
359	if (pde == NULL || (*pde & PG_V) == 0)
360		return NULL;
361	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
362		return ((pt_entry_t *)pde);
363	return (pmap_pde_to_pte(pde, va));
364}
365
366
367static __inline pt_entry_t *
368pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde)
369{
370	pd_entry_t *pde;
371
372	pde = pmap_pde(pmap, va);
373	if (pde == NULL || (*pde & PG_V) == 0)
374		return NULL;
375	*ptepde = *pde;
376	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
377		return ((pt_entry_t *)pde);
378	return (pmap_pde_to_pte(pde, va));
379}
380
381
382PMAP_INLINE pt_entry_t *
383vtopte(vm_offset_t va)
384{
385	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
386
387	return (PTmap + ((va >> PAGE_SHIFT) & mask));
388}
389
390static __inline pd_entry_t *
391vtopde(vm_offset_t va)
392{
393	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
394
395	return (PDmap + ((va >> PDRSHIFT) & mask));
396}
397
398static u_int64_t
399allocpages(vm_paddr_t *firstaddr, int n)
400{
401	u_int64_t ret;
402
403	ret = *firstaddr;
404	bzero((void *)ret, n * PAGE_SIZE);
405	*firstaddr += n * PAGE_SIZE;
406	return (ret);
407}
408
409static void
410create_pagetables(vm_paddr_t *firstaddr)
411{
412	int i;
413
414	/* Allocate pages */
415	KPTphys = allocpages(firstaddr, NKPT);
416	KPML4phys = allocpages(firstaddr, 1);
417	KPDPphys = allocpages(firstaddr, NKPML4E);
418	KPDphys = allocpages(firstaddr, NKPDPE);
419
420	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
421	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
422		ndmpdp = 4;
423	DMPDPphys = allocpages(firstaddr, NDMPML4E);
424	DMPDphys = allocpages(firstaddr, ndmpdp);
425	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
426
427	/* Fill in the underlying page table pages */
428	/* Read-only from zero to physfree */
429	/* XXX not fully used, underneath 2M pages */
430	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
431		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
432		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
433	}
434
435	/* Now map the page tables at their location within PTmap */
436	for (i = 0; i < NKPT; i++) {
437		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
438		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
439	}
440
441	/* Map from zero to end of allocations under 2M pages */
442	/* This replaces some of the KPTphys entries above */
443	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
444		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
445		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
446	}
447
448	/* And connect up the PD to the PDP */
449	for (i = 0; i < NKPDPE; i++) {
450		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
451		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
452	}
453
454
455	/* Now set up the direct map space using 2MB pages */
456	for (i = 0; i < NPDEPG * ndmpdp; i++) {
457		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
458		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
459	}
460
461	/* And the direct map space's PDP */
462	for (i = 0; i < ndmpdp; i++) {
463		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
464		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
465	}
466
467	/* And recursively map PML4 to itself in order to get PTmap */
468	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
469	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
470
471	/* Connect the Direct Map slot up to the PML4 */
472	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
473	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
474
475	/* Connect the KVA slot up to the PML4 */
476	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
477	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
478}
479
480/*
481 *	Bootstrap the system enough to run with virtual memory.
482 *
483 *	On amd64 this is called after mapping has already been enabled
484 *	and just syncs the pmap module with what has already been done.
485 *	[We can't call it easily with mapping off since the kernel is not
486 *	mapped with PA == VA, hence we would have to relocate every address
487 *	from the linked base (virtual) address "KERNBASE" to the actual
488 *	(physical) address starting relative to 0]
489 */
490void
491pmap_bootstrap(vm_paddr_t *firstaddr)
492{
493	vm_offset_t va;
494	pt_entry_t *pte, *unused;
495
496	/*
497	 * Create an initial set of page tables to run the kernel in.
498	 */
499	create_pagetables(firstaddr);
500
501	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
502	virtual_avail = pmap_kmem_choose(virtual_avail);
503
504	virtual_end = VM_MAX_KERNEL_ADDRESS;
505
506
507	/* XXX do %cr0 as well */
508	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
509	load_cr3(KPML4phys);
510
511	/*
512	 * Initialize the kernel pmap (which is statically allocated).
513	 */
514	PMAP_LOCK_INIT(kernel_pmap);
515	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
516	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
517	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
518	nkpt = NKPT;
519
520	/*
521	 * Reserve some special page table entries/VA space for temporary
522	 * mapping of pages.
523	 */
524#define	SYSMAP(c, p, v, n)	\
525	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
526
527	va = virtual_avail;
528	pte = vtopte(va);
529
530	/*
531	 * CMAP1 is only used for the memory test.
532	 */
533	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
534
535	/*
536	 * Crashdump maps.
537	 */
538	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
539
540	/*
541	 * msgbufp is used to map the system message buffer.
542	 */
543	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
544
545	virtual_avail = va;
546
547	*CMAP1 = 0;
548
549	invltlb();
550
551	/* Initialize the PAT MSR. */
552	pmap_init_pat();
553}
554
555/*
556 * Setup the PAT MSR.
557 */
558void
559pmap_init_pat(void)
560{
561	uint64_t pat_msr;
562
563	/* Bail if this CPU doesn't implement PAT. */
564	if (!(cpu_feature & CPUID_PAT))
565		panic("no PAT??");
566
567#ifdef PAT_WORKS
568	/*
569	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
570	 * Program 4 and 5 as WP and WC.
571	 * Leave 6 and 7 as UC and UC-.
572	 */
573	pat_msr = rdmsr(MSR_PAT);
574	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
575	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
576	    PAT_VALUE(5, PAT_WRITE_COMBINING);
577#else
578	/*
579	 * Due to some Intel errata, we can only safely use the lower 4
580	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
581	 * of UC-.
582	 *
583	 *   Intel Pentium III Processor Specification Update
584	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
585	 * or Mode C Paging)
586	 *
587	 *   Intel Pentium IV  Processor Specification Update
588	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
589	 */
590	pat_msr = rdmsr(MSR_PAT);
591	pat_msr &= ~PAT_MASK(2);
592	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
593#endif
594	wrmsr(MSR_PAT, pat_msr);
595}
596
597/*
598 *	Initialize a vm_page's machine-dependent fields.
599 */
600void
601pmap_page_init(vm_page_t m)
602{
603
604	TAILQ_INIT(&m->md.pv_list);
605	m->md.pv_list_count = 0;
606}
607
608/*
609 *	Initialize the pmap module.
610 *	Called by vm_init, to initialize any structures that the pmap
611 *	system needs to map virtual memory.
612 */
613void
614pmap_init(void)
615{
616
617	/*
618	 * Initialize the address space (zone) for the pv entries.  Set a
619	 * high water mark so that the system can recover from excessive
620	 * numbers of pv entries.
621	 */
622	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
623	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
624	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
625	pv_entry_high_water = 9 * (pv_entry_max / 10);
626}
627
628SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
629static int
630pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
631{
632	int error;
633
634	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
635	if (error == 0 && req->newptr) {
636		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
637		pv_entry_high_water = 9 * (pv_entry_max / 10);
638	}
639	return (error);
640}
641SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW,
642    &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
643
644static int
645pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
646{
647	int error;
648
649	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
650	if (error == 0 && req->newptr) {
651		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
652		pv_entry_high_water = 9 * (pv_entry_max / 10);
653	}
654	return (error);
655}
656SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW,
657    &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
658
659
660/***************************************************
661 * Low level helper routines.....
662 ***************************************************/
663
664/*
665 * Determine the appropriate bits to set in a PTE or PDE for a specified
666 * caching mode.
667 */
668static int
669pmap_cache_bits(int mode, boolean_t is_pde)
670{
671	int pat_flag, pat_index, cache_bits;
672
673	/* The PAT bit is different for PTE's and PDE's. */
674	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
675
676	/* If we don't support PAT, map extended modes to older ones. */
677	if (!(cpu_feature & CPUID_PAT)) {
678		switch (mode) {
679		case PAT_UNCACHEABLE:
680		case PAT_WRITE_THROUGH:
681		case PAT_WRITE_BACK:
682			break;
683		case PAT_UNCACHED:
684		case PAT_WRITE_COMBINING:
685		case PAT_WRITE_PROTECTED:
686			mode = PAT_UNCACHEABLE;
687			break;
688		}
689	}
690
691	/* Map the caching mode to a PAT index. */
692	switch (mode) {
693#ifdef PAT_WORKS
694	case PAT_UNCACHEABLE:
695		pat_index = 3;
696		break;
697	case PAT_WRITE_THROUGH:
698		pat_index = 1;
699		break;
700	case PAT_WRITE_BACK:
701		pat_index = 0;
702		break;
703	case PAT_UNCACHED:
704		pat_index = 2;
705		break;
706	case PAT_WRITE_COMBINING:
707		pat_index = 5;
708		break;
709	case PAT_WRITE_PROTECTED:
710		pat_index = 4;
711		break;
712#else
713	case PAT_UNCACHED:
714	case PAT_UNCACHEABLE:
715	case PAT_WRITE_PROTECTED:
716		pat_index = 3;
717		break;
718	case PAT_WRITE_THROUGH:
719		pat_index = 1;
720		break;
721	case PAT_WRITE_BACK:
722		pat_index = 0;
723		break;
724	case PAT_WRITE_COMBINING:
725		pat_index = 2;
726		break;
727#endif
728	default:
729		panic("Unknown caching mode %d\n", mode);
730	}
731
732	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
733	cache_bits = 0;
734	if (pat_index & 0x4)
735		cache_bits |= pat_flag;
736	if (pat_index & 0x2)
737		cache_bits |= PG_NC_PCD;
738	if (pat_index & 0x1)
739		cache_bits |= PG_NC_PWT;
740	return (cache_bits);
741}
742#ifdef SMP
743/*
744 * For SMP, these functions have to use the IPI mechanism for coherence.
745 */
746void
747pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
748{
749	u_int cpumask;
750	u_int other_cpus;
751
752	sched_pin();
753	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
754		invlpg(va);
755		smp_invlpg(va);
756	} else {
757		cpumask = PCPU_GET(cpumask);
758		other_cpus = PCPU_GET(other_cpus);
759		if (pmap->pm_active & cpumask)
760			invlpg(va);
761		if (pmap->pm_active & other_cpus)
762			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
763	}
764	sched_unpin();
765}
766
767void
768pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
769{
770	u_int cpumask;
771	u_int other_cpus;
772	vm_offset_t addr;
773
774	sched_pin();
775	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
776		for (addr = sva; addr < eva; addr += PAGE_SIZE)
777			invlpg(addr);
778		smp_invlpg_range(sva, eva);
779	} else {
780		cpumask = PCPU_GET(cpumask);
781		other_cpus = PCPU_GET(other_cpus);
782		if (pmap->pm_active & cpumask)
783			for (addr = sva; addr < eva; addr += PAGE_SIZE)
784				invlpg(addr);
785		if (pmap->pm_active & other_cpus)
786			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
787			    sva, eva);
788	}
789	sched_unpin();
790}
791
792void
793pmap_invalidate_all(pmap_t pmap)
794{
795	u_int cpumask;
796	u_int other_cpus;
797
798	sched_pin();
799	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
800		invltlb();
801		smp_invltlb();
802	} else {
803		cpumask = PCPU_GET(cpumask);
804		other_cpus = PCPU_GET(other_cpus);
805		if (pmap->pm_active & cpumask)
806			invltlb();
807		if (pmap->pm_active & other_cpus)
808			smp_masked_invltlb(pmap->pm_active & other_cpus);
809	}
810	sched_unpin();
811}
812
813void
814pmap_invalidate_cache(void)
815{
816
817	sched_pin();
818	wbinvd();
819	smp_cache_flush();
820	sched_unpin();
821}
822#else /* !SMP */
823/*
824 * Normal, non-SMP, invalidation functions.
825 * We inline these within pmap.c for speed.
826 */
827PMAP_INLINE void
828pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
829{
830
831	if (pmap == kernel_pmap || pmap->pm_active)
832		invlpg(va);
833}
834
835PMAP_INLINE void
836pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
837{
838	vm_offset_t addr;
839
840	if (pmap == kernel_pmap || pmap->pm_active)
841		for (addr = sva; addr < eva; addr += PAGE_SIZE)
842			invlpg(addr);
843}
844
845PMAP_INLINE void
846pmap_invalidate_all(pmap_t pmap)
847{
848
849	if (pmap == kernel_pmap || pmap->pm_active)
850		invltlb();
851}
852
853PMAP_INLINE void
854pmap_invalidate_cache(void)
855{
856
857	wbinvd();
858}
859#endif /* !SMP */
860
861/*
862 * Are we current address space or kernel?
863 */
864static __inline int
865pmap_is_current(pmap_t pmap)
866{
867	return (pmap == kernel_pmap ||
868	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
869}
870
871/*
872 *	Routine:	pmap_extract
873 *	Function:
874 *		Extract the physical page address associated
875 *		with the given map/virtual_address pair.
876 */
877vm_paddr_t
878pmap_extract(pmap_t pmap, vm_offset_t va)
879{
880	vm_paddr_t rtval;
881	pt_entry_t *pte;
882	pd_entry_t pde, *pdep;
883
884	rtval = 0;
885	PMAP_LOCK(pmap);
886	pdep = pmap_pde(pmap, va);
887	if (pdep != NULL) {
888		pde = *pdep;
889		if (pde) {
890			if ((pde & PG_PS) != 0) {
891				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
892				PMAP_UNLOCK(pmap);
893				return rtval;
894			}
895			pte = pmap_pde_to_pte(pdep, va);
896			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
897		}
898	}
899	PMAP_UNLOCK(pmap);
900	return (rtval);
901}
902
903/*
904 *	Routine:	pmap_extract_and_hold
905 *	Function:
906 *		Atomically extract and hold the physical page
907 *		with the given pmap and virtual address pair
908 *		if that mapping permits the given protection.
909 */
910vm_page_t
911pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
912{
913	pd_entry_t pde, *pdep;
914	pt_entry_t pte;
915	vm_page_t m;
916
917	m = NULL;
918	vm_page_lock_queues();
919	PMAP_LOCK(pmap);
920	pdep = pmap_pde(pmap, va);
921	if (pdep != NULL && (pde = *pdep)) {
922		if (pde & PG_PS) {
923			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
924				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
925				    (va & PDRMASK));
926				vm_page_hold(m);
927			}
928		} else {
929			pte = *pmap_pde_to_pte(pdep, va);
930			if ((pte & PG_V) &&
931			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
932				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
933				vm_page_hold(m);
934			}
935		}
936	}
937	vm_page_unlock_queues();
938	PMAP_UNLOCK(pmap);
939	return (m);
940}
941
942vm_paddr_t
943pmap_kextract(vm_offset_t va)
944{
945	pd_entry_t *pde;
946	vm_paddr_t pa;
947
948	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
949		pa = DMAP_TO_PHYS(va);
950	} else {
951		pde = vtopde(va);
952		if (*pde & PG_PS) {
953			pa = (*pde & PG_PS_FRAME) | (va & PDRMASK);
954		} else {
955			pa = *vtopte(va);
956			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
957		}
958	}
959	return pa;
960}
961
962/***************************************************
963 * Low level mapping routines.....
964 ***************************************************/
965
966/*
967 * Add a wired page to the kva.
968 * Note: not SMP coherent.
969 */
970PMAP_INLINE void
971pmap_kenter(vm_offset_t va, vm_paddr_t pa)
972{
973	pt_entry_t *pte;
974
975	pte = vtopte(va);
976	pte_store(pte, pa | PG_RW | PG_V | PG_G);
977}
978
979PMAP_INLINE void
980pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
981{
982	pt_entry_t *pte;
983
984	pte = vtopte(va);
985	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
986}
987
988/*
989 * Remove a page from the kernel pagetables.
990 * Note: not SMP coherent.
991 */
992PMAP_INLINE void
993pmap_kremove(vm_offset_t va)
994{
995	pt_entry_t *pte;
996
997	pte = vtopte(va);
998	pte_clear(pte);
999}
1000
1001/*
1002 *	Used to map a range of physical addresses into kernel
1003 *	virtual address space.
1004 *
1005 *	The value passed in '*virt' is a suggested virtual address for
1006 *	the mapping. Architectures which can support a direct-mapped
1007 *	physical to virtual region can return the appropriate address
1008 *	within that region, leaving '*virt' unchanged. Other
1009 *	architectures should map the pages starting at '*virt' and
1010 *	update '*virt' with the first usable address after the mapped
1011 *	region.
1012 */
1013vm_offset_t
1014pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1015{
1016	return PHYS_TO_DMAP(start);
1017}
1018
1019
1020/*
1021 * Add a list of wired pages to the kva
1022 * this routine is only used for temporary
1023 * kernel mappings that do not need to have
1024 * page modification or references recorded.
1025 * Note that old mappings are simply written
1026 * over.  The page *must* be wired.
1027 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1028 */
1029void
1030pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1031{
1032	pt_entry_t *endpte, oldpte, *pte;
1033
1034	oldpte = 0;
1035	pte = vtopte(sva);
1036	endpte = pte + count;
1037	while (pte < endpte) {
1038		oldpte |= *pte;
1039		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V);
1040		pte++;
1041		ma++;
1042	}
1043	if ((oldpte & PG_V) != 0)
1044		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1045		    PAGE_SIZE);
1046}
1047
1048/*
1049 * This routine tears out page mappings from the
1050 * kernel -- it is meant only for temporary mappings.
1051 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1052 */
1053void
1054pmap_qremove(vm_offset_t sva, int count)
1055{
1056	vm_offset_t va;
1057
1058	va = sva;
1059	while (count-- > 0) {
1060		pmap_kremove(va);
1061		va += PAGE_SIZE;
1062	}
1063	pmap_invalidate_range(kernel_pmap, sva, va);
1064}
1065
1066/***************************************************
1067 * Page table page management routines.....
1068 ***************************************************/
1069static PMAP_INLINE void
1070pmap_free_zero_pages(vm_page_t free)
1071{
1072	vm_page_t m;
1073
1074	while (free != NULL) {
1075		m = free;
1076		free = m->right;
1077		vm_page_free_zero(m);
1078	}
1079}
1080
1081/*
1082 * This routine unholds page table pages, and if the hold count
1083 * drops to zero, then it decrements the wire count.
1084 */
1085static PMAP_INLINE int
1086pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1087{
1088
1089	--m->wire_count;
1090	if (m->wire_count == 0)
1091		return _pmap_unwire_pte_hold(pmap, va, m, free);
1092	else
1093		return 0;
1094}
1095
1096static int
1097_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
1098    vm_page_t *free)
1099{
1100	vm_offset_t pteva;
1101
1102	/*
1103	 * unmap the page table page
1104	 */
1105	if (m->pindex >= (NUPDE + NUPDPE)) {
1106		/* PDP page */
1107		pml4_entry_t *pml4;
1108		pml4 = pmap_pml4e(pmap, va);
1109		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
1110		*pml4 = 0;
1111	} else if (m->pindex >= NUPDE) {
1112		/* PD page */
1113		pdp_entry_t *pdp;
1114		pdp = pmap_pdpe(pmap, va);
1115		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
1116		*pdp = 0;
1117	} else {
1118		/* PTE page */
1119		pd_entry_t *pd;
1120		pd = pmap_pde(pmap, va);
1121		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
1122		*pd = 0;
1123	}
1124	--pmap->pm_stats.resident_count;
1125	if (m->pindex < NUPDE) {
1126		/* We just released a PT, unhold the matching PD */
1127		vm_page_t pdpg;
1128
1129		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1130		pmap_unwire_pte_hold(pmap, va, pdpg, free);
1131	}
1132	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1133		/* We just released a PD, unhold the matching PDP */
1134		vm_page_t pdppg;
1135
1136		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1137		pmap_unwire_pte_hold(pmap, va, pdppg, free);
1138	}
1139
1140	/*
1141	 * Do an invltlb to make the invalidated mapping
1142	 * take effect immediately.
1143	 */
1144	pmap_invalidate_page(pmap, pteva);
1145
1146	/*
1147	 * Put page on a list so that it is released after
1148	 * *ALL* TLB shootdown is done
1149	 */
1150	m->right = *free;
1151	*free = m;
1152
1153	atomic_subtract_int(&cnt.v_wire_count, 1);
1154	return 1;
1155}
1156
1157/*
1158 * After removing a page table entry, this routine is used to
1159 * conditionally free the page, and manage the hold/wire counts.
1160 */
1161static int
1162pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
1163{
1164	vm_page_t mpte;
1165
1166	if (va >= VM_MAXUSER_ADDRESS)
1167		return 0;
1168	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1169	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1170	return pmap_unwire_pte_hold(pmap, va, mpte, free);
1171}
1172
1173void
1174pmap_pinit0(pmap_t pmap)
1175{
1176
1177	PMAP_LOCK_INIT(pmap);
1178	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1179	pmap->pm_active = 0;
1180	TAILQ_INIT(&pmap->pm_pvchunk);
1181	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1182}
1183
1184/*
1185 * Initialize a preallocated and zeroed pmap structure,
1186 * such as one in a vmspace structure.
1187 */
1188void
1189pmap_pinit(pmap_t pmap)
1190{
1191	vm_page_t pml4pg;
1192	static vm_pindex_t color;
1193
1194	PMAP_LOCK_INIT(pmap);
1195
1196	/*
1197	 * allocate the page directory page
1198	 */
1199	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1200	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1201		VM_WAIT;
1202
1203	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1204
1205	if ((pml4pg->flags & PG_ZERO) == 0)
1206		pagezero(pmap->pm_pml4);
1207
1208	/* Wire in kernel global address entries. */
1209	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1210	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1211
1212	/* install self-referential address mapping entry(s) */
1213	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1214
1215	pmap->pm_active = 0;
1216	TAILQ_INIT(&pmap->pm_pvchunk);
1217	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1218}
1219
1220/*
1221 * this routine is called if the page table page is not
1222 * mapped correctly.
1223 *
1224 * Note: If a page allocation fails at page table level two or three,
1225 * one or two pages may be held during the wait, only to be released
1226 * afterwards.  This conservative approach is easily argued to avoid
1227 * race conditions.
1228 */
1229static vm_page_t
1230_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1231{
1232	vm_page_t m, pdppg, pdpg;
1233
1234	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1235	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1236	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1237
1238	/*
1239	 * Allocate a page table page.
1240	 */
1241	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1242	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1243		if (flags & M_WAITOK) {
1244			PMAP_UNLOCK(pmap);
1245			vm_page_unlock_queues();
1246			VM_WAIT;
1247			vm_page_lock_queues();
1248			PMAP_LOCK(pmap);
1249		}
1250
1251		/*
1252		 * Indicate the need to retry.  While waiting, the page table
1253		 * page may have been allocated.
1254		 */
1255		return (NULL);
1256	}
1257	if ((m->flags & PG_ZERO) == 0)
1258		pmap_zero_page(m);
1259
1260	/*
1261	 * Map the pagetable page into the process address space, if
1262	 * it isn't already there.
1263	 */
1264
1265	pmap->pm_stats.resident_count++;
1266
1267	if (ptepindex >= (NUPDE + NUPDPE)) {
1268		pml4_entry_t *pml4;
1269		vm_pindex_t pml4index;
1270
1271		/* Wire up a new PDPE page */
1272		pml4index = ptepindex - (NUPDE + NUPDPE);
1273		pml4 = &pmap->pm_pml4[pml4index];
1274		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1275
1276	} else if (ptepindex >= NUPDE) {
1277		vm_pindex_t pml4index;
1278		vm_pindex_t pdpindex;
1279		pml4_entry_t *pml4;
1280		pdp_entry_t *pdp;
1281
1282		/* Wire up a new PDE page */
1283		pdpindex = ptepindex - NUPDE;
1284		pml4index = pdpindex >> NPML4EPGSHIFT;
1285
1286		pml4 = &pmap->pm_pml4[pml4index];
1287		if ((*pml4 & PG_V) == 0) {
1288			/* Have to allocate a new pdp, recurse */
1289			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1290			    flags) == NULL) {
1291				--m->wire_count;
1292				vm_page_free(m);
1293				return (NULL);
1294			}
1295		} else {
1296			/* Add reference to pdp page */
1297			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1298			pdppg->wire_count++;
1299		}
1300		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1301
1302		/* Now find the pdp page */
1303		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1304		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1305
1306	} else {
1307		vm_pindex_t pml4index;
1308		vm_pindex_t pdpindex;
1309		pml4_entry_t *pml4;
1310		pdp_entry_t *pdp;
1311		pd_entry_t *pd;
1312
1313		/* Wire up a new PTE page */
1314		pdpindex = ptepindex >> NPDPEPGSHIFT;
1315		pml4index = pdpindex >> NPML4EPGSHIFT;
1316
1317		/* First, find the pdp and check that its valid. */
1318		pml4 = &pmap->pm_pml4[pml4index];
1319		if ((*pml4 & PG_V) == 0) {
1320			/* Have to allocate a new pd, recurse */
1321			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1322			    flags) == NULL) {
1323				--m->wire_count;
1324				vm_page_free(m);
1325				return (NULL);
1326			}
1327			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1328			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1329		} else {
1330			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1331			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1332			if ((*pdp & PG_V) == 0) {
1333				/* Have to allocate a new pd, recurse */
1334				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1335				    flags) == NULL) {
1336					--m->wire_count;
1337					vm_page_free(m);
1338					return (NULL);
1339				}
1340			} else {
1341				/* Add reference to the pd page */
1342				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1343				pdpg->wire_count++;
1344			}
1345		}
1346		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1347
1348		/* Now we know where the page directory page is */
1349		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1350		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1351	}
1352
1353	return m;
1354}
1355
1356static vm_page_t
1357pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1358{
1359	vm_pindex_t pdpindex, ptepindex;
1360	pdp_entry_t *pdpe;
1361	vm_page_t pdpg;
1362
1363	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1364	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1365	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1366retry:
1367	pdpe = pmap_pdpe(pmap, va);
1368	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1369		/* Add a reference to the pd page. */
1370		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1371		pdpg->wire_count++;
1372	} else {
1373		/* Allocate a pd page. */
1374		ptepindex = pmap_pde_pindex(va);
1375		pdpindex = ptepindex >> NPDPEPGSHIFT;
1376		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1377		if (pdpg == NULL && (flags & M_WAITOK))
1378			goto retry;
1379	}
1380	return (pdpg);
1381}
1382
1383static vm_page_t
1384pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1385{
1386	vm_pindex_t ptepindex;
1387	pd_entry_t *pd;
1388	vm_page_t m, free;
1389
1390	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1391	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1392	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1393
1394	/*
1395	 * Calculate pagetable page index
1396	 */
1397	ptepindex = pmap_pde_pindex(va);
1398retry:
1399	/*
1400	 * Get the page directory entry
1401	 */
1402	pd = pmap_pde(pmap, va);
1403
1404	/*
1405	 * This supports switching from a 2MB page to a
1406	 * normal 4K page.
1407	 */
1408	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1409		*pd = 0;
1410		pd = 0;
1411		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1412		free = NULL;
1413		pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va), &free);
1414		pmap_invalidate_all(kernel_pmap);
1415		pmap_free_zero_pages(free);
1416	}
1417
1418	/*
1419	 * If the page table page is mapped, we just increment the
1420	 * hold count, and activate it.
1421	 */
1422	if (pd != 0 && (*pd & PG_V) != 0) {
1423		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1424		m->wire_count++;
1425	} else {
1426		/*
1427		 * Here if the pte page isn't mapped, or if it has been
1428		 * deallocated.
1429		 */
1430		m = _pmap_allocpte(pmap, ptepindex, flags);
1431		if (m == NULL && (flags & M_WAITOK))
1432			goto retry;
1433	}
1434	return (m);
1435}
1436
1437
1438/***************************************************
1439 * Pmap allocation/deallocation routines.
1440 ***************************************************/
1441
1442/*
1443 * Release any resources held by the given physical map.
1444 * Called when a pmap initialized by pmap_pinit is being released.
1445 * Should only be called if the map contains no valid mappings.
1446 */
1447void
1448pmap_release(pmap_t pmap)
1449{
1450	vm_page_t m;
1451
1452	KASSERT(pmap->pm_stats.resident_count == 0,
1453	    ("pmap_release: pmap resident count %ld != 0",
1454	    pmap->pm_stats.resident_count));
1455
1456	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1457
1458	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
1459	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
1460	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
1461
1462	m->wire_count--;
1463	atomic_subtract_int(&cnt.v_wire_count, 1);
1464	vm_page_free_zero(m);
1465	PMAP_LOCK_DESTROY(pmap);
1466}
1467
1468static int
1469kvm_size(SYSCTL_HANDLER_ARGS)
1470{
1471	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1472
1473	return sysctl_handle_long(oidp, &ksize, 0, req);
1474}
1475SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1476    0, 0, kvm_size, "LU", "Size of KVM");
1477
1478static int
1479kvm_free(SYSCTL_HANDLER_ARGS)
1480{
1481	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1482
1483	return sysctl_handle_long(oidp, &kfree, 0, req);
1484}
1485SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1486    0, 0, kvm_free, "LU", "Amount of KVM free");
1487
1488/*
1489 * grow the number of kernel page table entries, if needed
1490 */
1491void
1492pmap_growkernel(vm_offset_t addr)
1493{
1494	vm_paddr_t paddr;
1495	vm_page_t nkpg;
1496	pd_entry_t *pde, newpdir;
1497	pdp_entry_t newpdp;
1498
1499	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1500	if (kernel_vm_end == 0) {
1501		kernel_vm_end = KERNBASE;
1502		nkpt = 0;
1503		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1504			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1505			nkpt++;
1506			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1507				kernel_vm_end = kernel_map->max_offset;
1508				break;
1509			}
1510		}
1511	}
1512	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1513	if (addr - 1 >= kernel_map->max_offset)
1514		addr = kernel_map->max_offset;
1515	while (kernel_vm_end < addr) {
1516		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1517		if (pde == NULL) {
1518			/* We need a new PDP entry */
1519			nkpg = vm_page_alloc(NULL, nkpt,
1520			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1521			if (!nkpg)
1522				panic("pmap_growkernel: no memory to grow kernel");
1523			pmap_zero_page(nkpg);
1524			paddr = VM_PAGE_TO_PHYS(nkpg);
1525			newpdp = (pdp_entry_t)
1526				(paddr | PG_V | PG_RW | PG_A | PG_M);
1527			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1528			continue; /* try again */
1529		}
1530		if ((*pde & PG_V) != 0) {
1531			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1532			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1533				kernel_vm_end = kernel_map->max_offset;
1534				break;
1535			}
1536			continue;
1537		}
1538
1539		/*
1540		 * This index is bogus, but out of the way
1541		 */
1542		nkpg = vm_page_alloc(NULL, nkpt,
1543		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1544		if (!nkpg)
1545			panic("pmap_growkernel: no memory to grow kernel");
1546
1547		nkpt++;
1548
1549		pmap_zero_page(nkpg);
1550		paddr = VM_PAGE_TO_PHYS(nkpg);
1551		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1552		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1553
1554		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1555		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1556			kernel_vm_end = kernel_map->max_offset;
1557			break;
1558		}
1559	}
1560}
1561
1562
1563/***************************************************
1564 * page management routines.
1565 ***************************************************/
1566
1567CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1568CTASSERT(_NPCM == 3);
1569CTASSERT(_NPCPV == 168);
1570
1571static __inline struct pv_chunk *
1572pv_to_chunk(pv_entry_t pv)
1573{
1574
1575	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1576}
1577
1578#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1579
1580#define	PC_FREE0	0xfffffffffffffffful
1581#define	PC_FREE1	0xfffffffffffffffful
1582#define	PC_FREE2	0x000000fffffffffful
1583
1584static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1585
1586SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1587	"Current number of pv entries");
1588
1589#ifdef PV_STATS
1590static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1591
1592SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1593	"Current number of pv entry chunks");
1594SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1595	"Current number of pv entry chunks allocated");
1596SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1597	"Current number of pv entry chunks frees");
1598SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1599	"Number of times tried to get a chunk page but failed.");
1600
1601static long pv_entry_frees, pv_entry_allocs;
1602static int pv_entry_spare;
1603
1604SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1605	"Current number of pv entry frees");
1606SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1607	"Current number of pv entry allocs");
1608SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1609	"Current number of spare pv entries");
1610
1611static int pmap_collect_inactive, pmap_collect_active;
1612
1613SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1614	"Current number times pmap_collect called on inactive queue");
1615SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1616	"Current number times pmap_collect called on active queue");
1617#endif
1618
1619/*
1620 * We are in a serious low memory condition.  Resort to
1621 * drastic measures to free some pages so we can allocate
1622 * another pv entry chunk.  This is normally called to
1623 * unmap inactive pages, and if necessary, active pages.
1624 */
1625static void
1626pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1627{
1628	pd_entry_t ptepde;
1629	pmap_t pmap;
1630	pt_entry_t *pte, tpte;
1631	pv_entry_t next_pv, pv;
1632	vm_offset_t va;
1633	vm_page_t m, free;
1634
1635	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1636		if (m->hold_count || m->busy)
1637			continue;
1638		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1639			va = pv->pv_va;
1640			pmap = PV_PMAP(pv);
1641			/* Avoid deadlock and lock recursion. */
1642			if (pmap > locked_pmap)
1643				PMAP_LOCK(pmap);
1644			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1645				continue;
1646			pmap->pm_stats.resident_count--;
1647			pte = pmap_pte_pde(pmap, va, &ptepde);
1648			tpte = pte_load_clear(pte);
1649			KASSERT((tpte & PG_W) == 0,
1650			    ("pmap_collect: wired pte %#lx", tpte));
1651			if (tpte & PG_A)
1652				vm_page_flag_set(m, PG_REFERENCED);
1653			if (tpte & PG_M) {
1654				KASSERT((tpte & PG_RW),
1655	("pmap_collect: modified page not writable: va: %#lx, pte: %#lx",
1656				    va, tpte));
1657				vm_page_dirty(m);
1658			}
1659			free = NULL;
1660			pmap_unuse_pt(pmap, va, ptepde, &free);
1661			pmap_invalidate_page(pmap, va);
1662			pmap_free_zero_pages(free);
1663			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1664			if (TAILQ_EMPTY(&m->md.pv_list))
1665				vm_page_flag_clear(m, PG_WRITEABLE);
1666			m->md.pv_list_count--;
1667			free_pv_entry(pmap, pv);
1668			if (pmap != locked_pmap)
1669				PMAP_UNLOCK(pmap);
1670		}
1671	}
1672}
1673
1674
1675/*
1676 * free the pv_entry back to the free list
1677 */
1678static void
1679free_pv_entry(pmap_t pmap, pv_entry_t pv)
1680{
1681	vm_page_t m;
1682	struct pv_chunk *pc;
1683	int idx, field, bit;
1684
1685	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1686	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1687	PV_STAT(pv_entry_frees++);
1688	PV_STAT(pv_entry_spare++);
1689	pv_entry_count--;
1690	pc = pv_to_chunk(pv);
1691	idx = pv - &pc->pc_pventry[0];
1692	field = idx / 64;
1693	bit = idx % 64;
1694	pc->pc_map[field] |= 1ul << bit;
1695	/* move to head of list */
1696	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1697	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1698	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1699	    pc->pc_map[2] != PC_FREE2)
1700		return;
1701	PV_STAT(pv_entry_spare -= _NPCPV);
1702	PV_STAT(pc_chunk_count--);
1703	PV_STAT(pc_chunk_frees++);
1704	/* entire chunk is free, return it */
1705	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1706	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1707	dump_drop_page(m->phys_addr);
1708	vm_page_free(m);
1709}
1710
1711/*
1712 * get a new pv_entry, allocating a block from the system
1713 * when needed.
1714 */
1715static pv_entry_t
1716get_pv_entry(pmap_t pmap, int try)
1717{
1718	static const struct timeval printinterval = { 60, 0 };
1719	static struct timeval lastprint;
1720	static vm_pindex_t colour;
1721	int bit, field, page_req;
1722	pv_entry_t pv;
1723	struct pv_chunk *pc;
1724	vm_page_t m;
1725
1726	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1727	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1728	PV_STAT(pv_entry_allocs++);
1729	pv_entry_count++;
1730	if (pv_entry_count > pv_entry_high_water)
1731		pagedaemon_wakeup();
1732	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1733	if (pc != NULL) {
1734		for (field = 0; field < _NPCM; field++) {
1735			if (pc->pc_map[field]) {
1736				bit = bsfq(pc->pc_map[field]);
1737				break;
1738			}
1739		}
1740		if (field < _NPCM) {
1741			pv = &pc->pc_pventry[field * 64 + bit];
1742			pc->pc_map[field] &= ~(1ul << bit);
1743			/* If this was the last item, move it to tail */
1744			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1745			    pc->pc_map[2] == 0) {
1746				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1747				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1748			}
1749			PV_STAT(pv_entry_spare--);
1750			return (pv);
1751		}
1752	}
1753	/* No free items, allocate another chunk */
1754	page_req = try ? VM_ALLOC_NORMAL : VM_ALLOC_SYSTEM;
1755	m = vm_page_alloc(NULL, colour, page_req | VM_ALLOC_NOOBJ);
1756	if (m == NULL) {
1757		if (try) {
1758			pv_entry_count--;
1759			PV_STAT(pc_chunk_tryfail++);
1760			return (NULL);
1761		}
1762		/*
1763		 * Reclaim pv entries: At first, destroy mappings to inactive
1764		 * pages.  After that, if a pv chunk entry is still needed,
1765		 * destroy mappings to active pages.
1766		 */
1767		if (ratecheck(&lastprint, &printinterval))
1768			printf("Approaching the limit on PV entries, consider "
1769			    "increasing sysctl vm.pmap.shpgperproc or "
1770			    "vm.pmap.pv_entry_max\n");
1771		PV_STAT(pmap_collect_inactive++);
1772		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
1773		m = vm_page_alloc(NULL, colour,
1774		    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
1775		if (m == NULL) {
1776			PV_STAT(pmap_collect_active++);
1777			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
1778			m = vm_page_alloc(NULL, colour,
1779			    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
1780			if (m == NULL)
1781				panic("get_pv_entry: increase vm.pmap.shpgperproc");
1782		}
1783	}
1784	PV_STAT(pc_chunk_count++);
1785	PV_STAT(pc_chunk_allocs++);
1786	colour++;
1787	dump_add_page(m->phys_addr);
1788	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1789	pc->pc_pmap = pmap;
1790	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
1791	pc->pc_map[1] = PC_FREE1;
1792	pc->pc_map[2] = PC_FREE2;
1793	pv = &pc->pc_pventry[0];
1794	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1795	PV_STAT(pv_entry_spare += _NPCPV - 1);
1796	return (pv);
1797}
1798
1799static void
1800pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1801{
1802	pv_entry_t pv;
1803
1804	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1805	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1806	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1807		if (pmap == PV_PMAP(pv) && va == pv->pv_va)
1808			break;
1809	}
1810	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1811	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1812	m->md.pv_list_count--;
1813	if (TAILQ_EMPTY(&m->md.pv_list))
1814		vm_page_flag_clear(m, PG_WRITEABLE);
1815	free_pv_entry(pmap, pv);
1816}
1817
1818/*
1819 * Create a pv entry for page at pa for
1820 * (pmap, va).
1821 */
1822static void
1823pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1824{
1825	pv_entry_t pv;
1826
1827	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1828	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1829	pv = get_pv_entry(pmap, FALSE);
1830	pv->pv_va = va;
1831	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1832	m->md.pv_list_count++;
1833}
1834
1835/*
1836 * Conditionally create a pv entry.
1837 */
1838static boolean_t
1839pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1840{
1841	pv_entry_t pv;
1842
1843	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1844	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1845	if (pv_entry_count < pv_entry_high_water &&
1846	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
1847		pv->pv_va = va;
1848		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1849		m->md.pv_list_count++;
1850		return (TRUE);
1851	} else
1852		return (FALSE);
1853}
1854
1855/*
1856 * pmap_remove_pte: do the things to unmap a page in a process
1857 */
1858static int
1859pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
1860    pd_entry_t ptepde, vm_page_t *free)
1861{
1862	pt_entry_t oldpte;
1863	vm_page_t m;
1864
1865	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1866	oldpte = pte_load_clear(ptq);
1867	if (oldpte & PG_W)
1868		pmap->pm_stats.wired_count -= 1;
1869	/*
1870	 * Machines that don't support invlpg, also don't support
1871	 * PG_G.
1872	 */
1873	if (oldpte & PG_G)
1874		pmap_invalidate_page(kernel_pmap, va);
1875	pmap->pm_stats.resident_count -= 1;
1876	if (oldpte & PG_MANAGED) {
1877		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
1878		if (oldpte & PG_M) {
1879			KASSERT((oldpte & PG_RW),
1880	("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx",
1881			    va, oldpte));
1882			vm_page_dirty(m);
1883		}
1884		if (oldpte & PG_A)
1885			vm_page_flag_set(m, PG_REFERENCED);
1886		pmap_remove_entry(pmap, m, va);
1887	}
1888	return (pmap_unuse_pt(pmap, va, ptepde, free));
1889}
1890
1891/*
1892 * Remove a single page from a process address space
1893 */
1894static void
1895pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde)
1896{
1897	pt_entry_t *pte;
1898	vm_page_t free = NULL;
1899
1900	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1901	if ((*pde & PG_V) == 0)
1902		return;
1903	pte = pmap_pde_to_pte(pde, va);
1904	if ((*pte & PG_V) == 0)
1905		return;
1906	pmap_remove_pte(pmap, pte, va, *pde, &free);
1907	pmap_invalidate_page(pmap, va);
1908	pmap_free_zero_pages(free);
1909}
1910
1911/*
1912 *	Remove the given range of addresses from the specified map.
1913 *
1914 *	It is assumed that the start and end are properly
1915 *	rounded to the page size.
1916 */
1917void
1918pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1919{
1920	vm_offset_t va_next;
1921	pml4_entry_t *pml4e;
1922	pdp_entry_t *pdpe;
1923	pd_entry_t ptpaddr, *pde;
1924	pt_entry_t *pte;
1925	vm_page_t free = NULL;
1926	int anyvalid;
1927
1928	/*
1929	 * Perform an unsynchronized read.  This is, however, safe.
1930	 */
1931	if (pmap->pm_stats.resident_count == 0)
1932		return;
1933
1934	anyvalid = 0;
1935
1936	vm_page_lock_queues();
1937	PMAP_LOCK(pmap);
1938
1939	/*
1940	 * special handling of removing one page.  a very
1941	 * common operation and easy to short circuit some
1942	 * code.
1943	 */
1944	if (sva + PAGE_SIZE == eva) {
1945		pde = pmap_pde(pmap, sva);
1946		if (pde && (*pde & PG_PS) == 0) {
1947			pmap_remove_page(pmap, sva, pde);
1948			goto out;
1949		}
1950	}
1951
1952	for (; sva < eva; sva = va_next) {
1953
1954		if (pmap->pm_stats.resident_count == 0)
1955			break;
1956
1957		pml4e = pmap_pml4e(pmap, sva);
1958		if ((*pml4e & PG_V) == 0) {
1959			va_next = (sva + NBPML4) & ~PML4MASK;
1960			continue;
1961		}
1962
1963		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
1964		if ((*pdpe & PG_V) == 0) {
1965			va_next = (sva + NBPDP) & ~PDPMASK;
1966			continue;
1967		}
1968
1969		/*
1970		 * Calculate index for next page table.
1971		 */
1972		va_next = (sva + NBPDR) & ~PDRMASK;
1973
1974		pde = pmap_pdpe_to_pde(pdpe, sva);
1975		ptpaddr = *pde;
1976
1977		/*
1978		 * Weed out invalid mappings.
1979		 */
1980		if (ptpaddr == 0)
1981			continue;
1982
1983		/*
1984		 * Check for large page.
1985		 */
1986		if ((ptpaddr & PG_PS) != 0) {
1987			*pde = 0;
1988			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1989			pmap_unuse_pt(pmap, sva, *pdpe, &free);
1990			anyvalid = 1;
1991			continue;
1992		}
1993
1994		/*
1995		 * Limit our scan to either the end of the va represented
1996		 * by the current page table page, or to the end of the
1997		 * range being removed.
1998		 */
1999		if (va_next > eva)
2000			va_next = eva;
2001
2002		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2003		    sva += PAGE_SIZE) {
2004			if (*pte == 0)
2005				continue;
2006
2007			/*
2008			 * The TLB entry for a PG_G mapping is invalidated
2009			 * by pmap_remove_pte().
2010			 */
2011			if ((*pte & PG_G) == 0)
2012				anyvalid = 1;
2013			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
2014				break;
2015		}
2016	}
2017out:
2018	if (anyvalid) {
2019		pmap_invalidate_all(pmap);
2020		pmap_free_zero_pages(free);
2021	}
2022	vm_page_unlock_queues();
2023	PMAP_UNLOCK(pmap);
2024}
2025
2026/*
2027 *	Routine:	pmap_remove_all
2028 *	Function:
2029 *		Removes this physical page from
2030 *		all physical maps in which it resides.
2031 *		Reflects back modify bits to the pager.
2032 *
2033 *	Notes:
2034 *		Original versions of this routine were very
2035 *		inefficient because they iteratively called
2036 *		pmap_remove (slow...)
2037 */
2038
2039void
2040pmap_remove_all(vm_page_t m)
2041{
2042	pv_entry_t pv;
2043	pmap_t pmap;
2044	pt_entry_t *pte, tpte;
2045	pd_entry_t ptepde;
2046	vm_page_t free;
2047
2048#if defined(PMAP_DIAGNOSTIC)
2049	/*
2050	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
2051	 */
2052	if (m->flags & PG_FICTITIOUS) {
2053		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx",
2054		    VM_PAGE_TO_PHYS(m));
2055	}
2056#endif
2057	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2058	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2059		pmap = PV_PMAP(pv);
2060		PMAP_LOCK(pmap);
2061		pmap->pm_stats.resident_count--;
2062		pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde);
2063		tpte = pte_load_clear(pte);
2064		if (tpte & PG_W)
2065			pmap->pm_stats.wired_count--;
2066		if (tpte & PG_A)
2067			vm_page_flag_set(m, PG_REFERENCED);
2068
2069		/*
2070		 * Update the vm_page_t clean and reference bits.
2071		 */
2072		if (tpte & PG_M) {
2073			KASSERT((tpte & PG_RW),
2074	("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx",
2075			    pv->pv_va, tpte));
2076			vm_page_dirty(m);
2077		}
2078		free = NULL;
2079		pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
2080		pmap_invalidate_page(pmap, pv->pv_va);
2081		pmap_free_zero_pages(free);
2082		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2083		m->md.pv_list_count--;
2084		free_pv_entry(pmap, pv);
2085		PMAP_UNLOCK(pmap);
2086	}
2087	vm_page_flag_clear(m, PG_WRITEABLE);
2088}
2089
2090/*
2091 *	Set the physical protection on the
2092 *	specified range of this map as requested.
2093 */
2094void
2095pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2096{
2097	vm_offset_t va_next;
2098	pml4_entry_t *pml4e;
2099	pdp_entry_t *pdpe;
2100	pd_entry_t ptpaddr, *pde;
2101	pt_entry_t *pte;
2102	int anychanged;
2103
2104	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2105		pmap_remove(pmap, sva, eva);
2106		return;
2107	}
2108
2109	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2110	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2111		return;
2112
2113	anychanged = 0;
2114
2115	vm_page_lock_queues();
2116	PMAP_LOCK(pmap);
2117	for (; sva < eva; sva = va_next) {
2118
2119		pml4e = pmap_pml4e(pmap, sva);
2120		if ((*pml4e & PG_V) == 0) {
2121			va_next = (sva + NBPML4) & ~PML4MASK;
2122			continue;
2123		}
2124
2125		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2126		if ((*pdpe & PG_V) == 0) {
2127			va_next = (sva + NBPDP) & ~PDPMASK;
2128			continue;
2129		}
2130
2131		va_next = (sva + NBPDR) & ~PDRMASK;
2132
2133		pde = pmap_pdpe_to_pde(pdpe, sva);
2134		ptpaddr = *pde;
2135
2136		/*
2137		 * Weed out invalid mappings.
2138		 */
2139		if (ptpaddr == 0)
2140			continue;
2141
2142		/*
2143		 * Check for large page.
2144		 */
2145		if ((ptpaddr & PG_PS) != 0) {
2146			if ((prot & VM_PROT_WRITE) == 0)
2147				*pde &= ~(PG_M|PG_RW);
2148			if ((prot & VM_PROT_EXECUTE) == 0)
2149				*pde |= pg_nx;
2150			anychanged = 1;
2151			continue;
2152		}
2153
2154		if (va_next > eva)
2155			va_next = eva;
2156
2157		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2158		    sva += PAGE_SIZE) {
2159			pt_entry_t obits, pbits;
2160			vm_page_t m;
2161
2162retry:
2163			obits = pbits = *pte;
2164			if ((pbits & PG_V) == 0)
2165				continue;
2166			if (pbits & PG_MANAGED) {
2167				m = NULL;
2168				if (pbits & PG_A) {
2169					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2170					vm_page_flag_set(m, PG_REFERENCED);
2171					pbits &= ~PG_A;
2172				}
2173				if ((pbits & PG_M) != 0) {
2174					if (m == NULL)
2175						m = PHYS_TO_VM_PAGE(pbits &
2176						    PG_FRAME);
2177					vm_page_dirty(m);
2178				}
2179			}
2180
2181			if ((prot & VM_PROT_WRITE) == 0)
2182				pbits &= ~(PG_RW | PG_M);
2183			if ((prot & VM_PROT_EXECUTE) == 0)
2184				pbits |= pg_nx;
2185
2186			if (pbits != obits) {
2187				if (!atomic_cmpset_long(pte, obits, pbits))
2188					goto retry;
2189				if (obits & PG_G)
2190					pmap_invalidate_page(pmap, sva);
2191				else
2192					anychanged = 1;
2193			}
2194		}
2195	}
2196	if (anychanged)
2197		pmap_invalidate_all(pmap);
2198	vm_page_unlock_queues();
2199	PMAP_UNLOCK(pmap);
2200}
2201
2202/*
2203 *	Insert the given physical page (p) at
2204 *	the specified virtual address (v) in the
2205 *	target physical map with the protection requested.
2206 *
2207 *	If specified, the page will be wired down, meaning
2208 *	that the related pte can not be reclaimed.
2209 *
2210 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2211 *	or lose information.  That is, this routine must actually
2212 *	insert this page into the given map NOW.
2213 */
2214void
2215pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2216	   boolean_t wired)
2217{
2218	vm_paddr_t pa;
2219	pd_entry_t *pde;
2220	pt_entry_t *pte;
2221	vm_paddr_t opa;
2222	pt_entry_t origpte, newpte;
2223	vm_page_t mpte, om;
2224	boolean_t invlva;
2225
2226	va = trunc_page(va);
2227#ifdef PMAP_DIAGNOSTIC
2228	if (va > VM_MAX_KERNEL_ADDRESS)
2229		panic("pmap_enter: toobig");
2230	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2231		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
2232#endif
2233
2234	mpte = NULL;
2235
2236	vm_page_lock_queues();
2237	PMAP_LOCK(pmap);
2238
2239	/*
2240	 * In the case that a page table page is not
2241	 * resident, we are creating it here.
2242	 */
2243	if (va < VM_MAXUSER_ADDRESS) {
2244		mpte = pmap_allocpte(pmap, va, M_WAITOK);
2245	}
2246#if 0 && defined(PMAP_DIAGNOSTIC)
2247	else {
2248		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2249		origpte = *pdeaddr;
2250		if ((origpte & PG_V) == 0) {
2251			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
2252				origpte, va);
2253		}
2254	}
2255#endif
2256
2257	pde = pmap_pde(pmap, va);
2258	if (pde != NULL) {
2259		if ((*pde & PG_PS) != 0)
2260			panic("pmap_enter: attempted pmap_enter on 2MB page");
2261		pte = pmap_pde_to_pte(pde, va);
2262	} else
2263		pte = NULL;
2264
2265	/*
2266	 * Page Directory table entry not valid, we need a new PT page
2267	 */
2268	if (pte == NULL)
2269		panic("pmap_enter: invalid page directory va=%#lx\n", va);
2270
2271	pa = VM_PAGE_TO_PHYS(m);
2272	om = NULL;
2273	origpte = *pte;
2274	opa = origpte & PG_FRAME;
2275
2276	/*
2277	 * Mapping has not changed, must be protection or wiring change.
2278	 */
2279	if (origpte && (opa == pa)) {
2280		/*
2281		 * Wiring change, just update stats. We don't worry about
2282		 * wiring PT pages as they remain resident as long as there
2283		 * are valid mappings in them. Hence, if a user page is wired,
2284		 * the PT page will be also.
2285		 */
2286		if (wired && ((origpte & PG_W) == 0))
2287			pmap->pm_stats.wired_count++;
2288		else if (!wired && (origpte & PG_W))
2289			pmap->pm_stats.wired_count--;
2290
2291		/*
2292		 * Remove extra pte reference
2293		 */
2294		if (mpte)
2295			mpte->wire_count--;
2296
2297		/*
2298		 * We might be turning off write access to the page,
2299		 * so we go ahead and sense modify status.
2300		 */
2301		if (origpte & PG_MANAGED) {
2302			om = m;
2303			pa |= PG_MANAGED;
2304		}
2305		goto validate;
2306	}
2307	/*
2308	 * Mapping has changed, invalidate old range and fall through to
2309	 * handle validating new mapping.
2310	 */
2311	if (opa) {
2312		if (origpte & PG_W)
2313			pmap->pm_stats.wired_count--;
2314		if (origpte & PG_MANAGED) {
2315			om = PHYS_TO_VM_PAGE(opa);
2316			pmap_remove_entry(pmap, om, va);
2317		}
2318		if (mpte != NULL) {
2319			mpte->wire_count--;
2320			KASSERT(mpte->wire_count > 0,
2321			    ("pmap_enter: missing reference to page table page,"
2322			     " va: 0x%lx", va));
2323		}
2324	} else
2325		pmap->pm_stats.resident_count++;
2326
2327	/*
2328	 * Enter on the PV list if part of our managed memory.
2329	 */
2330	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2331		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2332		    ("pmap_enter: managed mapping within the clean submap"));
2333		pmap_insert_entry(pmap, va, m);
2334		pa |= PG_MANAGED;
2335	}
2336
2337	/*
2338	 * Increment counters
2339	 */
2340	if (wired)
2341		pmap->pm_stats.wired_count++;
2342
2343validate:
2344	/*
2345	 * Now validate mapping with desired protection/wiring.
2346	 */
2347	newpte = (pt_entry_t)(pa | PG_V);
2348	if ((prot & VM_PROT_WRITE) != 0) {
2349		newpte |= PG_RW;
2350		vm_page_flag_set(m, PG_WRITEABLE);
2351	}
2352	if ((prot & VM_PROT_EXECUTE) == 0)
2353		newpte |= pg_nx;
2354	if (wired)
2355		newpte |= PG_W;
2356	if (va < VM_MAXUSER_ADDRESS)
2357		newpte |= PG_U;
2358	if (pmap == kernel_pmap)
2359		newpte |= PG_G;
2360
2361	/*
2362	 * if the mapping or permission bits are different, we need
2363	 * to update the pte.
2364	 */
2365	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2366		if (origpte & PG_V) {
2367			invlva = FALSE;
2368			origpte = pte_load_store(pte, newpte | PG_A);
2369			if (origpte & PG_A) {
2370				if (origpte & PG_MANAGED)
2371					vm_page_flag_set(om, PG_REFERENCED);
2372				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
2373				    PG_NX) == 0 && (newpte & PG_NX)))
2374					invlva = TRUE;
2375			}
2376			if (origpte & PG_M) {
2377				KASSERT((origpte & PG_RW),
2378	("pmap_enter: modified page not writable: va: %#lx, pte: %#lx",
2379				    va, origpte));
2380				if ((origpte & PG_MANAGED) != 0)
2381					vm_page_dirty(om);
2382				if ((newpte & PG_RW) == 0)
2383					invlva = TRUE;
2384			}
2385			if (invlva)
2386				pmap_invalidate_page(pmap, va);
2387		} else
2388			pte_store(pte, newpte | PG_A);
2389	}
2390	vm_page_unlock_queues();
2391	PMAP_UNLOCK(pmap);
2392}
2393
2394/*
2395 * Maps a sequence of resident pages belonging to the same object.
2396 * The sequence begins with the given page m_start.  This page is
2397 * mapped at the given virtual address start.  Each subsequent page is
2398 * mapped at a virtual address that is offset from start by the same
2399 * amount as the page is offset from m_start within the object.  The
2400 * last page in the sequence is the page with the largest offset from
2401 * m_start that can be mapped at a virtual address less than the given
2402 * virtual address end.  Not every virtual page between start and end
2403 * is mapped; only those for which a resident page exists with the
2404 * corresponding offset from m_start are mapped.
2405 */
2406void
2407pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2408    vm_page_t m_start, vm_prot_t prot)
2409{
2410	vm_page_t m, mpte;
2411	vm_pindex_t diff, psize;
2412
2413	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
2414	psize = atop(end - start);
2415	mpte = NULL;
2416	m = m_start;
2417	PMAP_LOCK(pmap);
2418	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2419		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
2420		    prot, mpte);
2421		m = TAILQ_NEXT(m, listq);
2422	}
2423 	PMAP_UNLOCK(pmap);
2424}
2425
2426/*
2427 * this code makes some *MAJOR* assumptions:
2428 * 1. Current pmap & pmap exists.
2429 * 2. Not wired.
2430 * 3. Read access.
2431 * 4. No page table pages.
2432 * but is *MUCH* faster than pmap_enter...
2433 */
2434
2435void
2436pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2437{
2438
2439	PMAP_LOCK(pmap);
2440	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
2441	PMAP_UNLOCK(pmap);
2442}
2443
2444static vm_page_t
2445pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
2446    vm_prot_t prot, vm_page_t mpte)
2447{
2448	vm_page_t free;
2449	pt_entry_t *pte;
2450	vm_paddr_t pa;
2451
2452	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2453	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
2454	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2455	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2456	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2457
2458	/*
2459	 * In the case that a page table page is not
2460	 * resident, we are creating it here.
2461	 */
2462	if (va < VM_MAXUSER_ADDRESS) {
2463		vm_pindex_t ptepindex;
2464		pd_entry_t *ptepa;
2465
2466		/*
2467		 * Calculate pagetable page index
2468		 */
2469		ptepindex = pmap_pde_pindex(va);
2470		if (mpte && (mpte->pindex == ptepindex)) {
2471			mpte->wire_count++;
2472		} else {
2473			/*
2474			 * Get the page directory entry
2475			 */
2476			ptepa = pmap_pde(pmap, va);
2477
2478			/*
2479			 * If the page table page is mapped, we just increment
2480			 * the hold count, and activate it.
2481			 */
2482			if (ptepa && (*ptepa & PG_V) != 0) {
2483				if (*ptepa & PG_PS)
2484					panic("pmap_enter_quick: unexpected mapping into 2MB page");
2485				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
2486				mpte->wire_count++;
2487			} else {
2488				mpte = _pmap_allocpte(pmap, ptepindex,
2489				    M_NOWAIT);
2490				if (mpte == NULL)
2491					return (mpte);
2492			}
2493		}
2494	} else {
2495		mpte = NULL;
2496	}
2497
2498	/*
2499	 * This call to vtopte makes the assumption that we are
2500	 * entering the page into the current pmap.  In order to support
2501	 * quick entry into any pmap, one would likely use pmap_pte.
2502	 * But that isn't as quick as vtopte.
2503	 */
2504	pte = vtopte(va);
2505	if (*pte) {
2506		if (mpte != NULL) {
2507			mpte->wire_count--;
2508			mpte = NULL;
2509		}
2510		return (mpte);
2511	}
2512
2513	/*
2514	 * Enter on the PV list if part of our managed memory.
2515	 */
2516	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
2517	    !pmap_try_insert_pv_entry(pmap, va, m)) {
2518		if (mpte != NULL) {
2519			free = NULL;
2520			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
2521				pmap_invalidate_page(pmap, va);
2522				pmap_free_zero_pages(free);
2523			}
2524			mpte = NULL;
2525		}
2526		return (mpte);
2527	}
2528
2529	/*
2530	 * Increment counters
2531	 */
2532	pmap->pm_stats.resident_count++;
2533
2534	pa = VM_PAGE_TO_PHYS(m);
2535	if ((prot & VM_PROT_EXECUTE) == 0)
2536		pa |= pg_nx;
2537
2538	/*
2539	 * Now validate mapping with RO protection
2540	 */
2541	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2542		pte_store(pte, pa | PG_V | PG_U);
2543	else
2544		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2545	return mpte;
2546}
2547
2548/*
2549 * Make a temporary mapping for a physical address.  This is only intended
2550 * to be used for panic dumps.
2551 */
2552void *
2553pmap_kenter_temporary(vm_paddr_t pa, int i)
2554{
2555	vm_offset_t va;
2556
2557	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2558	pmap_kenter(va, pa);
2559	invlpg(va);
2560	return ((void *)crashdumpmap);
2561}
2562
2563/*
2564 * This code maps large physical mmap regions into the
2565 * processor address space.  Note that some shortcuts
2566 * are taken, but the code works.
2567 */
2568void
2569pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2570		    vm_object_t object, vm_pindex_t pindex,
2571		    vm_size_t size)
2572{
2573	vm_offset_t va;
2574	vm_page_t p, pdpg;
2575
2576	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2577	KASSERT(object->type == OBJT_DEVICE,
2578	    ("pmap_object_init_pt: non-device object"));
2579	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2580		vm_page_t m[1];
2581		pd_entry_t ptepa, *pde;
2582
2583		PMAP_LOCK(pmap);
2584		pde = pmap_pde(pmap, addr);
2585		if (pde != 0 && (*pde & PG_V) != 0)
2586			goto out;
2587		PMAP_UNLOCK(pmap);
2588retry:
2589		p = vm_page_lookup(object, pindex);
2590		if (p != NULL) {
2591			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2592				goto retry;
2593		} else {
2594			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2595			if (p == NULL)
2596				return;
2597			m[0] = p;
2598
2599			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2600				vm_page_lock_queues();
2601				vm_page_free(p);
2602				vm_page_unlock_queues();
2603				return;
2604			}
2605
2606			p = vm_page_lookup(object, pindex);
2607			vm_page_lock_queues();
2608			vm_page_wakeup(p);
2609			vm_page_unlock_queues();
2610		}
2611
2612		ptepa = VM_PAGE_TO_PHYS(p);
2613		if (ptepa & (NBPDR - 1))
2614			return;
2615
2616		p->valid = VM_PAGE_BITS_ALL;
2617
2618		PMAP_LOCK(pmap);
2619		for (va = addr; va < addr + size; va += NBPDR) {
2620			while ((pdpg =
2621			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
2622				PMAP_UNLOCK(pmap);
2623				vm_page_lock_queues();
2624				vm_page_busy(p);
2625				vm_page_unlock_queues();
2626				VM_OBJECT_UNLOCK(object);
2627				VM_WAIT;
2628				VM_OBJECT_LOCK(object);
2629				vm_page_lock_queues();
2630				vm_page_wakeup(p);
2631				vm_page_unlock_queues();
2632				PMAP_LOCK(pmap);
2633			}
2634			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
2635			pde = &pde[pmap_pde_index(va)];
2636			if ((*pde & PG_V) == 0) {
2637				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
2638				    PG_U | PG_RW | PG_V);
2639				pmap->pm_stats.resident_count +=
2640				    NBPDR / PAGE_SIZE;
2641			} else {
2642				pdpg->wire_count--;
2643				KASSERT(pdpg->wire_count > 0,
2644				    ("pmap_object_init_pt: missing reference "
2645				     "to page directory page, va: 0x%lx", va));
2646			}
2647			ptepa += NBPDR;
2648		}
2649		pmap_invalidate_all(pmap);
2650out:
2651		PMAP_UNLOCK(pmap);
2652	}
2653}
2654
2655/*
2656 *	Routine:	pmap_change_wiring
2657 *	Function:	Change the wiring attribute for a map/virtual-address
2658 *			pair.
2659 *	In/out conditions:
2660 *			The mapping must already exist in the pmap.
2661 */
2662void
2663pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
2664{
2665	pt_entry_t *pte;
2666
2667	/*
2668	 * Wiring is not a hardware characteristic so there is no need to
2669	 * invalidate TLB.
2670	 */
2671	PMAP_LOCK(pmap);
2672	pte = pmap_pte(pmap, va);
2673	if (wired && (*pte & PG_W) == 0) {
2674		pmap->pm_stats.wired_count++;
2675		atomic_set_long(pte, PG_W);
2676	} else if (!wired && (*pte & PG_W) != 0) {
2677		pmap->pm_stats.wired_count--;
2678		atomic_clear_long(pte, PG_W);
2679	}
2680	PMAP_UNLOCK(pmap);
2681}
2682
2683
2684
2685/*
2686 *	Copy the range specified by src_addr/len
2687 *	from the source map to the range dst_addr/len
2688 *	in the destination map.
2689 *
2690 *	This routine is only advisory and need not do anything.
2691 */
2692
2693void
2694pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2695	  vm_offset_t src_addr)
2696{
2697	vm_page_t   free;
2698	vm_offset_t addr;
2699	vm_offset_t end_addr = src_addr + len;
2700	vm_offset_t va_next;
2701
2702	if (dst_addr != src_addr)
2703		return;
2704
2705	if (!pmap_is_current(src_pmap))
2706		return;
2707
2708	vm_page_lock_queues();
2709	if (dst_pmap < src_pmap) {
2710		PMAP_LOCK(dst_pmap);
2711		PMAP_LOCK(src_pmap);
2712	} else {
2713		PMAP_LOCK(src_pmap);
2714		PMAP_LOCK(dst_pmap);
2715	}
2716	for (addr = src_addr; addr < end_addr; addr = va_next) {
2717		pt_entry_t *src_pte, *dst_pte;
2718		vm_page_t dstmpde, dstmpte, srcmpte;
2719		pml4_entry_t *pml4e;
2720		pdp_entry_t *pdpe;
2721		pd_entry_t srcptepaddr, *pde;
2722
2723		if (addr >= UPT_MIN_ADDRESS)
2724			panic("pmap_copy: invalid to pmap_copy page tables");
2725
2726		pml4e = pmap_pml4e(src_pmap, addr);
2727		if ((*pml4e & PG_V) == 0) {
2728			va_next = (addr + NBPML4) & ~PML4MASK;
2729			continue;
2730		}
2731
2732		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
2733		if ((*pdpe & PG_V) == 0) {
2734			va_next = (addr + NBPDP) & ~PDPMASK;
2735			continue;
2736		}
2737
2738		va_next = (addr + NBPDR) & ~PDRMASK;
2739
2740		pde = pmap_pdpe_to_pde(pdpe, addr);
2741		srcptepaddr = *pde;
2742		if (srcptepaddr == 0)
2743			continue;
2744
2745		if (srcptepaddr & PG_PS) {
2746			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
2747			if (dstmpde == NULL)
2748				break;
2749			pde = (pd_entry_t *)
2750			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
2751			pde = &pde[pmap_pde_index(addr)];
2752			if (*pde == 0) {
2753				*pde = srcptepaddr & ~PG_W;
2754				dst_pmap->pm_stats.resident_count +=
2755				    NBPDR / PAGE_SIZE;
2756			} else
2757				dstmpde->wire_count--;
2758			continue;
2759		}
2760
2761		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
2762		if (srcmpte->wire_count == 0)
2763			panic("pmap_copy: source page table page is unused");
2764
2765		if (va_next > end_addr)
2766			va_next = end_addr;
2767
2768		src_pte = vtopte(addr);
2769		while (addr < va_next) {
2770			pt_entry_t ptetemp;
2771			ptetemp = *src_pte;
2772			/*
2773			 * we only virtual copy managed pages
2774			 */
2775			if ((ptetemp & PG_MANAGED) != 0) {
2776				dstmpte = pmap_allocpte(dst_pmap, addr,
2777				    M_NOWAIT);
2778				if (dstmpte == NULL)
2779					break;
2780				dst_pte = (pt_entry_t *)
2781				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2782				dst_pte = &dst_pte[pmap_pte_index(addr)];
2783				if (*dst_pte == 0 &&
2784				    pmap_try_insert_pv_entry(dst_pmap, addr,
2785				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
2786					/*
2787					 * Clear the wired, modified, and
2788					 * accessed (referenced) bits
2789					 * during the copy.
2790					 */
2791					*dst_pte = ptetemp & ~(PG_W | PG_M |
2792					    PG_A);
2793					dst_pmap->pm_stats.resident_count++;
2794	 			} else {
2795					free = NULL;
2796					if (pmap_unwire_pte_hold(dst_pmap,
2797					    addr, dstmpte, &free)) {
2798					    	pmap_invalidate_page(dst_pmap,
2799					 	    addr);
2800				    	    	pmap_free_zero_pages(free);
2801					}
2802				}
2803				if (dstmpte->wire_count >= srcmpte->wire_count)
2804					break;
2805			}
2806			addr += PAGE_SIZE;
2807			src_pte++;
2808		}
2809	}
2810	vm_page_unlock_queues();
2811	PMAP_UNLOCK(src_pmap);
2812	PMAP_UNLOCK(dst_pmap);
2813}
2814
2815/*
2816 *	pmap_zero_page zeros the specified hardware page by mapping
2817 *	the page into KVM and using bzero to clear its contents.
2818 */
2819void
2820pmap_zero_page(vm_page_t m)
2821{
2822	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2823
2824	pagezero((void *)va);
2825}
2826
2827/*
2828 *	pmap_zero_page_area zeros the specified hardware page by mapping
2829 *	the page into KVM and using bzero to clear its contents.
2830 *
2831 *	off and size may not cover an area beyond a single hardware page.
2832 */
2833void
2834pmap_zero_page_area(vm_page_t m, int off, int size)
2835{
2836	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2837
2838	if (off == 0 && size == PAGE_SIZE)
2839		pagezero((void *)va);
2840	else
2841		bzero((char *)va + off, size);
2842}
2843
2844/*
2845 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2846 *	the page into KVM and using bzero to clear its contents.  This
2847 *	is intended to be called from the vm_pagezero process only and
2848 *	outside of Giant.
2849 */
2850void
2851pmap_zero_page_idle(vm_page_t m)
2852{
2853	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2854
2855	pagezero((void *)va);
2856}
2857
2858/*
2859 *	pmap_copy_page copies the specified (machine independent)
2860 *	page by mapping the page into virtual memory and using
2861 *	bcopy to copy the page, one machine dependent page at a
2862 *	time.
2863 */
2864void
2865pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2866{
2867	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2868	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2869
2870	pagecopy((void *)src, (void *)dst);
2871}
2872
2873/*
2874 * Returns true if the pmap's pv is one of the first
2875 * 16 pvs linked to from this page.  This count may
2876 * be changed upwards or downwards in the future; it
2877 * is only necessary that true be returned for a small
2878 * subset of pmaps for proper page aging.
2879 */
2880boolean_t
2881pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2882{
2883	pv_entry_t pv;
2884	int loops = 0;
2885
2886	if (m->flags & PG_FICTITIOUS)
2887		return FALSE;
2888
2889	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2890	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2891		if (PV_PMAP(pv) == pmap) {
2892			return TRUE;
2893		}
2894		loops++;
2895		if (loops >= 16)
2896			break;
2897	}
2898	return (FALSE);
2899}
2900
2901/*
2902 * Remove all pages from specified address space
2903 * this aids process exit speeds.  Also, this code
2904 * is special cased for current process only, but
2905 * can have the more generic (and slightly slower)
2906 * mode enabled.  This is much faster than pmap_remove
2907 * in the case of running down an entire address space.
2908 */
2909void
2910pmap_remove_pages(pmap_t pmap)
2911{
2912	pt_entry_t *pte, tpte;
2913	vm_page_t m, free = NULL;
2914	pv_entry_t pv;
2915	struct pv_chunk *pc, *npc;
2916	int field, idx;
2917	int64_t bit;
2918	uint64_t inuse, bitmask;
2919	int allfree;
2920
2921	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2922		printf("warning: pmap_remove_pages called with non-current pmap\n");
2923		return;
2924	}
2925	vm_page_lock_queues();
2926	PMAP_LOCK(pmap);
2927	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
2928		allfree = 1;
2929		for (field = 0; field < _NPCM; field++) {
2930			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
2931			while (inuse != 0) {
2932				bit = bsfq(inuse);
2933				bitmask = 1UL << bit;
2934				idx = field * 64 + bit;
2935				pv = &pc->pc_pventry[idx];
2936				inuse &= ~bitmask;
2937
2938				pte = vtopte(pv->pv_va);
2939				tpte = *pte;
2940
2941				if (tpte == 0) {
2942					printf(
2943					    "TPTE at %p  IS ZERO @ VA %08lx\n",
2944					    pte, pv->pv_va);
2945					panic("bad pte");
2946				}
2947
2948/*
2949 * We cannot remove wired pages from a process' mapping at this time
2950 */
2951				if (tpte & PG_W) {
2952					allfree = 0;
2953					continue;
2954				}
2955
2956				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2957				KASSERT(m->phys_addr == (tpte & PG_FRAME),
2958				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2959				    m, (uintmax_t)m->phys_addr,
2960				    (uintmax_t)tpte));
2961
2962				KASSERT(m < &vm_page_array[vm_page_array_size],
2963					("pmap_remove_pages: bad tpte %#jx",
2964					(uintmax_t)tpte));
2965
2966				pmap->pm_stats.resident_count--;
2967
2968				pte_clear(pte);
2969
2970				/*
2971				 * Update the vm_page_t clean/reference bits.
2972				 */
2973				if (tpte & PG_M)
2974					vm_page_dirty(m);
2975
2976				/* Mark free */
2977				PV_STAT(pv_entry_frees++);
2978				PV_STAT(pv_entry_spare++);
2979				pv_entry_count--;
2980				pc->pc_map[field] |= bitmask;
2981				m->md.pv_list_count--;
2982				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2983				if (TAILQ_EMPTY(&m->md.pv_list))
2984					vm_page_flag_clear(m, PG_WRITEABLE);
2985				pmap_unuse_pt(pmap, pv->pv_va,
2986				    *vtopde(pv->pv_va), &free);
2987			}
2988		}
2989		if (allfree) {
2990			PV_STAT(pv_entry_spare -= _NPCPV);
2991			PV_STAT(pc_chunk_count--);
2992			PV_STAT(pc_chunk_frees++);
2993			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2994			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2995			dump_drop_page(m->phys_addr);
2996			vm_page_free(m);
2997		}
2998	}
2999	pmap_invalidate_all(pmap);
3000	pmap_free_zero_pages(free);
3001	vm_page_unlock_queues();
3002	PMAP_UNLOCK(pmap);
3003}
3004
3005/*
3006 *	pmap_is_modified:
3007 *
3008 *	Return whether or not the specified physical page was modified
3009 *	in any physical maps.
3010 */
3011boolean_t
3012pmap_is_modified(vm_page_t m)
3013{
3014	pv_entry_t pv;
3015	pt_entry_t *pte;
3016	pmap_t pmap;
3017	boolean_t rv;
3018
3019	rv = FALSE;
3020	if (m->flags & PG_FICTITIOUS)
3021		return (rv);
3022
3023	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3024	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3025		pmap = PV_PMAP(pv);
3026		PMAP_LOCK(pmap);
3027		pte = pmap_pte(pmap, pv->pv_va);
3028		rv = (*pte & PG_M) != 0;
3029		PMAP_UNLOCK(pmap);
3030		if (rv)
3031			break;
3032	}
3033	return (rv);
3034}
3035
3036/*
3037 *	pmap_is_prefaultable:
3038 *
3039 *	Return whether or not the specified virtual address is elgible
3040 *	for prefault.
3041 */
3042boolean_t
3043pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3044{
3045	pd_entry_t *pde;
3046	pt_entry_t *pte;
3047	boolean_t rv;
3048
3049	rv = FALSE;
3050	PMAP_LOCK(pmap);
3051	pde = pmap_pde(pmap, addr);
3052	if (pde != NULL && (*pde & PG_V)) {
3053		pte = vtopte(addr);
3054		rv = (*pte & PG_V) == 0;
3055	}
3056	PMAP_UNLOCK(pmap);
3057	return (rv);
3058}
3059
3060/*
3061 * Clear the write and modified bits in each of the given page's mappings.
3062 */
3063void
3064pmap_remove_write(vm_page_t m)
3065{
3066	pv_entry_t pv;
3067	pmap_t pmap;
3068	pt_entry_t oldpte, *pte;
3069
3070	if ((m->flags & PG_FICTITIOUS) != 0 ||
3071	    (m->flags & PG_WRITEABLE) == 0)
3072		return;
3073	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3074	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3075		pmap = PV_PMAP(pv);
3076		PMAP_LOCK(pmap);
3077		pte = pmap_pte(pmap, pv->pv_va);
3078retry:
3079		oldpte = *pte;
3080		if (oldpte & PG_RW) {
3081			if (!atomic_cmpset_long(pte, oldpte, oldpte &
3082			    ~(PG_RW | PG_M)))
3083				goto retry;
3084			if ((oldpte & PG_M) != 0)
3085				vm_page_dirty(m);
3086			pmap_invalidate_page(pmap, pv->pv_va);
3087		}
3088		PMAP_UNLOCK(pmap);
3089	}
3090	vm_page_flag_clear(m, PG_WRITEABLE);
3091}
3092
3093/*
3094 *	pmap_ts_referenced:
3095 *
3096 *	Return a count of reference bits for a page, clearing those bits.
3097 *	It is not necessary for every reference bit to be cleared, but it
3098 *	is necessary that 0 only be returned when there are truly no
3099 *	reference bits set.
3100 *
3101 *	XXX: The exact number of bits to check and clear is a matter that
3102 *	should be tested and standardized at some point in the future for
3103 *	optimal aging of shared pages.
3104 */
3105int
3106pmap_ts_referenced(vm_page_t m)
3107{
3108	pv_entry_t pv, pvf, pvn;
3109	pmap_t pmap;
3110	pt_entry_t *pte;
3111	int rtval = 0;
3112
3113	if (m->flags & PG_FICTITIOUS)
3114		return (rtval);
3115	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3116	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3117		pvf = pv;
3118		do {
3119			pvn = TAILQ_NEXT(pv, pv_list);
3120			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3121			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3122			pmap = PV_PMAP(pv);
3123			PMAP_LOCK(pmap);
3124			pte = pmap_pte(pmap, pv->pv_va);
3125			if ((*pte & PG_A) != 0) {
3126				atomic_clear_long(pte, PG_A);
3127				pmap_invalidate_page(pmap, pv->pv_va);
3128				rtval++;
3129				if (rtval > 4)
3130					pvn = NULL;
3131			}
3132			PMAP_UNLOCK(pmap);
3133		} while ((pv = pvn) != NULL && pv != pvf);
3134	}
3135	return (rtval);
3136}
3137
3138/*
3139 *	Clear the modify bits on the specified physical page.
3140 */
3141void
3142pmap_clear_modify(vm_page_t m)
3143{
3144	pv_entry_t pv;
3145	pmap_t pmap;
3146	pt_entry_t *pte;
3147
3148	if ((m->flags & PG_FICTITIOUS) != 0)
3149		return;
3150	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3151	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3152		pmap = PV_PMAP(pv);
3153		PMAP_LOCK(pmap);
3154		pte = pmap_pte(pmap, pv->pv_va);
3155		if (*pte & PG_M) {
3156			atomic_clear_long(pte, PG_M);
3157			pmap_invalidate_page(pmap, pv->pv_va);
3158		}
3159		PMAP_UNLOCK(pmap);
3160	}
3161}
3162
3163/*
3164 *	pmap_clear_reference:
3165 *
3166 *	Clear the reference bit on the specified physical page.
3167 */
3168void
3169pmap_clear_reference(vm_page_t m)
3170{
3171	pv_entry_t pv;
3172	pmap_t pmap;
3173	pt_entry_t *pte;
3174
3175	if ((m->flags & PG_FICTITIOUS) != 0)
3176		return;
3177	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3178	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3179		pmap = PV_PMAP(pv);
3180		PMAP_LOCK(pmap);
3181		pte = pmap_pte(pmap, pv->pv_va);
3182		if (*pte & PG_A) {
3183			atomic_clear_long(pte, PG_A);
3184			pmap_invalidate_page(pmap, pv->pv_va);
3185		}
3186		PMAP_UNLOCK(pmap);
3187	}
3188}
3189
3190/*
3191 * Miscellaneous support routines follow
3192 */
3193
3194/* Adjust the cache mode for a 4KB page mapped via a PTE. */
3195static __inline void
3196pmap_pte_attr(vm_offset_t va, int mode)
3197{
3198	pt_entry_t *pte;
3199	u_int opte, npte;
3200
3201	pte = vtopte(va);
3202
3203	/*
3204	 * The cache mode bits are all in the low 32-bits of the
3205	 * PTE, so we can just spin on updating the low 32-bits.
3206	 */
3207	do {
3208		opte = *(u_int *)pte;
3209		npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
3210		npte |= pmap_cache_bits(mode, 0);
3211	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
3212}
3213
3214/* Adjust the cache mode for a 2MB page mapped via a PDE. */
3215static __inline void
3216pmap_pde_attr(vm_offset_t va, int mode)
3217{
3218	pd_entry_t *pde;
3219	u_int opde, npde;
3220
3221	pde = pmap_pde(kernel_pmap, va);
3222
3223	/*
3224	 * The cache mode bits are all in the low 32-bits of the
3225	 * PDE, so we can just spin on updating the low 32-bits.
3226	 */
3227	do {
3228		opde = *(u_int *)pde;
3229		npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT);
3230		npde |= pmap_cache_bits(mode, 1);
3231	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
3232}
3233
3234/*
3235 * Map a set of physical memory pages into the kernel virtual
3236 * address space. Return a pointer to where it is mapped. This
3237 * routine is intended to be used for mapping device memory,
3238 * NOT real memory.
3239 */
3240void *
3241pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
3242{
3243	vm_offset_t va, tmpva, offset;
3244
3245	/*
3246	 * If this fits within the direct map window and use WB caching
3247	 * mode, use the direct map.
3248	 */
3249	if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK)
3250		return ((void *)PHYS_TO_DMAP(pa));
3251	offset = pa & PAGE_MASK;
3252	size = roundup(offset + size, PAGE_SIZE);
3253	va = kmem_alloc_nofault(kernel_map, size);
3254	if (!va)
3255		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3256	pa = trunc_page(pa);
3257	for (tmpva = va; size > 0; ) {
3258		pmap_kenter_attr(tmpva, pa, mode);
3259		size -= PAGE_SIZE;
3260		tmpva += PAGE_SIZE;
3261		pa += PAGE_SIZE;
3262	}
3263	pmap_invalidate_range(kernel_pmap, va, tmpva);
3264	pmap_invalidate_cache();
3265	return ((void *)(va + offset));
3266}
3267
3268void *
3269pmap_mapdev(vm_paddr_t pa, vm_size_t size)
3270{
3271
3272	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
3273}
3274
3275void *
3276pmap_mapbios(vm_paddr_t pa, vm_size_t size)
3277{
3278
3279	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
3280}
3281
3282void
3283pmap_unmapdev(vm_offset_t va, vm_size_t size)
3284{
3285	vm_offset_t base, offset, tmpva;
3286
3287	/* If we gave a direct map region in pmap_mapdev, do nothing */
3288	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
3289		return;
3290	base = trunc_page(va);
3291	offset = va & PAGE_MASK;
3292	size = roundup(offset + size, PAGE_SIZE);
3293	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
3294		pmap_kremove(tmpva);
3295	pmap_invalidate_range(kernel_pmap, va, tmpva);
3296	kmem_free(kernel_map, base, size);
3297}
3298
3299int
3300pmap_change_attr(va, size, mode)
3301	vm_offset_t va;
3302	vm_size_t size;
3303	int mode;
3304{
3305	vm_offset_t base, offset, tmpva;
3306	pd_entry_t *pde;
3307	pt_entry_t *pte;
3308
3309	base = trunc_page(va);
3310	offset = va & PAGE_MASK;
3311	size = roundup(offset + size, PAGE_SIZE);
3312
3313	/* Only supported on kernel virtual addresses. */
3314	if (base <= VM_MAXUSER_ADDRESS)
3315		return (EINVAL);
3316
3317	/*
3318	 * XXX: We have to support tearing 2MB pages down into 4k pages if
3319	 * needed here.
3320	 */
3321	/* Pages that aren't mapped aren't supported. */
3322	for (tmpva = base; tmpva < (base + size); ) {
3323		pde = pmap_pde(kernel_pmap, tmpva);
3324		if (*pde == 0)
3325			return (EINVAL);
3326		if (*pde & PG_PS) {
3327			/* Handle 2MB pages that are completely contained. */
3328			if (size >= NBPDR) {
3329				tmpva += NBPDR;
3330				continue;
3331			}
3332			return (EINVAL);
3333		}
3334		pte = vtopte(va);
3335		if (*pte == 0)
3336			return (EINVAL);
3337		tmpva += PAGE_SIZE;
3338	}
3339
3340	/*
3341	 * Ok, all the pages exist, so run through them updating their
3342	 * cache mode.
3343	 */
3344	for (tmpva = base; size > 0; ) {
3345		pde = pmap_pde(kernel_pmap, tmpva);
3346		if (*pde & PG_PS) {
3347			pmap_pde_attr(tmpva, mode);
3348			tmpva += NBPDR;
3349			size -= NBPDR;
3350		} else {
3351			pmap_pte_attr(tmpva, mode);
3352			tmpva += PAGE_SIZE;
3353			size -= PAGE_SIZE;
3354		}
3355	}
3356
3357	/*
3358	 * Flush CPU caches to make sure any data isn't cached that shouldn't
3359	 * be, etc.
3360	 */
3361	pmap_invalidate_range(kernel_pmap, base, tmpva);
3362	pmap_invalidate_cache();
3363	return (0);
3364}
3365
3366/*
3367 * perform the pmap work for mincore
3368 */
3369int
3370pmap_mincore(pmap_t pmap, vm_offset_t addr)
3371{
3372	pt_entry_t *ptep, pte;
3373	vm_page_t m;
3374	int val = 0;
3375
3376	PMAP_LOCK(pmap);
3377	ptep = pmap_pte(pmap, addr);
3378	pte = (ptep != NULL) ? *ptep : 0;
3379	PMAP_UNLOCK(pmap);
3380
3381	if (pte != 0) {
3382		vm_paddr_t pa;
3383
3384		val = MINCORE_INCORE;
3385		if ((pte & PG_MANAGED) == 0)
3386			return val;
3387
3388		pa = pte & PG_FRAME;
3389
3390		m = PHYS_TO_VM_PAGE(pa);
3391
3392		/*
3393		 * Modified by us
3394		 */
3395		if (pte & PG_M)
3396			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3397		else {
3398			/*
3399			 * Modified by someone else
3400			 */
3401			vm_page_lock_queues();
3402			if (m->dirty || pmap_is_modified(m))
3403				val |= MINCORE_MODIFIED_OTHER;
3404			vm_page_unlock_queues();
3405		}
3406		/*
3407		 * Referenced by us
3408		 */
3409		if (pte & PG_A)
3410			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3411		else {
3412			/*
3413			 * Referenced by someone else
3414			 */
3415			vm_page_lock_queues();
3416			if ((m->flags & PG_REFERENCED) ||
3417			    pmap_ts_referenced(m)) {
3418				val |= MINCORE_REFERENCED_OTHER;
3419				vm_page_flag_set(m, PG_REFERENCED);
3420			}
3421			vm_page_unlock_queues();
3422		}
3423	}
3424	return val;
3425}
3426
3427void
3428pmap_activate(struct thread *td)
3429{
3430	pmap_t	pmap, oldpmap;
3431	u_int64_t  cr3;
3432
3433	critical_enter();
3434	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3435	oldpmap = PCPU_GET(curpmap);
3436#ifdef SMP
3437if (oldpmap)	/* XXX FIXME */
3438	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3439	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3440#else
3441if (oldpmap)	/* XXX FIXME */
3442	oldpmap->pm_active &= ~PCPU_GET(cpumask);
3443	pmap->pm_active |= PCPU_GET(cpumask);
3444#endif
3445	cr3 = vtophys(pmap->pm_pml4);
3446	td->td_pcb->pcb_cr3 = cr3;
3447	load_cr3(cr3);
3448	critical_exit();
3449}
3450
3451vm_offset_t
3452pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3453{
3454
3455	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3456		return addr;
3457	}
3458
3459	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3460	return addr;
3461}
3462