pmap.c revision 118641
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	@(#)pmap.c	7.7 (Berkeley)	5/12/91
42 */
43/*-
44 * Copyright (c) 2003 Networks Associates Technology, Inc.
45 * All rights reserved.
46 *
47 * This software was developed for the FreeBSD Project by Jake Burkholder,
48 * Safeport Network Services, and Network Associates Laboratories, the
49 * Security Research Division of Network Associates, Inc. under
50 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
51 * CHATS research program.
52 *
53 * Redistribution and use in source and binary forms, with or without
54 * modification, are permitted provided that the following conditions
55 * are met:
56 * 1. Redistributions of source code must retain the above copyright
57 *    notice, this list of conditions and the following disclaimer.
58 * 2. Redistributions in binary form must reproduce the above copyright
59 *    notice, this list of conditions and the following disclaimer in the
60 *    documentation and/or other materials provided with the distribution.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 */
74
75#include <sys/cdefs.h>
76__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 118641 2003-08-08 01:52:03Z alc $");
77
78/*
79 *	Manages physical address maps.
80 *
81 *	In addition to hardware address maps, this
82 *	module is called upon to provide software-use-only
83 *	maps which may or may not be stored in the same
84 *	form as hardware maps.  These pseudo-maps are
85 *	used to store intermediate results from copy
86 *	operations to and from address spaces.
87 *
88 *	Since the information managed by this module is
89 *	also stored by the logical address mapping module,
90 *	this module may throw away valid virtual-to-physical
91 *	mappings at almost any time.  However, invalidations
92 *	of virtual-to-physical mappings must be done as
93 *	requested.
94 *
95 *	In order to cope with hardware architectures which
96 *	make virtual-to-physical map invalidates expensive,
97 *	this module may delay invalidate or reduced protection
98 *	operations until such time as they are actually
99 *	necessary.  This module is given full information as
100 *	to which processors are currently using which maps,
101 *	and to when physical maps must be made correct.
102 */
103
104#include "opt_msgbuf.h"
105#include "opt_kstack_pages.h"
106
107#include <sys/param.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/lock.h>
111#include <sys/mman.h>
112#include <sys/msgbuf.h>
113#include <sys/mutex.h>
114#include <sys/proc.h>
115#include <sys/sx.h>
116#include <sys/user.h>
117#include <sys/vmmeter.h>
118#include <sys/sysctl.h>
119
120#include <vm/vm.h>
121#include <vm/vm_param.h>
122#include <vm/vm_kern.h>
123#include <vm/vm_page.h>
124#include <vm/vm_map.h>
125#include <vm/vm_object.h>
126#include <vm/vm_extern.h>
127#include <vm/vm_pageout.h>
128#include <vm/vm_pager.h>
129#include <vm/uma.h>
130#include <vm/uma_int.h>
131
132#include <machine/cpu.h>
133#include <machine/cputypes.h>
134#include <machine/md_var.h>
135#include <machine/specialreg.h>
136
137#define PMAP_KEEP_PDIRS
138#ifndef PMAP_SHPGPERPROC
139#define PMAP_SHPGPERPROC 200
140#endif
141
142#if defined(DIAGNOSTIC)
143#define PMAP_DIAGNOSTIC
144#endif
145
146#define MINPV 2048
147
148#if !defined(PMAP_DIAGNOSTIC)
149#define PMAP_INLINE __inline
150#else
151#define PMAP_INLINE
152#endif
153
154/*
155 * Given a map and a machine independent protection code,
156 * convert to a vax protection code.
157 */
158#define pte_prot(m, p)	(protection_codes[p])
159static pt_entry_t protection_codes[8];
160
161struct pmap kernel_pmap_store;
162LIST_HEAD(pmaplist, pmap);
163static struct pmaplist allpmaps;
164static struct mtx allpmaps_lock;
165
166vm_paddr_t avail_start;		/* PA of first available physical page */
167vm_paddr_t avail_end;		/* PA of last available physical page */
168vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
169vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
170static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
171
172static int nkpt;
173static int ndmpdp;
174static vm_paddr_t dmaplimit;
175vm_offset_t kernel_vm_end;
176
177static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
178static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
179static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
180u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
181
182static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
183static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
184
185/*
186 * Data for the pv entry allocation mechanism
187 */
188static uma_zone_t pvzone;
189static struct vm_object pvzone_obj;
190static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
191int pmap_pagedaemon_waken;
192
193/*
194 * All those kernel PT submaps that BSD is so fond of
195 */
196pt_entry_t *CMAP1 = 0;
197static pt_entry_t *ptmmap;
198caddr_t CADDR1 = 0, ptvmmap = 0;
199static pt_entry_t *msgbufmap;
200struct msgbuf *msgbufp = 0;
201
202/*
203 * Crashdump maps.
204 */
205static pt_entry_t *pt_crashdumpmap;
206static caddr_t crashdumpmap;
207
208static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
209static pv_entry_t get_pv_entry(void);
210static void	amd64_protection_init(void);
211static void	pmap_changebit(vm_page_t m, int bit, boolean_t setem)
212    __always_inline;
213
214static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
215static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
216static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
217					vm_offset_t va);
218static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
219		vm_page_t mpte, vm_page_t m);
220
221static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
222
223static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex);
224static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex);
225static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
226static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
227static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
228
229CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
230CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
231
232/*
233 * Move the kernel virtual free pointer to the next
234 * 2MB.  This is used to help improve performance
235 * by using a large (2MB) page for much of the kernel
236 * (.text, .data, .bss)
237 */
238static vm_offset_t
239pmap_kmem_choose(vm_offset_t addr)
240{
241	vm_offset_t newaddr = addr;
242
243	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
244	return newaddr;
245}
246
247/********************/
248/* Inline functions */
249/********************/
250
251/* Return a non-clipped PD index for a given VA */
252static __inline vm_pindex_t
253pmap_pde_pindex(vm_offset_t va)
254{
255	return va >> PDRSHIFT;
256}
257
258
259/* Return various clipped indexes for a given VA */
260static __inline vm_pindex_t
261pmap_pte_index(vm_offset_t va)
262{
263
264	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
265}
266
267static __inline vm_pindex_t
268pmap_pde_index(vm_offset_t va)
269{
270
271	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
272}
273
274static __inline vm_pindex_t
275pmap_pdpe_index(vm_offset_t va)
276{
277
278	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
279}
280
281static __inline vm_pindex_t
282pmap_pml4e_index(vm_offset_t va)
283{
284
285	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
286}
287
288/* Return a pointer to the PML4 slot that corresponds to a VA */
289static __inline pml4_entry_t *
290pmap_pml4e(pmap_t pmap, vm_offset_t va)
291{
292
293	if (!pmap)
294		return NULL;
295	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
296}
297
298/* Return a pointer to the PDP slot that corresponds to a VA */
299static __inline pdp_entry_t *
300pmap_pdpe(pmap_t pmap, vm_offset_t va)
301{
302	pml4_entry_t *pml4e;
303	pdp_entry_t *pdpe;
304
305	pml4e = pmap_pml4e(pmap, va);
306	if (pml4e == NULL || (*pml4e & PG_V) == 0)
307		return NULL;
308	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
309	return (&pdpe[pmap_pdpe_index(va)]);
310}
311
312/* Return a pointer to the PD slot that corresponds to a VA */
313static __inline pd_entry_t *
314pmap_pde(pmap_t pmap, vm_offset_t va)
315{
316	pdp_entry_t *pdpe;
317	pd_entry_t *pde;
318
319	pdpe = pmap_pdpe(pmap, va);
320	if (pdpe == NULL || (*pdpe & PG_V) == 0)
321		 return NULL;
322	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
323	return (&pde[pmap_pde_index(va)]);
324}
325
326/* Return a pointer to the PT slot that corresponds to a VA */
327static __inline pt_entry_t *
328pmap_pte(pmap_t pmap, vm_offset_t va)
329{
330	pd_entry_t *pde;
331	pt_entry_t *pte;
332
333	pde = pmap_pde(pmap, va);
334	if (pde == NULL || (*pde & PG_V) == 0)
335		return NULL;
336	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
337		return ((pt_entry_t *)pde);
338	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
339	return (&pte[pmap_pte_index(va)]);
340}
341
342
343PMAP_INLINE pt_entry_t *
344vtopte(vm_offset_t va)
345{
346	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
347
348	return (PTmap + (amd64_btop(va) & mask));
349}
350
351static u_int64_t
352allocpages(int n)
353{
354	u_int64_t ret;
355
356	ret = avail_start;
357	bzero((void *)ret, n * PAGE_SIZE);
358	avail_start += n * PAGE_SIZE;
359	return (ret);
360}
361
362static void
363create_pagetables(void)
364{
365	int i;
366
367	/* Allocate pages */
368	KPTphys = allocpages(NKPT);
369	KPML4phys = allocpages(1);
370	KPDPphys = allocpages(NKPML4E);
371	KPDphys = allocpages(NKPDPE);
372
373	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
374	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
375		ndmpdp = 4;
376	DMPDPphys = allocpages(NDMPML4E);
377	DMPDphys = allocpages(ndmpdp);
378	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
379
380	/* Fill in the underlying page table pages */
381	/* Read-only from zero to physfree */
382	/* XXX not fully used, underneath 2M pages */
383	for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
384		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
385		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V;
386	}
387
388	/* Now map the page tables at their location within PTmap */
389	for (i = 0; i < NKPT; i++) {
390		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
391		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
392	}
393
394	/* Map from zero to end of allocations under 2M pages */
395	/* This replaces some of the KPTphys entries above */
396	for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
397		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
398		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS;
399	}
400
401	/* And connect up the PD to the PDP */
402	for (i = 0; i < NKPDPE; i++) {
403		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
404		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
405	}
406
407
408	/* Now set up the direct map space using 2MB pages */
409	for (i = 0; i < NPDEPG * ndmpdp; i++) {
410		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
411		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS;
412	}
413
414	/* And the direct map space's PDP */
415	for (i = 0; i < ndmpdp; i++) {
416		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
417		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
418	}
419
420	/* And recursively map PML4 to itself in order to get PTmap */
421	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
422	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
423
424	/* Connect the Direct Map slot up to the PML4 */
425	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
426	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
427
428	/* Connect the KVA slot up to the PML4 */
429	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
430	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
431}
432
433/*
434 *	Bootstrap the system enough to run with virtual memory.
435 *
436 *	On amd64 this is called after mapping has already been enabled
437 *	and just syncs the pmap module with what has already been done.
438 *	[We can't call it easily with mapping off since the kernel is not
439 *	mapped with PA == VA, hence we would have to relocate every address
440 *	from the linked base (virtual) address "KERNBASE" to the actual
441 *	(physical) address starting relative to 0]
442 */
443void
444pmap_bootstrap(firstaddr)
445	vm_paddr_t *firstaddr;
446{
447	vm_offset_t va;
448	pt_entry_t *pte;
449
450	avail_start = *firstaddr;
451
452	/*
453	 * Create an initial set of page tables to run the kernel in.
454	 */
455	create_pagetables();
456	*firstaddr = avail_start;
457
458	virtual_avail = (vm_offset_t) KERNBASE + avail_start;
459	virtual_avail = pmap_kmem_choose(virtual_avail);
460
461	virtual_end = VM_MAX_KERNEL_ADDRESS;
462
463
464	/* XXX do %cr0 as well */
465	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
466	load_cr3(KPML4phys);
467
468	/*
469	 * Initialize protection array.
470	 */
471	amd64_protection_init();
472
473	/*
474	 * Initialize the kernel pmap (which is statically allocated).
475	 */
476	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
477	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
478	TAILQ_INIT(&kernel_pmap->pm_pvlist);
479	LIST_INIT(&allpmaps);
480	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
481	mtx_lock_spin(&allpmaps_lock);
482	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
483	mtx_unlock_spin(&allpmaps_lock);
484	nkpt = NKPT;
485
486	/*
487	 * Reserve some special page table entries/VA space for temporary
488	 * mapping of pages.
489	 */
490#define	SYSMAP(c, p, v, n)	\
491	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
492
493	va = virtual_avail;
494	pte = vtopte(va);
495
496	/*
497	 * CMAP1 is only used for the memory test.
498	 */
499	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
500
501	/*
502	 * Crashdump maps.
503	 */
504	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
505
506	/*
507	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
508	 * XXX ptmmap is not used.
509	 */
510	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
511
512	/*
513	 * msgbufp is used to map the system message buffer.
514	 * XXX msgbufmap is not used.
515	 */
516	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
517	       atop(round_page(MSGBUF_SIZE)))
518
519	virtual_avail = va;
520
521	*CMAP1 = 0;
522
523	invltlb();
524}
525
526static void *
527pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
528{
529	*flags = UMA_SLAB_PRIV;
530	return (void *)kmem_alloc(kernel_map, bytes);
531}
532
533void *
534uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
535{
536	static vm_pindex_t colour;
537	vm_page_t m;
538	int pflags;
539	void *va;
540
541	*flags = UMA_SLAB_PRIV;
542
543	if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
544		pflags = VM_ALLOC_INTERRUPT;
545	else
546		pflags = VM_ALLOC_SYSTEM;
547
548	if (wait & M_ZERO)
549		pflags |= VM_ALLOC_ZERO;
550
551	for (;;) {
552		m = vm_page_alloc(NULL, colour++, pflags | VM_ALLOC_NOOBJ);
553		if (m == NULL) {
554			if (wait & M_NOWAIT)
555				return (NULL);
556			else
557				VM_WAIT;
558		} else
559			break;
560	}
561
562	va = (void *)PHYS_TO_DMAP(m->phys_addr);
563	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
564		pagezero(va);
565	return (va);
566}
567
568void
569uma_small_free(void *mem, int size, u_int8_t flags)
570{
571	vm_page_t m;
572
573	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)mem));
574	vm_page_lock_queues();
575	vm_page_free(m);
576	vm_page_unlock_queues();
577}
578
579/*
580 *	Initialize the pmap module.
581 *	Called by vm_init, to initialize any structures that the pmap
582 *	system needs to map virtual memory.
583 *	pmap_init has been enhanced to support in a fairly consistant
584 *	way, discontiguous physical memory.
585 */
586void
587pmap_init(phys_start, phys_end)
588	vm_paddr_t phys_start, phys_end;
589{
590	int i;
591	int initial_pvs;
592
593	/*
594	 * Allocate memory for random pmap data structures.  Includes the
595	 * pv_head_table.
596	 */
597
598	for(i = 0; i < vm_page_array_size; i++) {
599		vm_page_t m;
600
601		m = &vm_page_array[i];
602		TAILQ_INIT(&m->md.pv_list);
603		m->md.pv_list_count = 0;
604	}
605
606	/*
607	 * init the pv free list
608	 */
609	initial_pvs = vm_page_array_size;
610	if (initial_pvs < MINPV)
611		initial_pvs = MINPV;
612	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
613	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
614	uma_zone_set_allocf(pvzone, pmap_pv_allocf);
615	uma_prealloc(pvzone, initial_pvs);
616
617	/*
618	 * Now it is safe to enable pv_table recording.
619	 */
620	pmap_initialized = TRUE;
621}
622
623/*
624 * Initialize the address space (zone) for the pv_entries.  Set a
625 * high water mark so that the system can recover from excessive
626 * numbers of pv entries.
627 */
628void
629pmap_init2()
630{
631	int shpgperproc = PMAP_SHPGPERPROC;
632
633	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
634	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
635	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
636	pv_entry_high_water = 9 * (pv_entry_max / 10);
637	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
638}
639
640
641/***************************************************
642 * Low level helper routines.....
643 ***************************************************/
644
645#if defined(PMAP_DIAGNOSTIC)
646
647/*
648 * This code checks for non-writeable/modified pages.
649 * This should be an invalid condition.
650 */
651static int
652pmap_nw_modified(pt_entry_t ptea)
653{
654	int pte;
655
656	pte = (int) ptea;
657
658	if ((pte & (PG_M|PG_RW)) == PG_M)
659		return 1;
660	else
661		return 0;
662}
663#endif
664
665
666/*
667 * this routine defines the region(s) of memory that should
668 * not be tested for the modified bit.
669 */
670static PMAP_INLINE int
671pmap_track_modified(vm_offset_t va)
672{
673	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
674		return 1;
675	else
676		return 0;
677}
678
679/*
680 * Normal invalidation functions.
681 * We inline these within pmap.c for speed.
682 */
683PMAP_INLINE void
684pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
685{
686
687	if (pmap == kernel_pmap || pmap->pm_active)
688		invlpg(va);
689}
690
691PMAP_INLINE void
692pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
693{
694	vm_offset_t addr;
695
696	if (pmap == kernel_pmap || pmap->pm_active)
697		for (addr = sva; addr < eva; addr += PAGE_SIZE)
698			invlpg(addr);
699}
700
701PMAP_INLINE void
702pmap_invalidate_all(pmap_t pmap)
703{
704
705	if (pmap == kernel_pmap || pmap->pm_active)
706		invltlb();
707}
708
709/*
710 * Are we current address space or kernel?
711 */
712static __inline int
713pmap_is_current(pmap_t pmap)
714{
715	return (pmap == kernel_pmap ||
716	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
717}
718
719/*
720 *	Routine:	pmap_extract
721 *	Function:
722 *		Extract the physical page address associated
723 *		with the given map/virtual_address pair.
724 */
725vm_paddr_t
726pmap_extract(pmap, va)
727	register pmap_t pmap;
728	vm_offset_t va;
729{
730	vm_paddr_t rtval;
731	pt_entry_t *pte;
732	pd_entry_t pde, *pdep;
733
734	if (pmap == 0)
735		return 0;
736	pdep = pmap_pde(pmap, va);
737	if (pdep) {
738		pde = *pdep;
739		if (pde) {
740			if ((pde & PG_PS) != 0) {
741				rtval = (pde & ~PDRMASK) | (va & PDRMASK);
742				return rtval;
743			}
744			pte = pmap_pte(pmap, va);
745			rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
746			return rtval;
747		}
748	}
749	return 0;
750
751}
752
753vm_paddr_t
754pmap_kextract(vm_offset_t va)
755{
756	pd_entry_t *pde;
757	vm_paddr_t pa;
758
759	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
760		pa = DMAP_TO_PHYS(va);
761	} else {
762		pde = pmap_pde(kernel_pmap, va);
763		if (*pde & PG_PS) {
764			pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
765		} else {
766			pa = *vtopte(va);
767			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
768		}
769	}
770	return pa;
771}
772
773/***************************************************
774 * Low level mapping routines.....
775 ***************************************************/
776
777/*
778 * Add a wired page to the kva.
779 * Note: not SMP coherent.
780 */
781PMAP_INLINE void
782pmap_kenter(vm_offset_t va, vm_paddr_t pa)
783{
784	pt_entry_t *pte;
785
786	pte = vtopte(va);
787	pte_store(pte, pa | PG_RW | PG_V | PG_G);
788}
789
790/*
791 * Remove a page from the kernel pagetables.
792 * Note: not SMP coherent.
793 */
794PMAP_INLINE void
795pmap_kremove(vm_offset_t va)
796{
797	pt_entry_t *pte;
798
799	pte = vtopte(va);
800	pte_clear(pte);
801}
802
803/*
804 *	Used to map a range of physical addresses into kernel
805 *	virtual address space.
806 *
807 *	The value passed in '*virt' is a suggested virtual address for
808 *	the mapping. Architectures which can support a direct-mapped
809 *	physical to virtual region can return the appropriate address
810 *	within that region, leaving '*virt' unchanged. Other
811 *	architectures should map the pages starting at '*virt' and
812 *	update '*virt' with the first usable address after the mapped
813 *	region.
814 */
815vm_offset_t
816pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
817{
818	return PHYS_TO_DMAP(start);
819}
820
821
822/*
823 * Add a list of wired pages to the kva
824 * this routine is only used for temporary
825 * kernel mappings that do not need to have
826 * page modification or references recorded.
827 * Note that old mappings are simply written
828 * over.  The page *must* be wired.
829 * Note: SMP coherent.  Uses a ranged shootdown IPI.
830 */
831void
832pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
833{
834	vm_offset_t va;
835
836	va = sva;
837	while (count-- > 0) {
838		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
839		va += PAGE_SIZE;
840		m++;
841	}
842	pmap_invalidate_range(kernel_pmap, sva, va);
843}
844
845/*
846 * This routine tears out page mappings from the
847 * kernel -- it is meant only for temporary mappings.
848 * Note: SMP coherent.  Uses a ranged shootdown IPI.
849 */
850void
851pmap_qremove(vm_offset_t sva, int count)
852{
853	vm_offset_t va;
854
855	va = sva;
856	while (count-- > 0) {
857		pmap_kremove(va);
858		va += PAGE_SIZE;
859	}
860	pmap_invalidate_range(kernel_pmap, sva, va);
861}
862
863static vm_page_t
864pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
865{
866	vm_page_t m;
867
868retry:
869	m = vm_page_lookup(object, pindex);
870	if (m != NULL) {
871		vm_page_lock_queues();
872		if (vm_page_sleep_if_busy(m, FALSE, "pplookp"))
873			goto retry;
874		vm_page_unlock_queues();
875	}
876	return m;
877}
878
879/***************************************************
880 * Page table page management routines.....
881 ***************************************************/
882
883/*
884 * This routine unholds page table pages, and if the hold count
885 * drops to zero, then it decrements the wire count.
886 */
887static int
888_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
889{
890
891	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
892		vm_page_lock_queues();
893
894	if (m->hold_count == 0) {
895		vm_offset_t pteva;
896
897		/*
898		 * unmap the page table page
899		 */
900		if (m->pindex >= (NUPDE + NUPDPE)) {
901			/* PDP page */
902			pml4_entry_t *pml4;
903			pml4 = pmap_pml4e(pmap, va);
904			pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
905			*pml4 = 0;
906		} else if (m->pindex >= NUPDE) {
907			/* PD page */
908			pdp_entry_t *pdp;
909			pdp = pmap_pdpe(pmap, va);
910			pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
911			*pdp = 0;
912		} else {
913			/* PTE page */
914			pd_entry_t *pd;
915			pd = pmap_pde(pmap, va);
916			pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
917			*pd = 0;
918		}
919		--pmap->pm_stats.resident_count;
920		if (m->pindex < NUPDE) {
921			/* We just released a PT, unhold the matching PD */
922			vm_page_t pdpg;
923
924			pdpg = vm_page_lookup(pmap->pm_pteobj, NUPDE +
925			    ((va >> PDPSHIFT) & (NUPDPE - 1)));
926			while (vm_page_sleep_if_busy(pdpg, FALSE, "pulook"))
927				vm_page_lock_queues();
928			vm_page_unhold(pdpg);
929			if (pdpg->hold_count == 0)
930				_pmap_unwire_pte_hold(pmap, va, pdpg);
931		}
932		if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
933			/* We just released a PD, unhold the matching PDP */
934			vm_page_t pdppg;
935
936			pdppg = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE +
937			    ((va >> PML4SHIFT) & (NUPML4E - 1)));
938			while (vm_page_sleep_if_busy(pdppg, FALSE, "pulooK"))
939				vm_page_lock_queues();
940			vm_page_unhold(pdppg);
941			if (pdppg->hold_count == 0)
942				_pmap_unwire_pte_hold(pmap, va, pdppg);
943		}
944		if (pmap_is_current(pmap)) {
945			/*
946			 * Do an invltlb to make the invalidated mapping
947			 * take effect immediately.
948			 */
949			pmap_invalidate_page(pmap, pteva);
950		}
951
952		/*
953		 * If the page is finally unwired, simply free it.
954		 */
955		--m->wire_count;
956		if (m->wire_count == 0) {
957			vm_page_busy(m);
958			vm_page_free_zero(m);
959			atomic_subtract_int(&cnt.v_wire_count, 1);
960		}
961		return 1;
962	}
963	return 0;
964}
965
966static PMAP_INLINE int
967pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
968{
969	vm_page_unhold(m);
970	if (m->hold_count == 0)
971		return _pmap_unwire_pte_hold(pmap, va, m);
972	else
973		return 0;
974}
975
976/*
977 * After removing a page table entry, this routine is used to
978 * conditionally free the page, and manage the hold/wire counts.
979 */
980static int
981pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
982{
983	vm_pindex_t ptepindex;
984
985	if (va >= VM_MAXUSER_ADDRESS)
986		return 0;
987
988	if (mpte == NULL) {
989		ptepindex = pmap_pde_pindex(va);
990		if (pmap->pm_pteobj->root &&
991		    pmap->pm_pteobj->root->pindex == ptepindex) {
992			mpte = pmap->pm_pteobj->root;
993		} else {
994			while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL &&
995			    vm_page_sleep_if_busy(mpte, FALSE, "pulook"))
996				vm_page_lock_queues();
997		}
998	}
999
1000	return pmap_unwire_pte_hold(pmap, va, mpte);
1001}
1002
1003void
1004pmap_pinit0(pmap)
1005	struct pmap *pmap;
1006{
1007
1008	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1009	pmap->pm_active = 0;
1010	TAILQ_INIT(&pmap->pm_pvlist);
1011	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1012	mtx_lock_spin(&allpmaps_lock);
1013	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1014	mtx_unlock_spin(&allpmaps_lock);
1015}
1016
1017/*
1018 * Initialize a preallocated and zeroed pmap structure,
1019 * such as one in a vmspace structure.
1020 */
1021void
1022pmap_pinit(pmap)
1023	register struct pmap *pmap;
1024{
1025	vm_page_t pml4pg;
1026
1027	/*
1028	 * allocate object for the ptes
1029	 */
1030	if (pmap->pm_pteobj == NULL)
1031		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + NUPML4E + 1);
1032
1033	/*
1034	 * allocate the page directory page
1035	 */
1036	pml4pg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + NUPML4E,
1037	    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1038	vm_page_lock_queues();
1039	vm_page_flag_clear(pml4pg, PG_BUSY);
1040	pml4pg->valid = VM_PAGE_BITS_ALL;
1041	vm_page_unlock_queues();
1042
1043	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1044
1045	if ((pml4pg->flags & PG_ZERO) == 0)
1046		bzero(pmap->pm_pml4, PAGE_SIZE);
1047
1048	mtx_lock_spin(&allpmaps_lock);
1049	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1050	mtx_unlock_spin(&allpmaps_lock);
1051
1052	/* Wire in kernel global address entries. */
1053	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1054	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1055
1056	/* install self-referential address mapping entry(s) */
1057	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1058
1059	pmap->pm_active = 0;
1060	TAILQ_INIT(&pmap->pm_pvlist);
1061	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1062}
1063
1064/*
1065 * Wire in kernel global address entries.  To avoid a race condition
1066 * between pmap initialization and pmap_growkernel, this procedure
1067 * should be called after the vmspace is attached to the process
1068 * but before this pmap is activated.
1069 */
1070void
1071pmap_pinit2(pmap)
1072	struct pmap *pmap;
1073{
1074	/* XXX: Remove this stub when no longer called */
1075}
1076
1077/*
1078 * this routine is called if the page table page is not
1079 * mapped correctly.
1080 */
1081static vm_page_t
1082_pmap_allocpte(pmap, ptepindex)
1083	pmap_t	pmap;
1084	vm_pindex_t ptepindex;
1085{
1086	vm_page_t m, pdppg, pdpg;
1087
1088	/*
1089	 * Find or fabricate a new pagetable page
1090	 */
1091	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1092	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1093
1094	KASSERT(m->queue == PQ_NONE,
1095		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1096
1097	/*
1098	 * Increment the hold count for the page table page
1099	 * (denoting a new mapping.)
1100	 */
1101	m->hold_count++;
1102
1103	/*
1104	 * Map the pagetable page into the process address space, if
1105	 * it isn't already there.
1106	 */
1107
1108	pmap->pm_stats.resident_count++;
1109
1110	if (ptepindex >= (NUPDE + NUPDPE)) {
1111		pml4_entry_t *pml4;
1112		vm_pindex_t pml4index;
1113
1114		/* Wire up a new PDPE page */
1115		pml4index = ptepindex - (NUPDE + NUPDPE);
1116		pml4 = &pmap->pm_pml4[pml4index];
1117		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1118
1119	} else if (ptepindex >= NUPDE) {
1120		vm_pindex_t pml4index;
1121		vm_pindex_t pdpindex;
1122		pml4_entry_t *pml4;
1123		pdp_entry_t *pdp;
1124
1125		/* Wire up a new PDE page */
1126		pdpindex = ptepindex - NUPDE;
1127		pml4index = pdpindex >> NPML4EPGSHIFT;
1128
1129		pml4 = &pmap->pm_pml4[pml4index];
1130		if ((*pml4 & PG_V) == 0) {
1131			/* Have to allocate a new pdp, recurse */
1132			_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index);
1133		} else {
1134			/* Add reference to pdp page */
1135			pdppg = pmap_page_lookup(pmap->pm_pteobj,
1136			    NUPDE + NUPDPE + pml4index);
1137			pdppg->hold_count++;
1138		}
1139		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1140
1141		/* Now find the pdp page */
1142		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1143		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1144
1145	} else {
1146		vm_pindex_t pml4index;
1147		vm_pindex_t pdpindex;
1148		pml4_entry_t *pml4;
1149		pdp_entry_t *pdp;
1150		pd_entry_t *pd;
1151
1152		/* Wire up a new PTE page */
1153		pdpindex = ptepindex >> NPDPEPGSHIFT;
1154		pml4index = pdpindex >> NPML4EPGSHIFT;
1155
1156		/* First, find the pdp and check that its valid. */
1157		pml4 = &pmap->pm_pml4[pml4index];
1158		if ((*pml4 & PG_V) == 0) {
1159			/* Have to allocate a new pd, recurse */
1160			_pmap_allocpte(pmap, NUPDE + pdpindex);
1161			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1162			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1163		} else {
1164			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1165			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1166			if ((*pdp & PG_V) == 0) {
1167				/* Have to allocate a new pd, recurse */
1168				_pmap_allocpte(pmap, NUPDE + pdpindex);
1169			} else {
1170				/* Add reference to the pd page */
1171				pdpg = pmap_page_lookup(pmap->pm_pteobj,
1172				    NUPDE + pdpindex);
1173				pdpg->hold_count++;
1174			}
1175		}
1176		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1177
1178		/* Now we know where the page directory page is */
1179		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1180		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1181	}
1182
1183	/*
1184	 * Try to use the new mapping, but if we cannot, then
1185	 * do it with the routine that maps the page explicitly.
1186	 */
1187	if ((m->flags & PG_ZERO) == 0)
1188		pmap_zero_page(m);
1189	vm_page_lock_queues();
1190	m->valid = VM_PAGE_BITS_ALL;
1191	vm_page_flag_clear(m, PG_ZERO);
1192	vm_page_wakeup(m);
1193	vm_page_unlock_queues();
1194
1195	return m;
1196}
1197
1198static vm_page_t
1199pmap_allocpte(pmap_t pmap, vm_offset_t va)
1200{
1201	vm_pindex_t ptepindex;
1202	pd_entry_t *pd;
1203	vm_page_t m;
1204
1205	/*
1206	 * Calculate pagetable page index
1207	 */
1208	ptepindex = pmap_pde_pindex(va);
1209
1210	/*
1211	 * Get the page directory entry
1212	 */
1213	pd = pmap_pde(pmap, va);
1214
1215	/*
1216	 * This supports switching from a 2MB page to a
1217	 * normal 4K page.
1218	 */
1219	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1220		*pd = 0;
1221		pd = 0;
1222		pmap_invalidate_all(kernel_pmap);
1223	}
1224
1225	/*
1226	 * If the page table page is mapped, we just increment the
1227	 * hold count, and activate it.
1228	 */
1229	if (pd != 0 && (*pd & PG_V) != 0) {
1230		/*
1231		 * In order to get the page table page, try the
1232		 * hint first.
1233		 */
1234		if (pmap->pm_pteobj->root &&
1235			(pmap->pm_pteobj->root->pindex == ptepindex)) {
1236			m = pmap->pm_pteobj->root;
1237		} else {
1238			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1239		}
1240		m->hold_count++;
1241		return m;
1242	}
1243	/*
1244	 * Here if the pte page isn't mapped, or if it has been deallocated.
1245	 */
1246	m = _pmap_allocpte(pmap, ptepindex);
1247	return m;
1248}
1249
1250
1251/***************************************************
1252 * Pmap allocation/deallocation routines.
1253 ***************************************************/
1254
1255/*
1256 * Release any resources held by the given physical map.
1257 * Called when a pmap initialized by pmap_pinit is being released.
1258 * Should only be called if the map contains no valid mappings.
1259 */
1260void
1261pmap_release(pmap_t pmap)
1262{
1263	vm_object_t object;
1264	vm_page_t m;
1265
1266	object = pmap->pm_pteobj;
1267
1268	KASSERT(object->ref_count == 1,
1269	    ("pmap_release: pteobj reference count %d != 1",
1270	    object->ref_count));
1271	KASSERT(pmap->pm_stats.resident_count == 0,
1272	    ("pmap_release: pmap resident count %ld != 0",
1273	    pmap->pm_stats.resident_count));
1274
1275	mtx_lock_spin(&allpmaps_lock);
1276	LIST_REMOVE(pmap, pm_list);
1277	mtx_unlock_spin(&allpmaps_lock);
1278
1279	vm_page_lock_queues();
1280	while ((m = TAILQ_FIRST(&object->memq)) != NULL) {
1281		m->wire_count--;
1282		atomic_subtract_int(&cnt.v_wire_count, 1);
1283		vm_page_busy(m);
1284		vm_page_free(m);
1285	}
1286	KASSERT(TAILQ_EMPTY(&object->memq),
1287	    ("pmap_release: leaking page table pages"));
1288	vm_page_unlock_queues();
1289}
1290
1291static int
1292kvm_size(SYSCTL_HANDLER_ARGS)
1293{
1294	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1295
1296	return sysctl_handle_long(oidp, &ksize, 0, req);
1297}
1298SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1299    0, 0, kvm_size, "IU", "Size of KVM");
1300
1301static int
1302kvm_free(SYSCTL_HANDLER_ARGS)
1303{
1304	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1305
1306	return sysctl_handle_long(oidp, &kfree, 0, req);
1307}
1308SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1309    0, 0, kvm_free, "IU", "Amount of KVM free");
1310
1311/*
1312 * grow the number of kernel page table entries, if needed
1313 */
1314void
1315pmap_growkernel(vm_offset_t addr)
1316{
1317	int s;
1318	vm_paddr_t paddr;
1319	vm_page_t nkpg;
1320	pd_entry_t *pde, newpdir;
1321	pdp_entry_t newpdp;
1322
1323	s = splhigh();
1324	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1325	if (kernel_vm_end == 0) {
1326		kernel_vm_end = KERNBASE;
1327		nkpt = 0;
1328		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1329			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1330			nkpt++;
1331		}
1332	}
1333	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1334	while (kernel_vm_end < addr) {
1335		pde = pmap_pde(kernel_pmap, kernel_vm_end);
1336		if (pde == NULL) {
1337			/* We need a new PDP entry */
1338			nkpg = vm_page_alloc(NULL, nkpt,
1339			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1340			if (!nkpg)
1341				panic("pmap_growkernel: no memory to grow kernel");
1342			pmap_zero_page(nkpg);
1343			paddr = VM_PAGE_TO_PHYS(nkpg);
1344			newpdp = (pdp_entry_t)
1345				(paddr | PG_V | PG_RW | PG_A | PG_M);
1346			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1347			continue; /* try again */
1348		}
1349		if ((*pde & PG_V) != 0) {
1350			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1351			continue;
1352		}
1353
1354		/*
1355		 * This index is bogus, but out of the way
1356		 */
1357		nkpg = vm_page_alloc(NULL, nkpt,
1358		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1359		if (!nkpg)
1360			panic("pmap_growkernel: no memory to grow kernel");
1361
1362		nkpt++;
1363
1364		pmap_zero_page(nkpg);
1365		paddr = VM_PAGE_TO_PHYS(nkpg);
1366		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1367		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1368
1369		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1370	}
1371	splx(s);
1372}
1373
1374
1375/***************************************************
1376 * page management routines.
1377 ***************************************************/
1378
1379/*
1380 * free the pv_entry back to the free list
1381 */
1382static PMAP_INLINE void
1383free_pv_entry(pv_entry_t pv)
1384{
1385	pv_entry_count--;
1386	uma_zfree(pvzone, pv);
1387}
1388
1389/*
1390 * get a new pv_entry, allocating a block from the system
1391 * when needed.
1392 * the memory allocation is performed bypassing the malloc code
1393 * because of the possibility of allocations at interrupt time.
1394 */
1395static pv_entry_t
1396get_pv_entry(void)
1397{
1398	pv_entry_count++;
1399	if (pv_entry_high_water &&
1400		(pv_entry_count > pv_entry_high_water) &&
1401		(pmap_pagedaemon_waken == 0)) {
1402		pmap_pagedaemon_waken = 1;
1403		wakeup (&vm_pages_needed);
1404	}
1405	return uma_zalloc(pvzone, M_NOWAIT);
1406}
1407
1408/*
1409 * If it is the first entry on the list, it is actually
1410 * in the header and we must copy the following entry up
1411 * to the header.  Otherwise we must search the list for
1412 * the entry.  In either case we free the now unused entry.
1413 */
1414
1415static int
1416pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1417{
1418	pv_entry_t pv;
1419	int rtval;
1420	int s;
1421
1422	s = splvm();
1423	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1424	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1425		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1426			if (pmap == pv->pv_pmap && va == pv->pv_va)
1427				break;
1428		}
1429	} else {
1430		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1431			if (va == pv->pv_va)
1432				break;
1433		}
1434	}
1435
1436	rtval = 0;
1437	if (pv) {
1438		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1439		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1440		m->md.pv_list_count--;
1441		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1442			vm_page_flag_clear(m, PG_WRITEABLE);
1443
1444		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1445		free_pv_entry(pv);
1446	}
1447
1448	splx(s);
1449	return rtval;
1450}
1451
1452/*
1453 * Create a pv entry for page at pa for
1454 * (pmap, va).
1455 */
1456static void
1457pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1458{
1459
1460	int s;
1461	pv_entry_t pv;
1462
1463	s = splvm();
1464	pv = get_pv_entry();
1465	pv->pv_va = va;
1466	pv->pv_pmap = pmap;
1467	pv->pv_ptem = mpte;
1468
1469	vm_page_lock_queues();
1470	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1471	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1472	m->md.pv_list_count++;
1473
1474	vm_page_unlock_queues();
1475	splx(s);
1476}
1477
1478/*
1479 * pmap_remove_pte: do the things to unmap a page in a process
1480 */
1481static int
1482pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1483{
1484	pt_entry_t oldpte;
1485	vm_page_t m;
1486
1487	oldpte = pte_load_clear(ptq);
1488	if (oldpte & PG_W)
1489		pmap->pm_stats.wired_count -= 1;
1490	/*
1491	 * Machines that don't support invlpg, also don't support
1492	 * PG_G.
1493	 */
1494	if (oldpte & PG_G)
1495		pmap_invalidate_page(kernel_pmap, va);
1496	pmap->pm_stats.resident_count -= 1;
1497	if (oldpte & PG_MANAGED) {
1498		m = PHYS_TO_VM_PAGE(oldpte);
1499		if (oldpte & PG_M) {
1500#if defined(PMAP_DIAGNOSTIC)
1501			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1502				printf(
1503	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1504				    va, oldpte);
1505			}
1506#endif
1507			if (pmap_track_modified(va))
1508				vm_page_dirty(m);
1509		}
1510		if (oldpte & PG_A)
1511			vm_page_flag_set(m, PG_REFERENCED);
1512		return pmap_remove_entry(pmap, m, va);
1513	} else {
1514		return pmap_unuse_pt(pmap, va, NULL);
1515	}
1516
1517	return 0;
1518}
1519
1520/*
1521 * Remove a single page from a process address space
1522 */
1523static void
1524pmap_remove_page(pmap_t pmap, vm_offset_t va)
1525{
1526	pt_entry_t *pte;
1527
1528	pte = pmap_pte(pmap, va);
1529	if (pte == NULL || (*pte & PG_V) == 0)
1530		return;
1531	pmap_remove_pte(pmap, pte, va);
1532	pmap_invalidate_page(pmap, va);
1533}
1534
1535/*
1536 *	Remove the given range of addresses from the specified map.
1537 *
1538 *	It is assumed that the start and end are properly
1539 *	rounded to the page size.
1540 */
1541void
1542pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1543{
1544	vm_offset_t pdnxt;
1545	pd_entry_t ptpaddr, *pde;
1546	pt_entry_t *pte;
1547	int anyvalid;
1548
1549	if (pmap == NULL)
1550		return;
1551
1552	if (pmap->pm_stats.resident_count == 0)
1553		return;
1554
1555	/*
1556	 * special handling of removing one page.  a very
1557	 * common operation and easy to short circuit some
1558	 * code.
1559	 */
1560	if (sva + PAGE_SIZE == eva) {
1561		pde = pmap_pde(pmap, sva);
1562		if (pde && (*pde & PG_PS) == 0) {
1563			pmap_remove_page(pmap, sva);
1564			return;
1565		}
1566	}
1567
1568	anyvalid = 0;
1569
1570	for (; sva < eva; sva = pdnxt) {
1571
1572		if (pmap->pm_stats.resident_count == 0)
1573			break;
1574
1575		/*
1576		 * Calculate index for next page table.
1577		 */
1578		pdnxt = (sva + NBPDR) & ~PDRMASK;
1579
1580		pde = pmap_pde(pmap, sva);
1581		if (pde == 0)
1582			continue;
1583		ptpaddr = *pde;
1584
1585		/*
1586		 * Weed out invalid mappings. Note: we assume that the page
1587		 * directory table is always allocated, and in kernel virtual.
1588		 */
1589		if (ptpaddr == 0)
1590			continue;
1591
1592		/*
1593		 * Check for large page.
1594		 */
1595		if ((ptpaddr & PG_PS) != 0) {
1596			*pde = 0;
1597			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1598			anyvalid = 1;
1599			continue;
1600		}
1601
1602		/*
1603		 * Limit our scan to either the end of the va represented
1604		 * by the current page table page, or to the end of the
1605		 * range being removed.
1606		 */
1607		if (pdnxt > eva)
1608			pdnxt = eva;
1609
1610		for (; sva != pdnxt; sva += PAGE_SIZE) {
1611			pte = pmap_pte(pmap, sva);
1612			if (pte == NULL || *pte == 0)
1613				continue;
1614			anyvalid = 1;
1615			if (pmap_remove_pte(pmap, pte, sva))
1616				break;
1617		}
1618	}
1619
1620	if (anyvalid)
1621		pmap_invalidate_all(pmap);
1622}
1623
1624/*
1625 *	Routine:	pmap_remove_all
1626 *	Function:
1627 *		Removes this physical page from
1628 *		all physical maps in which it resides.
1629 *		Reflects back modify bits to the pager.
1630 *
1631 *	Notes:
1632 *		Original versions of this routine were very
1633 *		inefficient because they iteratively called
1634 *		pmap_remove (slow...)
1635 */
1636
1637void
1638pmap_remove_all(vm_page_t m)
1639{
1640	register pv_entry_t pv;
1641	pt_entry_t *pte, tpte;
1642	int s;
1643
1644#if defined(PMAP_DIAGNOSTIC)
1645	/*
1646	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1647	 */
1648	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1649		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1650		    VM_PAGE_TO_PHYS(m));
1651	}
1652#endif
1653	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1654	s = splvm();
1655	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1656		pv->pv_pmap->pm_stats.resident_count--;
1657		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
1658		tpte = pte_load_clear(pte);
1659		if (tpte & PG_W)
1660			pv->pv_pmap->pm_stats.wired_count--;
1661		if (tpte & PG_A)
1662			vm_page_flag_set(m, PG_REFERENCED);
1663
1664		/*
1665		 * Update the vm_page_t clean and reference bits.
1666		 */
1667		if (tpte & PG_M) {
1668#if defined(PMAP_DIAGNOSTIC)
1669			if (pmap_nw_modified((pt_entry_t) tpte)) {
1670				printf(
1671	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1672				    pv->pv_va, tpte);
1673			}
1674#endif
1675			if (pmap_track_modified(pv->pv_va))
1676				vm_page_dirty(m);
1677		}
1678		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1679		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1680		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1681		m->md.pv_list_count--;
1682		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1683		free_pv_entry(pv);
1684	}
1685	vm_page_flag_clear(m, PG_WRITEABLE);
1686	splx(s);
1687}
1688
1689/*
1690 *	Set the physical protection on the
1691 *	specified range of this map as requested.
1692 */
1693void
1694pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1695{
1696	vm_offset_t pdnxt;
1697	pd_entry_t ptpaddr, *pde;
1698	int anychanged;
1699
1700	if (pmap == NULL)
1701		return;
1702
1703	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1704		pmap_remove(pmap, sva, eva);
1705		return;
1706	}
1707
1708	if (prot & VM_PROT_WRITE)
1709		return;
1710
1711	anychanged = 0;
1712
1713	for (; sva < eva; sva = pdnxt) {
1714
1715		pdnxt = (sva + NBPDR) & ~PDRMASK;
1716
1717		pde = pmap_pde(pmap, sva);
1718		if (pde == NULL)
1719			continue;
1720		ptpaddr = *pde;
1721
1722		/*
1723		 * Weed out invalid mappings. Note: we assume that the page
1724		 * directory table is always allocated, and in kernel virtual.
1725		 */
1726		if (ptpaddr == 0)
1727			continue;
1728
1729		/*
1730		 * Check for large page.
1731		 */
1732		if ((ptpaddr & PG_PS) != 0) {
1733			*pde &= ~(PG_M|PG_RW);
1734			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1735			anychanged = 1;
1736			continue;
1737		}
1738
1739		if (pdnxt > eva)
1740			pdnxt = eva;
1741
1742		for (; sva != pdnxt; sva += PAGE_SIZE) {
1743			pt_entry_t pbits;
1744			pt_entry_t *pte;
1745			vm_page_t m;
1746
1747			pte = pmap_pte(pmap, sva);
1748			if (pte == NULL)
1749				continue;
1750			pbits = *pte;
1751			if (pbits & PG_MANAGED) {
1752				m = NULL;
1753				if (pbits & PG_A) {
1754					m = PHYS_TO_VM_PAGE(pbits);
1755					vm_page_flag_set(m, PG_REFERENCED);
1756					pbits &= ~PG_A;
1757				}
1758				if ((pbits & PG_M) != 0 &&
1759				    pmap_track_modified(sva)) {
1760					if (m == NULL)
1761						m = PHYS_TO_VM_PAGE(pbits);
1762					vm_page_dirty(m);
1763					pbits &= ~PG_M;
1764				}
1765			}
1766
1767			pbits &= ~PG_RW;
1768
1769			if (pbits != *pte) {
1770				pte_store(pte, pbits);
1771				anychanged = 1;
1772			}
1773		}
1774	}
1775	if (anychanged)
1776		pmap_invalidate_all(pmap);
1777}
1778
1779/*
1780 *	Insert the given physical page (p) at
1781 *	the specified virtual address (v) in the
1782 *	target physical map with the protection requested.
1783 *
1784 *	If specified, the page will be wired down, meaning
1785 *	that the related pte can not be reclaimed.
1786 *
1787 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1788 *	or lose information.  That is, this routine must actually
1789 *	insert this page into the given map NOW.
1790 */
1791void
1792pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1793	   boolean_t wired)
1794{
1795	vm_paddr_t pa;
1796	register pt_entry_t *pte;
1797	vm_paddr_t opa;
1798	pt_entry_t origpte, newpte;
1799	vm_page_t mpte;
1800
1801	if (pmap == NULL)
1802		return;
1803
1804	va &= PG_FRAME;
1805#ifdef PMAP_DIAGNOSTIC
1806	if (va > VM_MAX_KERNEL_ADDRESS)
1807		panic("pmap_enter: toobig");
1808	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1809		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1810#endif
1811
1812	mpte = NULL;
1813	/*
1814	 * In the case that a page table page is not
1815	 * resident, we are creating it here.
1816	 */
1817	if (va < VM_MAXUSER_ADDRESS) {
1818		mpte = pmap_allocpte(pmap, va);
1819	}
1820#if 0 && defined(PMAP_DIAGNOSTIC)
1821	else {
1822		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1823		origpte = *pdeaddr;
1824		if ((origpte & PG_V) == 0) {
1825			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
1826				origpte, va);
1827		}
1828	}
1829#endif
1830
1831	pte = pmap_pte(pmap, va);
1832
1833	/*
1834	 * Page Directory table entry not valid, we need a new PT page
1835	 */
1836	if (pte == NULL)
1837		panic("pmap_enter: invalid page directory va=%#lx\n", va);
1838
1839	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
1840	origpte = *pte;
1841	opa = origpte & PG_FRAME;
1842
1843	if (origpte & PG_PS)
1844		panic("pmap_enter: attempted pmap_enter on 2MB page");
1845
1846	/*
1847	 * Mapping has not changed, must be protection or wiring change.
1848	 */
1849	if (origpte && (opa == pa)) {
1850		/*
1851		 * Wiring change, just update stats. We don't worry about
1852		 * wiring PT pages as they remain resident as long as there
1853		 * are valid mappings in them. Hence, if a user page is wired,
1854		 * the PT page will be also.
1855		 */
1856		if (wired && ((origpte & PG_W) == 0))
1857			pmap->pm_stats.wired_count++;
1858		else if (!wired && (origpte & PG_W))
1859			pmap->pm_stats.wired_count--;
1860
1861#if defined(PMAP_DIAGNOSTIC)
1862		if (pmap_nw_modified((pt_entry_t) origpte)) {
1863			printf(
1864	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
1865			    va, origpte);
1866		}
1867#endif
1868
1869		/*
1870		 * Remove extra pte reference
1871		 */
1872		if (mpte)
1873			mpte->hold_count--;
1874
1875		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
1876			if ((origpte & PG_RW) == 0) {
1877				pte_store(pte, origpte | PG_RW);
1878				pmap_invalidate_page(pmap, va);
1879			}
1880			return;
1881		}
1882
1883		/*
1884		 * We might be turning off write access to the page,
1885		 * so we go ahead and sense modify status.
1886		 */
1887		if (origpte & PG_MANAGED) {
1888			if ((origpte & PG_M) && pmap_track_modified(va)) {
1889				vm_page_t om;
1890				om = PHYS_TO_VM_PAGE(opa);
1891				vm_page_dirty(om);
1892			}
1893			pa |= PG_MANAGED;
1894		}
1895		goto validate;
1896	}
1897	/*
1898	 * Mapping has changed, invalidate old range and fall through to
1899	 * handle validating new mapping.
1900	 */
1901	if (opa) {
1902		int err;
1903		vm_page_lock_queues();
1904		err = pmap_remove_pte(pmap, pte, va);
1905		vm_page_unlock_queues();
1906		if (err)
1907			panic("pmap_enter: pte vanished, va: 0x%lx", va);
1908	}
1909
1910	/*
1911	 * Enter on the PV list if part of our managed memory. Note that we
1912	 * raise IPL while manipulating pv_table since pmap_enter can be
1913	 * called at interrupt time.
1914	 */
1915	if (pmap_initialized &&
1916	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1917		pmap_insert_entry(pmap, va, mpte, m);
1918		pa |= PG_MANAGED;
1919	}
1920
1921	/*
1922	 * Increment counters
1923	 */
1924	pmap->pm_stats.resident_count++;
1925	if (wired)
1926		pmap->pm_stats.wired_count++;
1927
1928validate:
1929	/*
1930	 * Now validate mapping with desired protection/wiring.
1931	 */
1932	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
1933
1934	if (wired)
1935		newpte |= PG_W;
1936	if (va < VM_MAXUSER_ADDRESS)
1937		newpte |= PG_U;
1938	if (pmap == kernel_pmap)
1939		newpte |= PG_G;
1940
1941	/*
1942	 * if the mapping or permission bits are different, we need
1943	 * to update the pte.
1944	 */
1945	if ((origpte & ~(PG_M|PG_A)) != newpte) {
1946		pte_store(pte, newpte | PG_A);
1947		/*if (origpte)*/ {
1948			pmap_invalidate_page(pmap, va);
1949		}
1950	}
1951}
1952
1953/*
1954 * this code makes some *MAJOR* assumptions:
1955 * 1. Current pmap & pmap exists.
1956 * 2. Not wired.
1957 * 3. Read access.
1958 * 4. No page table pages.
1959 * 5. Tlbflush is deferred to calling procedure.
1960 * 6. Page IS managed.
1961 * but is *MUCH* faster than pmap_enter...
1962 */
1963
1964vm_page_t
1965pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
1966{
1967	pt_entry_t *pte;
1968	vm_paddr_t pa;
1969
1970	/*
1971	 * In the case that a page table page is not
1972	 * resident, we are creating it here.
1973	 */
1974	if (va < VM_MAXUSER_ADDRESS) {
1975		vm_pindex_t ptepindex;
1976		pd_entry_t *ptepa;
1977
1978		/*
1979		 * Calculate pagetable page index
1980		 */
1981		ptepindex = pmap_pde_pindex(va);
1982		if (mpte && (mpte->pindex == ptepindex)) {
1983			mpte->hold_count++;
1984		} else {
1985retry:
1986			/*
1987			 * Get the page directory entry
1988			 */
1989			ptepa = pmap_pde(pmap, va);
1990
1991			/*
1992			 * If the page table page is mapped, we just increment
1993			 * the hold count, and activate it.
1994			 */
1995			if (ptepa && (*ptepa & PG_V) != 0) {
1996				if (*ptepa & PG_PS)
1997					panic("pmap_enter_quick: unexpected mapping into 2MB page");
1998				if (pmap->pm_pteobj->root &&
1999					(pmap->pm_pteobj->root->pindex == ptepindex)) {
2000					mpte = pmap->pm_pteobj->root;
2001				} else {
2002					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
2003				}
2004				if (mpte == NULL)
2005					goto retry;
2006				mpte->hold_count++;
2007			} else {
2008				mpte = _pmap_allocpte(pmap, ptepindex);
2009			}
2010		}
2011	} else {
2012		mpte = NULL;
2013	}
2014
2015	/*
2016	 * This call to vtopte makes the assumption that we are
2017	 * entering the page into the current pmap.  In order to support
2018	 * quick entry into any pmap, one would likely use pmap_pte.
2019	 * But that isn't as quick as vtopte.
2020	 */
2021	pte = vtopte(va);
2022	if (*pte) {
2023		if (mpte != NULL) {
2024			vm_page_lock_queues();
2025			pmap_unwire_pte_hold(pmap, va, mpte);
2026			vm_page_unlock_queues();
2027		}
2028		return 0;
2029	}
2030
2031	/*
2032	 * Enter on the PV list if part of our managed memory. Note that we
2033	 * raise IPL while manipulating pv_table since pmap_enter can be
2034	 * called at interrupt time.
2035	 */
2036	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2037		pmap_insert_entry(pmap, va, mpte, m);
2038
2039	/*
2040	 * Increment counters
2041	 */
2042	pmap->pm_stats.resident_count++;
2043
2044	pa = VM_PAGE_TO_PHYS(m);
2045
2046	/*
2047	 * Now validate mapping with RO protection
2048	 */
2049	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2050		pte_store(pte, pa | PG_V | PG_U);
2051	else
2052		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2053
2054	return mpte;
2055}
2056
2057/*
2058 * Make a temporary mapping for a physical address.  This is only intended
2059 * to be used for panic dumps.
2060 */
2061void *
2062pmap_kenter_temporary(vm_offset_t pa, int i)
2063{
2064	vm_offset_t va;
2065
2066	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2067	pmap_kenter(va, pa);
2068	invlpg(va);
2069	return ((void *)crashdumpmap);
2070}
2071
2072/*
2073 * This code maps large physical mmap regions into the
2074 * processor address space.  Note that some shortcuts
2075 * are taken, but the code works.
2076 */
2077void
2078pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2079		    vm_object_t object, vm_pindex_t pindex,
2080		    vm_size_t size)
2081{
2082	vm_page_t p;
2083
2084	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2085	KASSERT(object->type == OBJT_DEVICE,
2086	    ("pmap_object_init_pt: non-device object"));
2087	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2088		int i;
2089		vm_page_t m[1];
2090		int npdes;
2091		pd_entry_t ptepa, *pde;
2092
2093		pde = pmap_pde(pmap, addr);
2094		if (pde != 0 && (*pde & PG_V) != 0)
2095			return;
2096retry:
2097		p = vm_page_lookup(object, pindex);
2098		if (p != NULL) {
2099			vm_page_lock_queues();
2100			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2101				goto retry;
2102		} else {
2103			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2104			if (p == NULL)
2105				return;
2106			m[0] = p;
2107
2108			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2109				vm_page_lock_queues();
2110				vm_page_free(p);
2111				vm_page_unlock_queues();
2112				return;
2113			}
2114
2115			p = vm_page_lookup(object, pindex);
2116			vm_page_lock_queues();
2117			vm_page_wakeup(p);
2118		}
2119		vm_page_unlock_queues();
2120
2121		ptepa = VM_PAGE_TO_PHYS(p);
2122		if (ptepa & (NBPDR - 1))
2123			return;
2124
2125		p->valid = VM_PAGE_BITS_ALL;
2126
2127		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2128		npdes = size >> PDRSHIFT;
2129		for(i = 0; i < npdes; i++) {
2130			pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS);
2131			ptepa += NBPDR;
2132			pde++;
2133		}
2134		pmap_invalidate_all(pmap);
2135	}
2136}
2137
2138/*
2139 * pmap_prefault provides a quick way of clustering
2140 * pagefaults into a processes address space.  It is a "cousin"
2141 * of pmap_object_init_pt, except it runs at page fault time instead
2142 * of mmap time.
2143 */
2144#define PFBAK 4
2145#define PFFOR 4
2146#define PAGEORDER_SIZE (PFBAK+PFFOR)
2147
2148static int pmap_prefault_pageorder[] = {
2149	-1 * PAGE_SIZE, 1 * PAGE_SIZE,
2150	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
2151	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
2152	-4 * PAGE_SIZE, 4 * PAGE_SIZE
2153};
2154
2155void
2156pmap_prefault(pmap, addra, entry)
2157	pmap_t pmap;
2158	vm_offset_t addra;
2159	vm_map_entry_t entry;
2160{
2161	int i;
2162	vm_offset_t starta;
2163	vm_offset_t addr;
2164	vm_pindex_t pindex;
2165	vm_page_t m, mpte;
2166	vm_object_t object;
2167	pd_entry_t *pde;
2168
2169	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2170		return;
2171
2172	object = entry->object.vm_object;
2173
2174	starta = addra - PFBAK * PAGE_SIZE;
2175	if (starta < entry->start) {
2176		starta = entry->start;
2177	} else if (starta > addra) {
2178		starta = 0;
2179	}
2180
2181	mpte = NULL;
2182	for (i = 0; i < PAGEORDER_SIZE; i++) {
2183		vm_object_t backing_object, lobject;
2184		pt_entry_t *pte;
2185
2186		addr = addra + pmap_prefault_pageorder[i];
2187		if (addr > addra + (PFFOR * PAGE_SIZE))
2188			addr = 0;
2189
2190		if (addr < starta || addr >= entry->end)
2191			continue;
2192
2193		pde = pmap_pde(pmap, addr);
2194		if (pde == NULL || (*pde & PG_V) == 0)
2195			continue;
2196
2197		pte = vtopte(addr);
2198		if ((*pte & PG_V) == 0)
2199			continue;
2200
2201		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2202		lobject = object;
2203		VM_OBJECT_LOCK(lobject);
2204		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
2205		    lobject->type == OBJT_DEFAULT &&
2206		    (backing_object = lobject->backing_object) != NULL) {
2207			if (lobject->backing_object_offset & PAGE_MASK)
2208				break;
2209			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
2210			VM_OBJECT_LOCK(backing_object);
2211			VM_OBJECT_UNLOCK(lobject);
2212			lobject = backing_object;
2213		}
2214		VM_OBJECT_UNLOCK(lobject);
2215		/*
2216		 * give-up when a page is not in memory
2217		 */
2218		if (m == NULL)
2219			break;
2220		vm_page_lock_queues();
2221		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2222			(m->busy == 0) &&
2223		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2224
2225			if ((m->queue - m->pc) == PQ_CACHE) {
2226				vm_page_deactivate(m);
2227			}
2228			vm_page_busy(m);
2229			vm_page_unlock_queues();
2230			mpte = pmap_enter_quick(pmap, addr, m, mpte);
2231			vm_page_lock_queues();
2232			vm_page_wakeup(m);
2233		}
2234		vm_page_unlock_queues();
2235	}
2236}
2237
2238/*
2239 *	Routine:	pmap_change_wiring
2240 *	Function:	Change the wiring attribute for a map/virtual-address
2241 *			pair.
2242 *	In/out conditions:
2243 *			The mapping must already exist in the pmap.
2244 */
2245void
2246pmap_change_wiring(pmap, va, wired)
2247	register pmap_t pmap;
2248	vm_offset_t va;
2249	boolean_t wired;
2250{
2251	register pt_entry_t *pte;
2252
2253	if (pmap == NULL)
2254		return;
2255
2256	/*
2257	 * Wiring is not a hardware characteristic so there is no need to
2258	 * invalidate TLB.
2259	 */
2260	pte = pmap_pte(pmap, va);
2261	if (wired && (*pte & PG_W) == 0) {
2262		pmap->pm_stats.wired_count++;
2263		*pte |= PG_W;
2264	} else if (!wired && (*pte & PG_W) != 0) {
2265		pmap->pm_stats.wired_count--;
2266		*pte &= ~PG_W;
2267	}
2268}
2269
2270
2271
2272/*
2273 *	Copy the range specified by src_addr/len
2274 *	from the source map to the range dst_addr/len
2275 *	in the destination map.
2276 *
2277 *	This routine is only advisory and need not do anything.
2278 */
2279
2280void
2281pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2282	  vm_offset_t src_addr)
2283{
2284	vm_offset_t addr;
2285	vm_offset_t end_addr = src_addr + len;
2286	vm_offset_t pdnxt;
2287	vm_page_t m;
2288
2289	if (dst_addr != src_addr)
2290		return;
2291
2292	if (!pmap_is_current(src_pmap))
2293		return;
2294
2295	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2296		pt_entry_t *src_pte, *dst_pte;
2297		vm_page_t dstmpte, srcmpte;
2298		pd_entry_t srcptepaddr, *pde;
2299		vm_pindex_t ptepindex;
2300
2301		if (addr >= UPT_MIN_ADDRESS)
2302			panic("pmap_copy: invalid to pmap_copy page tables\n");
2303
2304		/*
2305		 * Don't let optional prefaulting of pages make us go
2306		 * way below the low water mark of free pages or way
2307		 * above high water mark of used pv entries.
2308		 */
2309		if (cnt.v_free_count < cnt.v_free_reserved ||
2310		    pv_entry_count > pv_entry_high_water)
2311			break;
2312
2313		pdnxt = (addr + NBPDR) & ~PDRMASK;
2314		ptepindex = pmap_pde_pindex(addr);
2315
2316		pde = pmap_pde(src_pmap, addr);
2317		if (pde)
2318			srcptepaddr = *pde;
2319		else
2320			continue;
2321		if (srcptepaddr == 0)
2322			continue;
2323
2324		if (srcptepaddr & PG_PS) {
2325			pde = pmap_pde(dst_pmap, addr);
2326			if (pde == 0) {
2327				/*
2328				 * XXX should do an allocpte here to
2329				 * instantiate the pde
2330				 */
2331				continue;
2332			}
2333			if (*pde == 0) {
2334				*pde = srcptepaddr;
2335				dst_pmap->pm_stats.resident_count +=
2336				    NBPDR / PAGE_SIZE;
2337			}
2338			continue;
2339		}
2340
2341		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2342		if ((srcmpte == NULL) ||
2343		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2344			continue;
2345
2346		if (pdnxt > end_addr)
2347			pdnxt = end_addr;
2348
2349		src_pte = vtopte(addr);
2350		while (addr < pdnxt) {
2351			pt_entry_t ptetemp;
2352			ptetemp = *src_pte;
2353			/*
2354			 * we only virtual copy managed pages
2355			 */
2356			if ((ptetemp & PG_MANAGED) != 0) {
2357				/*
2358				 * We have to check after allocpte for the
2359				 * pte still being around...  allocpte can
2360				 * block.
2361				 */
2362				dstmpte = pmap_allocpte(dst_pmap, addr);
2363				dst_pte = pmap_pte(dst_pmap, addr);
2364				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2365					/*
2366					 * Clear the modified and
2367					 * accessed (referenced) bits
2368					 * during the copy.
2369					 */
2370					m = PHYS_TO_VM_PAGE(ptetemp);
2371					*dst_pte = ptetemp & ~(PG_M | PG_A);
2372					dst_pmap->pm_stats.resident_count++;
2373					pmap_insert_entry(dst_pmap, addr,
2374						dstmpte, m);
2375	 			} else {
2376					vm_page_lock_queues();
2377					pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
2378					vm_page_unlock_queues();
2379				}
2380				if (dstmpte->hold_count >= srcmpte->hold_count)
2381					break;
2382			}
2383			addr += PAGE_SIZE;
2384			src_pte++;
2385		}
2386	}
2387}
2388
2389/*
2390 *	pmap_zero_page zeros the specified hardware page by mapping
2391 *	the page into KVM and using bzero to clear its contents.
2392 */
2393void
2394pmap_zero_page(vm_page_t m)
2395{
2396	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2397
2398	pagezero((void *)va);
2399}
2400
2401/*
2402 *	pmap_zero_page_area zeros the specified hardware page by mapping
2403 *	the page into KVM and using bzero to clear its contents.
2404 *
2405 *	off and size may not cover an area beyond a single hardware page.
2406 */
2407void
2408pmap_zero_page_area(vm_page_t m, int off, int size)
2409{
2410	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2411
2412	if (off == 0 && size == PAGE_SIZE)
2413		pagezero((void *)va);
2414	else
2415		bzero((char *)va + off, size);
2416}
2417
2418/*
2419 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2420 *	the page into KVM and using bzero to clear its contents.  This
2421 *	is intended to be called from the vm_pagezero process only and
2422 *	outside of Giant.
2423 */
2424void
2425pmap_zero_page_idle(vm_page_t m)
2426{
2427	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2428
2429	pagezero((void *)va);
2430}
2431
2432/*
2433 *	pmap_copy_page copies the specified (machine independent)
2434 *	page by mapping the page into virtual memory and using
2435 *	bcopy to copy the page, one machine dependent page at a
2436 *	time.
2437 */
2438void
2439pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2440{
2441	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2442	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2443
2444	bcopy((void *)src, (void *)dst, PAGE_SIZE);
2445}
2446
2447/*
2448 * Returns true if the pmap's pv is one of the first
2449 * 16 pvs linked to from this page.  This count may
2450 * be changed upwards or downwards in the future; it
2451 * is only necessary that true be returned for a small
2452 * subset of pmaps for proper page aging.
2453 */
2454boolean_t
2455pmap_page_exists_quick(pmap, m)
2456	pmap_t pmap;
2457	vm_page_t m;
2458{
2459	pv_entry_t pv;
2460	int loops = 0;
2461	int s;
2462
2463	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2464		return FALSE;
2465
2466	s = splvm();
2467	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2468	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2469		if (pv->pv_pmap == pmap) {
2470			splx(s);
2471			return TRUE;
2472		}
2473		loops++;
2474		if (loops >= 16)
2475			break;
2476	}
2477	splx(s);
2478	return (FALSE);
2479}
2480
2481#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2482/*
2483 * Remove all pages from specified address space
2484 * this aids process exit speeds.  Also, this code
2485 * is special cased for current process only, but
2486 * can have the more generic (and slightly slower)
2487 * mode enabled.  This is much faster than pmap_remove
2488 * in the case of running down an entire address space.
2489 */
2490void
2491pmap_remove_pages(pmap, sva, eva)
2492	pmap_t pmap;
2493	vm_offset_t sva, eva;
2494{
2495	pt_entry_t *pte, tpte;
2496	vm_page_t m;
2497	pv_entry_t pv, npv;
2498	int s;
2499
2500#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2501	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2502		printf("warning: pmap_remove_pages called with non-current pmap\n");
2503		return;
2504	}
2505#endif
2506	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2507	s = splvm();
2508	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2509
2510		if (pv->pv_va >= eva || pv->pv_va < sva) {
2511			npv = TAILQ_NEXT(pv, pv_plist);
2512			continue;
2513		}
2514
2515#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2516		pte = vtopte(pv->pv_va);
2517#else
2518		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2519#endif
2520		tpte = *pte;
2521
2522		if (tpte == 0) {
2523			printf("TPTE at %p  IS ZERO @ VA %08lx\n",
2524							pte, pv->pv_va);
2525			panic("bad pte");
2526		}
2527
2528/*
2529 * We cannot remove wired pages from a process' mapping at this time
2530 */
2531		if (tpte & PG_W) {
2532			npv = TAILQ_NEXT(pv, pv_plist);
2533			continue;
2534		}
2535
2536		m = PHYS_TO_VM_PAGE(tpte);
2537		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2538		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2539		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2540
2541		KASSERT(m < &vm_page_array[vm_page_array_size],
2542			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2543
2544		pv->pv_pmap->pm_stats.resident_count--;
2545
2546		pte_clear(pte);
2547
2548		/*
2549		 * Update the vm_page_t clean and reference bits.
2550		 */
2551		if (tpte & PG_M) {
2552			vm_page_dirty(m);
2553		}
2554
2555		npv = TAILQ_NEXT(pv, pv_plist);
2556		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2557
2558		m->md.pv_list_count--;
2559		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2560		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2561			vm_page_flag_clear(m, PG_WRITEABLE);
2562		}
2563
2564		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2565		free_pv_entry(pv);
2566	}
2567	splx(s);
2568	pmap_invalidate_all(pmap);
2569}
2570
2571/*
2572 *	pmap_is_modified:
2573 *
2574 *	Return whether or not the specified physical page was modified
2575 *	in any physical maps.
2576 */
2577boolean_t
2578pmap_is_modified(vm_page_t m)
2579{
2580	pv_entry_t pv;
2581	pt_entry_t *pte;
2582	int s;
2583
2584	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2585		return FALSE;
2586
2587	s = splvm();
2588	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2589	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2590		/*
2591		 * if the bit being tested is the modified bit, then
2592		 * mark clean_map and ptes as never
2593		 * modified.
2594		 */
2595		if (!pmap_track_modified(pv->pv_va))
2596			continue;
2597#if defined(PMAP_DIAGNOSTIC)
2598		if (!pv->pv_pmap) {
2599			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2600			continue;
2601		}
2602#endif
2603		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2604		if (*pte & PG_M) {
2605			splx(s);
2606			return TRUE;
2607		}
2608	}
2609	splx(s);
2610	return (FALSE);
2611}
2612
2613/*
2614 * this routine is used to modify bits in ptes
2615 */
2616static __inline void
2617pmap_changebit(vm_page_t m, int bit, boolean_t setem)
2618{
2619	register pv_entry_t pv;
2620	register pt_entry_t *pte;
2621	int s;
2622
2623	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2624	    (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2625		return;
2626
2627	s = splvm();
2628	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2629	/*
2630	 * Loop over all current mappings setting/clearing as appropos If
2631	 * setting RO do we need to clear the VAC?
2632	 */
2633	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2634		/*
2635		 * don't write protect pager mappings
2636		 */
2637		if (!setem && (bit == PG_RW)) {
2638			if (!pmap_track_modified(pv->pv_va))
2639				continue;
2640		}
2641
2642#if defined(PMAP_DIAGNOSTIC)
2643		if (!pv->pv_pmap) {
2644			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2645			continue;
2646		}
2647#endif
2648
2649		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2650
2651		if (setem) {
2652			*pte |= bit;
2653			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2654		} else {
2655			pt_entry_t pbits = *pte;
2656			if (pbits & bit) {
2657				if (bit == PG_RW) {
2658					if (pbits & PG_M) {
2659						vm_page_dirty(m);
2660					}
2661					pte_store(pte, pbits & ~(PG_M|PG_RW));
2662				} else {
2663					pte_store(pte, pbits & ~bit);
2664				}
2665				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2666			}
2667		}
2668	}
2669	if (!setem && bit == PG_RW)
2670		vm_page_flag_clear(m, PG_WRITEABLE);
2671	splx(s);
2672}
2673
2674/*
2675 *      pmap_page_protect:
2676 *
2677 *      Lower the permission for all mappings to a given page.
2678 */
2679void
2680pmap_page_protect(vm_page_t m, vm_prot_t prot)
2681{
2682	if ((prot & VM_PROT_WRITE) == 0) {
2683		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2684			pmap_changebit(m, PG_RW, FALSE);
2685		} else {
2686			pmap_remove_all(m);
2687		}
2688	}
2689}
2690
2691/*
2692 *	pmap_ts_referenced:
2693 *
2694 *	Return a count of reference bits for a page, clearing those bits.
2695 *	It is not necessary for every reference bit to be cleared, but it
2696 *	is necessary that 0 only be returned when there are truly no
2697 *	reference bits set.
2698 *
2699 *	XXX: The exact number of bits to check and clear is a matter that
2700 *	should be tested and standardized at some point in the future for
2701 *	optimal aging of shared pages.
2702 */
2703int
2704pmap_ts_referenced(vm_page_t m)
2705{
2706	register pv_entry_t pv, pvf, pvn;
2707	pt_entry_t *pte;
2708	pt_entry_t v;
2709	int s;
2710	int rtval = 0;
2711
2712	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2713		return (rtval);
2714
2715	s = splvm();
2716	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2717	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2718
2719		pvf = pv;
2720
2721		do {
2722			pvn = TAILQ_NEXT(pv, pv_list);
2723
2724			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2725
2726			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2727
2728			if (!pmap_track_modified(pv->pv_va))
2729				continue;
2730
2731			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2732
2733			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2734				pte_store(pte, v & ~PG_A);
2735				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2736
2737				rtval++;
2738				if (rtval > 4) {
2739					break;
2740				}
2741			}
2742		} while ((pv = pvn) != NULL && pv != pvf);
2743	}
2744	splx(s);
2745
2746	return (rtval);
2747}
2748
2749/*
2750 *	Clear the modify bits on the specified physical page.
2751 */
2752void
2753pmap_clear_modify(vm_page_t m)
2754{
2755	pmap_changebit(m, PG_M, FALSE);
2756}
2757
2758/*
2759 *	pmap_clear_reference:
2760 *
2761 *	Clear the reference bit on the specified physical page.
2762 */
2763void
2764pmap_clear_reference(vm_page_t m)
2765{
2766	pmap_changebit(m, PG_A, FALSE);
2767}
2768
2769/*
2770 * Miscellaneous support routines follow
2771 */
2772
2773static void
2774amd64_protection_init()
2775{
2776	register long *kp, prot;
2777
2778#if 0
2779#define PG_NX (1ul << 63)
2780#else
2781#define PG_NX 0
2782#endif
2783
2784	kp = protection_codes;
2785	for (prot = 0; prot < 8; prot++) {
2786		switch (prot) {
2787		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
2788		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
2789			*kp++ = PG_NX;
2790			break;
2791		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
2792		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
2793			*kp++ = 0;
2794			break;
2795		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
2796		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
2797			*kp++ = PG_RW | PG_NX;
2798			break;
2799		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
2800		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
2801			*kp++ = PG_RW;
2802			break;
2803		}
2804	}
2805}
2806
2807/*
2808 * Map a set of physical memory pages into the kernel virtual
2809 * address space. Return a pointer to where it is mapped. This
2810 * routine is intended to be used for mapping device memory,
2811 * NOT real memory.
2812 */
2813void *
2814pmap_mapdev(pa, size)
2815	vm_paddr_t pa;
2816	vm_size_t size;
2817{
2818	vm_offset_t va, tmpva, offset;
2819
2820	/* If this fits within the direct map window, use it */
2821	if (pa < dmaplimit && (pa + size) < dmaplimit)
2822		return ((void *)PHYS_TO_DMAP(pa));
2823	offset = pa & PAGE_MASK;
2824	size = roundup(offset + size, PAGE_SIZE);
2825	va = kmem_alloc_nofault(kernel_map, size);
2826	if (!va)
2827		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2828	pa = pa & PG_FRAME;
2829	for (tmpva = va; size > 0; ) {
2830		pmap_kenter(tmpva, pa);
2831		size -= PAGE_SIZE;
2832		tmpva += PAGE_SIZE;
2833		pa += PAGE_SIZE;
2834	}
2835	pmap_invalidate_range(kernel_pmap, va, tmpva);
2836	return ((void *)(va + offset));
2837}
2838
2839void
2840pmap_unmapdev(va, size)
2841	vm_offset_t va;
2842	vm_size_t size;
2843{
2844	vm_offset_t base, offset, tmpva;
2845	pt_entry_t *pte;
2846
2847	/* If we gave a direct map region in pmap_mapdev, do nothing */
2848	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2849		return;
2850	base = va & PG_FRAME;
2851	offset = va & PAGE_MASK;
2852	size = roundup(offset + size, PAGE_SIZE);
2853	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
2854		pte = vtopte(tmpva);
2855		pte_clear(pte);
2856	}
2857	pmap_invalidate_range(kernel_pmap, va, tmpva);
2858	kmem_free(kernel_map, base, size);
2859}
2860
2861/*
2862 * perform the pmap work for mincore
2863 */
2864int
2865pmap_mincore(pmap, addr)
2866	pmap_t pmap;
2867	vm_offset_t addr;
2868{
2869	pt_entry_t *ptep, pte;
2870	vm_page_t m;
2871	int val = 0;
2872
2873	ptep = pmap_pte(pmap, addr);
2874	if (ptep == 0) {
2875		return 0;
2876	}
2877
2878	if ((pte = *ptep) != 0) {
2879		vm_paddr_t pa;
2880
2881		val = MINCORE_INCORE;
2882		if ((pte & PG_MANAGED) == 0)
2883			return val;
2884
2885		pa = pte & PG_FRAME;
2886
2887		m = PHYS_TO_VM_PAGE(pa);
2888
2889		/*
2890		 * Modified by us
2891		 */
2892		if (pte & PG_M)
2893			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2894		else {
2895			/*
2896			 * Modified by someone else
2897			 */
2898			vm_page_lock_queues();
2899			if (m->dirty || pmap_is_modified(m))
2900				val |= MINCORE_MODIFIED_OTHER;
2901			vm_page_unlock_queues();
2902		}
2903		/*
2904		 * Referenced by us
2905		 */
2906		if (pte & PG_A)
2907			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2908		else {
2909			/*
2910			 * Referenced by someone else
2911			 */
2912			vm_page_lock_queues();
2913			if ((m->flags & PG_REFERENCED) ||
2914			    pmap_ts_referenced(m)) {
2915				val |= MINCORE_REFERENCED_OTHER;
2916				vm_page_flag_set(m, PG_REFERENCED);
2917			}
2918			vm_page_unlock_queues();
2919		}
2920	}
2921	return val;
2922}
2923
2924void
2925pmap_activate(struct thread *td)
2926{
2927	struct proc *p = td->td_proc;
2928	pmap_t	pmap;
2929	u_int64_t  cr3;
2930
2931	critical_enter();
2932	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2933	pmap->pm_active |= PCPU_GET(cpumask);
2934	cr3 = vtophys(pmap->pm_pml4);
2935	/* XXXKSE this is wrong.
2936	 * pmap_activate is for the current thread on the current cpu
2937	 */
2938	if (p->p_flag & P_SA) {
2939		/* Make sure all other cr3 entries are updated. */
2940		/* what if they are running?  XXXKSE (maybe abort them) */
2941		FOREACH_THREAD_IN_PROC(p, td) {
2942			td->td_pcb->pcb_cr3 = cr3;
2943		}
2944	} else {
2945		td->td_pcb->pcb_cr3 = cr3;
2946	}
2947	load_cr3(cr3);
2948	critical_exit();
2949}
2950
2951vm_offset_t
2952pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
2953{
2954
2955	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
2956		return addr;
2957	}
2958
2959	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
2960	return addr;
2961}
2962