pmap.c revision 132365
118316Swollman/*-
218316Swollman * Copyright (c) 1991 Regents of the University of California.
318316Swollman * All rights reserved.
418316Swollman * Copyright (c) 1994 John S. Dyson
518316Swollman * All rights reserved.
618316Swollman * Copyright (c) 1994 David Greenman
718316Swollman * All rights reserved.
818316Swollman *
918316Swollman * This code is derived from software contributed to Berkeley by
1018316Swollman * the Systems Programming Group of the University of Utah Computer
1118316Swollman * Science Department and William Jolitz of UUNET Technologies Inc.
1218316Swollman *
1318316Swollman * Redistribution and use in source and binary forms, with or without
1418316Swollman * modification, are permitted provided that the following conditions
1518316Swollman * are met:
1618316Swollman * 1. Redistributions of source code must retain the above copyright
1718316Swollman *    notice, this list of conditions and the following disclaimer.
1818316Swollman * 2. Redistributions in binary form must reproduce the above copyright
1918316Swollman *    notice, this list of conditions and the following disclaimer in the
2018316Swollman *    documentation and/or other materials provided with the distribution.
2118316Swollman * 3. All advertising materials mentioning features or use of this software
2218316Swollman *    must display the following acknowledgement:
2318316Swollman *	This product includes software developed by the University of
2418316Swollman *	California, Berkeley and its contributors.
2518316Swollman * 4. Neither the name of the University nor the names of its contributors
2618316Swollman *    may be used to endorse or promote products derived from this software
2718316Swollman *    without specific prior written permission.
2818316Swollman *
2918316Swollman * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
3018316Swollman * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3118316Swollman * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3218316Swollman * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
3318316Swollman * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3419885Swollman * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3518316Swollman * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3618316Swollman * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3718316Swollman * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3818316Swollman * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3918316Swollman * SUCH DAMAGE.
4018316Swollman *
4118316Swollman *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
4218316Swollman */
4318316Swollman/*-
4418316Swollman * Copyright (c) 2003 Networks Associates Technology, Inc.
4518316Swollman * All rights reserved.
4618316Swollman *
4718316Swollman * This software was developed for the FreeBSD Project by Jake Burkholder,
4818316Swollman * Safeport Network Services, and Network Associates Laboratories, the
4918316Swollman * Security Research Division of Network Associates, Inc. under
5018316Swollman * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
5118316Swollman * CHATS research program.
5218316Swollman *
5318316Swollman * Redistribution and use in source and binary forms, with or without
5418316Swollman * modification, are permitted provided that the following conditions
5518316Swollman * are met:
5618316Swollman * 1. Redistributions of source code must retain the above copyright
5718316Swollman *    notice, this list of conditions and the following disclaimer.
5818316Swollman * 2. Redistributions in binary form must reproduce the above copyright
5918316Swollman *    notice, this list of conditions and the following disclaimer in the
6018316Swollman *    documentation and/or other materials provided with the distribution.
6118316Swollman *
6218316Swollman * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
6318316Swollman * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
6418316Swollman * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
6518316Swollman * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
6618316Swollman * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
6718316Swollman * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
6818316Swollman * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
6918316Swollman * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
7018316Swollman * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
7118316Swollman * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
7218316Swollman * SUCH DAMAGE.
7318316Swollman */
7418316Swollman
7518316Swollman#include <sys/cdefs.h>
7618316Swollman__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 132365 2004-07-18 21:19:10Z alc $");
7718316Swollman
7818316Swollman/*
7918316Swollman *	Manages physical address maps.
8018316Swollman *
8118316Swollman *	In addition to hardware address maps, this
8218316Swollman *	module is called upon to provide software-use-only
8318316Swollman *	maps which may or may not be stored in the same
8418316Swollman *	form as hardware maps.  These pseudo-maps are
8518316Swollman *	used to store intermediate results from copy
8618316Swollman *	operations to and from address spaces.
8718316Swollman *
8818316Swollman *	Since the information managed by this module is
8918316Swollman *	also stored by the logical address mapping module,
9018316Swollman *	this module may throw away valid virtual-to-physical
9118316Swollman *	mappings at almost any time.  However, invalidations
9218316Swollman *	of virtual-to-physical mappings must be done as
9319885Swollman *	requested.
9419885Swollman *
9519885Swollman *	In order to cope with hardware architectures which
9618316Swollman *	make virtual-to-physical map invalidates expensive,
9719885Swollman *	this module may delay invalidate or reduced protection
9818316Swollman *	operations until such time as they are actually
9918316Swollman *	necessary.  This module is given full information as
10018316Swollman *	to which processors are currently using which maps,
10118316Swollman *	and to when physical maps must be made correct.
10218316Swollman */
10318316Swollman
10418316Swollman#include "opt_cpu.h"
10518316Swollman#include "opt_pmap.h"
10618316Swollman#include "opt_msgbuf.h"
10718316Swollman#include "opt_kstack_pages.h"
10818316Swollman
10918316Swollman#include <sys/param.h>
11018316Swollman#include <sys/systm.h>
11118316Swollman#include <sys/kernel.h>
11218316Swollman#include <sys/lock.h>
11318316Swollman#include <sys/mman.h>
11418316Swollman#include <sys/msgbuf.h>
11518316Swollman#include <sys/mutex.h>
11618316Swollman#include <sys/proc.h>
11718316Swollman#include <sys/sx.h>
11818316Swollman#include <sys/user.h>
11918316Swollman#include <sys/vmmeter.h>
12018316Swollman#include <sys/sched.h>
12118316Swollman#include <sys/sysctl.h>
12218316Swollman#ifdef SMP
12318316Swollman#include <sys/smp.h>
12418316Swollman#endif
12518316Swollman
12618316Swollman#include <vm/vm.h>
12718316Swollman#include <vm/vm_param.h>
12818316Swollman#include <vm/vm_kern.h>
12918316Swollman#include <vm/vm_page.h>
13018316Swollman#include <vm/vm_map.h>
13118316Swollman#include <vm/vm_object.h>
13218316Swollman#include <vm/vm_extern.h>
13319885Swollman#include <vm/vm_pageout.h>
13419885Swollman#include <vm/vm_pager.h>
13519885Swollman#include <vm/uma.h>
13619885Swollman
13719885Swollman#include <machine/cpu.h>
13819885Swollman#include <machine/cputypes.h>
13918316Swollman#include <machine/md_var.h>
14019885Swollman#include <machine/specialreg.h>
14118316Swollman#ifdef SMP
14218316Swollman#include <machine/smp.h>
14318316Swollman#endif
14418316Swollman
14518316Swollman#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU)
14618316Swollman#define CPU_ENABLE_SSE
14718316Swollman#endif
14818316Swollman#if defined(CPU_DISABLE_SSE)
14918316Swollman#undef CPU_ENABLE_SSE
15018316Swollman#endif
15118316Swollman
15218316Swollman#ifndef PMAP_SHPGPERPROC
15318316Swollman#define PMAP_SHPGPERPROC 200
15418316Swollman#endif
15518316Swollman
15618316Swollman#if defined(DIAGNOSTIC)
15718316Swollman#define PMAP_DIAGNOSTIC
15818316Swollman#endif
15918316Swollman
16019885Swollman#define MINPV 2048
16119885Swollman
16219885Swollman#if !defined(PMAP_DIAGNOSTIC)
16319885Swollman#define PMAP_INLINE __inline
16418316Swollman#else
16518316Swollman#define PMAP_INLINE
16619885Swollman#endif
16718316Swollman
16818316Swollman/*
16918316Swollman * Get PDEs and PTEs for user/kernel address space
17018316Swollman */
17119885Swollman#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
17219885Swollman#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
17318316Swollman
17418316Swollman#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
17518316Swollman#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
17618316Swollman#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
17718316Swollman#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
17818316Swollman#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
17918316Swollman
18018316Swollman#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
18118316Swollman    atomic_clear_int((u_int *)(pte), PG_W))
18218316Swollman#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
18318316Swollman
18418316Swollmanstruct pmap kernel_pmap_store;
18518316SwollmanLIST_HEAD(pmaplist, pmap);
18618316Swollmanstatic struct pmaplist allpmaps;
18718316Swollmanstatic struct mtx allpmaps_lock;
18818316Swollman#ifdef SMP
18918316Swollmanstatic struct mtx lazypmap_lock;
19018316Swollman#endif
19118316Swollman
19218316Swollmanvm_paddr_t avail_end;	/* PA of last available physical page */
19318316Swollmanvm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
19418316Swollmanvm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
19518316Swollmanstatic boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
19618316Swollmanint pgeflag = 0;		/* PG_G or-in */
19718316Swollmanint pseflag = 0;		/* PG_PS or-in */
19818316Swollman
19918316Swollmanstatic int nkpt;
20018316Swollmanvm_offset_t kernel_vm_end;
20118316Swollmanextern u_int32_t KERNend;
20218316Swollman
20318316Swollman#ifdef PAE
20418316Swollmanstatic uma_zone_t pdptzone;
20518316Swollman#endif
20618316Swollman
20718316Swollman/*
20818316Swollman * Data for the pv entry allocation mechanism
20918316Swollman */
21018316Swollmanstatic uma_zone_t pvzone;
21118316Swollmanstatic struct vm_object pvzone_obj;
21218316Swollmanstatic int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
21318316Swollmanint pmap_pagedaemon_waken;
21418316Swollman
21518316Swollman/*
21618316Swollman * All those kernel PT submaps that BSD is so fond of
21718316Swollman */
21818316Swollmanpt_entry_t *CMAP1 = 0;
21918316Swollmanstatic pt_entry_t *CMAP2, *CMAP3;
22018316Swollmancaddr_t CADDR1 = 0, ptvmmap = 0;
22118316Swollmanstatic caddr_t CADDR2, CADDR3;
22218316Swollmanstatic struct mtx CMAPCADDR12_lock;
22318316Swollmanstruct msgbuf *msgbufp = 0;
22418316Swollman
22518316Swollman/*
22618316Swollman * Crashdump maps.
22718316Swollman */
22818316Swollmanstatic caddr_t crashdumpmap;
22918316Swollman
23018316Swollman#ifdef SMP
23118316Swollmanextern pt_entry_t *SMPpt;
23218316Swollman#endif
23318316Swollmanstatic pt_entry_t *PMAP1 = 0, *PMAP2;
23418316Swollmanstatic pt_entry_t *PADDR1 = 0, *PADDR2;
23518316Swollman#ifdef SMP
23618316Swollmanstatic int PMAP1cpu;
23718316Swollmanstatic int PMAP1changedcpu;
23818316SwollmanSYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
23918316Swollman	   &PMAP1changedcpu, 0,
24018316Swollman	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
24118316Swollman#endif
24218316Swollmanstatic int PMAP1changed;
24318316SwollmanSYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
24418316Swollman	   &PMAP1changed, 0,
24518316Swollman	   "Number of times pmap_pte_quick changed PMAP1");
24618316Swollmanstatic int PMAP1unchanged;
24718316SwollmanSYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
24818316Swollman	   &PMAP1unchanged, 0,
24918316Swollman	   "Number of times pmap_pte_quick didn't change PMAP1");
25018316Swollman
25118316Swollmanstatic PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
25218316Swollmanstatic pv_entry_t get_pv_entry(void);
25318316Swollmanstatic void	pmap_clear_ptes(vm_page_t m, int bit);
25418316Swollman
25518316Swollmanstatic int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
25619885Swollmanstatic void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
25719885Swollmanstatic int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
25819885Swollman					vm_offset_t va);
25919885Swollmanstatic void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
26019885Swollman
26118316Swollmanstatic vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
26218316Swollman
26318316Swollmanstatic vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex);
26418316Swollmanstatic pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
26518316Swollmanstatic int pmap_unuse_pt(pmap_t, vm_offset_t);
26618316Swollmanstatic vm_offset_t pmap_kmem_choose(vm_offset_t addr);
26718316Swollman#ifdef PAE
26818316Swollmanstatic void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
26918316Swollman#endif
27018316Swollman
27118316SwollmanCTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
27218316SwollmanCTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
27318316Swollman
27418316Swollman/*
27518316Swollman * Move the kernel virtual free pointer to the next
27619885Swollman * 4MB.  This is used to help improve performance
27718316Swollman * by using a large (4MB) page for much of the kernel
27818316Swollman * (.text, .data, .bss)
27918316Swollman */
28018316Swollmanstatic vm_offset_t
28118316Swollmanpmap_kmem_choose(vm_offset_t addr)
28218316Swollman{
28318316Swollman	vm_offset_t newaddr = addr;
28418316Swollman
28518316Swollman#ifndef DISABLE_PSE
28618316Swollman	if (cpu_feature & CPUID_PSE)
28718316Swollman		newaddr = (addr + PDRMASK) & ~PDRMASK;
28818316Swollman#endif
28918316Swollman	return newaddr;
29019885Swollman}
29119885Swollman
29219885Swollman/*
29319885Swollman *	Bootstrap the system enough to run with virtual memory.
29419885Swollman *
29519885Swollman *	On the i386 this is called after mapping has already been enabled
29619885Swollman *	and just syncs the pmap module with what has already been done.
29719885Swollman *	[We can't call it easily with mapping off since the kernel is not
29819885Swollman *	mapped with PA == VA, hence we would have to relocate every address
29918316Swollman *	from the linked base (virtual) address "KERNBASE" to the actual
30018316Swollman *	(physical) address starting relative to 0]
30118316Swollman */
30218316Swollmanvoid
30318316Swollmanpmap_bootstrap(firstaddr, loadaddr)
30418316Swollman	vm_paddr_t firstaddr;
30518316Swollman	vm_paddr_t loadaddr;
30618316Swollman{
30718316Swollman	vm_offset_t va;
30818316Swollman	pt_entry_t *pte, *unused;
30918316Swollman	int i;
31018316Swollman
31118316Swollman	/*
31218316Swollman	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
31318316Swollman	 * large. It should instead be correctly calculated in locore.s and
31419885Swollman	 * not based on 'first' (which is a physical address, not a virtual
31518316Swollman	 * address, for the start of unused physical memory). The kernel
31618316Swollman	 * page tables are NOT double mapped and thus should not be included
31718316Swollman	 * in this calculation.
31819885Swollman	 */
31918316Swollman	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
32018316Swollman	virtual_avail = pmap_kmem_choose(virtual_avail);
32118316Swollman
32218316Swollman	virtual_end = VM_MAX_KERNEL_ADDRESS;
32318316Swollman
32418316Swollman	/*
32518316Swollman	 * Initialize the kernel pmap (which is statically allocated).
32618316Swollman	 */
32718316Swollman	PMAP_LOCK_INIT(kernel_pmap);
32818316Swollman	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
32918316Swollman#ifdef PAE
33018316Swollman	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
33118316Swollman#endif
33218316Swollman	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
33318316Swollman	TAILQ_INIT(&kernel_pmap->pm_pvlist);
33418316Swollman	LIST_INIT(&allpmaps);
33518316Swollman#ifdef SMP
33618316Swollman	mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN);
33718316Swollman#endif
33818316Swollman	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
33918316Swollman	mtx_lock_spin(&allpmaps_lock);
34018316Swollman	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
34118316Swollman	mtx_unlock_spin(&allpmaps_lock);
34218316Swollman	nkpt = NKPT;
34318316Swollman
34418316Swollman	/*
34518316Swollman	 * Reserve some special page table entries/VA space for temporary
34618316Swollman	 * mapping of pages.
34718316Swollman	 */
34818316Swollman#define	SYSMAP(c, p, v, n)	\
34918316Swollman	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
35018316Swollman
35118316Swollman	va = virtual_avail;
35218316Swollman	pte = vtopte(va);
35318316Swollman
35418316Swollman	/*
35518316Swollman	 * CMAP1/CMAP2 are used for zeroing and copying pages.
35618316Swollman	 * CMAP3 is used for the idle process page zeroing.
35718316Swollman	 */
35818316Swollman	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
35918316Swollman	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
36018316Swollman	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
36118316Swollman	*CMAP3 = 0;
36218316Swollman
36318316Swollman	mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF);
36418316Swollman
36518316Swollman	/*
36618316Swollman	 * Crashdump maps.
36718316Swollman	 */
36818316Swollman	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
36918316Swollman
37018316Swollman	/*
37118316Swollman	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
37218316Swollman	 */
37318316Swollman	SYSMAP(caddr_t, unused, ptvmmap, 1)
37418316Swollman
37518316Swollman	/*
37618316Swollman	 * msgbufp is used to map the system message buffer.
37718316Swollman	 */
37818316Swollman	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
37918316Swollman
38018316Swollman	/*
38118316Swollman	 * ptemap is used for pmap_pte_quick
38218316Swollman	 */
38318316Swollman	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
38418316Swollman	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
38518316Swollman
38618316Swollman	virtual_avail = va;
38718316Swollman
38818316Swollman	*CMAP1 = *CMAP2 = 0;
38918316Swollman	for (i = 0; i < NKPT; i++)
39019885Swollman		PTD[i] = 0;
39118316Swollman
39218316Swollman	/* Turn on PG_G on kernel page(s) */
39318316Swollman	pmap_set_pg();
39418316Swollman}
39518316Swollman
39618316Swollman/*
39719885Swollman * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
39818316Swollman */
39918316Swollmanvoid
40018316Swollmanpmap_set_pg(void)
40118316Swollman{
40218316Swollman	pd_entry_t pdir;
40318316Swollman	pt_entry_t *pte;
40418316Swollman	vm_offset_t va, endva;
40518316Swollman	int i;
40618316Swollman
40718316Swollman	if (pgeflag == 0)
40819885Swollman		return;
40919885Swollman
41019885Swollman	i = KERNLOAD/NBPDR;
41119885Swollman	endva = KERNBASE + KERNend;
41219885Swollman
41318316Swollman	if (pseflag) {
41419885Swollman		va = KERNBASE + KERNLOAD;
41519885Swollman		while (va  < endva) {
41618316Swollman			pdir = kernel_pmap->pm_pdir[KPTDI+i];
41719885Swollman			pdir |= pgeflag;
41819885Swollman			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
41919885Swollman			invltlb();	/* Play it safe, invltlb() every time */
42019885Swollman			i++;
42119885Swollman			va += NBPDR;
42219885Swollman		}
42319885Swollman	} else {
42419885Swollman		va = (vm_offset_t)btext;
42519885Swollman		while (va < endva) {
42618316Swollman			pte = vtopte(va);
42718316Swollman			if (*pte)
42818316Swollman				*pte |= pgeflag;
42918316Swollman			invltlb();	/* Play it safe, invltlb() every time */
43018316Swollman			va += PAGE_SIZE;
43118316Swollman		}
43218316Swollman	}
43318316Swollman}
43418316Swollman
43518316Swollman#ifdef PAE
43618316Swollmanstatic void *
43718316Swollmanpmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
43818316Swollman{
43918316Swollman	*flags = UMA_SLAB_PRIV;
44018316Swollman	return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0));
44118316Swollman}
44218316Swollman#endif
44318316Swollman
44418316Swollman/*
44518316Swollman *	Initialize the pmap module.
44618316Swollman *	Called by vm_init, to initialize any structures that the pmap
44719885Swollman *	system needs to map virtual memory.
44819885Swollman *	pmap_init has been enhanced to support in a fairly consistant
44918316Swollman *	way, discontiguous physical memory.
45018316Swollman */
45118316Swollmanvoid
45218316Swollmanpmap_init(void)
45318316Swollman{
45418316Swollman	int i;
45518316Swollman
45618316Swollman	/*
45718316Swollman	 * Allocate memory for random pmap data structures.  Includes the
45818316Swollman	 * pv_head_table.
45918316Swollman	 */
46018316Swollman
46118316Swollman	for(i = 0; i < vm_page_array_size; i++) {
46218316Swollman		vm_page_t m;
46318316Swollman
46418316Swollman		m = &vm_page_array[i];
46518316Swollman		TAILQ_INIT(&m->md.pv_list);
46619885Swollman		m->md.pv_list_count = 0;
46718316Swollman	}
46818316Swollman
46918316Swollman	/*
47018316Swollman	 * init the pv free list
47118316Swollman	 */
47218316Swollman	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
47318316Swollman	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
47418316Swollman	uma_prealloc(pvzone, MINPV);
47518316Swollman
47618316Swollman#ifdef PAE
47718316Swollman	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
47818316Swollman	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
47918316Swollman	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
48018316Swollman	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
48118316Swollman#endif
48218316Swollman
48318316Swollman	/*
48418316Swollman	 * Now it is safe to enable pv_table recording.
48518316Swollman	 */
48618316Swollman	pmap_initialized = TRUE;
48718316Swollman}
48818316Swollman
48918316Swollman/*
49018316Swollman * Initialize the address space (zone) for the pv_entries.  Set a
49118316Swollman * high water mark so that the system can recover from excessive
49218316Swollman * numbers of pv entries.
49318316Swollman */
49419885Swollmanvoid
49519885Swollmanpmap_init2()
49619885Swollman{
49719885Swollman	int shpgperproc = PMAP_SHPGPERPROC;
49818316Swollman
49918316Swollman	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
50018316Swollman	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
50119885Swollman	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
50218316Swollman	pv_entry_high_water = 9 * (pv_entry_max / 10);
50318316Swollman	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
50419885Swollman}
50519885Swollman
50619885Swollman
50719885Swollman/***************************************************
50819885Swollman * Low level helper routines.....
50918316Swollman ***************************************************/
51018316Swollman
51118316Swollman#if defined(PMAP_DIAGNOSTIC)
51218316Swollman
51318316Swollman/*
51418316Swollman * This code checks for non-writeable/modified pages.
51518316Swollman * This should be an invalid condition.
51618316Swollman */
51718316Swollmanstatic int
51818316Swollmanpmap_nw_modified(pt_entry_t ptea)
51918316Swollman{
52018316Swollman	int pte;
52118316Swollman
52218316Swollman	pte = (int) ptea;
52318316Swollman
52418316Swollman	if ((pte & (PG_M|PG_RW)) == PG_M)
52518316Swollman		return 1;
52618316Swollman	else
52718316Swollman		return 0;
52818316Swollman}
52918316Swollman#endif
53018316Swollman
53118316Swollman
53218316Swollman/*
53318316Swollman * this routine defines the region(s) of memory that should
53419885Swollman * not be tested for the modified bit.
53518316Swollman */
53618316Swollmanstatic PMAP_INLINE int
53718316Swollmanpmap_track_modified(vm_offset_t va)
53818316Swollman{
53918316Swollman	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
54018316Swollman		return 1;
54118316Swollman	else
54218316Swollman		return 0;
54318316Swollman}
54418316Swollman
54518316Swollman#ifdef I386_CPU
54618316Swollman/*
54718316Swollman * i386 only has "invalidate everything" and no SMP to worry about.
54818316Swollman */
54918316SwollmanPMAP_INLINE void
55018316Swollmanpmap_invalidate_page(pmap_t pmap, vm_offset_t va)
55118316Swollman{
55218316Swollman
55318316Swollman	if (pmap == kernel_pmap || pmap->pm_active)
55418316Swollman		invltlb();
55518316Swollman}
55618316Swollman
55718316SwollmanPMAP_INLINE void
55818316Swollmanpmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
55918316Swollman{
56018316Swollman
56118316Swollman	if (pmap == kernel_pmap || pmap->pm_active)
56218316Swollman		invltlb();
56318316Swollman}
56418316Swollman
56518316SwollmanPMAP_INLINE void
56618316Swollmanpmap_invalidate_all(pmap_t pmap)
56718316Swollman{
56818316Swollman
56918316Swollman	if (pmap == kernel_pmap || pmap->pm_active)
57018316Swollman		invltlb();
57118316Swollman}
57218316Swollman#else /* !I386_CPU */
57318316Swollman#ifdef SMP
57418316Swollman/*
57518316Swollman * For SMP, these functions have to use the IPI mechanism for coherence.
57618316Swollman */
57718316Swollmanvoid
57818316Swollmanpmap_invalidate_page(pmap_t pmap, vm_offset_t va)
57918316Swollman{
58018316Swollman	u_int cpumask;
58118316Swollman	u_int other_cpus;
58218316Swollman
58318316Swollman	if (smp_started) {
58418316Swollman		if (!(read_eflags() & PSL_I))
58518316Swollman			panic("%s: interrupts disabled", __func__);
58618316Swollman		mtx_lock_spin(&smp_tlb_mtx);
58718316Swollman	} else
58818316Swollman		critical_enter();
58918316Swollman	/*
59018316Swollman	 * We need to disable interrupt preemption but MUST NOT have
59118316Swollman	 * interrupts disabled here.
59218316Swollman	 * XXX we may need to hold schedlock to get a coherent pm_active
59318316Swollman	 * XXX critical sections disable interrupts again
59418316Swollman	 */
59518316Swollman	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
59618316Swollman		invlpg(va);
59718316Swollman		smp_invlpg(va);
59818316Swollman	} else {
59918316Swollman		cpumask = PCPU_GET(cpumask);
60018316Swollman		other_cpus = PCPU_GET(other_cpus);
60118316Swollman		if (pmap->pm_active & cpumask)
60218316Swollman			invlpg(va);
60319885Swollman		if (pmap->pm_active & other_cpus)
60419885Swollman			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
60519885Swollman	}
60618316Swollman	if (smp_started)
60718316Swollman		mtx_unlock_spin(&smp_tlb_mtx);
60818316Swollman	else
60918316Swollman		critical_exit();
61018316Swollman}
61119885Swollman
61218316Swollmanvoid
61318316Swollmanpmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
61418316Swollman{
61518316Swollman	u_int cpumask;
61619885Swollman	u_int other_cpus;
61719885Swollman	vm_offset_t addr;
61819885Swollman
61919885Swollman	if (smp_started) {
62019885Swollman		if (!(read_eflags() & PSL_I))
621			panic("%s: interrupts disabled", __func__);
622		mtx_lock_spin(&smp_tlb_mtx);
623	} else
624		critical_enter();
625	/*
626	 * We need to disable interrupt preemption but MUST NOT have
627	 * interrupts disabled here.
628	 * XXX we may need to hold schedlock to get a coherent pm_active
629	 * XXX critical sections disable interrupts again
630	 */
631	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
632		for (addr = sva; addr < eva; addr += PAGE_SIZE)
633			invlpg(addr);
634		smp_invlpg_range(sva, eva);
635	} else {
636		cpumask = PCPU_GET(cpumask);
637		other_cpus = PCPU_GET(other_cpus);
638		if (pmap->pm_active & cpumask)
639			for (addr = sva; addr < eva; addr += PAGE_SIZE)
640				invlpg(addr);
641		if (pmap->pm_active & other_cpus)
642			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
643			    sva, eva);
644	}
645	if (smp_started)
646		mtx_unlock_spin(&smp_tlb_mtx);
647	else
648		critical_exit();
649}
650
651void
652pmap_invalidate_all(pmap_t pmap)
653{
654	u_int cpumask;
655	u_int other_cpus;
656
657	if (smp_started) {
658		if (!(read_eflags() & PSL_I))
659			panic("%s: interrupts disabled", __func__);
660		mtx_lock_spin(&smp_tlb_mtx);
661	} else
662		critical_enter();
663	/*
664	 * We need to disable interrupt preemption but MUST NOT have
665	 * interrupts disabled here.
666	 * XXX we may need to hold schedlock to get a coherent pm_active
667	 * XXX critical sections disable interrupts again
668	 */
669	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
670		invltlb();
671		smp_invltlb();
672	} else {
673		cpumask = PCPU_GET(cpumask);
674		other_cpus = PCPU_GET(other_cpus);
675		if (pmap->pm_active & cpumask)
676			invltlb();
677		if (pmap->pm_active & other_cpus)
678			smp_masked_invltlb(pmap->pm_active & other_cpus);
679	}
680	if (smp_started)
681		mtx_unlock_spin(&smp_tlb_mtx);
682	else
683		critical_exit();
684}
685#else /* !SMP */
686/*
687 * Normal, non-SMP, 486+ invalidation functions.
688 * We inline these within pmap.c for speed.
689 */
690PMAP_INLINE void
691pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
692{
693
694	if (pmap == kernel_pmap || pmap->pm_active)
695		invlpg(va);
696}
697
698PMAP_INLINE void
699pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
700{
701	vm_offset_t addr;
702
703	if (pmap == kernel_pmap || pmap->pm_active)
704		for (addr = sva; addr < eva; addr += PAGE_SIZE)
705			invlpg(addr);
706}
707
708PMAP_INLINE void
709pmap_invalidate_all(pmap_t pmap)
710{
711
712	if (pmap == kernel_pmap || pmap->pm_active)
713		invltlb();
714}
715#endif /* !SMP */
716#endif /* !I386_CPU */
717
718/*
719 * Are we current address space or kernel?  N.B. We return FALSE when
720 * a pmap's page table is in use because a kernel thread is borrowing
721 * it.  The borrowed page table can change spontaneously, making any
722 * dependence on its continued use subject to a race condition.
723 */
724static __inline int
725pmap_is_current(pmap_t pmap)
726{
727
728	return (pmap == kernel_pmap ||
729		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
730	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
731}
732
733/*
734 * If the given pmap is not the current pmap, Giant must be held.
735 */
736pt_entry_t *
737pmap_pte(pmap_t pmap, vm_offset_t va)
738{
739	pd_entry_t newpf;
740	pd_entry_t *pde;
741
742	pde = pmap_pde(pmap, va);
743	if (*pde & PG_PS)
744		return (pde);
745	if (*pde != 0) {
746		/* are we current address space or kernel? */
747		if (pmap_is_current(pmap))
748			return (vtopte(va));
749		GIANT_REQUIRED;
750		newpf = *pde & PG_FRAME;
751		if ((*PMAP2 & PG_FRAME) != newpf) {
752			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
753			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
754		}
755		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
756	}
757	return (0);
758}
759
760static __inline void
761invlcaddr(void *caddr)
762{
763#ifdef I386_CPU
764	invltlb();
765#else
766	invlpg((u_int)caddr);
767#endif
768}
769
770/*
771 * Super fast pmap_pte routine best used when scanning
772 * the pv lists.  This eliminates many coarse-grained
773 * invltlb calls.  Note that many of the pv list
774 * scans are across different pmaps.  It is very wasteful
775 * to do an entire invltlb for checking a single mapping.
776 *
777 * If the given pmap is not the current pmap, vm_page_queue_mtx
778 * must be held and curthread pinned to a CPU.
779 */
780static pt_entry_t *
781pmap_pte_quick(pmap_t pmap, vm_offset_t va)
782{
783	pd_entry_t newpf;
784	pd_entry_t *pde;
785
786	pde = pmap_pde(pmap, va);
787	if (*pde & PG_PS)
788		return (pde);
789	if (*pde != 0) {
790		/* are we current address space or kernel? */
791		if (pmap_is_current(pmap))
792			return (vtopte(va));
793		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
794		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
795		newpf = *pde & PG_FRAME;
796		if ((*PMAP1 & PG_FRAME) != newpf) {
797			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
798#ifdef SMP
799			PMAP1cpu = PCPU_GET(cpuid);
800#endif
801			invlcaddr(PADDR1);
802			PMAP1changed++;
803		} else
804#ifdef SMP
805		if (PMAP1cpu != PCPU_GET(cpuid)) {
806			PMAP1cpu = PCPU_GET(cpuid);
807			invlcaddr(PADDR1);
808			PMAP1changedcpu++;
809		} else
810#endif
811			PMAP1unchanged++;
812		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
813	}
814	return (0);
815}
816
817/*
818 *	Routine:	pmap_extract
819 *	Function:
820 *		Extract the physical page address associated
821 *		with the given map/virtual_address pair.
822 */
823vm_paddr_t
824pmap_extract(pmap_t pmap, vm_offset_t va)
825{
826	vm_paddr_t rtval;
827	pt_entry_t *pte;
828	pd_entry_t pde;
829
830	rtval = 0;
831	if (pmap == NULL)
832		return (rtval);
833	PMAP_LOCK(pmap);
834	pde = pmap->pm_pdir[va >> PDRSHIFT];
835	if (pde != 0) {
836		if ((pde & PG_PS) != 0) {
837			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
838			PMAP_UNLOCK(pmap);
839			return rtval;
840		}
841		pte = pmap_pte(pmap, va);
842		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
843	}
844	PMAP_UNLOCK(pmap);
845	return (rtval);
846}
847
848/*
849 *	Routine:	pmap_extract_and_hold
850 *	Function:
851 *		Atomically extract and hold the physical page
852 *		with the given pmap and virtual address pair
853 *		if that mapping permits the given protection.
854 */
855vm_page_t
856pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
857{
858	pd_entry_t pde;
859	pt_entry_t pte;
860	vm_page_t m;
861
862	m = NULL;
863	if (pmap == NULL)
864		return (m);
865	vm_page_lock_queues();
866	PMAP_LOCK(pmap);
867	pde = *pmap_pde(pmap, va);
868	if (pde != 0) {
869		if (pde & PG_PS) {
870			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
871				m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
872				    (va & PDRMASK));
873				vm_page_hold(m);
874			}
875		} else {
876			sched_pin();
877			pte = *pmap_pte_quick(pmap, va);
878			if (pte != 0 &&
879			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
880				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
881				vm_page_hold(m);
882			}
883			sched_unpin();
884		}
885	}
886	vm_page_unlock_queues();
887	PMAP_UNLOCK(pmap);
888	return (m);
889}
890
891/***************************************************
892 * Low level mapping routines.....
893 ***************************************************/
894
895/*
896 * Add a wired page to the kva.
897 * Note: not SMP coherent.
898 */
899PMAP_INLINE void
900pmap_kenter(vm_offset_t va, vm_paddr_t pa)
901{
902	pt_entry_t *pte;
903
904	pte = vtopte(va);
905	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
906}
907
908/*
909 * Remove a page from the kernel pagetables.
910 * Note: not SMP coherent.
911 */
912PMAP_INLINE void
913pmap_kremove(vm_offset_t va)
914{
915	pt_entry_t *pte;
916
917	pte = vtopte(va);
918	pte_clear(pte);
919}
920
921/*
922 *	Used to map a range of physical addresses into kernel
923 *	virtual address space.
924 *
925 *	The value passed in '*virt' is a suggested virtual address for
926 *	the mapping. Architectures which can support a direct-mapped
927 *	physical to virtual region can return the appropriate address
928 *	within that region, leaving '*virt' unchanged. Other
929 *	architectures should map the pages starting at '*virt' and
930 *	update '*virt' with the first usable address after the mapped
931 *	region.
932 */
933vm_offset_t
934pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
935{
936	vm_offset_t va, sva;
937
938	va = sva = *virt;
939	while (start < end) {
940		pmap_kenter(va, start);
941		va += PAGE_SIZE;
942		start += PAGE_SIZE;
943	}
944	pmap_invalidate_range(kernel_pmap, sva, va);
945	*virt = va;
946	return (sva);
947}
948
949
950/*
951 * Add a list of wired pages to the kva
952 * this routine is only used for temporary
953 * kernel mappings that do not need to have
954 * page modification or references recorded.
955 * Note that old mappings are simply written
956 * over.  The page *must* be wired.
957 * Note: SMP coherent.  Uses a ranged shootdown IPI.
958 */
959void
960pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
961{
962	vm_offset_t va;
963
964	va = sva;
965	while (count-- > 0) {
966		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
967		va += PAGE_SIZE;
968		m++;
969	}
970	pmap_invalidate_range(kernel_pmap, sva, va);
971}
972
973/*
974 * This routine tears out page mappings from the
975 * kernel -- it is meant only for temporary mappings.
976 * Note: SMP coherent.  Uses a ranged shootdown IPI.
977 */
978void
979pmap_qremove(vm_offset_t sva, int count)
980{
981	vm_offset_t va;
982
983	va = sva;
984	while (count-- > 0) {
985		pmap_kremove(va);
986		va += PAGE_SIZE;
987	}
988	pmap_invalidate_range(kernel_pmap, sva, va);
989}
990
991/***************************************************
992 * Page table page management routines.....
993 ***************************************************/
994
995/*
996 * This routine unholds page table pages, and if the hold count
997 * drops to zero, then it decrements the wire count.
998 */
999static int
1000_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1001{
1002
1003	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
1004		vm_page_lock_queues();
1005
1006	if (m->hold_count == 0) {
1007		vm_offset_t pteva;
1008		/*
1009		 * unmap the page table page
1010		 */
1011		pmap->pm_pdir[m->pindex] = 0;
1012		--pmap->pm_stats.resident_count;
1013		/*
1014		 * We never unwire a kernel page table page, making a
1015		 * check for the kernel_pmap unnecessary.
1016		 */
1017		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) {
1018			/*
1019			 * Do an invltlb to make the invalidated mapping
1020			 * take effect immediately.
1021			 */
1022			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1023			pmap_invalidate_page(pmap, pteva);
1024		}
1025
1026		/*
1027		 * If the page is finally unwired, simply free it.
1028		 */
1029		--m->wire_count;
1030		if (m->wire_count == 0) {
1031			vm_page_busy(m);
1032			vm_page_free_zero(m);
1033			atomic_subtract_int(&cnt.v_wire_count, 1);
1034		}
1035		return 1;
1036	}
1037	return 0;
1038}
1039
1040static PMAP_INLINE int
1041pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1042{
1043	vm_page_unhold(m);
1044	if (m->hold_count == 0)
1045		return _pmap_unwire_pte_hold(pmap, m);
1046	else
1047		return 0;
1048}
1049
1050/*
1051 * After removing a page table entry, this routine is used to
1052 * conditionally free the page, and manage the hold/wire counts.
1053 */
1054static int
1055pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
1056{
1057	pd_entry_t ptepde;
1058	vm_page_t mpte;
1059
1060	if (va >= VM_MAXUSER_ADDRESS)
1061		return 0;
1062	ptepde = *pmap_pde(pmap, va);
1063	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1064	return pmap_unwire_pte_hold(pmap, mpte);
1065}
1066
1067void
1068pmap_pinit0(pmap)
1069	struct pmap *pmap;
1070{
1071
1072	PMAP_LOCK_INIT(pmap);
1073	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1074#ifdef PAE
1075	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1076#endif
1077	pmap->pm_active = 0;
1078	PCPU_SET(curpmap, pmap);
1079	TAILQ_INIT(&pmap->pm_pvlist);
1080	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1081	mtx_lock_spin(&allpmaps_lock);
1082	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1083	mtx_unlock_spin(&allpmaps_lock);
1084}
1085
1086/*
1087 * Initialize a preallocated and zeroed pmap structure,
1088 * such as one in a vmspace structure.
1089 */
1090void
1091pmap_pinit(pmap)
1092	register struct pmap *pmap;
1093{
1094	vm_page_t m, ptdpg[NPGPTD];
1095	vm_paddr_t pa;
1096	static int color;
1097	int i;
1098
1099	PMAP_LOCK_INIT(pmap);
1100
1101	/*
1102	 * No need to allocate page table space yet but we do need a valid
1103	 * page directory table.
1104	 */
1105	if (pmap->pm_pdir == NULL) {
1106		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1107		    NBPTD);
1108#ifdef PAE
1109		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1110		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1111		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1112		    ("pmap_pinit: pdpt misaligned"));
1113		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1114		    ("pmap_pinit: pdpt above 4g"));
1115#endif
1116	}
1117
1118	/*
1119	 * allocate the page directory page(s)
1120	 */
1121	for (i = 0; i < NPGPTD;) {
1122		m = vm_page_alloc(NULL, color++,
1123		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1124		    VM_ALLOC_ZERO);
1125		if (m == NULL)
1126			VM_WAIT;
1127		else {
1128			ptdpg[i++] = m;
1129		}
1130	}
1131
1132	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1133
1134	for (i = 0; i < NPGPTD; i++) {
1135		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1136			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1137	}
1138
1139	mtx_lock_spin(&allpmaps_lock);
1140	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1141	mtx_unlock_spin(&allpmaps_lock);
1142	/* Wire in kernel global address entries. */
1143	/* XXX copies current process, does not fill in MPPTDI */
1144	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1145#ifdef SMP
1146	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1147#endif
1148
1149	/* install self-referential address mapping entry(s) */
1150	for (i = 0; i < NPGPTD; i++) {
1151		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1152		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1153#ifdef PAE
1154		pmap->pm_pdpt[i] = pa | PG_V;
1155#endif
1156	}
1157
1158	pmap->pm_active = 0;
1159	TAILQ_INIT(&pmap->pm_pvlist);
1160	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1161}
1162
1163/*
1164 * this routine is called if the page table page is not
1165 * mapped correctly.
1166 */
1167static vm_page_t
1168_pmap_allocpte(pmap, ptepindex)
1169	pmap_t	pmap;
1170	unsigned ptepindex;
1171{
1172	vm_paddr_t ptepa;
1173	vm_page_t m;
1174
1175	/*
1176	 * Allocate a page table page.
1177	 */
1178	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1179	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1180		VM_WAIT;
1181		/*
1182		 * Indicate the need to retry.  While waiting, the page table
1183		 * page may have been allocated.
1184		 */
1185		return (NULL);
1186	}
1187	if ((m->flags & PG_ZERO) == 0)
1188		pmap_zero_page(m);
1189
1190	KASSERT(m->queue == PQ_NONE,
1191		("_pmap_allocpte: %p->queue != PQ_NONE", m));
1192
1193	/*
1194	 * Increment the hold count for the page table page
1195	 * (denoting a new mapping.)
1196	 */
1197	m->hold_count++;
1198
1199	/*
1200	 * Map the pagetable page into the process address space, if
1201	 * it isn't already there.
1202	 */
1203
1204	pmap->pm_stats.resident_count++;
1205
1206	ptepa = VM_PAGE_TO_PHYS(m);
1207	pmap->pm_pdir[ptepindex] =
1208		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1209
1210	return m;
1211}
1212
1213static vm_page_t
1214pmap_allocpte(pmap_t pmap, vm_offset_t va)
1215{
1216	unsigned ptepindex;
1217	pd_entry_t ptepa;
1218	vm_page_t m;
1219
1220	/*
1221	 * Calculate pagetable page index
1222	 */
1223	ptepindex = va >> PDRSHIFT;
1224retry:
1225	/*
1226	 * Get the page directory entry
1227	 */
1228	ptepa = pmap->pm_pdir[ptepindex];
1229
1230	/*
1231	 * This supports switching from a 4MB page to a
1232	 * normal 4K page.
1233	 */
1234	if (ptepa & PG_PS) {
1235		pmap->pm_pdir[ptepindex] = 0;
1236		ptepa = 0;
1237		pmap_invalidate_all(kernel_pmap);
1238	}
1239
1240	/*
1241	 * If the page table page is mapped, we just increment the
1242	 * hold count, and activate it.
1243	 */
1244	if (ptepa) {
1245		m = PHYS_TO_VM_PAGE(ptepa);
1246		m->hold_count++;
1247	} else {
1248		/*
1249		 * Here if the pte page isn't mapped, or if it has
1250		 * been deallocated.
1251		 */
1252		m = _pmap_allocpte(pmap, ptepindex);
1253		if (m == NULL)
1254			goto retry;
1255	}
1256	return (m);
1257}
1258
1259
1260/***************************************************
1261* Pmap allocation/deallocation routines.
1262 ***************************************************/
1263
1264#ifdef SMP
1265/*
1266 * Deal with a SMP shootdown of other users of the pmap that we are
1267 * trying to dispose of.  This can be a bit hairy.
1268 */
1269static u_int *lazymask;
1270static u_int lazyptd;
1271static volatile u_int lazywait;
1272
1273void pmap_lazyfix_action(void);
1274
1275void
1276pmap_lazyfix_action(void)
1277{
1278	u_int mymask = PCPU_GET(cpumask);
1279
1280	if (rcr3() == lazyptd)
1281		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1282	atomic_clear_int(lazymask, mymask);
1283	atomic_store_rel_int(&lazywait, 1);
1284}
1285
1286static void
1287pmap_lazyfix_self(u_int mymask)
1288{
1289
1290	if (rcr3() == lazyptd)
1291		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1292	atomic_clear_int(lazymask, mymask);
1293}
1294
1295
1296static void
1297pmap_lazyfix(pmap_t pmap)
1298{
1299	u_int mymask = PCPU_GET(cpumask);
1300	u_int mask;
1301	register u_int spins;
1302
1303	while ((mask = pmap->pm_active) != 0) {
1304		spins = 50000000;
1305		mask = mask & -mask;	/* Find least significant set bit */
1306		mtx_lock_spin(&lazypmap_lock);
1307#ifdef PAE
1308		lazyptd = vtophys(pmap->pm_pdpt);
1309#else
1310		lazyptd = vtophys(pmap->pm_pdir);
1311#endif
1312		if (mask == mymask) {
1313			lazymask = &pmap->pm_active;
1314			pmap_lazyfix_self(mymask);
1315		} else {
1316			atomic_store_rel_int((u_int *)&lazymask,
1317			    (u_int)&pmap->pm_active);
1318			atomic_store_rel_int(&lazywait, 0);
1319			ipi_selected(mask, IPI_LAZYPMAP);
1320			while (lazywait == 0) {
1321				ia32_pause();
1322				if (--spins == 0)
1323					break;
1324			}
1325		}
1326		mtx_unlock_spin(&lazypmap_lock);
1327		if (spins == 0)
1328			printf("pmap_lazyfix: spun for 50000000\n");
1329	}
1330}
1331
1332#else	/* SMP */
1333
1334/*
1335 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1336 * unlikely to have to even execute this code, including the fact
1337 * that the cleanup is deferred until the parent does a wait(2), which
1338 * means that another userland process has run.
1339 */
1340static void
1341pmap_lazyfix(pmap_t pmap)
1342{
1343	u_int cr3;
1344
1345	cr3 = vtophys(pmap->pm_pdir);
1346	if (cr3 == rcr3()) {
1347		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1348		pmap->pm_active &= ~(PCPU_GET(cpumask));
1349	}
1350}
1351#endif	/* SMP */
1352
1353/*
1354 * Release any resources held by the given physical map.
1355 * Called when a pmap initialized by pmap_pinit is being released.
1356 * Should only be called if the map contains no valid mappings.
1357 */
1358void
1359pmap_release(pmap_t pmap)
1360{
1361	vm_page_t m, ptdpg[NPGPTD];
1362	int i;
1363
1364	KASSERT(pmap->pm_stats.resident_count == 0,
1365	    ("pmap_release: pmap resident count %ld != 0",
1366	    pmap->pm_stats.resident_count));
1367
1368	pmap_lazyfix(pmap);
1369	mtx_lock_spin(&allpmaps_lock);
1370	LIST_REMOVE(pmap, pm_list);
1371	mtx_unlock_spin(&allpmaps_lock);
1372
1373	for (i = 0; i < NPGPTD; i++)
1374		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1375
1376	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1377	    sizeof(*pmap->pm_pdir));
1378#ifdef SMP
1379	pmap->pm_pdir[MPPTDI] = 0;
1380#endif
1381
1382	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1383
1384	vm_page_lock_queues();
1385	for (i = 0; i < NPGPTD; i++) {
1386		m = ptdpg[i];
1387#ifdef PAE
1388		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1389		    ("pmap_release: got wrong ptd page"));
1390#endif
1391		m->wire_count--;
1392		atomic_subtract_int(&cnt.v_wire_count, 1);
1393		vm_page_free_zero(m);
1394	}
1395	vm_page_unlock_queues();
1396	PMAP_LOCK_DESTROY(pmap);
1397}
1398
1399static int
1400kvm_size(SYSCTL_HANDLER_ARGS)
1401{
1402	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1403
1404	return sysctl_handle_long(oidp, &ksize, 0, req);
1405}
1406SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1407    0, 0, kvm_size, "IU", "Size of KVM");
1408
1409static int
1410kvm_free(SYSCTL_HANDLER_ARGS)
1411{
1412	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1413
1414	return sysctl_handle_long(oidp, &kfree, 0, req);
1415}
1416SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1417    0, 0, kvm_free, "IU", "Amount of KVM free");
1418
1419/*
1420 * grow the number of kernel page table entries, if needed
1421 */
1422void
1423pmap_growkernel(vm_offset_t addr)
1424{
1425	struct pmap *pmap;
1426	vm_paddr_t ptppaddr;
1427	vm_page_t nkpg;
1428	pd_entry_t newpdir;
1429	pt_entry_t *pde;
1430
1431	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1432	if (kernel_vm_end == 0) {
1433		kernel_vm_end = KERNBASE;
1434		nkpt = 0;
1435		while (pdir_pde(PTD, kernel_vm_end)) {
1436			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1437			nkpt++;
1438		}
1439	}
1440	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1441	while (kernel_vm_end < addr) {
1442		if (pdir_pde(PTD, kernel_vm_end)) {
1443			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1444			continue;
1445		}
1446
1447		/*
1448		 * This index is bogus, but out of the way
1449		 */
1450		nkpg = vm_page_alloc(NULL, nkpt,
1451		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1452		if (!nkpg)
1453			panic("pmap_growkernel: no memory to grow kernel");
1454
1455		nkpt++;
1456
1457		pmap_zero_page(nkpg);
1458		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1459		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1460		pdir_pde(PTD, kernel_vm_end) = newpdir;
1461
1462		mtx_lock_spin(&allpmaps_lock);
1463		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1464			pde = pmap_pde(pmap, kernel_vm_end);
1465			pde_store(pde, newpdir);
1466		}
1467		mtx_unlock_spin(&allpmaps_lock);
1468		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1469	}
1470}
1471
1472
1473/***************************************************
1474 * page management routines.
1475 ***************************************************/
1476
1477/*
1478 * free the pv_entry back to the free list
1479 */
1480static PMAP_INLINE void
1481free_pv_entry(pv_entry_t pv)
1482{
1483	pv_entry_count--;
1484	uma_zfree(pvzone, pv);
1485}
1486
1487/*
1488 * get a new pv_entry, allocating a block from the system
1489 * when needed.
1490 * the memory allocation is performed bypassing the malloc code
1491 * because of the possibility of allocations at interrupt time.
1492 */
1493static pv_entry_t
1494get_pv_entry(void)
1495{
1496	pv_entry_count++;
1497	if (pv_entry_high_water &&
1498		(pv_entry_count > pv_entry_high_water) &&
1499		(pmap_pagedaemon_waken == 0)) {
1500		pmap_pagedaemon_waken = 1;
1501		wakeup (&vm_pages_needed);
1502	}
1503	return uma_zalloc(pvzone, M_NOWAIT);
1504}
1505
1506
1507static int
1508pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1509{
1510	pv_entry_t pv;
1511	int rtval;
1512
1513	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1514	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1515	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1516		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1517			if (pmap == pv->pv_pmap && va == pv->pv_va)
1518				break;
1519		}
1520	} else {
1521		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1522			if (va == pv->pv_va)
1523				break;
1524		}
1525	}
1526
1527	rtval = 0;
1528	if (pv) {
1529		rtval = pmap_unuse_pt(pmap, va);
1530		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1531		m->md.pv_list_count--;
1532		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1533			vm_page_flag_clear(m, PG_WRITEABLE);
1534
1535		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1536		free_pv_entry(pv);
1537	}
1538
1539	return rtval;
1540}
1541
1542/*
1543 * Create a pv entry for page at pa for
1544 * (pmap, va).
1545 */
1546static void
1547pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1548{
1549	pv_entry_t pv;
1550
1551	pv = get_pv_entry();
1552	pv->pv_va = va;
1553	pv->pv_pmap = pmap;
1554
1555	vm_page_lock_queues();
1556	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1557	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1558	m->md.pv_list_count++;
1559	vm_page_unlock_queues();
1560}
1561
1562/*
1563 * pmap_remove_pte: do the things to unmap a page in a process
1564 */
1565static int
1566pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1567{
1568	pt_entry_t oldpte;
1569	vm_page_t m;
1570
1571	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1572	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1573	oldpte = pte_load_clear(ptq);
1574	if (oldpte & PG_W)
1575		pmap->pm_stats.wired_count -= 1;
1576	/*
1577	 * Machines that don't support invlpg, also don't support
1578	 * PG_G.
1579	 */
1580	if (oldpte & PG_G)
1581		pmap_invalidate_page(kernel_pmap, va);
1582	pmap->pm_stats.resident_count -= 1;
1583	if (oldpte & PG_MANAGED) {
1584		m = PHYS_TO_VM_PAGE(oldpte);
1585		if (oldpte & PG_M) {
1586#if defined(PMAP_DIAGNOSTIC)
1587			if (pmap_nw_modified((pt_entry_t) oldpte)) {
1588				printf(
1589	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1590				    va, oldpte);
1591			}
1592#endif
1593			if (pmap_track_modified(va))
1594				vm_page_dirty(m);
1595		}
1596		if (oldpte & PG_A)
1597			vm_page_flag_set(m, PG_REFERENCED);
1598		return pmap_remove_entry(pmap, m, va);
1599	} else {
1600		return pmap_unuse_pt(pmap, va);
1601	}
1602}
1603
1604/*
1605 * Remove a single page from a process address space
1606 */
1607static void
1608pmap_remove_page(pmap_t pmap, vm_offset_t va)
1609{
1610	pt_entry_t *pte;
1611
1612	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1613	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1614	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1615	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1616		return;
1617	pmap_remove_pte(pmap, pte, va);
1618	pmap_invalidate_page(pmap, va);
1619}
1620
1621/*
1622 *	Remove the given range of addresses from the specified map.
1623 *
1624 *	It is assumed that the start and end are properly
1625 *	rounded to the page size.
1626 */
1627void
1628pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1629{
1630	vm_offset_t pdnxt;
1631	pd_entry_t ptpaddr;
1632	pt_entry_t *pte;
1633	int anyvalid;
1634
1635	if (pmap == NULL)
1636		return;
1637
1638	/*
1639	 * Perform an unsynchronized read.  This is, however, safe.
1640	 */
1641	if (pmap->pm_stats.resident_count == 0)
1642		return;
1643
1644	vm_page_lock_queues();
1645	sched_pin();
1646	PMAP_LOCK(pmap);
1647
1648	/*
1649	 * special handling of removing one page.  a very
1650	 * common operation and easy to short circuit some
1651	 * code.
1652	 */
1653	if ((sva + PAGE_SIZE == eva) &&
1654	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1655		pmap_remove_page(pmap, sva);
1656		goto out;
1657	}
1658
1659	anyvalid = 0;
1660
1661	for (; sva < eva; sva = pdnxt) {
1662		unsigned pdirindex;
1663
1664		/*
1665		 * Calculate index for next page table.
1666		 */
1667		pdnxt = (sva + NBPDR) & ~PDRMASK;
1668		if (pmap->pm_stats.resident_count == 0)
1669			break;
1670
1671		pdirindex = sva >> PDRSHIFT;
1672		ptpaddr = pmap->pm_pdir[pdirindex];
1673
1674		/*
1675		 * Weed out invalid mappings. Note: we assume that the page
1676		 * directory table is always allocated, and in kernel virtual.
1677		 */
1678		if (ptpaddr == 0)
1679			continue;
1680
1681		/*
1682		 * Check for large page.
1683		 */
1684		if ((ptpaddr & PG_PS) != 0) {
1685			pmap->pm_pdir[pdirindex] = 0;
1686			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1687			anyvalid = 1;
1688			continue;
1689		}
1690
1691		/*
1692		 * Limit our scan to either the end of the va represented
1693		 * by the current page table page, or to the end of the
1694		 * range being removed.
1695		 */
1696		if (pdnxt > eva)
1697			pdnxt = eva;
1698
1699		for (; sva != pdnxt; sva += PAGE_SIZE) {
1700			if ((pte = pmap_pte_quick(pmap, sva)) == NULL ||
1701			    *pte == 0)
1702				continue;
1703			anyvalid = 1;
1704			if (pmap_remove_pte(pmap, pte, sva))
1705				break;
1706		}
1707	}
1708
1709	if (anyvalid)
1710		pmap_invalidate_all(pmap);
1711out:
1712	sched_unpin();
1713	vm_page_unlock_queues();
1714	PMAP_UNLOCK(pmap);
1715}
1716
1717/*
1718 *	Routine:	pmap_remove_all
1719 *	Function:
1720 *		Removes this physical page from
1721 *		all physical maps in which it resides.
1722 *		Reflects back modify bits to the pager.
1723 *
1724 *	Notes:
1725 *		Original versions of this routine were very
1726 *		inefficient because they iteratively called
1727 *		pmap_remove (slow...)
1728 */
1729
1730void
1731pmap_remove_all(vm_page_t m)
1732{
1733	register pv_entry_t pv;
1734	pt_entry_t *pte, tpte;
1735
1736#if defined(PMAP_DIAGNOSTIC)
1737	/*
1738	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1739	 */
1740	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1741		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1742		    VM_PAGE_TO_PHYS(m));
1743	}
1744#endif
1745	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1746	sched_pin();
1747	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1748		PMAP_LOCK(pv->pv_pmap);
1749		pv->pv_pmap->pm_stats.resident_count--;
1750		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1751		tpte = pte_load_clear(pte);
1752		if (tpte & PG_W)
1753			pv->pv_pmap->pm_stats.wired_count--;
1754		if (tpte & PG_A)
1755			vm_page_flag_set(m, PG_REFERENCED);
1756
1757		/*
1758		 * Update the vm_page_t clean and reference bits.
1759		 */
1760		if (tpte & PG_M) {
1761#if defined(PMAP_DIAGNOSTIC)
1762			if (pmap_nw_modified((pt_entry_t) tpte)) {
1763				printf(
1764	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1765				    pv->pv_va, tpte);
1766			}
1767#endif
1768			if (pmap_track_modified(pv->pv_va))
1769				vm_page_dirty(m);
1770		}
1771		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1772		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1773		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1774		m->md.pv_list_count--;
1775		pmap_unuse_pt(pv->pv_pmap, pv->pv_va);
1776		PMAP_UNLOCK(pv->pv_pmap);
1777		free_pv_entry(pv);
1778	}
1779	vm_page_flag_clear(m, PG_WRITEABLE);
1780	sched_unpin();
1781}
1782
1783/*
1784 *	Set the physical protection on the
1785 *	specified range of this map as requested.
1786 */
1787void
1788pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1789{
1790	vm_offset_t pdnxt;
1791	pd_entry_t ptpaddr;
1792	int anychanged;
1793
1794	if (pmap == NULL)
1795		return;
1796
1797	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1798		pmap_remove(pmap, sva, eva);
1799		return;
1800	}
1801
1802	if (prot & VM_PROT_WRITE)
1803		return;
1804
1805	anychanged = 0;
1806
1807	vm_page_lock_queues();
1808	sched_pin();
1809	PMAP_LOCK(pmap);
1810	for (; sva < eva; sva = pdnxt) {
1811		unsigned pdirindex;
1812
1813		pdnxt = (sva + NBPDR) & ~PDRMASK;
1814
1815		pdirindex = sva >> PDRSHIFT;
1816		ptpaddr = pmap->pm_pdir[pdirindex];
1817
1818		/*
1819		 * Weed out invalid mappings. Note: we assume that the page
1820		 * directory table is always allocated, and in kernel virtual.
1821		 */
1822		if (ptpaddr == 0)
1823			continue;
1824
1825		/*
1826		 * Check for large page.
1827		 */
1828		if ((ptpaddr & PG_PS) != 0) {
1829			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1830			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1831			anychanged = 1;
1832			continue;
1833		}
1834
1835		if (pdnxt > eva)
1836			pdnxt = eva;
1837
1838		for (; sva != pdnxt; sva += PAGE_SIZE) {
1839			pt_entry_t pbits;
1840			pt_entry_t *pte;
1841			vm_page_t m;
1842
1843			if ((pte = pmap_pte_quick(pmap, sva)) == NULL)
1844				continue;
1845			pbits = *pte;
1846			if (pbits & PG_MANAGED) {
1847				m = NULL;
1848				if (pbits & PG_A) {
1849					m = PHYS_TO_VM_PAGE(pbits);
1850					vm_page_flag_set(m, PG_REFERENCED);
1851					pbits &= ~PG_A;
1852				}
1853				if ((pbits & PG_M) != 0 &&
1854				    pmap_track_modified(sva)) {
1855					if (m == NULL)
1856						m = PHYS_TO_VM_PAGE(pbits);
1857					vm_page_dirty(m);
1858					pbits &= ~PG_M;
1859				}
1860			}
1861
1862			pbits &= ~PG_RW;
1863
1864			if (pbits != *pte) {
1865				pte_store(pte, pbits);
1866				anychanged = 1;
1867			}
1868		}
1869	}
1870	if (anychanged)
1871		pmap_invalidate_all(pmap);
1872	sched_unpin();
1873	vm_page_unlock_queues();
1874	PMAP_UNLOCK(pmap);
1875}
1876
1877/*
1878 *	Insert the given physical page (p) at
1879 *	the specified virtual address (v) in the
1880 *	target physical map with the protection requested.
1881 *
1882 *	If specified, the page will be wired down, meaning
1883 *	that the related pte can not be reclaimed.
1884 *
1885 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1886 *	or lose information.  That is, this routine must actually
1887 *	insert this page into the given map NOW.
1888 */
1889void
1890pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1891	   boolean_t wired)
1892{
1893	vm_paddr_t pa;
1894	register pt_entry_t *pte;
1895	vm_paddr_t opa;
1896	pt_entry_t origpte, newpte;
1897	vm_page_t mpte;
1898
1899	if (pmap == NULL)
1900		return;
1901
1902	va &= PG_FRAME;
1903#ifdef PMAP_DIAGNOSTIC
1904	if (va > VM_MAX_KERNEL_ADDRESS)
1905		panic("pmap_enter: toobig");
1906	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1907		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1908#endif
1909
1910	mpte = NULL;
1911	/*
1912	 * In the case that a page table page is not
1913	 * resident, we are creating it here.
1914	 */
1915	if (va < VM_MAXUSER_ADDRESS) {
1916		mpte = pmap_allocpte(pmap, va);
1917	}
1918#if 0 && defined(PMAP_DIAGNOSTIC)
1919	else {
1920		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1921		origpte = *pdeaddr;
1922		if ((origpte & PG_V) == 0) {
1923			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
1924				pmap->pm_pdir[PTDPTDI], origpte, va);
1925		}
1926	}
1927#endif
1928
1929	pte = pmap_pte(pmap, va);
1930
1931	/*
1932	 * Page Directory table entry not valid, we need a new PT page
1933	 */
1934	if (pte == NULL) {
1935		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
1936			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
1937	}
1938
1939	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
1940	origpte = *pte;
1941	opa = origpte & PG_FRAME;
1942
1943	if (origpte & PG_PS) {
1944		/*
1945		 * Yes, I know this will truncate upper address bits for PAE,
1946		 * but I'm actually more interested in the lower bits
1947		 */
1948		printf("pmap_enter: va %p, pte %p, origpte %p\n",
1949		    (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
1950		panic("pmap_enter: attempted pmap_enter on 4MB page");
1951	}
1952
1953	/*
1954	 * Mapping has not changed, must be protection or wiring change.
1955	 */
1956	if (origpte && (opa == pa)) {
1957		/*
1958		 * Wiring change, just update stats. We don't worry about
1959		 * wiring PT pages as they remain resident as long as there
1960		 * are valid mappings in them. Hence, if a user page is wired,
1961		 * the PT page will be also.
1962		 */
1963		if (wired && ((origpte & PG_W) == 0))
1964			pmap->pm_stats.wired_count++;
1965		else if (!wired && (origpte & PG_W))
1966			pmap->pm_stats.wired_count--;
1967
1968#if defined(PMAP_DIAGNOSTIC)
1969		if (pmap_nw_modified((pt_entry_t) origpte)) {
1970			printf(
1971	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
1972			    va, origpte);
1973		}
1974#endif
1975
1976		/*
1977		 * Remove extra pte reference
1978		 */
1979		if (mpte)
1980			mpte->hold_count--;
1981
1982		/*
1983		 * We might be turning off write access to the page,
1984		 * so we go ahead and sense modify status.
1985		 */
1986		if (origpte & PG_MANAGED) {
1987			if ((origpte & PG_M) && pmap_track_modified(va)) {
1988				vm_page_t om;
1989				om = PHYS_TO_VM_PAGE(opa);
1990				vm_page_dirty(om);
1991			}
1992			pa |= PG_MANAGED;
1993		}
1994		goto validate;
1995	}
1996	/*
1997	 * Mapping has changed, invalidate old range and fall through to
1998	 * handle validating new mapping.
1999	 */
2000	if (opa) {
2001		int err;
2002		vm_page_lock_queues();
2003		PMAP_LOCK(pmap);
2004		err = pmap_remove_pte(pmap, pte, va);
2005		PMAP_UNLOCK(pmap);
2006		vm_page_unlock_queues();
2007		if (err)
2008			panic("pmap_enter: pte vanished, va: 0x%x", va);
2009	}
2010
2011	/*
2012	 * Enter on the PV list if part of our managed memory. Note that we
2013	 * raise IPL while manipulating pv_table since pmap_enter can be
2014	 * called at interrupt time.
2015	 */
2016	if (pmap_initialized &&
2017	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2018		pmap_insert_entry(pmap, va, m);
2019		pa |= PG_MANAGED;
2020	}
2021
2022	/*
2023	 * Increment counters
2024	 */
2025	pmap->pm_stats.resident_count++;
2026	if (wired)
2027		pmap->pm_stats.wired_count++;
2028
2029validate:
2030	/*
2031	 * Now validate mapping with desired protection/wiring.
2032	 */
2033	newpte = (pt_entry_t)(pa | PG_V);
2034	if ((prot & VM_PROT_WRITE) != 0)
2035		newpte |= PG_RW;
2036	if (wired)
2037		newpte |= PG_W;
2038	if (va < VM_MAXUSER_ADDRESS)
2039		newpte |= PG_U;
2040	if (pmap == kernel_pmap)
2041		newpte |= pgeflag;
2042
2043	/*
2044	 * if the mapping or permission bits are different, we need
2045	 * to update the pte.
2046	 */
2047	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2048		pte_store(pte, newpte | PG_A);
2049		/*if (origpte)*/ {
2050			pmap_invalidate_page(pmap, va);
2051		}
2052	}
2053}
2054
2055/*
2056 * this code makes some *MAJOR* assumptions:
2057 * 1. Current pmap & pmap exists.
2058 * 2. Not wired.
2059 * 3. Read access.
2060 * 4. No page table pages.
2061 * 5. Tlbflush is deferred to calling procedure.
2062 * 6. Page IS managed.
2063 * but is *MUCH* faster than pmap_enter...
2064 */
2065
2066vm_page_t
2067pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2068{
2069	pt_entry_t *pte;
2070	vm_paddr_t pa;
2071
2072	/*
2073	 * In the case that a page table page is not
2074	 * resident, we are creating it here.
2075	 */
2076	if (va < VM_MAXUSER_ADDRESS) {
2077		unsigned ptepindex;
2078		pd_entry_t ptepa;
2079
2080		/*
2081		 * Calculate pagetable page index
2082		 */
2083		ptepindex = va >> PDRSHIFT;
2084		if (mpte && (mpte->pindex == ptepindex)) {
2085			mpte->hold_count++;
2086		} else {
2087retry:
2088			/*
2089			 * Get the page directory entry
2090			 */
2091			ptepa = pmap->pm_pdir[ptepindex];
2092
2093			/*
2094			 * If the page table page is mapped, we just increment
2095			 * the hold count, and activate it.
2096			 */
2097			if (ptepa) {
2098				if (ptepa & PG_PS)
2099					panic("pmap_enter_quick: unexpected mapping into 4MB page");
2100				mpte = PHYS_TO_VM_PAGE(ptepa);
2101				mpte->hold_count++;
2102			} else {
2103				mpte = _pmap_allocpte(pmap, ptepindex);
2104				if (mpte == NULL)
2105					goto retry;
2106			}
2107		}
2108	} else {
2109		mpte = NULL;
2110	}
2111
2112	/*
2113	 * This call to vtopte makes the assumption that we are
2114	 * entering the page into the current pmap.  In order to support
2115	 * quick entry into any pmap, one would likely use pmap_pte_quick.
2116	 * But that isn't as quick as vtopte.
2117	 */
2118	pte = vtopte(va);
2119	if (*pte) {
2120		if (mpte != NULL) {
2121			vm_page_lock_queues();
2122			pmap_unwire_pte_hold(pmap, mpte);
2123			vm_page_unlock_queues();
2124		}
2125		return 0;
2126	}
2127
2128	/*
2129	 * Enter on the PV list if part of our managed memory. Note that we
2130	 * raise IPL while manipulating pv_table since pmap_enter can be
2131	 * called at interrupt time.
2132	 */
2133	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2134		pmap_insert_entry(pmap, va, m);
2135
2136	/*
2137	 * Increment counters
2138	 */
2139	pmap->pm_stats.resident_count++;
2140
2141	pa = VM_PAGE_TO_PHYS(m);
2142
2143	/*
2144	 * Now validate mapping with RO protection
2145	 */
2146	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2147		pte_store(pte, pa | PG_V | PG_U);
2148	else
2149		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2150
2151	return mpte;
2152}
2153
2154/*
2155 * Make a temporary mapping for a physical address.  This is only intended
2156 * to be used for panic dumps.
2157 */
2158void *
2159pmap_kenter_temporary(vm_paddr_t pa, int i)
2160{
2161	vm_offset_t va;
2162
2163	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2164	pmap_kenter(va, pa);
2165#ifndef I386_CPU
2166	invlpg(va);
2167#else
2168	invltlb();
2169#endif
2170	return ((void *)crashdumpmap);
2171}
2172
2173/*
2174 * This code maps large physical mmap regions into the
2175 * processor address space.  Note that some shortcuts
2176 * are taken, but the code works.
2177 */
2178void
2179pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2180		    vm_object_t object, vm_pindex_t pindex,
2181		    vm_size_t size)
2182{
2183	vm_page_t p;
2184
2185	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2186	KASSERT(object->type == OBJT_DEVICE,
2187	    ("pmap_object_init_pt: non-device object"));
2188	if (pseflag &&
2189	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2190		int i;
2191		vm_page_t m[1];
2192		unsigned int ptepindex;
2193		int npdes;
2194		pd_entry_t ptepa;
2195
2196		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2197			return;
2198retry:
2199		p = vm_page_lookup(object, pindex);
2200		if (p != NULL) {
2201			vm_page_lock_queues();
2202			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2203				goto retry;
2204		} else {
2205			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2206			if (p == NULL)
2207				return;
2208			m[0] = p;
2209
2210			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2211				vm_page_lock_queues();
2212				vm_page_free(p);
2213				vm_page_unlock_queues();
2214				return;
2215			}
2216
2217			p = vm_page_lookup(object, pindex);
2218			vm_page_lock_queues();
2219			vm_page_wakeup(p);
2220		}
2221		vm_page_unlock_queues();
2222
2223		ptepa = VM_PAGE_TO_PHYS(p);
2224		if (ptepa & (NBPDR - 1))
2225			return;
2226
2227		p->valid = VM_PAGE_BITS_ALL;
2228
2229		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2230		npdes = size >> PDRSHIFT;
2231		for(i = 0; i < npdes; i++) {
2232			pde_store(&pmap->pm_pdir[ptepindex],
2233			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
2234			ptepa += NBPDR;
2235			ptepindex += 1;
2236		}
2237		pmap_invalidate_all(pmap);
2238	}
2239}
2240
2241/*
2242 *	Routine:	pmap_change_wiring
2243 *	Function:	Change the wiring attribute for a map/virtual-address
2244 *			pair.
2245 *	In/out conditions:
2246 *			The mapping must already exist in the pmap.
2247 */
2248void
2249pmap_change_wiring(pmap, va, wired)
2250	register pmap_t pmap;
2251	vm_offset_t va;
2252	boolean_t wired;
2253{
2254	register pt_entry_t *pte;
2255
2256	if (pmap == NULL)
2257		return;
2258
2259	PMAP_LOCK(pmap);
2260	pte = pmap_pte(pmap, va);
2261
2262	if (wired && !pmap_pte_w(pte))
2263		pmap->pm_stats.wired_count++;
2264	else if (!wired && pmap_pte_w(pte))
2265		pmap->pm_stats.wired_count--;
2266
2267	/*
2268	 * Wiring is not a hardware characteristic so there is no need to
2269	 * invalidate TLB.
2270	 */
2271	pmap_pte_set_w(pte, wired);
2272	PMAP_UNLOCK(pmap);
2273}
2274
2275
2276
2277/*
2278 *	Copy the range specified by src_addr/len
2279 *	from the source map to the range dst_addr/len
2280 *	in the destination map.
2281 *
2282 *	This routine is only advisory and need not do anything.
2283 */
2284
2285void
2286pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2287	  vm_offset_t src_addr)
2288{
2289	vm_offset_t addr;
2290	vm_offset_t end_addr = src_addr + len;
2291	vm_offset_t pdnxt;
2292	vm_page_t m;
2293
2294	if (dst_addr != src_addr)
2295		return;
2296
2297	if (!pmap_is_current(src_pmap))
2298		return;
2299
2300	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2301		pt_entry_t *src_pte, *dst_pte;
2302		vm_page_t dstmpte, srcmpte;
2303		pd_entry_t srcptepaddr;
2304		unsigned ptepindex;
2305
2306		if (addr >= UPT_MIN_ADDRESS)
2307			panic("pmap_copy: invalid to pmap_copy page tables\n");
2308
2309		/*
2310		 * Don't let optional prefaulting of pages make us go
2311		 * way below the low water mark of free pages or way
2312		 * above high water mark of used pv entries.
2313		 */
2314		if (cnt.v_free_count < cnt.v_free_reserved ||
2315		    pv_entry_count > pv_entry_high_water)
2316			break;
2317
2318		pdnxt = (addr + NBPDR) & ~PDRMASK;
2319		ptepindex = addr >> PDRSHIFT;
2320
2321		srcptepaddr = src_pmap->pm_pdir[ptepindex];
2322		if (srcptepaddr == 0)
2323			continue;
2324
2325		if (srcptepaddr & PG_PS) {
2326			if (dst_pmap->pm_pdir[ptepindex] == 0) {
2327				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2328				dst_pmap->pm_stats.resident_count +=
2329				    NBPDR / PAGE_SIZE;
2330			}
2331			continue;
2332		}
2333
2334		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2335		if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY))
2336			continue;
2337
2338		if (pdnxt > end_addr)
2339			pdnxt = end_addr;
2340
2341		src_pte = vtopte(addr);
2342		while (addr < pdnxt) {
2343			pt_entry_t ptetemp;
2344			ptetemp = *src_pte;
2345			/*
2346			 * we only virtual copy managed pages
2347			 */
2348			if ((ptetemp & PG_MANAGED) != 0) {
2349				/*
2350				 * We have to check after allocpte for the
2351				 * pte still being around...  allocpte can
2352				 * block.
2353				 */
2354				dstmpte = pmap_allocpte(dst_pmap, addr);
2355				dst_pte = pmap_pte(dst_pmap, addr);
2356				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2357					/*
2358					 * Clear the modified and
2359					 * accessed (referenced) bits
2360					 * during the copy.
2361					 */
2362					m = PHYS_TO_VM_PAGE(ptetemp);
2363					*dst_pte = ptetemp & ~(PG_M | PG_A);
2364					dst_pmap->pm_stats.resident_count++;
2365					pmap_insert_entry(dst_pmap, addr, m);
2366	 			} else {
2367					vm_page_lock_queues();
2368					pmap_unwire_pte_hold(dst_pmap, dstmpte);
2369					vm_page_unlock_queues();
2370				}
2371				if (dstmpte->hold_count >= srcmpte->hold_count)
2372					break;
2373			}
2374			addr += PAGE_SIZE;
2375			src_pte++;
2376		}
2377	}
2378}
2379
2380static __inline void
2381pagezero(void *page)
2382{
2383#if defined(I686_CPU)
2384	if (cpu_class == CPUCLASS_686) {
2385#if defined(CPU_ENABLE_SSE)
2386		if (cpu_feature & CPUID_SSE2)
2387			sse2_pagezero(page);
2388		else
2389#endif
2390			i686_pagezero(page);
2391	} else
2392#endif
2393		bzero(page, PAGE_SIZE);
2394}
2395
2396/*
2397 *	pmap_zero_page zeros the specified hardware page by mapping
2398 *	the page into KVM and using bzero to clear its contents.
2399 */
2400void
2401pmap_zero_page(vm_page_t m)
2402{
2403
2404	mtx_lock(&CMAPCADDR12_lock);
2405	if (*CMAP2)
2406		panic("pmap_zero_page: CMAP2 busy");
2407	sched_pin();
2408	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2409	invlcaddr(CADDR2);
2410	pagezero(CADDR2);
2411	*CMAP2 = 0;
2412	sched_unpin();
2413	mtx_unlock(&CMAPCADDR12_lock);
2414}
2415
2416/*
2417 *	pmap_zero_page_area zeros the specified hardware page by mapping
2418 *	the page into KVM and using bzero to clear its contents.
2419 *
2420 *	off and size may not cover an area beyond a single hardware page.
2421 */
2422void
2423pmap_zero_page_area(vm_page_t m, int off, int size)
2424{
2425
2426	mtx_lock(&CMAPCADDR12_lock);
2427	if (*CMAP2)
2428		panic("pmap_zero_page: CMAP2 busy");
2429	sched_pin();
2430	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2431	invlcaddr(CADDR2);
2432	if (off == 0 && size == PAGE_SIZE)
2433		pagezero(CADDR2);
2434	else
2435		bzero((char *)CADDR2 + off, size);
2436	*CMAP2 = 0;
2437	sched_unpin();
2438	mtx_unlock(&CMAPCADDR12_lock);
2439}
2440
2441/*
2442 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2443 *	the page into KVM and using bzero to clear its contents.  This
2444 *	is intended to be called from the vm_pagezero process only and
2445 *	outside of Giant.
2446 */
2447void
2448pmap_zero_page_idle(vm_page_t m)
2449{
2450
2451	if (*CMAP3)
2452		panic("pmap_zero_page: CMAP3 busy");
2453	sched_pin();
2454	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2455	invlcaddr(CADDR3);
2456	pagezero(CADDR3);
2457	*CMAP3 = 0;
2458	sched_unpin();
2459}
2460
2461/*
2462 *	pmap_copy_page copies the specified (machine independent)
2463 *	page by mapping the page into virtual memory and using
2464 *	bcopy to copy the page, one machine dependent page at a
2465 *	time.
2466 */
2467void
2468pmap_copy_page(vm_page_t src, vm_page_t dst)
2469{
2470
2471	mtx_lock(&CMAPCADDR12_lock);
2472	if (*CMAP1)
2473		panic("pmap_copy_page: CMAP1 busy");
2474	if (*CMAP2)
2475		panic("pmap_copy_page: CMAP2 busy");
2476	sched_pin();
2477#ifdef I386_CPU
2478	invltlb();
2479#else
2480	invlpg((u_int)CADDR1);
2481	invlpg((u_int)CADDR2);
2482#endif
2483	*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2484	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2485	bcopy(CADDR1, CADDR2, PAGE_SIZE);
2486	*CMAP1 = 0;
2487	*CMAP2 = 0;
2488	sched_unpin();
2489	mtx_unlock(&CMAPCADDR12_lock);
2490}
2491
2492/*
2493 * Returns true if the pmap's pv is one of the first
2494 * 16 pvs linked to from this page.  This count may
2495 * be changed upwards or downwards in the future; it
2496 * is only necessary that true be returned for a small
2497 * subset of pmaps for proper page aging.
2498 */
2499boolean_t
2500pmap_page_exists_quick(pmap, m)
2501	pmap_t pmap;
2502	vm_page_t m;
2503{
2504	pv_entry_t pv;
2505	int loops = 0;
2506
2507	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2508		return FALSE;
2509
2510	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2511	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2512		if (pv->pv_pmap == pmap) {
2513			return TRUE;
2514		}
2515		loops++;
2516		if (loops >= 16)
2517			break;
2518	}
2519	return (FALSE);
2520}
2521
2522#define PMAP_REMOVE_PAGES_CURPROC_ONLY
2523/*
2524 * Remove all pages from specified address space
2525 * this aids process exit speeds.  Also, this code
2526 * is special cased for current process only, but
2527 * can have the more generic (and slightly slower)
2528 * mode enabled.  This is much faster than pmap_remove
2529 * in the case of running down an entire address space.
2530 */
2531void
2532pmap_remove_pages(pmap, sva, eva)
2533	pmap_t pmap;
2534	vm_offset_t sva, eva;
2535{
2536	pt_entry_t *pte, tpte;
2537	vm_page_t m;
2538	pv_entry_t pv, npv;
2539
2540#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2541	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2542		printf("warning: pmap_remove_pages called with non-current pmap\n");
2543		return;
2544	}
2545#endif
2546	vm_page_lock_queues();
2547	PMAP_LOCK(pmap);
2548	sched_pin();
2549	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2550
2551		if (pv->pv_va >= eva || pv->pv_va < sva) {
2552			npv = TAILQ_NEXT(pv, pv_plist);
2553			continue;
2554		}
2555
2556#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2557		pte = vtopte(pv->pv_va);
2558#else
2559		pte = pmap_pte_quick(pmap, pv->pv_va);
2560#endif
2561		tpte = *pte;
2562
2563		if (tpte == 0) {
2564			printf("TPTE at %p  IS ZERO @ VA %08x\n",
2565							pte, pv->pv_va);
2566			panic("bad pte");
2567		}
2568
2569/*
2570 * We cannot remove wired pages from a process' mapping at this time
2571 */
2572		if (tpte & PG_W) {
2573			npv = TAILQ_NEXT(pv, pv_plist);
2574			continue;
2575		}
2576
2577		m = PHYS_TO_VM_PAGE(tpte);
2578		KASSERT(m->phys_addr == (tpte & PG_FRAME),
2579		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2580		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2581
2582		KASSERT(m < &vm_page_array[vm_page_array_size],
2583			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2584
2585		pmap->pm_stats.resident_count--;
2586
2587		pte_clear(pte);
2588
2589		/*
2590		 * Update the vm_page_t clean and reference bits.
2591		 */
2592		if (tpte & PG_M) {
2593			vm_page_dirty(m);
2594		}
2595
2596		npv = TAILQ_NEXT(pv, pv_plist);
2597		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2598
2599		m->md.pv_list_count--;
2600		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2601		if (TAILQ_EMPTY(&m->md.pv_list))
2602			vm_page_flag_clear(m, PG_WRITEABLE);
2603
2604		pmap_unuse_pt(pmap, pv->pv_va);
2605		free_pv_entry(pv);
2606	}
2607	sched_unpin();
2608	pmap_invalidate_all(pmap);
2609	PMAP_UNLOCK(pmap);
2610	vm_page_unlock_queues();
2611}
2612
2613/*
2614 *	pmap_is_modified:
2615 *
2616 *	Return whether or not the specified physical page was modified
2617 *	in any physical maps.
2618 */
2619boolean_t
2620pmap_is_modified(vm_page_t m)
2621{
2622	pv_entry_t pv;
2623	pt_entry_t *pte;
2624	boolean_t rv;
2625
2626	rv = FALSE;
2627	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2628		return (rv);
2629
2630	sched_pin();
2631	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2632	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2633		/*
2634		 * if the bit being tested is the modified bit, then
2635		 * mark clean_map and ptes as never
2636		 * modified.
2637		 */
2638		if (!pmap_track_modified(pv->pv_va))
2639			continue;
2640#if defined(PMAP_DIAGNOSTIC)
2641		if (!pv->pv_pmap) {
2642			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2643			continue;
2644		}
2645#endif
2646		PMAP_LOCK(pv->pv_pmap);
2647		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2648		rv = (*pte & PG_M) != 0;
2649		PMAP_UNLOCK(pv->pv_pmap);
2650		if (rv)
2651			break;
2652	}
2653	sched_unpin();
2654	return (rv);
2655}
2656
2657/*
2658 *	pmap_is_prefaultable:
2659 *
2660 *	Return whether or not the specified virtual address is elgible
2661 *	for prefault.
2662 */
2663boolean_t
2664pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2665{
2666	pt_entry_t *pte;
2667	boolean_t rv;
2668
2669	rv = FALSE;
2670	PMAP_LOCK(pmap);
2671	if (*pmap_pde(pmap, addr)) {
2672		pte = vtopte(addr);
2673		rv = *pte == 0;
2674	}
2675	PMAP_UNLOCK(pmap);
2676	return (rv);
2677}
2678
2679/*
2680 *	Clear the given bit in each of the given page's ptes.
2681 */
2682static __inline void
2683pmap_clear_ptes(vm_page_t m, int bit)
2684{
2685	register pv_entry_t pv;
2686	pt_entry_t pbits, *pte;
2687
2688	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2689	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2690		return;
2691
2692	sched_pin();
2693	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2694	/*
2695	 * Loop over all current mappings setting/clearing as appropos If
2696	 * setting RO do we need to clear the VAC?
2697	 */
2698	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2699		/*
2700		 * don't write protect pager mappings
2701		 */
2702		if (bit == PG_RW) {
2703			if (!pmap_track_modified(pv->pv_va))
2704				continue;
2705		}
2706
2707#if defined(PMAP_DIAGNOSTIC)
2708		if (!pv->pv_pmap) {
2709			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2710			continue;
2711		}
2712#endif
2713
2714		PMAP_LOCK(pv->pv_pmap);
2715		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2716		pbits = *pte;
2717		if (pbits & bit) {
2718			if (bit == PG_RW) {
2719				if (pbits & PG_M) {
2720					vm_page_dirty(m);
2721				}
2722				pte_store(pte, pbits & ~(PG_M|PG_RW));
2723			} else {
2724				pte_store(pte, pbits & ~bit);
2725			}
2726			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2727		}
2728		PMAP_UNLOCK(pv->pv_pmap);
2729	}
2730	if (bit == PG_RW)
2731		vm_page_flag_clear(m, PG_WRITEABLE);
2732	sched_unpin();
2733}
2734
2735/*
2736 *      pmap_page_protect:
2737 *
2738 *      Lower the permission for all mappings to a given page.
2739 */
2740void
2741pmap_page_protect(vm_page_t m, vm_prot_t prot)
2742{
2743	if ((prot & VM_PROT_WRITE) == 0) {
2744		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2745			pmap_clear_ptes(m, PG_RW);
2746		} else {
2747			pmap_remove_all(m);
2748		}
2749	}
2750}
2751
2752/*
2753 *	pmap_ts_referenced:
2754 *
2755 *	Return a count of reference bits for a page, clearing those bits.
2756 *	It is not necessary for every reference bit to be cleared, but it
2757 *	is necessary that 0 only be returned when there are truly no
2758 *	reference bits set.
2759 *
2760 *	XXX: The exact number of bits to check and clear is a matter that
2761 *	should be tested and standardized at some point in the future for
2762 *	optimal aging of shared pages.
2763 */
2764int
2765pmap_ts_referenced(vm_page_t m)
2766{
2767	register pv_entry_t pv, pvf, pvn;
2768	pt_entry_t *pte;
2769	pt_entry_t v;
2770	int rtval = 0;
2771
2772	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2773		return (rtval);
2774
2775	sched_pin();
2776	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2777	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2778
2779		pvf = pv;
2780
2781		do {
2782			pvn = TAILQ_NEXT(pv, pv_list);
2783
2784			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2785
2786			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2787
2788			if (!pmap_track_modified(pv->pv_va))
2789				continue;
2790
2791			PMAP_LOCK(pv->pv_pmap);
2792			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2793
2794			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2795				atomic_clear_int((u_int *)pte, PG_A);
2796				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2797
2798				rtval++;
2799				if (rtval > 4) {
2800					PMAP_UNLOCK(pv->pv_pmap);
2801					break;
2802				}
2803			}
2804			PMAP_UNLOCK(pv->pv_pmap);
2805		} while ((pv = pvn) != NULL && pv != pvf);
2806	}
2807	sched_unpin();
2808
2809	return (rtval);
2810}
2811
2812/*
2813 *	Clear the modify bits on the specified physical page.
2814 */
2815void
2816pmap_clear_modify(vm_page_t m)
2817{
2818	pmap_clear_ptes(m, PG_M);
2819}
2820
2821/*
2822 *	pmap_clear_reference:
2823 *
2824 *	Clear the reference bit on the specified physical page.
2825 */
2826void
2827pmap_clear_reference(vm_page_t m)
2828{
2829	pmap_clear_ptes(m, PG_A);
2830}
2831
2832/*
2833 * Miscellaneous support routines follow
2834 */
2835
2836/*
2837 * Map a set of physical memory pages into the kernel virtual
2838 * address space. Return a pointer to where it is mapped. This
2839 * routine is intended to be used for mapping device memory,
2840 * NOT real memory.
2841 */
2842void *
2843pmap_mapdev(pa, size)
2844	vm_paddr_t pa;
2845	vm_size_t size;
2846{
2847	vm_offset_t va, tmpva, offset;
2848
2849	offset = pa & PAGE_MASK;
2850	size = roundup(offset + size, PAGE_SIZE);
2851	pa = pa & PG_FRAME;
2852
2853	if (pa < KERNLOAD && pa + size <= KERNLOAD)
2854		va = KERNBASE + pa;
2855	else
2856		va = kmem_alloc_nofault(kernel_map, size);
2857	if (!va)
2858		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2859
2860	for (tmpva = va; size > 0; ) {
2861		pmap_kenter(tmpva, pa);
2862		size -= PAGE_SIZE;
2863		tmpva += PAGE_SIZE;
2864		pa += PAGE_SIZE;
2865	}
2866	pmap_invalidate_range(kernel_pmap, va, tmpva);
2867	return ((void *)(va + offset));
2868}
2869
2870void
2871pmap_unmapdev(va, size)
2872	vm_offset_t va;
2873	vm_size_t size;
2874{
2875	vm_offset_t base, offset, tmpva;
2876
2877	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
2878		return;
2879	base = va & PG_FRAME;
2880	offset = va & PAGE_MASK;
2881	size = roundup(offset + size, PAGE_SIZE);
2882	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2883		pmap_kremove(tmpva);
2884	pmap_invalidate_range(kernel_pmap, va, tmpva);
2885	kmem_free(kernel_map, base, size);
2886}
2887
2888/*
2889 * perform the pmap work for mincore
2890 */
2891int
2892pmap_mincore(pmap, addr)
2893	pmap_t pmap;
2894	vm_offset_t addr;
2895{
2896	pt_entry_t *ptep, pte;
2897	vm_page_t m;
2898	int val = 0;
2899
2900	PMAP_LOCK(pmap);
2901	ptep = pmap_pte(pmap, addr);
2902	pte = (ptep != NULL) ? *ptep : 0;
2903	PMAP_UNLOCK(pmap);
2904
2905	if (pte != 0) {
2906		vm_paddr_t pa;
2907
2908		val = MINCORE_INCORE;
2909		if ((pte & PG_MANAGED) == 0)
2910			return val;
2911
2912		pa = pte & PG_FRAME;
2913
2914		m = PHYS_TO_VM_PAGE(pa);
2915
2916		/*
2917		 * Modified by us
2918		 */
2919		if (pte & PG_M)
2920			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2921		else {
2922			/*
2923			 * Modified by someone else
2924			 */
2925			vm_page_lock_queues();
2926			if (m->dirty || pmap_is_modified(m))
2927				val |= MINCORE_MODIFIED_OTHER;
2928			vm_page_unlock_queues();
2929		}
2930		/*
2931		 * Referenced by us
2932		 */
2933		if (pte & PG_A)
2934			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2935		else {
2936			/*
2937			 * Referenced by someone else
2938			 */
2939			vm_page_lock_queues();
2940			if ((m->flags & PG_REFERENCED) ||
2941			    pmap_ts_referenced(m)) {
2942				val |= MINCORE_REFERENCED_OTHER;
2943				vm_page_flag_set(m, PG_REFERENCED);
2944			}
2945			vm_page_unlock_queues();
2946		}
2947	}
2948	return val;
2949}
2950
2951void
2952pmap_activate(struct thread *td)
2953{
2954	struct proc *p = td->td_proc;
2955	pmap_t	pmap, oldpmap;
2956	u_int32_t  cr3;
2957
2958	critical_enter();
2959	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2960	oldpmap = PCPU_GET(curpmap);
2961#if defined(SMP)
2962	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
2963	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
2964#else
2965	oldpmap->pm_active &= ~1;
2966	pmap->pm_active |= 1;
2967#endif
2968#ifdef PAE
2969	cr3 = vtophys(pmap->pm_pdpt);
2970#else
2971	cr3 = vtophys(pmap->pm_pdir);
2972#endif
2973	/* XXXKSE this is wrong.
2974	 * pmap_activate is for the current thread on the current cpu
2975	 */
2976	if (p->p_flag & P_SA) {
2977		/* Make sure all other cr3 entries are updated. */
2978		/* what if they are running?  XXXKSE (maybe abort them) */
2979		FOREACH_THREAD_IN_PROC(p, td) {
2980			td->td_pcb->pcb_cr3 = cr3;
2981		}
2982	} else {
2983		td->td_pcb->pcb_cr3 = cr3;
2984	}
2985	load_cr3(cr3);
2986	PCPU_SET(curpmap, pmap);
2987	critical_exit();
2988}
2989
2990vm_offset_t
2991pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
2992{
2993
2994	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
2995		return addr;
2996	}
2997
2998	addr = (addr + PDRMASK) & ~PDRMASK;
2999	return addr;
3000}
3001
3002
3003#if defined(PMAP_DEBUG)
3004pmap_pid_dump(int pid)
3005{
3006	pmap_t pmap;
3007	struct proc *p;
3008	int npte = 0;
3009	int index;
3010
3011	sx_slock(&allproc_lock);
3012	LIST_FOREACH(p, &allproc, p_list) {
3013		if (p->p_pid != pid)
3014			continue;
3015
3016		if (p->p_vmspace) {
3017			int i,j;
3018			index = 0;
3019			pmap = vmspace_pmap(p->p_vmspace);
3020			for (i = 0; i < NPDEPTD; i++) {
3021				pd_entry_t *pde;
3022				pt_entry_t *pte;
3023				vm_offset_t base = i << PDRSHIFT;
3024
3025				pde = &pmap->pm_pdir[i];
3026				if (pde && pmap_pde_v(pde)) {
3027					for (j = 0; j < NPTEPG; j++) {
3028						vm_offset_t va = base + (j << PAGE_SHIFT);
3029						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3030							if (index) {
3031								index = 0;
3032								printf("\n");
3033							}
3034							sx_sunlock(&allproc_lock);
3035							return npte;
3036						}
3037						pte = pmap_pte(pmap, va);
3038						if (pte && pmap_pte_v(pte)) {
3039							pt_entry_t pa;
3040							vm_page_t m;
3041							pa = *pte;
3042							m = PHYS_TO_VM_PAGE(pa);
3043							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3044								va, pa, m->hold_count, m->wire_count, m->flags);
3045							npte++;
3046							index++;
3047							if (index >= 2) {
3048								index = 0;
3049								printf("\n");
3050							} else {
3051								printf(" ");
3052							}
3053						}
3054					}
3055				}
3056			}
3057		}
3058	}
3059	sx_sunlock(&allproc_lock);
3060	return npte;
3061}
3062#endif
3063
3064#if defined(DEBUG)
3065
3066static void	pads(pmap_t pm);
3067void		pmap_pvdump(vm_offset_t pa);
3068
3069/* print address space of pmap*/
3070static void
3071pads(pm)
3072	pmap_t pm;
3073{
3074	int i, j;
3075	vm_paddr_t va;
3076	pt_entry_t *ptep;
3077
3078	if (pm == kernel_pmap)
3079		return;
3080	for (i = 0; i < NPDEPTD; i++)
3081		if (pm->pm_pdir[i])
3082			for (j = 0; j < NPTEPG; j++) {
3083				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3084				if (pm == kernel_pmap && va < KERNBASE)
3085					continue;
3086				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3087					continue;
3088				ptep = pmap_pte(pm, va);
3089				if (pmap_pte_v(ptep))
3090					printf("%x:%x ", va, *ptep);
3091			};
3092
3093}
3094
3095void
3096pmap_pvdump(pa)
3097	vm_paddr_t pa;
3098{
3099	pv_entry_t pv;
3100	vm_page_t m;
3101
3102	printf("pa %x", pa);
3103	m = PHYS_TO_VM_PAGE(pa);
3104	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3105		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3106		pads(pv->pv_pmap);
3107	}
3108	printf(" ");
3109}
3110#endif
3111