pmap.c revision 255811
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: stable/9/sys/i386/i386/pmap.c 255811 2013-09-23 07:53:58Z kib $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_apic.h"
107#include "opt_cpu.h"
108#include "opt_pmap.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/rwlock.h>
123#include <sys/sf_buf.h>
124#include <sys/sx.h>
125#include <sys/vmmeter.h>
126#include <sys/sched.h>
127#include <sys/sysctl.h>
128#ifdef SMP
129#include <sys/smp.h>
130#else
131#include <sys/cpuset.h>
132#endif
133
134#include <vm/vm.h>
135#include <vm/vm_param.h>
136#include <vm/vm_kern.h>
137#include <vm/vm_page.h>
138#include <vm/vm_map.h>
139#include <vm/vm_object.h>
140#include <vm/vm_extern.h>
141#include <vm/vm_pageout.h>
142#include <vm/vm_pager.h>
143#include <vm/vm_reserv.h>
144#include <vm/uma.h>
145
146#ifdef DEV_APIC
147#include <sys/bus.h>
148#include <machine/intr_machdep.h>
149#include <machine/apicvar.h>
150#endif
151#include <machine/cpu.h>
152#include <machine/cputypes.h>
153#include <machine/md_var.h>
154#include <machine/pcb.h>
155#include <machine/specialreg.h>
156#ifdef SMP
157#include <machine/smp.h>
158#endif
159
160#ifdef XBOX
161#include <machine/xbox.h>
162#endif
163
164#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
165#define CPU_ENABLE_SSE
166#endif
167
168#ifndef PMAP_SHPGPERPROC
169#define PMAP_SHPGPERPROC 200
170#endif
171
172#if !defined(DIAGNOSTIC)
173#ifdef __GNUC_GNU_INLINE__
174#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
175#else
176#define PMAP_INLINE	extern inline
177#endif
178#else
179#define PMAP_INLINE
180#endif
181
182#ifdef PV_STATS
183#define PV_STAT(x)	do { x ; } while (0)
184#else
185#define PV_STAT(x)	do { } while (0)
186#endif
187
188#define	pa_index(pa)	((pa) >> PDRSHIFT)
189#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
190
191/*
192 * Get PDEs and PTEs for user/kernel address space
193 */
194#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
195#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
196
197#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
198#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
199#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
200#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
201#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
202
203#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
204    atomic_clear_int((u_int *)(pte), PG_W))
205#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
206
207struct pmap kernel_pmap_store;
208LIST_HEAD(pmaplist, pmap);
209static struct pmaplist allpmaps;
210static struct mtx allpmaps_lock;
211
212vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
213vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
214int pgeflag = 0;		/* PG_G or-in */
215int pseflag = 0;		/* PG_PS or-in */
216
217static int nkpt = NKPT;
218vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
219extern u_int32_t KERNend;
220extern u_int32_t KPTphys;
221
222#ifdef PAE
223pt_entry_t pg_nx;
224static uma_zone_t pdptzone;
225#endif
226
227static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
228
229static int pat_works = 1;
230SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
231    "Is page attribute table fully functional?");
232
233static int pg_ps_enabled = 1;
234SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
235    "Are large page mappings enabled?");
236
237#define	PAT_INDEX_SIZE	8
238static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
239
240/*
241 * Isolate the global pv list lock from data and other locks to prevent false
242 * sharing within the cache.
243 */
244static struct {
245	struct rwlock	lock;
246	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
247} pvh_global __aligned(CACHE_LINE_SIZE);
248
249#define	pvh_global_lock	pvh_global.lock
250
251/*
252 * Data for the pv entry allocation mechanism
253 */
254static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
255static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
256static struct md_page *pv_table;
257static int shpgperproc = PMAP_SHPGPERPROC;
258
259struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
260int pv_maxchunks;			/* How many chunks we have KVA for */
261vm_offset_t pv_vafree;			/* freelist stored in the PTE */
262
263/*
264 * All those kernel PT submaps that BSD is so fond of
265 */
266struct sysmaps {
267	struct	mtx lock;
268	pt_entry_t *CMAP1;
269	pt_entry_t *CMAP2;
270	caddr_t	CADDR1;
271	caddr_t	CADDR2;
272};
273static struct sysmaps sysmaps_pcpu[MAXCPU];
274pt_entry_t *CMAP1 = 0;
275static pt_entry_t *CMAP3;
276static pd_entry_t *KPTD;
277caddr_t CADDR1 = 0, ptvmmap = 0;
278static caddr_t CADDR3;
279struct msgbuf *msgbufp = 0;
280
281/*
282 * Crashdump maps.
283 */
284static caddr_t crashdumpmap;
285
286static pt_entry_t *PMAP1 = 0, *PMAP2;
287static pt_entry_t *PADDR1 = 0, *PADDR2;
288#ifdef SMP
289static int PMAP1cpu;
290static int PMAP1changedcpu;
291SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
292	   &PMAP1changedcpu, 0,
293	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
294#endif
295static int PMAP1changed;
296SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
297	   &PMAP1changed, 0,
298	   "Number of times pmap_pte_quick changed PMAP1");
299static int PMAP1unchanged;
300SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
301	   &PMAP1unchanged, 0,
302	   "Number of times pmap_pte_quick didn't change PMAP1");
303static struct mtx PMAP2mutex;
304
305static void	free_pv_chunk(struct pv_chunk *pc);
306static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
307static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
308static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
309static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
310static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
311static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
312static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
313		    vm_offset_t va);
314static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
315
316static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
317static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
318    vm_prot_t prot);
319static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
320    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
321static void pmap_flush_page(vm_page_t m);
322static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
323static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
324static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
325static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
326static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
327static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
328static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
329static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
330static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
331static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
332    vm_prot_t prot);
333static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
334static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
335    vm_page_t *free);
336static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
337    vm_page_t *free);
338static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
339static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
340    vm_page_t *free);
341static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
342					vm_offset_t va);
343static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
344static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
345    vm_page_t m);
346static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
347    pd_entry_t newpde);
348static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
349
350static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
351
352static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags);
353static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free);
354static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
355static void pmap_pte_release(pt_entry_t *pte);
356static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
357#ifdef PAE
358static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
359#endif
360static void pmap_set_pg(void);
361
362static __inline void pagezero(void *page);
363
364CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
365CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
366
367/*
368 * If you get an error here, then you set KVA_PAGES wrong! See the
369 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
370 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
371 */
372CTASSERT(KERNBASE % (1 << 24) == 0);
373
374/*
375 *	Bootstrap the system enough to run with virtual memory.
376 *
377 *	On the i386 this is called after mapping has already been enabled
378 *	and just syncs the pmap module with what has already been done.
379 *	[We can't call it easily with mapping off since the kernel is not
380 *	mapped with PA == VA, hence we would have to relocate every address
381 *	from the linked base (virtual) address "KERNBASE" to the actual
382 *	(physical) address starting relative to 0]
383 */
384void
385pmap_bootstrap(vm_paddr_t firstaddr)
386{
387	vm_offset_t va;
388	pt_entry_t *pte, *unused;
389	struct sysmaps *sysmaps;
390	int i;
391
392	/*
393	 * Initialize the first available kernel virtual address.  However,
394	 * using "firstaddr" may waste a few pages of the kernel virtual
395	 * address space, because locore may not have mapped every physical
396	 * page that it allocated.  Preferably, locore would provide a first
397	 * unused virtual address in addition to "firstaddr".
398	 */
399	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
400
401	virtual_end = VM_MAX_KERNEL_ADDRESS;
402
403	/*
404	 * Initialize the kernel pmap (which is statically allocated).
405	 */
406	PMAP_LOCK_INIT(kernel_pmap);
407	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
408#ifdef PAE
409	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
410#endif
411	kernel_pmap->pm_root = NULL;
412	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
413	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
414
415 	/*
416	 * Initialize the global pv list lock.
417	 */
418	rw_init(&pvh_global_lock, "pmap pv global");
419
420	LIST_INIT(&allpmaps);
421
422	/*
423	 * Request a spin mutex so that changes to allpmaps cannot be
424	 * preempted by smp_rendezvous_cpus().  Otherwise,
425	 * pmap_update_pde_kernel() could access allpmaps while it is
426	 * being changed.
427	 */
428	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
429	mtx_lock_spin(&allpmaps_lock);
430	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
431	mtx_unlock_spin(&allpmaps_lock);
432
433	/*
434	 * Reserve some special page table entries/VA space for temporary
435	 * mapping of pages.
436	 */
437#define	SYSMAP(c, p, v, n)	\
438	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
439
440	va = virtual_avail;
441	pte = vtopte(va);
442
443	/*
444	 * CMAP1/CMAP2 are used for zeroing and copying pages.
445	 * CMAP3 is used for the idle process page zeroing.
446	 */
447	for (i = 0; i < MAXCPU; i++) {
448		sysmaps = &sysmaps_pcpu[i];
449		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
450		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
451		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
452	}
453	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
454	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
455
456	/*
457	 * Crashdump maps.
458	 */
459	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
460
461	/*
462	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
463	 */
464	SYSMAP(caddr_t, unused, ptvmmap, 1)
465
466	/*
467	 * msgbufp is used to map the system message buffer.
468	 */
469	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
470
471	/*
472	 * KPTmap is used by pmap_kextract().
473	 *
474	 * KPTmap is first initialized by locore.  However, that initial
475	 * KPTmap can only support NKPT page table pages.  Here, a larger
476	 * KPTmap is created that can support KVA_PAGES page table pages.
477	 */
478	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
479
480	for (i = 0; i < NKPT; i++)
481		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
482
483	/*
484	 * Adjust the start of the KPTD and KPTmap so that the implementation
485	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
486	 */
487	KPTD -= KPTDI;
488	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
489
490	/*
491	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
492	 * respectively.
493	 */
494	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
495	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
496
497	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
498
499	virtual_avail = va;
500
501	/*
502	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
503	 * physical memory region that is used by the ACPI wakeup code.  This
504	 * mapping must not have PG_G set.
505	 */
506#ifdef XBOX
507	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
508	 * an early stadium, we cannot yet neatly map video memory ... :-(
509	 * Better fixes are very welcome! */
510	if (!arch_i386_is_xbox)
511#endif
512	for (i = 1; i < NKPT; i++)
513		PTD[i] = 0;
514
515	/* Initialize the PAT MSR if present. */
516	pmap_init_pat();
517
518	/* Turn on PG_G on kernel page(s) */
519	pmap_set_pg();
520}
521
522/*
523 * Setup the PAT MSR.
524 */
525void
526pmap_init_pat(void)
527{
528	int pat_table[PAT_INDEX_SIZE];
529	uint64_t pat_msr;
530	u_long cr0, cr4;
531	int i;
532
533	/* Set default PAT index table. */
534	for (i = 0; i < PAT_INDEX_SIZE; i++)
535		pat_table[i] = -1;
536	pat_table[PAT_WRITE_BACK] = 0;
537	pat_table[PAT_WRITE_THROUGH] = 1;
538	pat_table[PAT_UNCACHEABLE] = 3;
539	pat_table[PAT_WRITE_COMBINING] = 3;
540	pat_table[PAT_WRITE_PROTECTED] = 3;
541	pat_table[PAT_UNCACHED] = 3;
542
543	/* Bail if this CPU doesn't implement PAT. */
544	if ((cpu_feature & CPUID_PAT) == 0) {
545		for (i = 0; i < PAT_INDEX_SIZE; i++)
546			pat_index[i] = pat_table[i];
547		pat_works = 0;
548		return;
549	}
550
551	/*
552	 * Due to some Intel errata, we can only safely use the lower 4
553	 * PAT entries.
554	 *
555	 *   Intel Pentium III Processor Specification Update
556	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
557	 * or Mode C Paging)
558	 *
559	 *   Intel Pentium IV  Processor Specification Update
560	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
561	 */
562	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
563	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
564		pat_works = 0;
565
566	/* Initialize default PAT entries. */
567	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
568	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
569	    PAT_VALUE(2, PAT_UNCACHED) |
570	    PAT_VALUE(3, PAT_UNCACHEABLE) |
571	    PAT_VALUE(4, PAT_WRITE_BACK) |
572	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
573	    PAT_VALUE(6, PAT_UNCACHED) |
574	    PAT_VALUE(7, PAT_UNCACHEABLE);
575
576	if (pat_works) {
577		/*
578		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
579		 * Program 5 and 6 as WP and WC.
580		 * Leave 4 and 7 as WB and UC.
581		 */
582		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
583		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
584		    PAT_VALUE(6, PAT_WRITE_COMBINING);
585		pat_table[PAT_UNCACHED] = 2;
586		pat_table[PAT_WRITE_PROTECTED] = 5;
587		pat_table[PAT_WRITE_COMBINING] = 6;
588	} else {
589		/*
590		 * Just replace PAT Index 2 with WC instead of UC-.
591		 */
592		pat_msr &= ~PAT_MASK(2);
593		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
594		pat_table[PAT_WRITE_COMBINING] = 2;
595	}
596
597	/* Disable PGE. */
598	cr4 = rcr4();
599	load_cr4(cr4 & ~CR4_PGE);
600
601	/* Disable caches (CD = 1, NW = 0). */
602	cr0 = rcr0();
603	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
604
605	/* Flushes caches and TLBs. */
606	wbinvd();
607	invltlb();
608
609	/* Update PAT and index table. */
610	wrmsr(MSR_PAT, pat_msr);
611	for (i = 0; i < PAT_INDEX_SIZE; i++)
612		pat_index[i] = pat_table[i];
613
614	/* Flush caches and TLBs again. */
615	wbinvd();
616	invltlb();
617
618	/* Restore caches and PGE. */
619	load_cr0(cr0);
620	load_cr4(cr4);
621}
622
623/*
624 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
625 */
626static void
627pmap_set_pg(void)
628{
629	pt_entry_t *pte;
630	vm_offset_t va, endva;
631
632	if (pgeflag == 0)
633		return;
634
635	endva = KERNBASE + KERNend;
636
637	if (pseflag) {
638		va = KERNBASE + KERNLOAD;
639		while (va  < endva) {
640			pdir_pde(PTD, va) |= pgeflag;
641			invltlb();	/* Play it safe, invltlb() every time */
642			va += NBPDR;
643		}
644	} else {
645		va = (vm_offset_t)btext;
646		while (va < endva) {
647			pte = vtopte(va);
648			if (*pte)
649				*pte |= pgeflag;
650			invltlb();	/* Play it safe, invltlb() every time */
651			va += PAGE_SIZE;
652		}
653	}
654}
655
656/*
657 * Initialize a vm_page's machine-dependent fields.
658 */
659void
660pmap_page_init(vm_page_t m)
661{
662
663	TAILQ_INIT(&m->md.pv_list);
664	m->md.pat_mode = PAT_WRITE_BACK;
665}
666
667#ifdef PAE
668static void *
669pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
670{
671
672	/* Inform UMA that this allocator uses kernel_map/object. */
673	*flags = UMA_SLAB_KERNEL;
674	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
675	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
676}
677#endif
678
679/*
680 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
681 * Requirements:
682 *  - Must deal with pages in order to ensure that none of the PG_* bits
683 *    are ever set, PG_V in particular.
684 *  - Assumes we can write to ptes without pte_store() atomic ops, even
685 *    on PAE systems.  This should be ok.
686 *  - Assumes nothing will ever test these addresses for 0 to indicate
687 *    no mapping instead of correctly checking PG_V.
688 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
689 * Because PG_V is never set, there can be no mappings to invalidate.
690 */
691static vm_offset_t
692pmap_ptelist_alloc(vm_offset_t *head)
693{
694	pt_entry_t *pte;
695	vm_offset_t va;
696
697	va = *head;
698	if (va == 0)
699		return (va);	/* Out of memory */
700	pte = vtopte(va);
701	*head = *pte;
702	if (*head & PG_V)
703		panic("pmap_ptelist_alloc: va with PG_V set!");
704	*pte = 0;
705	return (va);
706}
707
708static void
709pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
710{
711	pt_entry_t *pte;
712
713	if (va & PG_V)
714		panic("pmap_ptelist_free: freeing va with PG_V set!");
715	pte = vtopte(va);
716	*pte = *head;		/* virtual! PG_V is 0 though */
717	*head = va;
718}
719
720static void
721pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
722{
723	int i;
724	vm_offset_t va;
725
726	*head = 0;
727	for (i = npages - 1; i >= 0; i--) {
728		va = (vm_offset_t)base + i * PAGE_SIZE;
729		pmap_ptelist_free(head, va);
730	}
731}
732
733
734/*
735 *	Initialize the pmap module.
736 *	Called by vm_init, to initialize any structures that the pmap
737 *	system needs to map virtual memory.
738 */
739void
740pmap_init(void)
741{
742	vm_page_t mpte;
743	vm_size_t s;
744	int i, pv_npg;
745
746	/*
747	 * Initialize the vm page array entries for the kernel pmap's
748	 * page table pages.
749	 */
750	for (i = 0; i < NKPT; i++) {
751		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
752		KASSERT(mpte >= vm_page_array &&
753		    mpte < &vm_page_array[vm_page_array_size],
754		    ("pmap_init: page table page is out of range"));
755		mpte->pindex = i + KPTDI;
756		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
757	}
758
759	/*
760	 * Initialize the address space (zone) for the pv entries.  Set a
761	 * high water mark so that the system can recover from excessive
762	 * numbers of pv entries.
763	 */
764	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
765	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
766	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
767	pv_entry_max = roundup(pv_entry_max, _NPCPV);
768	pv_entry_high_water = 9 * (pv_entry_max / 10);
769
770	/*
771	 * If the kernel is running in a virtual machine on an AMD Family 10h
772	 * processor, then it must assume that MCA is enabled by the virtual
773	 * machine monitor.
774	 */
775	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
776	    CPUID_TO_FAMILY(cpu_id) == 0x10)
777		workaround_erratum383 = 1;
778
779	/*
780	 * Are large page mappings supported and enabled?
781	 */
782	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
783	if (pseflag == 0)
784		pg_ps_enabled = 0;
785	else if (pg_ps_enabled) {
786		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
787		    ("pmap_init: can't assign to pagesizes[1]"));
788		pagesizes[1] = NBPDR;
789	}
790
791	/*
792	 * Calculate the size of the pv head table for superpages.
793	 */
794	for (i = 0; phys_avail[i + 1]; i += 2);
795	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
796
797	/*
798	 * Allocate memory for the pv head table for superpages.
799	 */
800	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
801	s = round_page(s);
802	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
803	for (i = 0; i < pv_npg; i++)
804		TAILQ_INIT(&pv_table[i].pv_list);
805
806	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
807	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
808	    PAGE_SIZE * pv_maxchunks);
809	if (pv_chunkbase == NULL)
810		panic("pmap_init: not enough kvm for pv chunks");
811	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
812#ifdef PAE
813	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
814	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
815	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
816	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
817#endif
818}
819
820
821SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
822	"Max number of PV entries");
823SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
824	"Page share factor per proc");
825
826static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
827    "2/4MB page mapping counters");
828
829static u_long pmap_pde_demotions;
830SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
831    &pmap_pde_demotions, 0, "2/4MB page demotions");
832
833static u_long pmap_pde_mappings;
834SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
835    &pmap_pde_mappings, 0, "2/4MB page mappings");
836
837static u_long pmap_pde_p_failures;
838SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
839    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
840
841static u_long pmap_pde_promotions;
842SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
843    &pmap_pde_promotions, 0, "2/4MB page promotions");
844
845/***************************************************
846 * Low level helper routines.....
847 ***************************************************/
848
849/*
850 * Determine the appropriate bits to set in a PTE or PDE for a specified
851 * caching mode.
852 */
853int
854pmap_cache_bits(int mode, boolean_t is_pde)
855{
856	int cache_bits, pat_flag, pat_idx;
857
858	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
859		panic("Unknown caching mode %d\n", mode);
860
861	/* The PAT bit is different for PTE's and PDE's. */
862	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
863
864	/* Map the caching mode to a PAT index. */
865	pat_idx = pat_index[mode];
866
867	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
868	cache_bits = 0;
869	if (pat_idx & 0x4)
870		cache_bits |= pat_flag;
871	if (pat_idx & 0x2)
872		cache_bits |= PG_NC_PCD;
873	if (pat_idx & 0x1)
874		cache_bits |= PG_NC_PWT;
875	return (cache_bits);
876}
877
878/*
879 * The caller is responsible for maintaining TLB consistency.
880 */
881static void
882pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
883{
884	pd_entry_t *pde;
885	pmap_t pmap;
886	boolean_t PTD_updated;
887
888	PTD_updated = FALSE;
889	mtx_lock_spin(&allpmaps_lock);
890	LIST_FOREACH(pmap, &allpmaps, pm_list) {
891		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
892		    PG_FRAME))
893			PTD_updated = TRUE;
894		pde = pmap_pde(pmap, va);
895		pde_store(pde, newpde);
896	}
897	mtx_unlock_spin(&allpmaps_lock);
898	KASSERT(PTD_updated,
899	    ("pmap_kenter_pde: current page table is not in allpmaps"));
900}
901
902/*
903 * After changing the page size for the specified virtual address in the page
904 * table, flush the corresponding entries from the processor's TLB.  Only the
905 * calling processor's TLB is affected.
906 *
907 * The calling thread must be pinned to a processor.
908 */
909static void
910pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
911{
912	u_long cr4;
913
914	if ((newpde & PG_PS) == 0)
915		/* Demotion: flush a specific 2MB page mapping. */
916		invlpg(va);
917	else if ((newpde & PG_G) == 0)
918		/*
919		 * Promotion: flush every 4KB page mapping from the TLB
920		 * because there are too many to flush individually.
921		 */
922		invltlb();
923	else {
924		/*
925		 * Promotion: flush every 4KB page mapping from the TLB,
926		 * including any global (PG_G) mappings.
927		 */
928		cr4 = rcr4();
929		load_cr4(cr4 & ~CR4_PGE);
930		/*
931		 * Although preemption at this point could be detrimental to
932		 * performance, it would not lead to an error.  PG_G is simply
933		 * ignored if CR4.PGE is clear.  Moreover, in case this block
934		 * is re-entered, the load_cr4() either above or below will
935		 * modify CR4.PGE flushing the TLB.
936		 */
937		load_cr4(cr4 | CR4_PGE);
938	}
939}
940#ifdef SMP
941/*
942 * For SMP, these functions have to use the IPI mechanism for coherence.
943 *
944 * N.B.: Before calling any of the following TLB invalidation functions,
945 * the calling processor must ensure that all stores updating a non-
946 * kernel page table are globally performed.  Otherwise, another
947 * processor could cache an old, pre-update entry without being
948 * invalidated.  This can happen one of two ways: (1) The pmap becomes
949 * active on another processor after its pm_active field is checked by
950 * one of the following functions but before a store updating the page
951 * table is globally performed. (2) The pmap becomes active on another
952 * processor before its pm_active field is checked but due to
953 * speculative loads one of the following functions stills reads the
954 * pmap as inactive on the other processor.
955 *
956 * The kernel page table is exempt because its pm_active field is
957 * immutable.  The kernel page table is always active on every
958 * processor.
959 */
960void
961pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
962{
963	cpuset_t other_cpus;
964	u_int cpuid;
965
966	sched_pin();
967	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
968		invlpg(va);
969		smp_invlpg(va);
970	} else {
971		cpuid = PCPU_GET(cpuid);
972		other_cpus = all_cpus;
973		CPU_CLR(cpuid, &other_cpus);
974		if (CPU_ISSET(cpuid, &pmap->pm_active))
975			invlpg(va);
976		CPU_AND(&other_cpus, &pmap->pm_active);
977		if (!CPU_EMPTY(&other_cpus))
978			smp_masked_invlpg(other_cpus, va);
979	}
980	sched_unpin();
981}
982
983void
984pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
985{
986	cpuset_t other_cpus;
987	vm_offset_t addr;
988	u_int cpuid;
989
990	sched_pin();
991	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
992		for (addr = sva; addr < eva; addr += PAGE_SIZE)
993			invlpg(addr);
994		smp_invlpg_range(sva, eva);
995	} else {
996		cpuid = PCPU_GET(cpuid);
997		other_cpus = all_cpus;
998		CPU_CLR(cpuid, &other_cpus);
999		if (CPU_ISSET(cpuid, &pmap->pm_active))
1000			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1001				invlpg(addr);
1002		CPU_AND(&other_cpus, &pmap->pm_active);
1003		if (!CPU_EMPTY(&other_cpus))
1004			smp_masked_invlpg_range(other_cpus, sva, eva);
1005	}
1006	sched_unpin();
1007}
1008
1009void
1010pmap_invalidate_all(pmap_t pmap)
1011{
1012	cpuset_t other_cpus;
1013	u_int cpuid;
1014
1015	sched_pin();
1016	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1017		invltlb();
1018		smp_invltlb();
1019	} else {
1020		cpuid = PCPU_GET(cpuid);
1021		other_cpus = all_cpus;
1022		CPU_CLR(cpuid, &other_cpus);
1023		if (CPU_ISSET(cpuid, &pmap->pm_active))
1024			invltlb();
1025		CPU_AND(&other_cpus, &pmap->pm_active);
1026		if (!CPU_EMPTY(&other_cpus))
1027			smp_masked_invltlb(other_cpus);
1028	}
1029	sched_unpin();
1030}
1031
1032void
1033pmap_invalidate_cache(void)
1034{
1035
1036	sched_pin();
1037	wbinvd();
1038	smp_cache_flush();
1039	sched_unpin();
1040}
1041
1042struct pde_action {
1043	cpuset_t invalidate;	/* processors that invalidate their TLB */
1044	vm_offset_t va;
1045	pd_entry_t *pde;
1046	pd_entry_t newpde;
1047	u_int store;		/* processor that updates the PDE */
1048};
1049
1050static void
1051pmap_update_pde_kernel(void *arg)
1052{
1053	struct pde_action *act = arg;
1054	pd_entry_t *pde;
1055	pmap_t pmap;
1056
1057	if (act->store == PCPU_GET(cpuid)) {
1058
1059		/*
1060		 * Elsewhere, this operation requires allpmaps_lock for
1061		 * synchronization.  Here, it does not because it is being
1062		 * performed in the context of an all_cpus rendezvous.
1063		 */
1064		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1065			pde = pmap_pde(pmap, act->va);
1066			pde_store(pde, act->newpde);
1067		}
1068	}
1069}
1070
1071static void
1072pmap_update_pde_user(void *arg)
1073{
1074	struct pde_action *act = arg;
1075
1076	if (act->store == PCPU_GET(cpuid))
1077		pde_store(act->pde, act->newpde);
1078}
1079
1080static void
1081pmap_update_pde_teardown(void *arg)
1082{
1083	struct pde_action *act = arg;
1084
1085	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1086		pmap_update_pde_invalidate(act->va, act->newpde);
1087}
1088
1089/*
1090 * Change the page size for the specified virtual address in a way that
1091 * prevents any possibility of the TLB ever having two entries that map the
1092 * same virtual address using different page sizes.  This is the recommended
1093 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1094 * machine check exception for a TLB state that is improperly diagnosed as a
1095 * hardware error.
1096 */
1097static void
1098pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1099{
1100	struct pde_action act;
1101	cpuset_t active, other_cpus;
1102	u_int cpuid;
1103
1104	sched_pin();
1105	cpuid = PCPU_GET(cpuid);
1106	other_cpus = all_cpus;
1107	CPU_CLR(cpuid, &other_cpus);
1108	if (pmap == kernel_pmap)
1109		active = all_cpus;
1110	else
1111		active = pmap->pm_active;
1112	if (CPU_OVERLAP(&active, &other_cpus)) {
1113		act.store = cpuid;
1114		act.invalidate = active;
1115		act.va = va;
1116		act.pde = pde;
1117		act.newpde = newpde;
1118		CPU_SET(cpuid, &active);
1119		smp_rendezvous_cpus(active,
1120		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1121		    pmap_update_pde_kernel : pmap_update_pde_user,
1122		    pmap_update_pde_teardown, &act);
1123	} else {
1124		if (pmap == kernel_pmap)
1125			pmap_kenter_pde(va, newpde);
1126		else
1127			pde_store(pde, newpde);
1128		if (CPU_ISSET(cpuid, &active))
1129			pmap_update_pde_invalidate(va, newpde);
1130	}
1131	sched_unpin();
1132}
1133#else /* !SMP */
1134/*
1135 * Normal, non-SMP, 486+ invalidation functions.
1136 * We inline these within pmap.c for speed.
1137 */
1138PMAP_INLINE void
1139pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1140{
1141
1142	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1143		invlpg(va);
1144}
1145
1146PMAP_INLINE void
1147pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1148{
1149	vm_offset_t addr;
1150
1151	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1152		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1153			invlpg(addr);
1154}
1155
1156PMAP_INLINE void
1157pmap_invalidate_all(pmap_t pmap)
1158{
1159
1160	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1161		invltlb();
1162}
1163
1164PMAP_INLINE void
1165pmap_invalidate_cache(void)
1166{
1167
1168	wbinvd();
1169}
1170
1171static void
1172pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1173{
1174
1175	if (pmap == kernel_pmap)
1176		pmap_kenter_pde(va, newpde);
1177	else
1178		pde_store(pde, newpde);
1179	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1180		pmap_update_pde_invalidate(va, newpde);
1181}
1182#endif /* !SMP */
1183
1184#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1185
1186void
1187pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1188{
1189
1190	KASSERT((sva & PAGE_MASK) == 0,
1191	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1192	KASSERT((eva & PAGE_MASK) == 0,
1193	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1194
1195	if (cpu_feature & CPUID_SS)
1196		; /* If "Self Snoop" is supported, do nothing. */
1197	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1198	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1199
1200#ifdef DEV_APIC
1201		/*
1202		 * XXX: Some CPUs fault, hang, or trash the local APIC
1203		 * registers if we use CLFLUSH on the local APIC
1204		 * range.  The local APIC is always uncached, so we
1205		 * don't need to flush for that range anyway.
1206		 */
1207		if (pmap_kextract(sva) == lapic_paddr)
1208			return;
1209#endif
1210		/*
1211		 * Otherwise, do per-cache line flush.  Use the mfence
1212		 * instruction to insure that previous stores are
1213		 * included in the write-back.  The processor
1214		 * propagates flush to other processors in the cache
1215		 * coherence domain.
1216		 */
1217		mfence();
1218		for (; sva < eva; sva += cpu_clflush_line_size)
1219			clflush(sva);
1220		mfence();
1221	} else {
1222
1223		/*
1224		 * No targeted cache flush methods are supported by CPU,
1225		 * or the supplied range is bigger than 2MB.
1226		 * Globally invalidate cache.
1227		 */
1228		pmap_invalidate_cache();
1229	}
1230}
1231
1232void
1233pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1234{
1235	int i;
1236
1237	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1238	    (cpu_feature & CPUID_CLFSH) == 0) {
1239		pmap_invalidate_cache();
1240	} else {
1241		for (i = 0; i < count; i++)
1242			pmap_flush_page(pages[i]);
1243	}
1244}
1245
1246/*
1247 * Are we current address space or kernel?  N.B. We return FALSE when
1248 * a pmap's page table is in use because a kernel thread is borrowing
1249 * it.  The borrowed page table can change spontaneously, making any
1250 * dependence on its continued use subject to a race condition.
1251 */
1252static __inline int
1253pmap_is_current(pmap_t pmap)
1254{
1255
1256	return (pmap == kernel_pmap ||
1257	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1258	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1259}
1260
1261/*
1262 * If the given pmap is not the current or kernel pmap, the returned pte must
1263 * be released by passing it to pmap_pte_release().
1264 */
1265pt_entry_t *
1266pmap_pte(pmap_t pmap, vm_offset_t va)
1267{
1268	pd_entry_t newpf;
1269	pd_entry_t *pde;
1270
1271	pde = pmap_pde(pmap, va);
1272	if (*pde & PG_PS)
1273		return (pde);
1274	if (*pde != 0) {
1275		/* are we current address space or kernel? */
1276		if (pmap_is_current(pmap))
1277			return (vtopte(va));
1278		mtx_lock(&PMAP2mutex);
1279		newpf = *pde & PG_FRAME;
1280		if ((*PMAP2 & PG_FRAME) != newpf) {
1281			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1282			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1283		}
1284		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1285	}
1286	return (NULL);
1287}
1288
1289/*
1290 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1291 * being NULL.
1292 */
1293static __inline void
1294pmap_pte_release(pt_entry_t *pte)
1295{
1296
1297	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1298		mtx_unlock(&PMAP2mutex);
1299}
1300
1301static __inline void
1302invlcaddr(void *caddr)
1303{
1304
1305	invlpg((u_int)caddr);
1306}
1307
1308/*
1309 * Super fast pmap_pte routine best used when scanning
1310 * the pv lists.  This eliminates many coarse-grained
1311 * invltlb calls.  Note that many of the pv list
1312 * scans are across different pmaps.  It is very wasteful
1313 * to do an entire invltlb for checking a single mapping.
1314 *
1315 * If the given pmap is not the current pmap, pvh_global_lock
1316 * must be held and curthread pinned to a CPU.
1317 */
1318static pt_entry_t *
1319pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1320{
1321	pd_entry_t newpf;
1322	pd_entry_t *pde;
1323
1324	pde = pmap_pde(pmap, va);
1325	if (*pde & PG_PS)
1326		return (pde);
1327	if (*pde != 0) {
1328		/* are we current address space or kernel? */
1329		if (pmap_is_current(pmap))
1330			return (vtopte(va));
1331		rw_assert(&pvh_global_lock, RA_WLOCKED);
1332		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1333		newpf = *pde & PG_FRAME;
1334		if ((*PMAP1 & PG_FRAME) != newpf) {
1335			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1336#ifdef SMP
1337			PMAP1cpu = PCPU_GET(cpuid);
1338#endif
1339			invlcaddr(PADDR1);
1340			PMAP1changed++;
1341		} else
1342#ifdef SMP
1343		if (PMAP1cpu != PCPU_GET(cpuid)) {
1344			PMAP1cpu = PCPU_GET(cpuid);
1345			invlcaddr(PADDR1);
1346			PMAP1changedcpu++;
1347		} else
1348#endif
1349			PMAP1unchanged++;
1350		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1351	}
1352	return (0);
1353}
1354
1355/*
1356 *	Routine:	pmap_extract
1357 *	Function:
1358 *		Extract the physical page address associated
1359 *		with the given map/virtual_address pair.
1360 */
1361vm_paddr_t
1362pmap_extract(pmap_t pmap, vm_offset_t va)
1363{
1364	vm_paddr_t rtval;
1365	pt_entry_t *pte;
1366	pd_entry_t pde;
1367
1368	rtval = 0;
1369	PMAP_LOCK(pmap);
1370	pde = pmap->pm_pdir[va >> PDRSHIFT];
1371	if (pde != 0) {
1372		if ((pde & PG_PS) != 0)
1373			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1374		else {
1375			pte = pmap_pte(pmap, va);
1376			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1377			pmap_pte_release(pte);
1378		}
1379	}
1380	PMAP_UNLOCK(pmap);
1381	return (rtval);
1382}
1383
1384/*
1385 *	Routine:	pmap_extract_and_hold
1386 *	Function:
1387 *		Atomically extract and hold the physical page
1388 *		with the given pmap and virtual address pair
1389 *		if that mapping permits the given protection.
1390 */
1391vm_page_t
1392pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1393{
1394	pd_entry_t pde;
1395	pt_entry_t pte, *ptep;
1396	vm_page_t m;
1397	vm_paddr_t pa;
1398
1399	pa = 0;
1400	m = NULL;
1401	PMAP_LOCK(pmap);
1402retry:
1403	pde = *pmap_pde(pmap, va);
1404	if (pde != 0) {
1405		if (pde & PG_PS) {
1406			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1407				if (vm_page_pa_tryrelock(pmap, (pde &
1408				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1409					goto retry;
1410				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1411				    (va & PDRMASK));
1412				vm_page_hold(m);
1413			}
1414		} else {
1415			ptep = pmap_pte(pmap, va);
1416			pte = *ptep;
1417			pmap_pte_release(ptep);
1418			if (pte != 0 &&
1419			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1420				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1421				    &pa))
1422					goto retry;
1423				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1424				vm_page_hold(m);
1425			}
1426		}
1427	}
1428	PA_UNLOCK_COND(pa);
1429	PMAP_UNLOCK(pmap);
1430	return (m);
1431}
1432
1433/***************************************************
1434 * Low level mapping routines.....
1435 ***************************************************/
1436
1437/*
1438 * Add a wired page to the kva.
1439 * Note: not SMP coherent.
1440 *
1441 * This function may be used before pmap_bootstrap() is called.
1442 */
1443PMAP_INLINE void
1444pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1445{
1446	pt_entry_t *pte;
1447
1448	pte = vtopte(va);
1449	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1450}
1451
1452static __inline void
1453pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1454{
1455	pt_entry_t *pte;
1456
1457	pte = vtopte(va);
1458	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1459}
1460
1461/*
1462 * Remove a page from the kernel pagetables.
1463 * Note: not SMP coherent.
1464 *
1465 * This function may be used before pmap_bootstrap() is called.
1466 */
1467PMAP_INLINE void
1468pmap_kremove(vm_offset_t va)
1469{
1470	pt_entry_t *pte;
1471
1472	pte = vtopte(va);
1473	pte_clear(pte);
1474}
1475
1476/*
1477 *	Used to map a range of physical addresses into kernel
1478 *	virtual address space.
1479 *
1480 *	The value passed in '*virt' is a suggested virtual address for
1481 *	the mapping. Architectures which can support a direct-mapped
1482 *	physical to virtual region can return the appropriate address
1483 *	within that region, leaving '*virt' unchanged. Other
1484 *	architectures should map the pages starting at '*virt' and
1485 *	update '*virt' with the first usable address after the mapped
1486 *	region.
1487 */
1488vm_offset_t
1489pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1490{
1491	vm_offset_t va, sva;
1492	vm_paddr_t superpage_offset;
1493	pd_entry_t newpde;
1494
1495	va = *virt;
1496	/*
1497	 * Does the physical address range's size and alignment permit at
1498	 * least one superpage mapping to be created?
1499	 */
1500	superpage_offset = start & PDRMASK;
1501	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1502		/*
1503		 * Increase the starting virtual address so that its alignment
1504		 * does not preclude the use of superpage mappings.
1505		 */
1506		if ((va & PDRMASK) < superpage_offset)
1507			va = (va & ~PDRMASK) + superpage_offset;
1508		else if ((va & PDRMASK) > superpage_offset)
1509			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1510	}
1511	sva = va;
1512	while (start < end) {
1513		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1514		    pseflag) {
1515			KASSERT((va & PDRMASK) == 0,
1516			    ("pmap_map: misaligned va %#x", va));
1517			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1518			pmap_kenter_pde(va, newpde);
1519			va += NBPDR;
1520			start += NBPDR;
1521		} else {
1522			pmap_kenter(va, start);
1523			va += PAGE_SIZE;
1524			start += PAGE_SIZE;
1525		}
1526	}
1527	pmap_invalidate_range(kernel_pmap, sva, va);
1528	*virt = va;
1529	return (sva);
1530}
1531
1532
1533/*
1534 * Add a list of wired pages to the kva
1535 * this routine is only used for temporary
1536 * kernel mappings that do not need to have
1537 * page modification or references recorded.
1538 * Note that old mappings are simply written
1539 * over.  The page *must* be wired.
1540 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1541 */
1542void
1543pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1544{
1545	pt_entry_t *endpte, oldpte, pa, *pte;
1546	vm_page_t m;
1547
1548	oldpte = 0;
1549	pte = vtopte(sva);
1550	endpte = pte + count;
1551	while (pte < endpte) {
1552		m = *ma++;
1553		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1554		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1555			oldpte |= *pte;
1556			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1557		}
1558		pte++;
1559	}
1560	if (__predict_false((oldpte & PG_V) != 0))
1561		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1562		    PAGE_SIZE);
1563}
1564
1565/*
1566 * This routine tears out page mappings from the
1567 * kernel -- it is meant only for temporary mappings.
1568 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1569 */
1570void
1571pmap_qremove(vm_offset_t sva, int count)
1572{
1573	vm_offset_t va;
1574
1575	va = sva;
1576	while (count-- > 0) {
1577		pmap_kremove(va);
1578		va += PAGE_SIZE;
1579	}
1580	pmap_invalidate_range(kernel_pmap, sva, va);
1581}
1582
1583/***************************************************
1584 * Page table page management routines.....
1585 ***************************************************/
1586static __inline void
1587pmap_free_zero_pages(vm_page_t free)
1588{
1589	vm_page_t m;
1590
1591	while (free != NULL) {
1592		m = free;
1593		free = m->right;
1594		/* Preserve the page's PG_ZERO setting. */
1595		vm_page_free_toq(m);
1596	}
1597}
1598
1599/*
1600 * Schedule the specified unused page table page to be freed.  Specifically,
1601 * add the page to the specified list of pages that will be released to the
1602 * physical memory manager after the TLB has been updated.
1603 */
1604static __inline void
1605pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1606{
1607
1608	if (set_PG_ZERO)
1609		m->flags |= PG_ZERO;
1610	else
1611		m->flags &= ~PG_ZERO;
1612	m->right = *free;
1613	*free = m;
1614}
1615
1616/*
1617 * Inserts the specified page table page into the specified pmap's collection
1618 * of idle page table pages.  Each of a pmap's page table pages is responsible
1619 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1620 * ordered by this virtual address range.
1621 */
1622static void
1623pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1624{
1625	vm_page_t root;
1626
1627	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1628	root = pmap->pm_root;
1629	if (root == NULL) {
1630		mpte->left = NULL;
1631		mpte->right = NULL;
1632	} else {
1633		root = vm_page_splay(mpte->pindex, root);
1634		if (mpte->pindex < root->pindex) {
1635			mpte->left = root->left;
1636			mpte->right = root;
1637			root->left = NULL;
1638		} else if (mpte->pindex == root->pindex)
1639			panic("pmap_insert_pt_page: pindex already inserted");
1640		else {
1641			mpte->right = root->right;
1642			mpte->left = root;
1643			root->right = NULL;
1644		}
1645	}
1646	pmap->pm_root = mpte;
1647}
1648
1649/*
1650 * Looks for a page table page mapping the specified virtual address in the
1651 * specified pmap's collection of idle page table pages.  Returns NULL if there
1652 * is no page table page corresponding to the specified virtual address.
1653 */
1654static vm_page_t
1655pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1656{
1657	vm_page_t mpte;
1658	vm_pindex_t pindex = va >> PDRSHIFT;
1659
1660	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1661	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1662		mpte = vm_page_splay(pindex, mpte);
1663		if ((pmap->pm_root = mpte)->pindex != pindex)
1664			mpte = NULL;
1665	}
1666	return (mpte);
1667}
1668
1669/*
1670 * Removes the specified page table page from the specified pmap's collection
1671 * of idle page table pages.  The specified page table page must be a member of
1672 * the pmap's collection.
1673 */
1674static void
1675pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1676{
1677	vm_page_t root;
1678
1679	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1680	if (mpte != pmap->pm_root)
1681		vm_page_splay(mpte->pindex, pmap->pm_root);
1682	if (mpte->left == NULL)
1683		root = mpte->right;
1684	else {
1685		root = vm_page_splay(mpte->pindex, mpte->left);
1686		root->right = mpte->right;
1687	}
1688	pmap->pm_root = root;
1689}
1690
1691/*
1692 * Decrements a page table page's wire count, which is used to record the
1693 * number of valid page table entries within the page.  If the wire count
1694 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1695 * page table page was unmapped and FALSE otherwise.
1696 */
1697static inline boolean_t
1698pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1699{
1700
1701	--m->wire_count;
1702	if (m->wire_count == 0) {
1703		_pmap_unwire_ptp(pmap, m, free);
1704		return (TRUE);
1705	} else
1706		return (FALSE);
1707}
1708
1709static void
1710_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1711{
1712	vm_offset_t pteva;
1713
1714	/*
1715	 * unmap the page table page
1716	 */
1717	pmap->pm_pdir[m->pindex] = 0;
1718	--pmap->pm_stats.resident_count;
1719
1720	/*
1721	 * This is a release store so that the ordinary store unmapping
1722	 * the page table page is globally performed before TLB shoot-
1723	 * down is begun.
1724	 */
1725	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1726
1727	/*
1728	 * Do an invltlb to make the invalidated mapping
1729	 * take effect immediately.
1730	 */
1731	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1732	pmap_invalidate_page(pmap, pteva);
1733
1734	/*
1735	 * Put page on a list so that it is released after
1736	 * *ALL* TLB shootdown is done
1737	 */
1738	pmap_add_delayed_free_list(m, free, TRUE);
1739}
1740
1741/*
1742 * After removing a page table entry, this routine is used to
1743 * conditionally free the page, and manage the hold/wire counts.
1744 */
1745static int
1746pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1747{
1748	pd_entry_t ptepde;
1749	vm_page_t mpte;
1750
1751	if (va >= VM_MAXUSER_ADDRESS)
1752		return (0);
1753	ptepde = *pmap_pde(pmap, va);
1754	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1755	return (pmap_unwire_ptp(pmap, mpte, free));
1756}
1757
1758/*
1759 * Initialize the pmap for the swapper process.
1760 */
1761void
1762pmap_pinit0(pmap_t pmap)
1763{
1764
1765	PMAP_LOCK_INIT(pmap);
1766	/*
1767	 * Since the page table directory is shared with the kernel pmap,
1768	 * which is already included in the list "allpmaps", this pmap does
1769	 * not need to be inserted into that list.
1770	 */
1771	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1772#ifdef PAE
1773	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1774#endif
1775	pmap->pm_root = NULL;
1776	CPU_ZERO(&pmap->pm_active);
1777	PCPU_SET(curpmap, pmap);
1778	TAILQ_INIT(&pmap->pm_pvchunk);
1779	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1780}
1781
1782/*
1783 * Initialize a preallocated and zeroed pmap structure,
1784 * such as one in a vmspace structure.
1785 */
1786int
1787pmap_pinit(pmap_t pmap)
1788{
1789	vm_page_t m, ptdpg[NPGPTD];
1790	vm_paddr_t pa;
1791	int i;
1792
1793	PMAP_LOCK_INIT(pmap);
1794
1795	/*
1796	 * No need to allocate page table space yet but we do need a valid
1797	 * page directory table.
1798	 */
1799	if (pmap->pm_pdir == NULL) {
1800		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1801		    NBPTD);
1802		if (pmap->pm_pdir == NULL) {
1803			PMAP_LOCK_DESTROY(pmap);
1804			return (0);
1805		}
1806#ifdef PAE
1807		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1808		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1809		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1810		    ("pmap_pinit: pdpt misaligned"));
1811		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1812		    ("pmap_pinit: pdpt above 4g"));
1813#endif
1814		pmap->pm_root = NULL;
1815	}
1816	KASSERT(pmap->pm_root == NULL,
1817	    ("pmap_pinit: pmap has reserved page table page(s)"));
1818
1819	/*
1820	 * allocate the page directory page(s)
1821	 */
1822	for (i = 0; i < NPGPTD;) {
1823		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1824		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1825		if (m == NULL)
1826			VM_WAIT;
1827		else {
1828			ptdpg[i++] = m;
1829		}
1830	}
1831
1832	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1833
1834	for (i = 0; i < NPGPTD; i++)
1835		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1836			pagezero(pmap->pm_pdir + (i * NPDEPG));
1837
1838	mtx_lock_spin(&allpmaps_lock);
1839	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1840	/* Copy the kernel page table directory entries. */
1841	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1842	mtx_unlock_spin(&allpmaps_lock);
1843
1844	/* install self-referential address mapping entry(s) */
1845	for (i = 0; i < NPGPTD; i++) {
1846		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1847		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1848#ifdef PAE
1849		pmap->pm_pdpt[i] = pa | PG_V;
1850#endif
1851	}
1852
1853	CPU_ZERO(&pmap->pm_active);
1854	TAILQ_INIT(&pmap->pm_pvchunk);
1855	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1856
1857	return (1);
1858}
1859
1860/*
1861 * this routine is called if the page table page is not
1862 * mapped correctly.
1863 */
1864static vm_page_t
1865_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags)
1866{
1867	vm_paddr_t ptepa;
1868	vm_page_t m;
1869
1870	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1871	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1872	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1873
1874	/*
1875	 * Allocate a page table page.
1876	 */
1877	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1878	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1879		if (flags & M_WAITOK) {
1880			PMAP_UNLOCK(pmap);
1881			rw_wunlock(&pvh_global_lock);
1882			VM_WAIT;
1883			rw_wlock(&pvh_global_lock);
1884			PMAP_LOCK(pmap);
1885		}
1886
1887		/*
1888		 * Indicate the need to retry.  While waiting, the page table
1889		 * page may have been allocated.
1890		 */
1891		return (NULL);
1892	}
1893	if ((m->flags & PG_ZERO) == 0)
1894		pmap_zero_page(m);
1895
1896	/*
1897	 * Map the pagetable page into the process address space, if
1898	 * it isn't already there.
1899	 */
1900
1901	pmap->pm_stats.resident_count++;
1902
1903	ptepa = VM_PAGE_TO_PHYS(m);
1904	pmap->pm_pdir[ptepindex] =
1905		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1906
1907	return (m);
1908}
1909
1910static vm_page_t
1911pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1912{
1913	u_int ptepindex;
1914	pd_entry_t ptepa;
1915	vm_page_t m;
1916
1917	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1918	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1919	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1920
1921	/*
1922	 * Calculate pagetable page index
1923	 */
1924	ptepindex = va >> PDRSHIFT;
1925retry:
1926	/*
1927	 * Get the page directory entry
1928	 */
1929	ptepa = pmap->pm_pdir[ptepindex];
1930
1931	/*
1932	 * This supports switching from a 4MB page to a
1933	 * normal 4K page.
1934	 */
1935	if (ptepa & PG_PS) {
1936		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1937		ptepa = pmap->pm_pdir[ptepindex];
1938	}
1939
1940	/*
1941	 * If the page table page is mapped, we just increment the
1942	 * hold count, and activate it.
1943	 */
1944	if (ptepa) {
1945		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1946		m->wire_count++;
1947	} else {
1948		/*
1949		 * Here if the pte page isn't mapped, or if it has
1950		 * been deallocated.
1951		 */
1952		m = _pmap_allocpte(pmap, ptepindex, flags);
1953		if (m == NULL && (flags & M_WAITOK))
1954			goto retry;
1955	}
1956	return (m);
1957}
1958
1959
1960/***************************************************
1961* Pmap allocation/deallocation routines.
1962 ***************************************************/
1963
1964#ifdef SMP
1965/*
1966 * Deal with a SMP shootdown of other users of the pmap that we are
1967 * trying to dispose of.  This can be a bit hairy.
1968 */
1969static cpuset_t *lazymask;
1970static u_int lazyptd;
1971static volatile u_int lazywait;
1972
1973void pmap_lazyfix_action(void);
1974
1975void
1976pmap_lazyfix_action(void)
1977{
1978
1979#ifdef COUNT_IPIS
1980	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1981#endif
1982	if (rcr3() == lazyptd)
1983		load_cr3(curpcb->pcb_cr3);
1984	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1985	atomic_store_rel_int(&lazywait, 1);
1986}
1987
1988static void
1989pmap_lazyfix_self(u_int cpuid)
1990{
1991
1992	if (rcr3() == lazyptd)
1993		load_cr3(curpcb->pcb_cr3);
1994	CPU_CLR_ATOMIC(cpuid, lazymask);
1995}
1996
1997
1998static void
1999pmap_lazyfix(pmap_t pmap)
2000{
2001	cpuset_t mymask, mask;
2002	u_int cpuid, spins;
2003	int lsb;
2004
2005	mask = pmap->pm_active;
2006	while (!CPU_EMPTY(&mask)) {
2007		spins = 50000000;
2008
2009		/* Find least significant set bit. */
2010		lsb = cpusetobj_ffs(&mask);
2011		MPASS(lsb != 0);
2012		lsb--;
2013		CPU_SETOF(lsb, &mask);
2014		mtx_lock_spin(&smp_ipi_mtx);
2015#ifdef PAE
2016		lazyptd = vtophys(pmap->pm_pdpt);
2017#else
2018		lazyptd = vtophys(pmap->pm_pdir);
2019#endif
2020		cpuid = PCPU_GET(cpuid);
2021
2022		/* Use a cpuset just for having an easy check. */
2023		CPU_SETOF(cpuid, &mymask);
2024		if (!CPU_CMP(&mask, &mymask)) {
2025			lazymask = &pmap->pm_active;
2026			pmap_lazyfix_self(cpuid);
2027		} else {
2028			atomic_store_rel_int((u_int *)&lazymask,
2029			    (u_int)&pmap->pm_active);
2030			atomic_store_rel_int(&lazywait, 0);
2031			ipi_selected(mask, IPI_LAZYPMAP);
2032			while (lazywait == 0) {
2033				ia32_pause();
2034				if (--spins == 0)
2035					break;
2036			}
2037		}
2038		mtx_unlock_spin(&smp_ipi_mtx);
2039		if (spins == 0)
2040			printf("pmap_lazyfix: spun for 50000000\n");
2041		mask = pmap->pm_active;
2042	}
2043}
2044
2045#else	/* SMP */
2046
2047/*
2048 * Cleaning up on uniprocessor is easy.  For various reasons, we're
2049 * unlikely to have to even execute this code, including the fact
2050 * that the cleanup is deferred until the parent does a wait(2), which
2051 * means that another userland process has run.
2052 */
2053static void
2054pmap_lazyfix(pmap_t pmap)
2055{
2056	u_int cr3;
2057
2058	cr3 = vtophys(pmap->pm_pdir);
2059	if (cr3 == rcr3()) {
2060		load_cr3(curpcb->pcb_cr3);
2061		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2062	}
2063}
2064#endif	/* SMP */
2065
2066/*
2067 * Release any resources held by the given physical map.
2068 * Called when a pmap initialized by pmap_pinit is being released.
2069 * Should only be called if the map contains no valid mappings.
2070 */
2071void
2072pmap_release(pmap_t pmap)
2073{
2074	vm_page_t m, ptdpg[NPGPTD];
2075	int i;
2076
2077	KASSERT(pmap->pm_stats.resident_count == 0,
2078	    ("pmap_release: pmap resident count %ld != 0",
2079	    pmap->pm_stats.resident_count));
2080	KASSERT(pmap->pm_root == NULL,
2081	    ("pmap_release: pmap has reserved page table page(s)"));
2082
2083	pmap_lazyfix(pmap);
2084	mtx_lock_spin(&allpmaps_lock);
2085	LIST_REMOVE(pmap, pm_list);
2086	mtx_unlock_spin(&allpmaps_lock);
2087
2088	for (i = 0; i < NPGPTD; i++)
2089		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2090		    PG_FRAME);
2091
2092	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2093	    sizeof(*pmap->pm_pdir));
2094
2095	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2096
2097	for (i = 0; i < NPGPTD; i++) {
2098		m = ptdpg[i];
2099#ifdef PAE
2100		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2101		    ("pmap_release: got wrong ptd page"));
2102#endif
2103		m->wire_count--;
2104		atomic_subtract_int(&cnt.v_wire_count, 1);
2105		vm_page_free_zero(m);
2106	}
2107	PMAP_LOCK_DESTROY(pmap);
2108}
2109
2110static int
2111kvm_size(SYSCTL_HANDLER_ARGS)
2112{
2113	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2114
2115	return (sysctl_handle_long(oidp, &ksize, 0, req));
2116}
2117SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2118    0, 0, kvm_size, "IU", "Size of KVM");
2119
2120static int
2121kvm_free(SYSCTL_HANDLER_ARGS)
2122{
2123	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2124
2125	return (sysctl_handle_long(oidp, &kfree, 0, req));
2126}
2127SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2128    0, 0, kvm_free, "IU", "Amount of KVM free");
2129
2130/*
2131 * grow the number of kernel page table entries, if needed
2132 */
2133void
2134pmap_growkernel(vm_offset_t addr)
2135{
2136	vm_paddr_t ptppaddr;
2137	vm_page_t nkpg;
2138	pd_entry_t newpdir;
2139
2140	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2141	addr = roundup2(addr, NBPDR);
2142	if (addr - 1 >= kernel_map->max_offset)
2143		addr = kernel_map->max_offset;
2144	while (kernel_vm_end < addr) {
2145		if (pdir_pde(PTD, kernel_vm_end)) {
2146			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2147			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2148				kernel_vm_end = kernel_map->max_offset;
2149				break;
2150			}
2151			continue;
2152		}
2153
2154		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2155		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2156		    VM_ALLOC_ZERO);
2157		if (nkpg == NULL)
2158			panic("pmap_growkernel: no memory to grow kernel");
2159
2160		nkpt++;
2161
2162		if ((nkpg->flags & PG_ZERO) == 0)
2163			pmap_zero_page(nkpg);
2164		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2165		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2166		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2167
2168		pmap_kenter_pde(kernel_vm_end, newpdir);
2169		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2170		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2171			kernel_vm_end = kernel_map->max_offset;
2172			break;
2173		}
2174	}
2175}
2176
2177
2178/***************************************************
2179 * page management routines.
2180 ***************************************************/
2181
2182CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2183CTASSERT(_NPCM == 11);
2184CTASSERT(_NPCPV == 336);
2185
2186static __inline struct pv_chunk *
2187pv_to_chunk(pv_entry_t pv)
2188{
2189
2190	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2191}
2192
2193#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2194
2195#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2196#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2197
2198static const uint32_t pc_freemask[_NPCM] = {
2199	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2200	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2201	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2202	PC_FREE0_9, PC_FREE10
2203};
2204
2205SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2206	"Current number of pv entries");
2207
2208#ifdef PV_STATS
2209static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2210
2211SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2212	"Current number of pv entry chunks");
2213SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2214	"Current number of pv entry chunks allocated");
2215SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2216	"Current number of pv entry chunks frees");
2217SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2218	"Number of times tried to get a chunk page but failed.");
2219
2220static long pv_entry_frees, pv_entry_allocs;
2221static int pv_entry_spare;
2222
2223SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2224	"Current number of pv entry frees");
2225SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2226	"Current number of pv entry allocs");
2227SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2228	"Current number of spare pv entries");
2229#endif
2230
2231/*
2232 * We are in a serious low memory condition.  Resort to
2233 * drastic measures to free some pages so we can allocate
2234 * another pv entry chunk.
2235 */
2236static vm_page_t
2237pmap_pv_reclaim(pmap_t locked_pmap)
2238{
2239	struct pch newtail;
2240	struct pv_chunk *pc;
2241	struct md_page *pvh;
2242	pd_entry_t *pde;
2243	pmap_t pmap;
2244	pt_entry_t *pte, tpte;
2245	pv_entry_t pv;
2246	vm_offset_t va;
2247	vm_page_t free, m, m_pc;
2248	uint32_t inuse;
2249	int bit, field, freed;
2250
2251	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2252	pmap = NULL;
2253	free = m_pc = NULL;
2254	TAILQ_INIT(&newtail);
2255	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2256	    free == NULL)) {
2257		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2258		if (pmap != pc->pc_pmap) {
2259			if (pmap != NULL) {
2260				pmap_invalidate_all(pmap);
2261				if (pmap != locked_pmap)
2262					PMAP_UNLOCK(pmap);
2263			}
2264			pmap = pc->pc_pmap;
2265			/* Avoid deadlock and lock recursion. */
2266			if (pmap > locked_pmap)
2267				PMAP_LOCK(pmap);
2268			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2269				pmap = NULL;
2270				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2271				continue;
2272			}
2273		}
2274
2275		/*
2276		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2277		 */
2278		freed = 0;
2279		for (field = 0; field < _NPCM; field++) {
2280			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2281			    inuse != 0; inuse &= ~(1UL << bit)) {
2282				bit = bsfl(inuse);
2283				pv = &pc->pc_pventry[field * 32 + bit];
2284				va = pv->pv_va;
2285				pde = pmap_pde(pmap, va);
2286				if ((*pde & PG_PS) != 0)
2287					continue;
2288				pte = pmap_pte(pmap, va);
2289				tpte = *pte;
2290				if ((tpte & PG_W) == 0)
2291					tpte = pte_load_clear(pte);
2292				pmap_pte_release(pte);
2293				if ((tpte & PG_W) != 0)
2294					continue;
2295				KASSERT(tpte != 0,
2296				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2297				    pmap, va));
2298				if ((tpte & PG_G) != 0)
2299					pmap_invalidate_page(pmap, va);
2300				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2301				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2302					vm_page_dirty(m);
2303				if ((tpte & PG_A) != 0)
2304					vm_page_aflag_set(m, PGA_REFERENCED);
2305				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2306				if (TAILQ_EMPTY(&m->md.pv_list) &&
2307				    (m->flags & PG_FICTITIOUS) == 0) {
2308					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2309					if (TAILQ_EMPTY(&pvh->pv_list)) {
2310						vm_page_aflag_clear(m,
2311						    PGA_WRITEABLE);
2312					}
2313				}
2314				pc->pc_map[field] |= 1UL << bit;
2315				pmap_unuse_pt(pmap, va, &free);
2316				freed++;
2317			}
2318		}
2319		if (freed == 0) {
2320			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2321			continue;
2322		}
2323		/* Every freed mapping is for a 4 KB page. */
2324		pmap->pm_stats.resident_count -= freed;
2325		PV_STAT(pv_entry_frees += freed);
2326		PV_STAT(pv_entry_spare += freed);
2327		pv_entry_count -= freed;
2328		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2329		for (field = 0; field < _NPCM; field++)
2330			if (pc->pc_map[field] != pc_freemask[field]) {
2331				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2332				    pc_list);
2333				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2334
2335				/*
2336				 * One freed pv entry in locked_pmap is
2337				 * sufficient.
2338				 */
2339				if (pmap == locked_pmap)
2340					goto out;
2341				break;
2342			}
2343		if (field == _NPCM) {
2344			PV_STAT(pv_entry_spare -= _NPCPV);
2345			PV_STAT(pc_chunk_count--);
2346			PV_STAT(pc_chunk_frees++);
2347			/* Entire chunk is free; return it. */
2348			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2349			pmap_qremove((vm_offset_t)pc, 1);
2350			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2351			break;
2352		}
2353	}
2354out:
2355	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2356	if (pmap != NULL) {
2357		pmap_invalidate_all(pmap);
2358		if (pmap != locked_pmap)
2359			PMAP_UNLOCK(pmap);
2360	}
2361	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
2362		m_pc = free;
2363		free = m_pc->right;
2364		/* Recycle a freed page table page. */
2365		m_pc->wire_count = 1;
2366		atomic_add_int(&cnt.v_wire_count, 1);
2367	}
2368	pmap_free_zero_pages(free);
2369	return (m_pc);
2370}
2371
2372/*
2373 * free the pv_entry back to the free list
2374 */
2375static void
2376free_pv_entry(pmap_t pmap, pv_entry_t pv)
2377{
2378	struct pv_chunk *pc;
2379	int idx, field, bit;
2380
2381	rw_assert(&pvh_global_lock, RA_WLOCKED);
2382	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2383	PV_STAT(pv_entry_frees++);
2384	PV_STAT(pv_entry_spare++);
2385	pv_entry_count--;
2386	pc = pv_to_chunk(pv);
2387	idx = pv - &pc->pc_pventry[0];
2388	field = idx / 32;
2389	bit = idx % 32;
2390	pc->pc_map[field] |= 1ul << bit;
2391	for (idx = 0; idx < _NPCM; idx++)
2392		if (pc->pc_map[idx] != pc_freemask[idx]) {
2393			/*
2394			 * 98% of the time, pc is already at the head of the
2395			 * list.  If it isn't already, move it to the head.
2396			 */
2397			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2398			    pc)) {
2399				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2400				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2401				    pc_list);
2402			}
2403			return;
2404		}
2405	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2406	free_pv_chunk(pc);
2407}
2408
2409static void
2410free_pv_chunk(struct pv_chunk *pc)
2411{
2412	vm_page_t m;
2413
2414 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2415	PV_STAT(pv_entry_spare -= _NPCPV);
2416	PV_STAT(pc_chunk_count--);
2417	PV_STAT(pc_chunk_frees++);
2418	/* entire chunk is free, return it */
2419	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2420	pmap_qremove((vm_offset_t)pc, 1);
2421	vm_page_unwire(m, 0);
2422	vm_page_free(m);
2423	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2424}
2425
2426/*
2427 * get a new pv_entry, allocating a block from the system
2428 * when needed.
2429 */
2430static pv_entry_t
2431get_pv_entry(pmap_t pmap, boolean_t try)
2432{
2433	static const struct timeval printinterval = { 60, 0 };
2434	static struct timeval lastprint;
2435	int bit, field;
2436	pv_entry_t pv;
2437	struct pv_chunk *pc;
2438	vm_page_t m;
2439
2440	rw_assert(&pvh_global_lock, RA_WLOCKED);
2441	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2442	PV_STAT(pv_entry_allocs++);
2443	pv_entry_count++;
2444	if (pv_entry_count > pv_entry_high_water)
2445		if (ratecheck(&lastprint, &printinterval))
2446			printf("Approaching the limit on PV entries, consider "
2447			    "increasing either the vm.pmap.shpgperproc or the "
2448			    "vm.pmap.pv_entry_max tunable.\n");
2449retry:
2450	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2451	if (pc != NULL) {
2452		for (field = 0; field < _NPCM; field++) {
2453			if (pc->pc_map[field]) {
2454				bit = bsfl(pc->pc_map[field]);
2455				break;
2456			}
2457		}
2458		if (field < _NPCM) {
2459			pv = &pc->pc_pventry[field * 32 + bit];
2460			pc->pc_map[field] &= ~(1ul << bit);
2461			/* If this was the last item, move it to tail */
2462			for (field = 0; field < _NPCM; field++)
2463				if (pc->pc_map[field] != 0) {
2464					PV_STAT(pv_entry_spare--);
2465					return (pv);	/* not full, return */
2466				}
2467			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2468			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2469			PV_STAT(pv_entry_spare--);
2470			return (pv);
2471		}
2472	}
2473	/*
2474	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2475	 * global lock.  If "pv_vafree" is currently non-empty, it will
2476	 * remain non-empty until pmap_ptelist_alloc() completes.
2477	 */
2478	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2479	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2480		if (try) {
2481			pv_entry_count--;
2482			PV_STAT(pc_chunk_tryfail++);
2483			return (NULL);
2484		}
2485		m = pmap_pv_reclaim(pmap);
2486		if (m == NULL)
2487			goto retry;
2488	}
2489	PV_STAT(pc_chunk_count++);
2490	PV_STAT(pc_chunk_allocs++);
2491	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2492	pmap_qenter((vm_offset_t)pc, &m, 1);
2493	pc->pc_pmap = pmap;
2494	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2495	for (field = 1; field < _NPCM; field++)
2496		pc->pc_map[field] = pc_freemask[field];
2497	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2498	pv = &pc->pc_pventry[0];
2499	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2500	PV_STAT(pv_entry_spare += _NPCPV - 1);
2501	return (pv);
2502}
2503
2504static __inline pv_entry_t
2505pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2506{
2507	pv_entry_t pv;
2508
2509	rw_assert(&pvh_global_lock, RA_WLOCKED);
2510	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2511		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2512			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2513			break;
2514		}
2515	}
2516	return (pv);
2517}
2518
2519static void
2520pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2521{
2522	struct md_page *pvh;
2523	pv_entry_t pv;
2524	vm_offset_t va_last;
2525	vm_page_t m;
2526
2527	rw_assert(&pvh_global_lock, RA_WLOCKED);
2528	KASSERT((pa & PDRMASK) == 0,
2529	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2530
2531	/*
2532	 * Transfer the 4mpage's pv entry for this mapping to the first
2533	 * page's pv list.
2534	 */
2535	pvh = pa_to_pvh(pa);
2536	va = trunc_4mpage(va);
2537	pv = pmap_pvh_remove(pvh, pmap, va);
2538	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2539	m = PHYS_TO_VM_PAGE(pa);
2540	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2541	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2542	va_last = va + NBPDR - PAGE_SIZE;
2543	do {
2544		m++;
2545		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2546		    ("pmap_pv_demote_pde: page %p is not managed", m));
2547		va += PAGE_SIZE;
2548		pmap_insert_entry(pmap, va, m);
2549	} while (va < va_last);
2550}
2551
2552static void
2553pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2554{
2555	struct md_page *pvh;
2556	pv_entry_t pv;
2557	vm_offset_t va_last;
2558	vm_page_t m;
2559
2560	rw_assert(&pvh_global_lock, RA_WLOCKED);
2561	KASSERT((pa & PDRMASK) == 0,
2562	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2563
2564	/*
2565	 * Transfer the first page's pv entry for this mapping to the
2566	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2567	 * to get_pv_entry(), a transfer avoids the possibility that
2568	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2569	 * removes one of the mappings that is being promoted.
2570	 */
2571	m = PHYS_TO_VM_PAGE(pa);
2572	va = trunc_4mpage(va);
2573	pv = pmap_pvh_remove(&m->md, pmap, va);
2574	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2575	pvh = pa_to_pvh(pa);
2576	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2577	/* Free the remaining NPTEPG - 1 pv entries. */
2578	va_last = va + NBPDR - PAGE_SIZE;
2579	do {
2580		m++;
2581		va += PAGE_SIZE;
2582		pmap_pvh_free(&m->md, pmap, va);
2583	} while (va < va_last);
2584}
2585
2586static void
2587pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2588{
2589	pv_entry_t pv;
2590
2591	pv = pmap_pvh_remove(pvh, pmap, va);
2592	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2593	free_pv_entry(pmap, pv);
2594}
2595
2596static void
2597pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2598{
2599	struct md_page *pvh;
2600
2601	rw_assert(&pvh_global_lock, RA_WLOCKED);
2602	pmap_pvh_free(&m->md, pmap, va);
2603	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2604		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2605		if (TAILQ_EMPTY(&pvh->pv_list))
2606			vm_page_aflag_clear(m, PGA_WRITEABLE);
2607	}
2608}
2609
2610/*
2611 * Create a pv entry for page at pa for
2612 * (pmap, va).
2613 */
2614static void
2615pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2616{
2617	pv_entry_t pv;
2618
2619	rw_assert(&pvh_global_lock, RA_WLOCKED);
2620	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2621	pv = get_pv_entry(pmap, FALSE);
2622	pv->pv_va = va;
2623	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2624}
2625
2626/*
2627 * Conditionally create a pv entry.
2628 */
2629static boolean_t
2630pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2631{
2632	pv_entry_t pv;
2633
2634	rw_assert(&pvh_global_lock, RA_WLOCKED);
2635	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2636	if (pv_entry_count < pv_entry_high_water &&
2637	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2638		pv->pv_va = va;
2639		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2640		return (TRUE);
2641	} else
2642		return (FALSE);
2643}
2644
2645/*
2646 * Create the pv entries for each of the pages within a superpage.
2647 */
2648static boolean_t
2649pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2650{
2651	struct md_page *pvh;
2652	pv_entry_t pv;
2653
2654	rw_assert(&pvh_global_lock, RA_WLOCKED);
2655	if (pv_entry_count < pv_entry_high_water &&
2656	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2657		pv->pv_va = va;
2658		pvh = pa_to_pvh(pa);
2659		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2660		return (TRUE);
2661	} else
2662		return (FALSE);
2663}
2664
2665/*
2666 * Fills a page table page with mappings to consecutive physical pages.
2667 */
2668static void
2669pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2670{
2671	pt_entry_t *pte;
2672
2673	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2674		*pte = newpte;
2675		newpte += PAGE_SIZE;
2676	}
2677}
2678
2679/*
2680 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2681 * 2- or 4MB page mapping is invalidated.
2682 */
2683static boolean_t
2684pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2685{
2686	pd_entry_t newpde, oldpde;
2687	pt_entry_t *firstpte, newpte;
2688	vm_paddr_t mptepa;
2689	vm_page_t free, mpte;
2690
2691	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2692	oldpde = *pde;
2693	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2694	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2695	mpte = pmap_lookup_pt_page(pmap, va);
2696	if (mpte != NULL)
2697		pmap_remove_pt_page(pmap, mpte);
2698	else {
2699		KASSERT((oldpde & PG_W) == 0,
2700		    ("pmap_demote_pde: page table page for a wired mapping"
2701		    " is missing"));
2702
2703		/*
2704		 * Invalidate the 2- or 4MB page mapping and return
2705		 * "failure" if the mapping was never accessed or the
2706		 * allocation of the new page table page fails.
2707		 */
2708		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2709		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2710		    VM_ALLOC_WIRED)) == NULL) {
2711			free = NULL;
2712			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2713			pmap_invalidate_page(pmap, trunc_4mpage(va));
2714			pmap_free_zero_pages(free);
2715			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2716			    " in pmap %p", va, pmap);
2717			return (FALSE);
2718		}
2719		if (va < VM_MAXUSER_ADDRESS)
2720			pmap->pm_stats.resident_count++;
2721	}
2722	mptepa = VM_PAGE_TO_PHYS(mpte);
2723
2724	/*
2725	 * If the page mapping is in the kernel's address space, then the
2726	 * KPTmap can provide access to the page table page.  Otherwise,
2727	 * temporarily map the page table page (mpte) into the kernel's
2728	 * address space at either PADDR1 or PADDR2.
2729	 */
2730	if (va >= KERNBASE)
2731		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2732	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2733		if ((*PMAP1 & PG_FRAME) != mptepa) {
2734			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2735#ifdef SMP
2736			PMAP1cpu = PCPU_GET(cpuid);
2737#endif
2738			invlcaddr(PADDR1);
2739			PMAP1changed++;
2740		} else
2741#ifdef SMP
2742		if (PMAP1cpu != PCPU_GET(cpuid)) {
2743			PMAP1cpu = PCPU_GET(cpuid);
2744			invlcaddr(PADDR1);
2745			PMAP1changedcpu++;
2746		} else
2747#endif
2748			PMAP1unchanged++;
2749		firstpte = PADDR1;
2750	} else {
2751		mtx_lock(&PMAP2mutex);
2752		if ((*PMAP2 & PG_FRAME) != mptepa) {
2753			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2754			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2755		}
2756		firstpte = PADDR2;
2757	}
2758	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2759	KASSERT((oldpde & PG_A) != 0,
2760	    ("pmap_demote_pde: oldpde is missing PG_A"));
2761	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2762	    ("pmap_demote_pde: oldpde is missing PG_M"));
2763	newpte = oldpde & ~PG_PS;
2764	if ((newpte & PG_PDE_PAT) != 0)
2765		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2766
2767	/*
2768	 * If the page table page is new, initialize it.
2769	 */
2770	if (mpte->wire_count == 1) {
2771		mpte->wire_count = NPTEPG;
2772		pmap_fill_ptp(firstpte, newpte);
2773	}
2774	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2775	    ("pmap_demote_pde: firstpte and newpte map different physical"
2776	    " addresses"));
2777
2778	/*
2779	 * If the mapping has changed attributes, update the page table
2780	 * entries.
2781	 */
2782	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2783		pmap_fill_ptp(firstpte, newpte);
2784
2785	/*
2786	 * Demote the mapping.  This pmap is locked.  The old PDE has
2787	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2788	 * set.  Thus, there is no danger of a race with another
2789	 * processor changing the setting of PG_A and/or PG_M between
2790	 * the read above and the store below.
2791	 */
2792	if (workaround_erratum383)
2793		pmap_update_pde(pmap, va, pde, newpde);
2794	else if (pmap == kernel_pmap)
2795		pmap_kenter_pde(va, newpde);
2796	else
2797		pde_store(pde, newpde);
2798	if (firstpte == PADDR2)
2799		mtx_unlock(&PMAP2mutex);
2800
2801	/*
2802	 * Invalidate the recursive mapping of the page table page.
2803	 */
2804	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2805
2806	/*
2807	 * Demote the pv entry.  This depends on the earlier demotion
2808	 * of the mapping.  Specifically, the (re)creation of a per-
2809	 * page pv entry might trigger the execution of pmap_collect(),
2810	 * which might reclaim a newly (re)created per-page pv entry
2811	 * and destroy the associated mapping.  In order to destroy
2812	 * the mapping, the PDE must have already changed from mapping
2813	 * the 2mpage to referencing the page table page.
2814	 */
2815	if ((oldpde & PG_MANAGED) != 0)
2816		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2817
2818	pmap_pde_demotions++;
2819	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2820	    " in pmap %p", va, pmap);
2821	return (TRUE);
2822}
2823
2824/*
2825 * pmap_remove_pde: do the things to unmap a superpage in a process
2826 */
2827static void
2828pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2829    vm_page_t *free)
2830{
2831	struct md_page *pvh;
2832	pd_entry_t oldpde;
2833	vm_offset_t eva, va;
2834	vm_page_t m, mpte;
2835
2836	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2837	KASSERT((sva & PDRMASK) == 0,
2838	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2839	oldpde = pte_load_clear(pdq);
2840	if (oldpde & PG_W)
2841		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2842
2843	/*
2844	 * Machines that don't support invlpg, also don't support
2845	 * PG_G.
2846	 */
2847	if (oldpde & PG_G)
2848		pmap_invalidate_page(kernel_pmap, sva);
2849	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2850	if (oldpde & PG_MANAGED) {
2851		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2852		pmap_pvh_free(pvh, pmap, sva);
2853		eva = sva + NBPDR;
2854		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2855		    va < eva; va += PAGE_SIZE, m++) {
2856			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2857				vm_page_dirty(m);
2858			if (oldpde & PG_A)
2859				vm_page_aflag_set(m, PGA_REFERENCED);
2860			if (TAILQ_EMPTY(&m->md.pv_list) &&
2861			    TAILQ_EMPTY(&pvh->pv_list))
2862				vm_page_aflag_clear(m, PGA_WRITEABLE);
2863		}
2864	}
2865	if (pmap == kernel_pmap) {
2866		if (!pmap_demote_pde(pmap, pdq, sva))
2867			panic("pmap_remove_pde: failed demotion");
2868	} else {
2869		mpte = pmap_lookup_pt_page(pmap, sva);
2870		if (mpte != NULL) {
2871			pmap_remove_pt_page(pmap, mpte);
2872			pmap->pm_stats.resident_count--;
2873			KASSERT(mpte->wire_count == NPTEPG,
2874			    ("pmap_remove_pde: pte page wire count error"));
2875			mpte->wire_count = 0;
2876			pmap_add_delayed_free_list(mpte, free, FALSE);
2877			atomic_subtract_int(&cnt.v_wire_count, 1);
2878		}
2879	}
2880}
2881
2882/*
2883 * pmap_remove_pte: do the things to unmap a page in a process
2884 */
2885static int
2886pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2887{
2888	pt_entry_t oldpte;
2889	vm_page_t m;
2890
2891	rw_assert(&pvh_global_lock, RA_WLOCKED);
2892	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2893	oldpte = pte_load_clear(ptq);
2894	KASSERT(oldpte != 0,
2895	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2896	if (oldpte & PG_W)
2897		pmap->pm_stats.wired_count -= 1;
2898	/*
2899	 * Machines that don't support invlpg, also don't support
2900	 * PG_G.
2901	 */
2902	if (oldpte & PG_G)
2903		pmap_invalidate_page(kernel_pmap, va);
2904	pmap->pm_stats.resident_count -= 1;
2905	if (oldpte & PG_MANAGED) {
2906		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2907		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2908			vm_page_dirty(m);
2909		if (oldpte & PG_A)
2910			vm_page_aflag_set(m, PGA_REFERENCED);
2911		pmap_remove_entry(pmap, m, va);
2912	}
2913	return (pmap_unuse_pt(pmap, va, free));
2914}
2915
2916/*
2917 * Remove a single page from a process address space
2918 */
2919static void
2920pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2921{
2922	pt_entry_t *pte;
2923
2924	rw_assert(&pvh_global_lock, RA_WLOCKED);
2925	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2926	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2927	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2928		return;
2929	pmap_remove_pte(pmap, pte, va, free);
2930	pmap_invalidate_page(pmap, va);
2931}
2932
2933/*
2934 *	Remove the given range of addresses from the specified map.
2935 *
2936 *	It is assumed that the start and end are properly
2937 *	rounded to the page size.
2938 */
2939void
2940pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2941{
2942	vm_offset_t pdnxt;
2943	pd_entry_t ptpaddr;
2944	pt_entry_t *pte;
2945	vm_page_t free = NULL;
2946	int anyvalid;
2947
2948	/*
2949	 * Perform an unsynchronized read.  This is, however, safe.
2950	 */
2951	if (pmap->pm_stats.resident_count == 0)
2952		return;
2953
2954	anyvalid = 0;
2955
2956	rw_wlock(&pvh_global_lock);
2957	sched_pin();
2958	PMAP_LOCK(pmap);
2959
2960	/*
2961	 * special handling of removing one page.  a very
2962	 * common operation and easy to short circuit some
2963	 * code.
2964	 */
2965	if ((sva + PAGE_SIZE == eva) &&
2966	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2967		pmap_remove_page(pmap, sva, &free);
2968		goto out;
2969	}
2970
2971	for (; sva < eva; sva = pdnxt) {
2972		u_int pdirindex;
2973
2974		/*
2975		 * Calculate index for next page table.
2976		 */
2977		pdnxt = (sva + NBPDR) & ~PDRMASK;
2978		if (pdnxt < sva)
2979			pdnxt = eva;
2980		if (pmap->pm_stats.resident_count == 0)
2981			break;
2982
2983		pdirindex = sva >> PDRSHIFT;
2984		ptpaddr = pmap->pm_pdir[pdirindex];
2985
2986		/*
2987		 * Weed out invalid mappings. Note: we assume that the page
2988		 * directory table is always allocated, and in kernel virtual.
2989		 */
2990		if (ptpaddr == 0)
2991			continue;
2992
2993		/*
2994		 * Check for large page.
2995		 */
2996		if ((ptpaddr & PG_PS) != 0) {
2997			/*
2998			 * Are we removing the entire large page?  If not,
2999			 * demote the mapping and fall through.
3000			 */
3001			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3002				/*
3003				 * The TLB entry for a PG_G mapping is
3004				 * invalidated by pmap_remove_pde().
3005				 */
3006				if ((ptpaddr & PG_G) == 0)
3007					anyvalid = 1;
3008				pmap_remove_pde(pmap,
3009				    &pmap->pm_pdir[pdirindex], sva, &free);
3010				continue;
3011			} else if (!pmap_demote_pde(pmap,
3012			    &pmap->pm_pdir[pdirindex], sva)) {
3013				/* The large page mapping was destroyed. */
3014				continue;
3015			}
3016		}
3017
3018		/*
3019		 * Limit our scan to either the end of the va represented
3020		 * by the current page table page, or to the end of the
3021		 * range being removed.
3022		 */
3023		if (pdnxt > eva)
3024			pdnxt = eva;
3025
3026		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3027		    sva += PAGE_SIZE) {
3028			if (*pte == 0)
3029				continue;
3030
3031			/*
3032			 * The TLB entry for a PG_G mapping is invalidated
3033			 * by pmap_remove_pte().
3034			 */
3035			if ((*pte & PG_G) == 0)
3036				anyvalid = 1;
3037			if (pmap_remove_pte(pmap, pte, sva, &free))
3038				break;
3039		}
3040	}
3041out:
3042	sched_unpin();
3043	if (anyvalid)
3044		pmap_invalidate_all(pmap);
3045	rw_wunlock(&pvh_global_lock);
3046	PMAP_UNLOCK(pmap);
3047	pmap_free_zero_pages(free);
3048}
3049
3050/*
3051 *	Routine:	pmap_remove_all
3052 *	Function:
3053 *		Removes this physical page from
3054 *		all physical maps in which it resides.
3055 *		Reflects back modify bits to the pager.
3056 *
3057 *	Notes:
3058 *		Original versions of this routine were very
3059 *		inefficient because they iteratively called
3060 *		pmap_remove (slow...)
3061 */
3062
3063void
3064pmap_remove_all(vm_page_t m)
3065{
3066	struct md_page *pvh;
3067	pv_entry_t pv;
3068	pmap_t pmap;
3069	pt_entry_t *pte, tpte;
3070	pd_entry_t *pde;
3071	vm_offset_t va;
3072	vm_page_t free;
3073
3074	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3075	    ("pmap_remove_all: page %p is not managed", m));
3076	free = NULL;
3077	rw_wlock(&pvh_global_lock);
3078	sched_pin();
3079	if ((m->flags & PG_FICTITIOUS) != 0)
3080		goto small_mappings;
3081	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3082	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3083		va = pv->pv_va;
3084		pmap = PV_PMAP(pv);
3085		PMAP_LOCK(pmap);
3086		pde = pmap_pde(pmap, va);
3087		(void)pmap_demote_pde(pmap, pde, va);
3088		PMAP_UNLOCK(pmap);
3089	}
3090small_mappings:
3091	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3092		pmap = PV_PMAP(pv);
3093		PMAP_LOCK(pmap);
3094		pmap->pm_stats.resident_count--;
3095		pde = pmap_pde(pmap, pv->pv_va);
3096		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3097		    " a 4mpage in page %p's pv list", m));
3098		pte = pmap_pte_quick(pmap, pv->pv_va);
3099		tpte = pte_load_clear(pte);
3100		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3101		    pmap, pv->pv_va));
3102		if (tpte & PG_W)
3103			pmap->pm_stats.wired_count--;
3104		if (tpte & PG_A)
3105			vm_page_aflag_set(m, PGA_REFERENCED);
3106
3107		/*
3108		 * Update the vm_page_t clean and reference bits.
3109		 */
3110		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3111			vm_page_dirty(m);
3112		pmap_unuse_pt(pmap, pv->pv_va, &free);
3113		pmap_invalidate_page(pmap, pv->pv_va);
3114		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3115		free_pv_entry(pmap, pv);
3116		PMAP_UNLOCK(pmap);
3117	}
3118	vm_page_aflag_clear(m, PGA_WRITEABLE);
3119	sched_unpin();
3120	rw_wunlock(&pvh_global_lock);
3121	pmap_free_zero_pages(free);
3122}
3123
3124/*
3125 * pmap_protect_pde: do the things to protect a 4mpage in a process
3126 */
3127static boolean_t
3128pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3129{
3130	pd_entry_t newpde, oldpde;
3131	vm_offset_t eva, va;
3132	vm_page_t m;
3133	boolean_t anychanged;
3134
3135	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3136	KASSERT((sva & PDRMASK) == 0,
3137	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3138	anychanged = FALSE;
3139retry:
3140	oldpde = newpde = *pde;
3141	if (oldpde & PG_MANAGED) {
3142		eva = sva + NBPDR;
3143		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3144		    va < eva; va += PAGE_SIZE, m++)
3145			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3146				vm_page_dirty(m);
3147	}
3148	if ((prot & VM_PROT_WRITE) == 0)
3149		newpde &= ~(PG_RW | PG_M);
3150#ifdef PAE
3151	if ((prot & VM_PROT_EXECUTE) == 0)
3152		newpde |= pg_nx;
3153#endif
3154	if (newpde != oldpde) {
3155		if (!pde_cmpset(pde, oldpde, newpde))
3156			goto retry;
3157		if (oldpde & PG_G)
3158			pmap_invalidate_page(pmap, sva);
3159		else
3160			anychanged = TRUE;
3161	}
3162	return (anychanged);
3163}
3164
3165/*
3166 *	Set the physical protection on the
3167 *	specified range of this map as requested.
3168 */
3169void
3170pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3171{
3172	vm_offset_t pdnxt;
3173	pd_entry_t ptpaddr;
3174	pt_entry_t *pte;
3175	boolean_t anychanged, pv_lists_locked;
3176
3177	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3178		pmap_remove(pmap, sva, eva);
3179		return;
3180	}
3181
3182#ifdef PAE
3183	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3184	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3185		return;
3186#else
3187	if (prot & VM_PROT_WRITE)
3188		return;
3189#endif
3190
3191	if (pmap_is_current(pmap))
3192		pv_lists_locked = FALSE;
3193	else {
3194		pv_lists_locked = TRUE;
3195resume:
3196		rw_wlock(&pvh_global_lock);
3197		sched_pin();
3198	}
3199	anychanged = FALSE;
3200
3201	PMAP_LOCK(pmap);
3202	for (; sva < eva; sva = pdnxt) {
3203		pt_entry_t obits, pbits;
3204		u_int pdirindex;
3205
3206		pdnxt = (sva + NBPDR) & ~PDRMASK;
3207		if (pdnxt < sva)
3208			pdnxt = eva;
3209
3210		pdirindex = sva >> PDRSHIFT;
3211		ptpaddr = pmap->pm_pdir[pdirindex];
3212
3213		/*
3214		 * Weed out invalid mappings. Note: we assume that the page
3215		 * directory table is always allocated, and in kernel virtual.
3216		 */
3217		if (ptpaddr == 0)
3218			continue;
3219
3220		/*
3221		 * Check for large page.
3222		 */
3223		if ((ptpaddr & PG_PS) != 0) {
3224			/*
3225			 * Are we protecting the entire large page?  If not,
3226			 * demote the mapping and fall through.
3227			 */
3228			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3229				/*
3230				 * The TLB entry for a PG_G mapping is
3231				 * invalidated by pmap_protect_pde().
3232				 */
3233				if (pmap_protect_pde(pmap,
3234				    &pmap->pm_pdir[pdirindex], sva, prot))
3235					anychanged = TRUE;
3236				continue;
3237			} else {
3238				if (!pv_lists_locked) {
3239					pv_lists_locked = TRUE;
3240					if (!rw_try_wlock(&pvh_global_lock)) {
3241						if (anychanged)
3242							pmap_invalidate_all(
3243							    pmap);
3244						PMAP_UNLOCK(pmap);
3245						goto resume;
3246					}
3247					sched_pin();
3248				}
3249				if (!pmap_demote_pde(pmap,
3250				    &pmap->pm_pdir[pdirindex], sva)) {
3251					/*
3252					 * The large page mapping was
3253					 * destroyed.
3254					 */
3255					continue;
3256				}
3257			}
3258		}
3259
3260		if (pdnxt > eva)
3261			pdnxt = eva;
3262
3263		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3264		    sva += PAGE_SIZE) {
3265			vm_page_t m;
3266
3267retry:
3268			/*
3269			 * Regardless of whether a pte is 32 or 64 bits in
3270			 * size, PG_RW, PG_A, and PG_M are among the least
3271			 * significant 32 bits.
3272			 */
3273			obits = pbits = *pte;
3274			if ((pbits & PG_V) == 0)
3275				continue;
3276
3277			if ((prot & VM_PROT_WRITE) == 0) {
3278				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3279				    (PG_MANAGED | PG_M | PG_RW)) {
3280					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3281					vm_page_dirty(m);
3282				}
3283				pbits &= ~(PG_RW | PG_M);
3284			}
3285#ifdef PAE
3286			if ((prot & VM_PROT_EXECUTE) == 0)
3287				pbits |= pg_nx;
3288#endif
3289
3290			if (pbits != obits) {
3291#ifdef PAE
3292				if (!atomic_cmpset_64(pte, obits, pbits))
3293					goto retry;
3294#else
3295				if (!atomic_cmpset_int((u_int *)pte, obits,
3296				    pbits))
3297					goto retry;
3298#endif
3299				if (obits & PG_G)
3300					pmap_invalidate_page(pmap, sva);
3301				else
3302					anychanged = TRUE;
3303			}
3304		}
3305	}
3306	if (anychanged)
3307		pmap_invalidate_all(pmap);
3308	if (pv_lists_locked) {
3309		sched_unpin();
3310		rw_wunlock(&pvh_global_lock);
3311	}
3312	PMAP_UNLOCK(pmap);
3313}
3314
3315/*
3316 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3317 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3318 * For promotion to occur, two conditions must be met: (1) the 4KB page
3319 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3320 * mappings must have identical characteristics.
3321 *
3322 * Managed (PG_MANAGED) mappings within the kernel address space are not
3323 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3324 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3325 * pmap.
3326 */
3327static void
3328pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3329{
3330	pd_entry_t newpde;
3331	pt_entry_t *firstpte, oldpte, pa, *pte;
3332	vm_offset_t oldpteva;
3333	vm_page_t mpte;
3334
3335	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3336
3337	/*
3338	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3339	 * either invalid, unused, or does not map the first 4KB physical page
3340	 * within a 2- or 4MB page.
3341	 */
3342	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3343setpde:
3344	newpde = *firstpte;
3345	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3346		pmap_pde_p_failures++;
3347		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3348		    " in pmap %p", va, pmap);
3349		return;
3350	}
3351	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3352		pmap_pde_p_failures++;
3353		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3354		    " in pmap %p", va, pmap);
3355		return;
3356	}
3357	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3358		/*
3359		 * When PG_M is already clear, PG_RW can be cleared without
3360		 * a TLB invalidation.
3361		 */
3362		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3363		    ~PG_RW))
3364			goto setpde;
3365		newpde &= ~PG_RW;
3366	}
3367
3368	/*
3369	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3370	 * PTE maps an unexpected 4KB physical page or does not have identical
3371	 * characteristics to the first PTE.
3372	 */
3373	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3374	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3375setpte:
3376		oldpte = *pte;
3377		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3378			pmap_pde_p_failures++;
3379			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3380			    " in pmap %p", va, pmap);
3381			return;
3382		}
3383		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3384			/*
3385			 * When PG_M is already clear, PG_RW can be cleared
3386			 * without a TLB invalidation.
3387			 */
3388			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3389			    oldpte & ~PG_RW))
3390				goto setpte;
3391			oldpte &= ~PG_RW;
3392			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3393			    (va & ~PDRMASK);
3394			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3395			    " in pmap %p", oldpteva, pmap);
3396		}
3397		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3398			pmap_pde_p_failures++;
3399			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3400			    " in pmap %p", va, pmap);
3401			return;
3402		}
3403		pa -= PAGE_SIZE;
3404	}
3405
3406	/*
3407	 * Save the page table page in its current state until the PDE
3408	 * mapping the superpage is demoted by pmap_demote_pde() or
3409	 * destroyed by pmap_remove_pde().
3410	 */
3411	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3412	KASSERT(mpte >= vm_page_array &&
3413	    mpte < &vm_page_array[vm_page_array_size],
3414	    ("pmap_promote_pde: page table page is out of range"));
3415	KASSERT(mpte->pindex == va >> PDRSHIFT,
3416	    ("pmap_promote_pde: page table page's pindex is wrong"));
3417	pmap_insert_pt_page(pmap, mpte);
3418
3419	/*
3420	 * Promote the pv entries.
3421	 */
3422	if ((newpde & PG_MANAGED) != 0)
3423		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3424
3425	/*
3426	 * Propagate the PAT index to its proper position.
3427	 */
3428	if ((newpde & PG_PTE_PAT) != 0)
3429		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3430
3431	/*
3432	 * Map the superpage.
3433	 */
3434	if (workaround_erratum383)
3435		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3436	else if (pmap == kernel_pmap)
3437		pmap_kenter_pde(va, PG_PS | newpde);
3438	else
3439		pde_store(pde, PG_PS | newpde);
3440
3441	pmap_pde_promotions++;
3442	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3443	    " in pmap %p", va, pmap);
3444}
3445
3446/*
3447 *	Insert the given physical page (p) at
3448 *	the specified virtual address (v) in the
3449 *	target physical map with the protection requested.
3450 *
3451 *	If specified, the page will be wired down, meaning
3452 *	that the related pte can not be reclaimed.
3453 *
3454 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3455 *	or lose information.  That is, this routine must actually
3456 *	insert this page into the given map NOW.
3457 */
3458void
3459pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3460    vm_prot_t prot, boolean_t wired)
3461{
3462	pd_entry_t *pde;
3463	pt_entry_t *pte;
3464	pt_entry_t newpte, origpte;
3465	pv_entry_t pv;
3466	vm_paddr_t opa, pa;
3467	vm_page_t mpte, om;
3468	boolean_t invlva;
3469
3470	va = trunc_page(va);
3471	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3472	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3473	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3474	    va));
3475	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
3476	    VM_OBJECT_LOCKED(m->object),
3477	    ("pmap_enter: page %p is not busy", m));
3478
3479	mpte = NULL;
3480
3481	rw_wlock(&pvh_global_lock);
3482	PMAP_LOCK(pmap);
3483	sched_pin();
3484
3485	/*
3486	 * In the case that a page table page is not
3487	 * resident, we are creating it here.
3488	 */
3489	if (va < VM_MAXUSER_ADDRESS) {
3490		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3491	}
3492
3493	pde = pmap_pde(pmap, va);
3494	if ((*pde & PG_PS) != 0)
3495		panic("pmap_enter: attempted pmap_enter on 4MB page");
3496	pte = pmap_pte_quick(pmap, va);
3497
3498	/*
3499	 * Page Directory table entry not valid, we need a new PT page
3500	 */
3501	if (pte == NULL) {
3502		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3503			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3504	}
3505
3506	pa = VM_PAGE_TO_PHYS(m);
3507	om = NULL;
3508	origpte = *pte;
3509	opa = origpte & PG_FRAME;
3510
3511	/*
3512	 * Mapping has not changed, must be protection or wiring change.
3513	 */
3514	if (origpte && (opa == pa)) {
3515		/*
3516		 * Wiring change, just update stats. We don't worry about
3517		 * wiring PT pages as they remain resident as long as there
3518		 * are valid mappings in them. Hence, if a user page is wired,
3519		 * the PT page will be also.
3520		 */
3521		if (wired && ((origpte & PG_W) == 0))
3522			pmap->pm_stats.wired_count++;
3523		else if (!wired && (origpte & PG_W))
3524			pmap->pm_stats.wired_count--;
3525
3526		/*
3527		 * Remove extra pte reference
3528		 */
3529		if (mpte)
3530			mpte->wire_count--;
3531
3532		if (origpte & PG_MANAGED) {
3533			om = m;
3534			pa |= PG_MANAGED;
3535		}
3536		goto validate;
3537	}
3538
3539	pv = NULL;
3540
3541	/*
3542	 * Mapping has changed, invalidate old range and fall through to
3543	 * handle validating new mapping.
3544	 */
3545	if (opa) {
3546		if (origpte & PG_W)
3547			pmap->pm_stats.wired_count--;
3548		if (origpte & PG_MANAGED) {
3549			om = PHYS_TO_VM_PAGE(opa);
3550			pv = pmap_pvh_remove(&om->md, pmap, va);
3551		}
3552		if (mpte != NULL) {
3553			mpte->wire_count--;
3554			KASSERT(mpte->wire_count > 0,
3555			    ("pmap_enter: missing reference to page table page,"
3556			     " va: 0x%x", va));
3557		}
3558	} else
3559		pmap->pm_stats.resident_count++;
3560
3561	/*
3562	 * Enter on the PV list if part of our managed memory.
3563	 */
3564	if ((m->oflags & VPO_UNMANAGED) == 0) {
3565		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3566		    ("pmap_enter: managed mapping within the clean submap"));
3567		if (pv == NULL)
3568			pv = get_pv_entry(pmap, FALSE);
3569		pv->pv_va = va;
3570		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3571		pa |= PG_MANAGED;
3572	} else if (pv != NULL)
3573		free_pv_entry(pmap, pv);
3574
3575	/*
3576	 * Increment counters
3577	 */
3578	if (wired)
3579		pmap->pm_stats.wired_count++;
3580
3581validate:
3582	/*
3583	 * Now validate mapping with desired protection/wiring.
3584	 */
3585	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3586	if ((prot & VM_PROT_WRITE) != 0) {
3587		newpte |= PG_RW;
3588		if ((newpte & PG_MANAGED) != 0)
3589			vm_page_aflag_set(m, PGA_WRITEABLE);
3590	}
3591#ifdef PAE
3592	if ((prot & VM_PROT_EXECUTE) == 0)
3593		newpte |= pg_nx;
3594#endif
3595	if (wired)
3596		newpte |= PG_W;
3597	if (va < VM_MAXUSER_ADDRESS)
3598		newpte |= PG_U;
3599	if (pmap == kernel_pmap)
3600		newpte |= pgeflag;
3601
3602	/*
3603	 * if the mapping or permission bits are different, we need
3604	 * to update the pte.
3605	 */
3606	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3607		newpte |= PG_A;
3608		if ((access & VM_PROT_WRITE) != 0)
3609			newpte |= PG_M;
3610		if (origpte & PG_V) {
3611			invlva = FALSE;
3612			origpte = pte_load_store(pte, newpte);
3613			if (origpte & PG_A) {
3614				if (origpte & PG_MANAGED)
3615					vm_page_aflag_set(om, PGA_REFERENCED);
3616				if (opa != VM_PAGE_TO_PHYS(m))
3617					invlva = TRUE;
3618#ifdef PAE
3619				if ((origpte & PG_NX) == 0 &&
3620				    (newpte & PG_NX) != 0)
3621					invlva = TRUE;
3622#endif
3623			}
3624			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3625				if ((origpte & PG_MANAGED) != 0)
3626					vm_page_dirty(om);
3627				if ((prot & VM_PROT_WRITE) == 0)
3628					invlva = TRUE;
3629			}
3630			if ((origpte & PG_MANAGED) != 0 &&
3631			    TAILQ_EMPTY(&om->md.pv_list) &&
3632			    ((om->flags & PG_FICTITIOUS) != 0 ||
3633			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3634				vm_page_aflag_clear(om, PGA_WRITEABLE);
3635			if (invlva)
3636				pmap_invalidate_page(pmap, va);
3637		} else
3638			pte_store(pte, newpte);
3639	}
3640
3641	/*
3642	 * If both the page table page and the reservation are fully
3643	 * populated, then attempt promotion.
3644	 */
3645	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3646	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3647	    vm_reserv_level_iffullpop(m) == 0)
3648		pmap_promote_pde(pmap, pde, va);
3649
3650	sched_unpin();
3651	rw_wunlock(&pvh_global_lock);
3652	PMAP_UNLOCK(pmap);
3653}
3654
3655/*
3656 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3657 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3658 * blocking, (2) a mapping already exists at the specified virtual address, or
3659 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3660 */
3661static boolean_t
3662pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3663{
3664	pd_entry_t *pde, newpde;
3665
3666	rw_assert(&pvh_global_lock, RA_WLOCKED);
3667	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3668	pde = pmap_pde(pmap, va);
3669	if (*pde != 0) {
3670		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3671		    " in pmap %p", va, pmap);
3672		return (FALSE);
3673	}
3674	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3675	    PG_PS | PG_V;
3676	if ((m->oflags & VPO_UNMANAGED) == 0) {
3677		newpde |= PG_MANAGED;
3678
3679		/*
3680		 * Abort this mapping if its PV entry could not be created.
3681		 */
3682		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3683			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3684			    " in pmap %p", va, pmap);
3685			return (FALSE);
3686		}
3687	}
3688#ifdef PAE
3689	if ((prot & VM_PROT_EXECUTE) == 0)
3690		newpde |= pg_nx;
3691#endif
3692	if (va < VM_MAXUSER_ADDRESS)
3693		newpde |= PG_U;
3694
3695	/*
3696	 * Increment counters.
3697	 */
3698	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3699
3700	/*
3701	 * Map the superpage.
3702	 */
3703	pde_store(pde, newpde);
3704
3705	pmap_pde_mappings++;
3706	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3707	    " in pmap %p", va, pmap);
3708	return (TRUE);
3709}
3710
3711/*
3712 * Maps a sequence of resident pages belonging to the same object.
3713 * The sequence begins with the given page m_start.  This page is
3714 * mapped at the given virtual address start.  Each subsequent page is
3715 * mapped at a virtual address that is offset from start by the same
3716 * amount as the page is offset from m_start within the object.  The
3717 * last page in the sequence is the page with the largest offset from
3718 * m_start that can be mapped at a virtual address less than the given
3719 * virtual address end.  Not every virtual page between start and end
3720 * is mapped; only those for which a resident page exists with the
3721 * corresponding offset from m_start are mapped.
3722 */
3723void
3724pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3725    vm_page_t m_start, vm_prot_t prot)
3726{
3727	vm_offset_t va;
3728	vm_page_t m, mpte;
3729	vm_pindex_t diff, psize;
3730
3731	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3732	psize = atop(end - start);
3733	mpte = NULL;
3734	m = m_start;
3735	rw_wlock(&pvh_global_lock);
3736	PMAP_LOCK(pmap);
3737	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3738		va = start + ptoa(diff);
3739		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3740		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3741		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3742		    pmap_enter_pde(pmap, va, m, prot))
3743			m = &m[NBPDR / PAGE_SIZE - 1];
3744		else
3745			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3746			    mpte);
3747		m = TAILQ_NEXT(m, listq);
3748	}
3749	rw_wunlock(&pvh_global_lock);
3750	PMAP_UNLOCK(pmap);
3751}
3752
3753/*
3754 * this code makes some *MAJOR* assumptions:
3755 * 1. Current pmap & pmap exists.
3756 * 2. Not wired.
3757 * 3. Read access.
3758 * 4. No page table pages.
3759 * but is *MUCH* faster than pmap_enter...
3760 */
3761
3762void
3763pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3764{
3765
3766	rw_wlock(&pvh_global_lock);
3767	PMAP_LOCK(pmap);
3768	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3769	rw_wunlock(&pvh_global_lock);
3770	PMAP_UNLOCK(pmap);
3771}
3772
3773static vm_page_t
3774pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3775    vm_prot_t prot, vm_page_t mpte)
3776{
3777	pt_entry_t *pte;
3778	vm_paddr_t pa;
3779	vm_page_t free;
3780
3781	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3782	    (m->oflags & VPO_UNMANAGED) != 0,
3783	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3784	rw_assert(&pvh_global_lock, RA_WLOCKED);
3785	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3786
3787	/*
3788	 * In the case that a page table page is not
3789	 * resident, we are creating it here.
3790	 */
3791	if (va < VM_MAXUSER_ADDRESS) {
3792		u_int ptepindex;
3793		pd_entry_t ptepa;
3794
3795		/*
3796		 * Calculate pagetable page index
3797		 */
3798		ptepindex = va >> PDRSHIFT;
3799		if (mpte && (mpte->pindex == ptepindex)) {
3800			mpte->wire_count++;
3801		} else {
3802			/*
3803			 * Get the page directory entry
3804			 */
3805			ptepa = pmap->pm_pdir[ptepindex];
3806
3807			/*
3808			 * If the page table page is mapped, we just increment
3809			 * the hold count, and activate it.
3810			 */
3811			if (ptepa) {
3812				if (ptepa & PG_PS)
3813					return (NULL);
3814				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3815				mpte->wire_count++;
3816			} else {
3817				mpte = _pmap_allocpte(pmap, ptepindex,
3818				    M_NOWAIT);
3819				if (mpte == NULL)
3820					return (mpte);
3821			}
3822		}
3823	} else {
3824		mpte = NULL;
3825	}
3826
3827	/*
3828	 * This call to vtopte makes the assumption that we are
3829	 * entering the page into the current pmap.  In order to support
3830	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3831	 * But that isn't as quick as vtopte.
3832	 */
3833	pte = vtopte(va);
3834	if (*pte) {
3835		if (mpte != NULL) {
3836			mpte->wire_count--;
3837			mpte = NULL;
3838		}
3839		return (mpte);
3840	}
3841
3842	/*
3843	 * Enter on the PV list if part of our managed memory.
3844	 */
3845	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3846	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3847		if (mpte != NULL) {
3848			free = NULL;
3849			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3850				pmap_invalidate_page(pmap, va);
3851				pmap_free_zero_pages(free);
3852			}
3853
3854			mpte = NULL;
3855		}
3856		return (mpte);
3857	}
3858
3859	/*
3860	 * Increment counters
3861	 */
3862	pmap->pm_stats.resident_count++;
3863
3864	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3865#ifdef PAE
3866	if ((prot & VM_PROT_EXECUTE) == 0)
3867		pa |= pg_nx;
3868#endif
3869
3870	/*
3871	 * Now validate mapping with RO protection
3872	 */
3873	if ((m->oflags & VPO_UNMANAGED) != 0)
3874		pte_store(pte, pa | PG_V | PG_U);
3875	else
3876		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3877	return (mpte);
3878}
3879
3880/*
3881 * Make a temporary mapping for a physical address.  This is only intended
3882 * to be used for panic dumps.
3883 */
3884void *
3885pmap_kenter_temporary(vm_paddr_t pa, int i)
3886{
3887	vm_offset_t va;
3888
3889	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3890	pmap_kenter(va, pa);
3891	invlpg(va);
3892	return ((void *)crashdumpmap);
3893}
3894
3895/*
3896 * This code maps large physical mmap regions into the
3897 * processor address space.  Note that some shortcuts
3898 * are taken, but the code works.
3899 */
3900void
3901pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3902    vm_pindex_t pindex, vm_size_t size)
3903{
3904	pd_entry_t *pde;
3905	vm_paddr_t pa, ptepa;
3906	vm_page_t p;
3907	int pat_mode;
3908
3909	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3910	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3911	    ("pmap_object_init_pt: non-device object"));
3912	if (pseflag &&
3913	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3914		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3915			return;
3916		p = vm_page_lookup(object, pindex);
3917		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3918		    ("pmap_object_init_pt: invalid page %p", p));
3919		pat_mode = p->md.pat_mode;
3920
3921		/*
3922		 * Abort the mapping if the first page is not physically
3923		 * aligned to a 2/4MB page boundary.
3924		 */
3925		ptepa = VM_PAGE_TO_PHYS(p);
3926		if (ptepa & (NBPDR - 1))
3927			return;
3928
3929		/*
3930		 * Skip the first page.  Abort the mapping if the rest of
3931		 * the pages are not physically contiguous or have differing
3932		 * memory attributes.
3933		 */
3934		p = TAILQ_NEXT(p, listq);
3935		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3936		    pa += PAGE_SIZE) {
3937			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3938			    ("pmap_object_init_pt: invalid page %p", p));
3939			if (pa != VM_PAGE_TO_PHYS(p) ||
3940			    pat_mode != p->md.pat_mode)
3941				return;
3942			p = TAILQ_NEXT(p, listq);
3943		}
3944
3945		/*
3946		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3947		 * "size" is a multiple of 2/4M, adding the PAT setting to
3948		 * "pa" will not affect the termination of this loop.
3949		 */
3950		PMAP_LOCK(pmap);
3951		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3952		    size; pa += NBPDR) {
3953			pde = pmap_pde(pmap, addr);
3954			if (*pde == 0) {
3955				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3956				    PG_U | PG_RW | PG_V);
3957				pmap->pm_stats.resident_count += NBPDR /
3958				    PAGE_SIZE;
3959				pmap_pde_mappings++;
3960			}
3961			/* Else continue on if the PDE is already valid. */
3962			addr += NBPDR;
3963		}
3964		PMAP_UNLOCK(pmap);
3965	}
3966}
3967
3968/*
3969 *	Routine:	pmap_change_wiring
3970 *	Function:	Change the wiring attribute for a map/virtual-address
3971 *			pair.
3972 *	In/out conditions:
3973 *			The mapping must already exist in the pmap.
3974 */
3975void
3976pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3977{
3978	pd_entry_t *pde;
3979	pt_entry_t *pte;
3980	boolean_t are_queues_locked;
3981
3982	are_queues_locked = FALSE;
3983retry:
3984	PMAP_LOCK(pmap);
3985	pde = pmap_pde(pmap, va);
3986	if ((*pde & PG_PS) != 0) {
3987		if (!wired != ((*pde & PG_W) == 0)) {
3988			if (!are_queues_locked) {
3989				are_queues_locked = TRUE;
3990				if (!rw_try_wlock(&pvh_global_lock)) {
3991					PMAP_UNLOCK(pmap);
3992					rw_wlock(&pvh_global_lock);
3993					goto retry;
3994				}
3995			}
3996			if (!pmap_demote_pde(pmap, pde, va))
3997				panic("pmap_change_wiring: demotion failed");
3998		} else
3999			goto out;
4000	}
4001	pte = pmap_pte(pmap, va);
4002
4003	if (wired && !pmap_pte_w(pte))
4004		pmap->pm_stats.wired_count++;
4005	else if (!wired && pmap_pte_w(pte))
4006		pmap->pm_stats.wired_count--;
4007
4008	/*
4009	 * Wiring is not a hardware characteristic so there is no need to
4010	 * invalidate TLB.
4011	 */
4012	pmap_pte_set_w(pte, wired);
4013	pmap_pte_release(pte);
4014out:
4015	if (are_queues_locked)
4016		rw_wunlock(&pvh_global_lock);
4017	PMAP_UNLOCK(pmap);
4018}
4019
4020
4021
4022/*
4023 *	Copy the range specified by src_addr/len
4024 *	from the source map to the range dst_addr/len
4025 *	in the destination map.
4026 *
4027 *	This routine is only advisory and need not do anything.
4028 */
4029
4030void
4031pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4032    vm_offset_t src_addr)
4033{
4034	vm_page_t   free;
4035	vm_offset_t addr;
4036	vm_offset_t end_addr = src_addr + len;
4037	vm_offset_t pdnxt;
4038
4039	if (dst_addr != src_addr)
4040		return;
4041
4042	if (!pmap_is_current(src_pmap))
4043		return;
4044
4045	rw_wlock(&pvh_global_lock);
4046	if (dst_pmap < src_pmap) {
4047		PMAP_LOCK(dst_pmap);
4048		PMAP_LOCK(src_pmap);
4049	} else {
4050		PMAP_LOCK(src_pmap);
4051		PMAP_LOCK(dst_pmap);
4052	}
4053	sched_pin();
4054	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4055		pt_entry_t *src_pte, *dst_pte;
4056		vm_page_t dstmpte, srcmpte;
4057		pd_entry_t srcptepaddr;
4058		u_int ptepindex;
4059
4060		KASSERT(addr < UPT_MIN_ADDRESS,
4061		    ("pmap_copy: invalid to pmap_copy page tables"));
4062
4063		pdnxt = (addr + NBPDR) & ~PDRMASK;
4064		if (pdnxt < addr)
4065			pdnxt = end_addr;
4066		ptepindex = addr >> PDRSHIFT;
4067
4068		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4069		if (srcptepaddr == 0)
4070			continue;
4071
4072		if (srcptepaddr & PG_PS) {
4073			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4074				continue;
4075			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4076			    ((srcptepaddr & PG_MANAGED) == 0 ||
4077			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4078			    PG_PS_FRAME))) {
4079				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4080				    ~PG_W;
4081				dst_pmap->pm_stats.resident_count +=
4082				    NBPDR / PAGE_SIZE;
4083			}
4084			continue;
4085		}
4086
4087		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4088		KASSERT(srcmpte->wire_count > 0,
4089		    ("pmap_copy: source page table page is unused"));
4090
4091		if (pdnxt > end_addr)
4092			pdnxt = end_addr;
4093
4094		src_pte = vtopte(addr);
4095		while (addr < pdnxt) {
4096			pt_entry_t ptetemp;
4097			ptetemp = *src_pte;
4098			/*
4099			 * we only virtual copy managed pages
4100			 */
4101			if ((ptetemp & PG_MANAGED) != 0) {
4102				dstmpte = pmap_allocpte(dst_pmap, addr,
4103				    M_NOWAIT);
4104				if (dstmpte == NULL)
4105					goto out;
4106				dst_pte = pmap_pte_quick(dst_pmap, addr);
4107				if (*dst_pte == 0 &&
4108				    pmap_try_insert_pv_entry(dst_pmap, addr,
4109				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4110					/*
4111					 * Clear the wired, modified, and
4112					 * accessed (referenced) bits
4113					 * during the copy.
4114					 */
4115					*dst_pte = ptetemp & ~(PG_W | PG_M |
4116					    PG_A);
4117					dst_pmap->pm_stats.resident_count++;
4118	 			} else {
4119					free = NULL;
4120					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4121					    &free)) {
4122						pmap_invalidate_page(dst_pmap,
4123						    addr);
4124						pmap_free_zero_pages(free);
4125					}
4126					goto out;
4127				}
4128				if (dstmpte->wire_count >= srcmpte->wire_count)
4129					break;
4130			}
4131			addr += PAGE_SIZE;
4132			src_pte++;
4133		}
4134	}
4135out:
4136	sched_unpin();
4137	rw_wunlock(&pvh_global_lock);
4138	PMAP_UNLOCK(src_pmap);
4139	PMAP_UNLOCK(dst_pmap);
4140}
4141
4142static __inline void
4143pagezero(void *page)
4144{
4145#if defined(I686_CPU)
4146	if (cpu_class == CPUCLASS_686) {
4147#if defined(CPU_ENABLE_SSE)
4148		if (cpu_feature & CPUID_SSE2)
4149			sse2_pagezero(page);
4150		else
4151#endif
4152			i686_pagezero(page);
4153	} else
4154#endif
4155		bzero(page, PAGE_SIZE);
4156}
4157
4158/*
4159 *	pmap_zero_page zeros the specified hardware page by mapping
4160 *	the page into KVM and using bzero to clear its contents.
4161 */
4162void
4163pmap_zero_page(vm_page_t m)
4164{
4165	struct sysmaps *sysmaps;
4166
4167	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4168	mtx_lock(&sysmaps->lock);
4169	if (*sysmaps->CMAP2)
4170		panic("pmap_zero_page: CMAP2 busy");
4171	sched_pin();
4172	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4173	    pmap_cache_bits(m->md.pat_mode, 0);
4174	invlcaddr(sysmaps->CADDR2);
4175	pagezero(sysmaps->CADDR2);
4176	*sysmaps->CMAP2 = 0;
4177	sched_unpin();
4178	mtx_unlock(&sysmaps->lock);
4179}
4180
4181/*
4182 *	pmap_zero_page_area zeros the specified hardware page by mapping
4183 *	the page into KVM and using bzero to clear its contents.
4184 *
4185 *	off and size may not cover an area beyond a single hardware page.
4186 */
4187void
4188pmap_zero_page_area(vm_page_t m, int off, int size)
4189{
4190	struct sysmaps *sysmaps;
4191
4192	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4193	mtx_lock(&sysmaps->lock);
4194	if (*sysmaps->CMAP2)
4195		panic("pmap_zero_page_area: CMAP2 busy");
4196	sched_pin();
4197	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4198	    pmap_cache_bits(m->md.pat_mode, 0);
4199	invlcaddr(sysmaps->CADDR2);
4200	if (off == 0 && size == PAGE_SIZE)
4201		pagezero(sysmaps->CADDR2);
4202	else
4203		bzero((char *)sysmaps->CADDR2 + off, size);
4204	*sysmaps->CMAP2 = 0;
4205	sched_unpin();
4206	mtx_unlock(&sysmaps->lock);
4207}
4208
4209/*
4210 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4211 *	the page into KVM and using bzero to clear its contents.  This
4212 *	is intended to be called from the vm_pagezero process only and
4213 *	outside of Giant.
4214 */
4215void
4216pmap_zero_page_idle(vm_page_t m)
4217{
4218
4219	if (*CMAP3)
4220		panic("pmap_zero_page_idle: CMAP3 busy");
4221	sched_pin();
4222	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4223	    pmap_cache_bits(m->md.pat_mode, 0);
4224	invlcaddr(CADDR3);
4225	pagezero(CADDR3);
4226	*CMAP3 = 0;
4227	sched_unpin();
4228}
4229
4230/*
4231 *	pmap_copy_page copies the specified (machine independent)
4232 *	page by mapping the page into virtual memory and using
4233 *	bcopy to copy the page, one machine dependent page at a
4234 *	time.
4235 */
4236void
4237pmap_copy_page(vm_page_t src, vm_page_t dst)
4238{
4239	struct sysmaps *sysmaps;
4240
4241	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4242	mtx_lock(&sysmaps->lock);
4243	if (*sysmaps->CMAP1)
4244		panic("pmap_copy_page: CMAP1 busy");
4245	if (*sysmaps->CMAP2)
4246		panic("pmap_copy_page: CMAP2 busy");
4247	sched_pin();
4248	invlpg((u_int)sysmaps->CADDR1);
4249	invlpg((u_int)sysmaps->CADDR2);
4250	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4251	    pmap_cache_bits(src->md.pat_mode, 0);
4252	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4253	    pmap_cache_bits(dst->md.pat_mode, 0);
4254	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4255	*sysmaps->CMAP1 = 0;
4256	*sysmaps->CMAP2 = 0;
4257	sched_unpin();
4258	mtx_unlock(&sysmaps->lock);
4259}
4260
4261int unmapped_buf_allowed = 1;
4262
4263void
4264pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4265    vm_offset_t b_offset, int xfersize)
4266{
4267	struct sysmaps *sysmaps;
4268	vm_page_t a_pg, b_pg;
4269	char *a_cp, *b_cp;
4270	vm_offset_t a_pg_offset, b_pg_offset;
4271	int cnt;
4272
4273	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4274	mtx_lock(&sysmaps->lock);
4275	if (*sysmaps->CMAP1 != 0)
4276		panic("pmap_copy_pages: CMAP1 busy");
4277	if (*sysmaps->CMAP2 != 0)
4278		panic("pmap_copy_pages: CMAP2 busy");
4279	sched_pin();
4280	while (xfersize > 0) {
4281		invlpg((u_int)sysmaps->CADDR1);
4282		invlpg((u_int)sysmaps->CADDR2);
4283		a_pg = ma[a_offset >> PAGE_SHIFT];
4284		a_pg_offset = a_offset & PAGE_MASK;
4285		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4286		b_pg = mb[b_offset >> PAGE_SHIFT];
4287		b_pg_offset = b_offset & PAGE_MASK;
4288		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4289		*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4290		    pmap_cache_bits(b_pg->md.pat_mode, 0);
4291		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4292		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4293		a_cp = sysmaps->CADDR1 + a_pg_offset;
4294		b_cp = sysmaps->CADDR2 + b_pg_offset;
4295		bcopy(a_cp, b_cp, cnt);
4296		a_offset += cnt;
4297		b_offset += cnt;
4298		xfersize -= cnt;
4299	}
4300	*sysmaps->CMAP1 = 0;
4301	*sysmaps->CMAP2 = 0;
4302	sched_unpin();
4303	mtx_unlock(&sysmaps->lock);
4304}
4305
4306/*
4307 * Returns true if the pmap's pv is one of the first
4308 * 16 pvs linked to from this page.  This count may
4309 * be changed upwards or downwards in the future; it
4310 * is only necessary that true be returned for a small
4311 * subset of pmaps for proper page aging.
4312 */
4313boolean_t
4314pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4315{
4316	struct md_page *pvh;
4317	pv_entry_t pv;
4318	int loops = 0;
4319	boolean_t rv;
4320
4321	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4322	    ("pmap_page_exists_quick: page %p is not managed", m));
4323	rv = FALSE;
4324	rw_wlock(&pvh_global_lock);
4325	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4326		if (PV_PMAP(pv) == pmap) {
4327			rv = TRUE;
4328			break;
4329		}
4330		loops++;
4331		if (loops >= 16)
4332			break;
4333	}
4334	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4335		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4336		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4337			if (PV_PMAP(pv) == pmap) {
4338				rv = TRUE;
4339				break;
4340			}
4341			loops++;
4342			if (loops >= 16)
4343				break;
4344		}
4345	}
4346	rw_wunlock(&pvh_global_lock);
4347	return (rv);
4348}
4349
4350/*
4351 *	pmap_page_wired_mappings:
4352 *
4353 *	Return the number of managed mappings to the given physical page
4354 *	that are wired.
4355 */
4356int
4357pmap_page_wired_mappings(vm_page_t m)
4358{
4359	int count;
4360
4361	count = 0;
4362	if ((m->oflags & VPO_UNMANAGED) != 0)
4363		return (count);
4364	rw_wlock(&pvh_global_lock);
4365	count = pmap_pvh_wired_mappings(&m->md, count);
4366	if ((m->flags & PG_FICTITIOUS) == 0) {
4367	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4368	        count);
4369	}
4370	rw_wunlock(&pvh_global_lock);
4371	return (count);
4372}
4373
4374/*
4375 *	pmap_pvh_wired_mappings:
4376 *
4377 *	Return the updated number "count" of managed mappings that are wired.
4378 */
4379static int
4380pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4381{
4382	pmap_t pmap;
4383	pt_entry_t *pte;
4384	pv_entry_t pv;
4385
4386	rw_assert(&pvh_global_lock, RA_WLOCKED);
4387	sched_pin();
4388	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4389		pmap = PV_PMAP(pv);
4390		PMAP_LOCK(pmap);
4391		pte = pmap_pte_quick(pmap, pv->pv_va);
4392		if ((*pte & PG_W) != 0)
4393			count++;
4394		PMAP_UNLOCK(pmap);
4395	}
4396	sched_unpin();
4397	return (count);
4398}
4399
4400/*
4401 * Returns TRUE if the given page is mapped individually or as part of
4402 * a 4mpage.  Otherwise, returns FALSE.
4403 */
4404boolean_t
4405pmap_page_is_mapped(vm_page_t m)
4406{
4407	boolean_t rv;
4408
4409	if ((m->oflags & VPO_UNMANAGED) != 0)
4410		return (FALSE);
4411	rw_wlock(&pvh_global_lock);
4412	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4413	    ((m->flags & PG_FICTITIOUS) == 0 &&
4414	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4415	rw_wunlock(&pvh_global_lock);
4416	return (rv);
4417}
4418
4419/*
4420 * Remove all pages from specified address space
4421 * this aids process exit speeds.  Also, this code
4422 * is special cased for current process only, but
4423 * can have the more generic (and slightly slower)
4424 * mode enabled.  This is much faster than pmap_remove
4425 * in the case of running down an entire address space.
4426 */
4427void
4428pmap_remove_pages(pmap_t pmap)
4429{
4430	pt_entry_t *pte, tpte;
4431	vm_page_t free = NULL;
4432	vm_page_t m, mpte, mt;
4433	pv_entry_t pv;
4434	struct md_page *pvh;
4435	struct pv_chunk *pc, *npc;
4436	int field, idx;
4437	int32_t bit;
4438	uint32_t inuse, bitmask;
4439	int allfree;
4440
4441	if (pmap != PCPU_GET(curpmap)) {
4442		printf("warning: pmap_remove_pages called with non-current pmap\n");
4443		return;
4444	}
4445	rw_wlock(&pvh_global_lock);
4446	PMAP_LOCK(pmap);
4447	sched_pin();
4448	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4449		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4450		    pc->pc_pmap));
4451		allfree = 1;
4452		for (field = 0; field < _NPCM; field++) {
4453			inuse = ~pc->pc_map[field] & pc_freemask[field];
4454			while (inuse != 0) {
4455				bit = bsfl(inuse);
4456				bitmask = 1UL << bit;
4457				idx = field * 32 + bit;
4458				pv = &pc->pc_pventry[idx];
4459				inuse &= ~bitmask;
4460
4461				pte = pmap_pde(pmap, pv->pv_va);
4462				tpte = *pte;
4463				if ((tpte & PG_PS) == 0) {
4464					pte = vtopte(pv->pv_va);
4465					tpte = *pte & ~PG_PTE_PAT;
4466				}
4467
4468				if (tpte == 0) {
4469					printf(
4470					    "TPTE at %p  IS ZERO @ VA %08x\n",
4471					    pte, pv->pv_va);
4472					panic("bad pte");
4473				}
4474
4475/*
4476 * We cannot remove wired pages from a process' mapping at this time
4477 */
4478				if (tpte & PG_W) {
4479					allfree = 0;
4480					continue;
4481				}
4482
4483				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4484				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4485				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4486				    m, (uintmax_t)m->phys_addr,
4487				    (uintmax_t)tpte));
4488
4489				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4490				    m < &vm_page_array[vm_page_array_size],
4491				    ("pmap_remove_pages: bad tpte %#jx",
4492				    (uintmax_t)tpte));
4493
4494				pte_clear(pte);
4495
4496				/*
4497				 * Update the vm_page_t clean/reference bits.
4498				 */
4499				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4500					if ((tpte & PG_PS) != 0) {
4501						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4502							vm_page_dirty(mt);
4503					} else
4504						vm_page_dirty(m);
4505				}
4506
4507				/* Mark free */
4508				PV_STAT(pv_entry_frees++);
4509				PV_STAT(pv_entry_spare++);
4510				pv_entry_count--;
4511				pc->pc_map[field] |= bitmask;
4512				if ((tpte & PG_PS) != 0) {
4513					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4514					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4515					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4516					if (TAILQ_EMPTY(&pvh->pv_list)) {
4517						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4518							if (TAILQ_EMPTY(&mt->md.pv_list))
4519								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4520					}
4521					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4522					if (mpte != NULL) {
4523						pmap_remove_pt_page(pmap, mpte);
4524						pmap->pm_stats.resident_count--;
4525						KASSERT(mpte->wire_count == NPTEPG,
4526						    ("pmap_remove_pages: pte page wire count error"));
4527						mpte->wire_count = 0;
4528						pmap_add_delayed_free_list(mpte, &free, FALSE);
4529						atomic_subtract_int(&cnt.v_wire_count, 1);
4530					}
4531				} else {
4532					pmap->pm_stats.resident_count--;
4533					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4534					if (TAILQ_EMPTY(&m->md.pv_list) &&
4535					    (m->flags & PG_FICTITIOUS) == 0) {
4536						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4537						if (TAILQ_EMPTY(&pvh->pv_list))
4538							vm_page_aflag_clear(m, PGA_WRITEABLE);
4539					}
4540					pmap_unuse_pt(pmap, pv->pv_va, &free);
4541				}
4542			}
4543		}
4544		if (allfree) {
4545			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4546			free_pv_chunk(pc);
4547		}
4548	}
4549	sched_unpin();
4550	pmap_invalidate_all(pmap);
4551	rw_wunlock(&pvh_global_lock);
4552	PMAP_UNLOCK(pmap);
4553	pmap_free_zero_pages(free);
4554}
4555
4556/*
4557 *	pmap_is_modified:
4558 *
4559 *	Return whether or not the specified physical page was modified
4560 *	in any physical maps.
4561 */
4562boolean_t
4563pmap_is_modified(vm_page_t m)
4564{
4565	boolean_t rv;
4566
4567	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4568	    ("pmap_is_modified: page %p is not managed", m));
4569
4570	/*
4571	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
4572	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4573	 * is clear, no PTEs can have PG_M set.
4574	 */
4575	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4576	if ((m->oflags & VPO_BUSY) == 0 &&
4577	    (m->aflags & PGA_WRITEABLE) == 0)
4578		return (FALSE);
4579	rw_wlock(&pvh_global_lock);
4580	rv = pmap_is_modified_pvh(&m->md) ||
4581	    ((m->flags & PG_FICTITIOUS) == 0 &&
4582	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4583	rw_wunlock(&pvh_global_lock);
4584	return (rv);
4585}
4586
4587/*
4588 * Returns TRUE if any of the given mappings were used to modify
4589 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4590 * mappings are supported.
4591 */
4592static boolean_t
4593pmap_is_modified_pvh(struct md_page *pvh)
4594{
4595	pv_entry_t pv;
4596	pt_entry_t *pte;
4597	pmap_t pmap;
4598	boolean_t rv;
4599
4600	rw_assert(&pvh_global_lock, RA_WLOCKED);
4601	rv = FALSE;
4602	sched_pin();
4603	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4604		pmap = PV_PMAP(pv);
4605		PMAP_LOCK(pmap);
4606		pte = pmap_pte_quick(pmap, pv->pv_va);
4607		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4608		PMAP_UNLOCK(pmap);
4609		if (rv)
4610			break;
4611	}
4612	sched_unpin();
4613	return (rv);
4614}
4615
4616/*
4617 *	pmap_is_prefaultable:
4618 *
4619 *	Return whether or not the specified virtual address is elgible
4620 *	for prefault.
4621 */
4622boolean_t
4623pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4624{
4625	pd_entry_t *pde;
4626	pt_entry_t *pte;
4627	boolean_t rv;
4628
4629	rv = FALSE;
4630	PMAP_LOCK(pmap);
4631	pde = pmap_pde(pmap, addr);
4632	if (*pde != 0 && (*pde & PG_PS) == 0) {
4633		pte = vtopte(addr);
4634		rv = *pte == 0;
4635	}
4636	PMAP_UNLOCK(pmap);
4637	return (rv);
4638}
4639
4640/*
4641 *	pmap_is_referenced:
4642 *
4643 *	Return whether or not the specified physical page was referenced
4644 *	in any physical maps.
4645 */
4646boolean_t
4647pmap_is_referenced(vm_page_t m)
4648{
4649	boolean_t rv;
4650
4651	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4652	    ("pmap_is_referenced: page %p is not managed", m));
4653	rw_wlock(&pvh_global_lock);
4654	rv = pmap_is_referenced_pvh(&m->md) ||
4655	    ((m->flags & PG_FICTITIOUS) == 0 &&
4656	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4657	rw_wunlock(&pvh_global_lock);
4658	return (rv);
4659}
4660
4661/*
4662 * Returns TRUE if any of the given mappings were referenced and FALSE
4663 * otherwise.  Both page and 4mpage mappings are supported.
4664 */
4665static boolean_t
4666pmap_is_referenced_pvh(struct md_page *pvh)
4667{
4668	pv_entry_t pv;
4669	pt_entry_t *pte;
4670	pmap_t pmap;
4671	boolean_t rv;
4672
4673	rw_assert(&pvh_global_lock, RA_WLOCKED);
4674	rv = FALSE;
4675	sched_pin();
4676	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4677		pmap = PV_PMAP(pv);
4678		PMAP_LOCK(pmap);
4679		pte = pmap_pte_quick(pmap, pv->pv_va);
4680		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4681		PMAP_UNLOCK(pmap);
4682		if (rv)
4683			break;
4684	}
4685	sched_unpin();
4686	return (rv);
4687}
4688
4689/*
4690 * Clear the write and modified bits in each of the given page's mappings.
4691 */
4692void
4693pmap_remove_write(vm_page_t m)
4694{
4695	struct md_page *pvh;
4696	pv_entry_t next_pv, pv;
4697	pmap_t pmap;
4698	pd_entry_t *pde;
4699	pt_entry_t oldpte, *pte;
4700	vm_offset_t va;
4701
4702	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4703	    ("pmap_remove_write: page %p is not managed", m));
4704
4705	/*
4706	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
4707	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
4708	 * is clear, no page table entries need updating.
4709	 */
4710	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4711	if ((m->oflags & VPO_BUSY) == 0 &&
4712	    (m->aflags & PGA_WRITEABLE) == 0)
4713		return;
4714	rw_wlock(&pvh_global_lock);
4715	sched_pin();
4716	if ((m->flags & PG_FICTITIOUS) != 0)
4717		goto small_mappings;
4718	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4719	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4720		va = pv->pv_va;
4721		pmap = PV_PMAP(pv);
4722		PMAP_LOCK(pmap);
4723		pde = pmap_pde(pmap, va);
4724		if ((*pde & PG_RW) != 0)
4725			(void)pmap_demote_pde(pmap, pde, va);
4726		PMAP_UNLOCK(pmap);
4727	}
4728small_mappings:
4729	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4730		pmap = PV_PMAP(pv);
4731		PMAP_LOCK(pmap);
4732		pde = pmap_pde(pmap, pv->pv_va);
4733		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4734		    " a 4mpage in page %p's pv list", m));
4735		pte = pmap_pte_quick(pmap, pv->pv_va);
4736retry:
4737		oldpte = *pte;
4738		if ((oldpte & PG_RW) != 0) {
4739			/*
4740			 * Regardless of whether a pte is 32 or 64 bits
4741			 * in size, PG_RW and PG_M are among the least
4742			 * significant 32 bits.
4743			 */
4744			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4745			    oldpte & ~(PG_RW | PG_M)))
4746				goto retry;
4747			if ((oldpte & PG_M) != 0)
4748				vm_page_dirty(m);
4749			pmap_invalidate_page(pmap, pv->pv_va);
4750		}
4751		PMAP_UNLOCK(pmap);
4752	}
4753	vm_page_aflag_clear(m, PGA_WRITEABLE);
4754	sched_unpin();
4755	rw_wunlock(&pvh_global_lock);
4756}
4757
4758/*
4759 *	pmap_ts_referenced:
4760 *
4761 *	Return a count of reference bits for a page, clearing those bits.
4762 *	It is not necessary for every reference bit to be cleared, but it
4763 *	is necessary that 0 only be returned when there are truly no
4764 *	reference bits set.
4765 *
4766 *	XXX: The exact number of bits to check and clear is a matter that
4767 *	should be tested and standardized at some point in the future for
4768 *	optimal aging of shared pages.
4769 */
4770int
4771pmap_ts_referenced(vm_page_t m)
4772{
4773	struct md_page *pvh;
4774	pv_entry_t pv, pvf, pvn;
4775	pmap_t pmap;
4776	pd_entry_t oldpde, *pde;
4777	pt_entry_t *pte;
4778	vm_offset_t va;
4779	int rtval = 0;
4780
4781	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4782	    ("pmap_ts_referenced: page %p is not managed", m));
4783	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4784	rw_wlock(&pvh_global_lock);
4785	sched_pin();
4786	if ((m->flags & PG_FICTITIOUS) != 0)
4787		goto small_mappings;
4788	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4789		va = pv->pv_va;
4790		pmap = PV_PMAP(pv);
4791		PMAP_LOCK(pmap);
4792		pde = pmap_pde(pmap, va);
4793		oldpde = *pde;
4794		if ((oldpde & PG_A) != 0) {
4795			if (pmap_demote_pde(pmap, pde, va)) {
4796				if ((oldpde & PG_W) == 0) {
4797					/*
4798					 * Remove the mapping to a single page
4799					 * so that a subsequent access may
4800					 * repromote.  Since the underlying
4801					 * page table page is fully populated,
4802					 * this removal never frees a page
4803					 * table page.
4804					 */
4805					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4806					    PG_PS_FRAME);
4807					pmap_remove_page(pmap, va, NULL);
4808					rtval++;
4809					if (rtval > 4) {
4810						PMAP_UNLOCK(pmap);
4811						goto out;
4812					}
4813				}
4814			}
4815		}
4816		PMAP_UNLOCK(pmap);
4817	}
4818small_mappings:
4819	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4820		pvf = pv;
4821		do {
4822			pvn = TAILQ_NEXT(pv, pv_list);
4823			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4824			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4825			pmap = PV_PMAP(pv);
4826			PMAP_LOCK(pmap);
4827			pde = pmap_pde(pmap, pv->pv_va);
4828			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4829			    " found a 4mpage in page %p's pv list", m));
4830			pte = pmap_pte_quick(pmap, pv->pv_va);
4831			if ((*pte & PG_A) != 0) {
4832				atomic_clear_int((u_int *)pte, PG_A);
4833				pmap_invalidate_page(pmap, pv->pv_va);
4834				rtval++;
4835				if (rtval > 4)
4836					pvn = NULL;
4837			}
4838			PMAP_UNLOCK(pmap);
4839		} while ((pv = pvn) != NULL && pv != pvf);
4840	}
4841out:
4842	sched_unpin();
4843	rw_wunlock(&pvh_global_lock);
4844	return (rtval);
4845}
4846
4847/*
4848 *	Clear the modify bits on the specified physical page.
4849 */
4850void
4851pmap_clear_modify(vm_page_t m)
4852{
4853	struct md_page *pvh;
4854	pv_entry_t next_pv, pv;
4855	pmap_t pmap;
4856	pd_entry_t oldpde, *pde;
4857	pt_entry_t oldpte, *pte;
4858	vm_offset_t va;
4859
4860	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4861	    ("pmap_clear_modify: page %p is not managed", m));
4862	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4863	KASSERT((m->oflags & VPO_BUSY) == 0,
4864	    ("pmap_clear_modify: page %p is busy", m));
4865
4866	/*
4867	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4868	 * If the object containing the page is locked and the page is not
4869	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
4870	 */
4871	if ((m->aflags & PGA_WRITEABLE) == 0)
4872		return;
4873	rw_wlock(&pvh_global_lock);
4874	sched_pin();
4875	if ((m->flags & PG_FICTITIOUS) != 0)
4876		goto small_mappings;
4877	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4878	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4879		va = pv->pv_va;
4880		pmap = PV_PMAP(pv);
4881		PMAP_LOCK(pmap);
4882		pde = pmap_pde(pmap, va);
4883		oldpde = *pde;
4884		if ((oldpde & PG_RW) != 0) {
4885			if (pmap_demote_pde(pmap, pde, va)) {
4886				if ((oldpde & PG_W) == 0) {
4887					/*
4888					 * Write protect the mapping to a
4889					 * single page so that a subsequent
4890					 * write access may repromote.
4891					 */
4892					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4893					    PG_PS_FRAME);
4894					pte = pmap_pte_quick(pmap, va);
4895					oldpte = *pte;
4896					if ((oldpte & PG_V) != 0) {
4897						/*
4898						 * Regardless of whether a pte is 32 or 64 bits
4899						 * in size, PG_RW and PG_M are among the least
4900						 * significant 32 bits.
4901						 */
4902						while (!atomic_cmpset_int((u_int *)pte,
4903						    oldpte,
4904						    oldpte & ~(PG_M | PG_RW)))
4905							oldpte = *pte;
4906						vm_page_dirty(m);
4907						pmap_invalidate_page(pmap, va);
4908					}
4909				}
4910			}
4911		}
4912		PMAP_UNLOCK(pmap);
4913	}
4914small_mappings:
4915	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4916		pmap = PV_PMAP(pv);
4917		PMAP_LOCK(pmap);
4918		pde = pmap_pde(pmap, pv->pv_va);
4919		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4920		    " a 4mpage in page %p's pv list", m));
4921		pte = pmap_pte_quick(pmap, pv->pv_va);
4922		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4923			/*
4924			 * Regardless of whether a pte is 32 or 64 bits
4925			 * in size, PG_M is among the least significant
4926			 * 32 bits.
4927			 */
4928			atomic_clear_int((u_int *)pte, PG_M);
4929			pmap_invalidate_page(pmap, pv->pv_va);
4930		}
4931		PMAP_UNLOCK(pmap);
4932	}
4933	sched_unpin();
4934	rw_wunlock(&pvh_global_lock);
4935}
4936
4937/*
4938 *	pmap_clear_reference:
4939 *
4940 *	Clear the reference bit on the specified physical page.
4941 */
4942void
4943pmap_clear_reference(vm_page_t m)
4944{
4945	struct md_page *pvh;
4946	pv_entry_t next_pv, pv;
4947	pmap_t pmap;
4948	pd_entry_t oldpde, *pde;
4949	pt_entry_t *pte;
4950	vm_offset_t va;
4951
4952	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4953	    ("pmap_clear_reference: page %p is not managed", m));
4954	rw_wlock(&pvh_global_lock);
4955	sched_pin();
4956	if ((m->flags & PG_FICTITIOUS) != 0)
4957		goto small_mappings;
4958	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4959	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4960		va = pv->pv_va;
4961		pmap = PV_PMAP(pv);
4962		PMAP_LOCK(pmap);
4963		pde = pmap_pde(pmap, va);
4964		oldpde = *pde;
4965		if ((oldpde & PG_A) != 0) {
4966			if (pmap_demote_pde(pmap, pde, va)) {
4967				/*
4968				 * Remove the mapping to a single page so
4969				 * that a subsequent access may repromote.
4970				 * Since the underlying page table page is
4971				 * fully populated, this removal never frees
4972				 * a page table page.
4973				 */
4974				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4975				    PG_PS_FRAME);
4976				pmap_remove_page(pmap, va, NULL);
4977			}
4978		}
4979		PMAP_UNLOCK(pmap);
4980	}
4981small_mappings:
4982	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4983		pmap = PV_PMAP(pv);
4984		PMAP_LOCK(pmap);
4985		pde = pmap_pde(pmap, pv->pv_va);
4986		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4987		    " a 4mpage in page %p's pv list", m));
4988		pte = pmap_pte_quick(pmap, pv->pv_va);
4989		if ((*pte & PG_A) != 0) {
4990			/*
4991			 * Regardless of whether a pte is 32 or 64 bits
4992			 * in size, PG_A is among the least significant
4993			 * 32 bits.
4994			 */
4995			atomic_clear_int((u_int *)pte, PG_A);
4996			pmap_invalidate_page(pmap, pv->pv_va);
4997		}
4998		PMAP_UNLOCK(pmap);
4999	}
5000	sched_unpin();
5001	rw_wunlock(&pvh_global_lock);
5002}
5003
5004/*
5005 * Miscellaneous support routines follow
5006 */
5007
5008/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5009static __inline void
5010pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5011{
5012	u_int opte, npte;
5013
5014	/*
5015	 * The cache mode bits are all in the low 32-bits of the
5016	 * PTE, so we can just spin on updating the low 32-bits.
5017	 */
5018	do {
5019		opte = *(u_int *)pte;
5020		npte = opte & ~PG_PTE_CACHE;
5021		npte |= cache_bits;
5022	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5023}
5024
5025/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5026static __inline void
5027pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5028{
5029	u_int opde, npde;
5030
5031	/*
5032	 * The cache mode bits are all in the low 32-bits of the
5033	 * PDE, so we can just spin on updating the low 32-bits.
5034	 */
5035	do {
5036		opde = *(u_int *)pde;
5037		npde = opde & ~PG_PDE_CACHE;
5038		npde |= cache_bits;
5039	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5040}
5041
5042/*
5043 * Map a set of physical memory pages into the kernel virtual
5044 * address space. Return a pointer to where it is mapped. This
5045 * routine is intended to be used for mapping device memory,
5046 * NOT real memory.
5047 */
5048void *
5049pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5050{
5051	vm_offset_t va, offset;
5052	vm_size_t tmpsize;
5053
5054	offset = pa & PAGE_MASK;
5055	size = roundup(offset + size, PAGE_SIZE);
5056	pa = pa & PG_FRAME;
5057
5058	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5059		va = KERNBASE + pa;
5060	else
5061		va = kmem_alloc_nofault(kernel_map, size);
5062	if (!va)
5063		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5064
5065	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5066		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5067	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5068	pmap_invalidate_cache_range(va, va + size);
5069	return ((void *)(va + offset));
5070}
5071
5072void *
5073pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5074{
5075
5076	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5077}
5078
5079void *
5080pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5081{
5082
5083	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5084}
5085
5086void
5087pmap_unmapdev(vm_offset_t va, vm_size_t size)
5088{
5089	vm_offset_t base, offset;
5090
5091	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5092		return;
5093	base = trunc_page(va);
5094	offset = va & PAGE_MASK;
5095	size = roundup(offset + size, PAGE_SIZE);
5096	kmem_free(kernel_map, base, size);
5097}
5098
5099/*
5100 * Sets the memory attribute for the specified page.
5101 */
5102void
5103pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5104{
5105
5106	m->md.pat_mode = ma;
5107	if ((m->flags & PG_FICTITIOUS) != 0)
5108		return;
5109
5110	/*
5111	 * If "m" is a normal page, flush it from the cache.
5112	 * See pmap_invalidate_cache_range().
5113	 *
5114	 * First, try to find an existing mapping of the page by sf
5115	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5116	 * flushes the cache.
5117	 */
5118	if (sf_buf_invalidate_cache(m))
5119		return;
5120
5121	/*
5122	 * If page is not mapped by sf buffer, but CPU does not
5123	 * support self snoop, map the page transient and do
5124	 * invalidation. In the worst case, whole cache is flushed by
5125	 * pmap_invalidate_cache_range().
5126	 */
5127	if ((cpu_feature & CPUID_SS) == 0)
5128		pmap_flush_page(m);
5129}
5130
5131static void
5132pmap_flush_page(vm_page_t m)
5133{
5134	struct sysmaps *sysmaps;
5135	vm_offset_t sva, eva;
5136
5137	if ((cpu_feature & CPUID_CLFSH) != 0) {
5138		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5139		mtx_lock(&sysmaps->lock);
5140		if (*sysmaps->CMAP2)
5141			panic("pmap_flush_page: CMAP2 busy");
5142		sched_pin();
5143		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5144		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5145		invlcaddr(sysmaps->CADDR2);
5146		sva = (vm_offset_t)sysmaps->CADDR2;
5147		eva = sva + PAGE_SIZE;
5148
5149		/*
5150		 * Use mfence despite the ordering implied by
5151		 * mtx_{un,}lock() because clflush is not guaranteed
5152		 * to be ordered by any other instruction.
5153		 */
5154		mfence();
5155		for (; sva < eva; sva += cpu_clflush_line_size)
5156			clflush(sva);
5157		mfence();
5158		*sysmaps->CMAP2 = 0;
5159		sched_unpin();
5160		mtx_unlock(&sysmaps->lock);
5161	} else
5162		pmap_invalidate_cache();
5163}
5164
5165/*
5166 * Changes the specified virtual address range's memory type to that given by
5167 * the parameter "mode".  The specified virtual address range must be
5168 * completely contained within either the kernel map.
5169 *
5170 * Returns zero if the change completed successfully, and either EINVAL or
5171 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5172 * of the virtual address range was not mapped, and ENOMEM is returned if
5173 * there was insufficient memory available to complete the change.
5174 */
5175int
5176pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5177{
5178	vm_offset_t base, offset, tmpva;
5179	pd_entry_t *pde;
5180	pt_entry_t *pte;
5181	int cache_bits_pte, cache_bits_pde;
5182	boolean_t changed;
5183
5184	base = trunc_page(va);
5185	offset = va & PAGE_MASK;
5186	size = roundup(offset + size, PAGE_SIZE);
5187
5188	/*
5189	 * Only supported on kernel virtual addresses above the recursive map.
5190	 */
5191	if (base < VM_MIN_KERNEL_ADDRESS)
5192		return (EINVAL);
5193
5194	cache_bits_pde = pmap_cache_bits(mode, 1);
5195	cache_bits_pte = pmap_cache_bits(mode, 0);
5196	changed = FALSE;
5197
5198	/*
5199	 * Pages that aren't mapped aren't supported.  Also break down
5200	 * 2/4MB pages into 4KB pages if required.
5201	 */
5202	PMAP_LOCK(kernel_pmap);
5203	for (tmpva = base; tmpva < base + size; ) {
5204		pde = pmap_pde(kernel_pmap, tmpva);
5205		if (*pde == 0) {
5206			PMAP_UNLOCK(kernel_pmap);
5207			return (EINVAL);
5208		}
5209		if (*pde & PG_PS) {
5210			/*
5211			 * If the current 2/4MB page already has
5212			 * the required memory type, then we need not
5213			 * demote this page.  Just increment tmpva to
5214			 * the next 2/4MB page frame.
5215			 */
5216			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5217				tmpva = trunc_4mpage(tmpva) + NBPDR;
5218				continue;
5219			}
5220
5221			/*
5222			 * If the current offset aligns with a 2/4MB
5223			 * page frame and there is at least 2/4MB left
5224			 * within the range, then we need not break
5225			 * down this page into 4KB pages.
5226			 */
5227			if ((tmpva & PDRMASK) == 0 &&
5228			    tmpva + PDRMASK < base + size) {
5229				tmpva += NBPDR;
5230				continue;
5231			}
5232			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5233				PMAP_UNLOCK(kernel_pmap);
5234				return (ENOMEM);
5235			}
5236		}
5237		pte = vtopte(tmpva);
5238		if (*pte == 0) {
5239			PMAP_UNLOCK(kernel_pmap);
5240			return (EINVAL);
5241		}
5242		tmpva += PAGE_SIZE;
5243	}
5244	PMAP_UNLOCK(kernel_pmap);
5245
5246	/*
5247	 * Ok, all the pages exist, so run through them updating their
5248	 * cache mode if required.
5249	 */
5250	for (tmpva = base; tmpva < base + size; ) {
5251		pde = pmap_pde(kernel_pmap, tmpva);
5252		if (*pde & PG_PS) {
5253			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5254				pmap_pde_attr(pde, cache_bits_pde);
5255				changed = TRUE;
5256			}
5257			tmpva = trunc_4mpage(tmpva) + NBPDR;
5258		} else {
5259			pte = vtopte(tmpva);
5260			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5261				pmap_pte_attr(pte, cache_bits_pte);
5262				changed = TRUE;
5263			}
5264			tmpva += PAGE_SIZE;
5265		}
5266	}
5267
5268	/*
5269	 * Flush CPU caches to make sure any data isn't cached that
5270	 * shouldn't be, etc.
5271	 */
5272	if (changed) {
5273		pmap_invalidate_range(kernel_pmap, base, tmpva);
5274		pmap_invalidate_cache_range(base, tmpva);
5275	}
5276	return (0);
5277}
5278
5279/*
5280 * perform the pmap work for mincore
5281 */
5282int
5283pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5284{
5285	pd_entry_t *pdep;
5286	pt_entry_t *ptep, pte;
5287	vm_paddr_t pa;
5288	int val;
5289
5290	PMAP_LOCK(pmap);
5291retry:
5292	pdep = pmap_pde(pmap, addr);
5293	if (*pdep != 0) {
5294		if (*pdep & PG_PS) {
5295			pte = *pdep;
5296			/* Compute the physical address of the 4KB page. */
5297			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5298			    PG_FRAME;
5299			val = MINCORE_SUPER;
5300		} else {
5301			ptep = pmap_pte(pmap, addr);
5302			pte = *ptep;
5303			pmap_pte_release(ptep);
5304			pa = pte & PG_FRAME;
5305			val = 0;
5306		}
5307	} else {
5308		pte = 0;
5309		pa = 0;
5310		val = 0;
5311	}
5312	if ((pte & PG_V) != 0) {
5313		val |= MINCORE_INCORE;
5314		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5315			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5316		if ((pte & PG_A) != 0)
5317			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5318	}
5319	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5320	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5321	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5322		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5323		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5324			goto retry;
5325	} else
5326		PA_UNLOCK_COND(*locked_pa);
5327	PMAP_UNLOCK(pmap);
5328	return (val);
5329}
5330
5331void
5332pmap_activate(struct thread *td)
5333{
5334	pmap_t	pmap, oldpmap;
5335	u_int	cpuid;
5336	u_int32_t  cr3;
5337
5338	critical_enter();
5339	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5340	oldpmap = PCPU_GET(curpmap);
5341	cpuid = PCPU_GET(cpuid);
5342#if defined(SMP)
5343	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5344	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5345#else
5346	CPU_CLR(cpuid, &oldpmap->pm_active);
5347	CPU_SET(cpuid, &pmap->pm_active);
5348#endif
5349#ifdef PAE
5350	cr3 = vtophys(pmap->pm_pdpt);
5351#else
5352	cr3 = vtophys(pmap->pm_pdir);
5353#endif
5354	/*
5355	 * pmap_activate is for the current thread on the current cpu
5356	 */
5357	td->td_pcb->pcb_cr3 = cr3;
5358	load_cr3(cr3);
5359	PCPU_SET(curpmap, pmap);
5360	critical_exit();
5361}
5362
5363void
5364pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5365{
5366}
5367
5368/*
5369 *	Increase the starting virtual address of the given mapping if a
5370 *	different alignment might result in more superpage mappings.
5371 */
5372void
5373pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5374    vm_offset_t *addr, vm_size_t size)
5375{
5376	vm_offset_t superpage_offset;
5377
5378	if (size < NBPDR)
5379		return;
5380	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5381		offset += ptoa(object->pg_color);
5382	superpage_offset = offset & PDRMASK;
5383	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5384	    (*addr & PDRMASK) == superpage_offset)
5385		return;
5386	if ((*addr & PDRMASK) < superpage_offset)
5387		*addr = (*addr & ~PDRMASK) + superpage_offset;
5388	else
5389		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5390}
5391
5392
5393#if defined(PMAP_DEBUG)
5394pmap_pid_dump(int pid)
5395{
5396	pmap_t pmap;
5397	struct proc *p;
5398	int npte = 0;
5399	int index;
5400
5401	sx_slock(&allproc_lock);
5402	FOREACH_PROC_IN_SYSTEM(p) {
5403		if (p->p_pid != pid)
5404			continue;
5405
5406		if (p->p_vmspace) {
5407			int i,j;
5408			index = 0;
5409			pmap = vmspace_pmap(p->p_vmspace);
5410			for (i = 0; i < NPDEPTD; i++) {
5411				pd_entry_t *pde;
5412				pt_entry_t *pte;
5413				vm_offset_t base = i << PDRSHIFT;
5414
5415				pde = &pmap->pm_pdir[i];
5416				if (pde && pmap_pde_v(pde)) {
5417					for (j = 0; j < NPTEPG; j++) {
5418						vm_offset_t va = base + (j << PAGE_SHIFT);
5419						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5420							if (index) {
5421								index = 0;
5422								printf("\n");
5423							}
5424							sx_sunlock(&allproc_lock);
5425							return (npte);
5426						}
5427						pte = pmap_pte(pmap, va);
5428						if (pte && pmap_pte_v(pte)) {
5429							pt_entry_t pa;
5430							vm_page_t m;
5431							pa = *pte;
5432							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5433							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5434								va, pa, m->hold_count, m->wire_count, m->flags);
5435							npte++;
5436							index++;
5437							if (index >= 2) {
5438								index = 0;
5439								printf("\n");
5440							} else {
5441								printf(" ");
5442							}
5443						}
5444					}
5445				}
5446			}
5447		}
5448	}
5449	sx_sunlock(&allproc_lock);
5450	return (npte);
5451}
5452#endif
5453
5454#if defined(DEBUG)
5455
5456static void	pads(pmap_t pm);
5457void		pmap_pvdump(vm_paddr_t pa);
5458
5459/* print address space of pmap*/
5460static void
5461pads(pmap_t pm)
5462{
5463	int i, j;
5464	vm_paddr_t va;
5465	pt_entry_t *ptep;
5466
5467	if (pm == kernel_pmap)
5468		return;
5469	for (i = 0; i < NPDEPTD; i++)
5470		if (pm->pm_pdir[i])
5471			for (j = 0; j < NPTEPG; j++) {
5472				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5473				if (pm == kernel_pmap && va < KERNBASE)
5474					continue;
5475				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5476					continue;
5477				ptep = pmap_pte(pm, va);
5478				if (pmap_pte_v(ptep))
5479					printf("%x:%x ", va, *ptep);
5480			};
5481
5482}
5483
5484void
5485pmap_pvdump(vm_paddr_t pa)
5486{
5487	pv_entry_t pv;
5488	pmap_t pmap;
5489	vm_page_t m;
5490
5491	printf("pa %x", pa);
5492	m = PHYS_TO_VM_PAGE(pa);
5493	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5494		pmap = PV_PMAP(pv);
5495		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5496		pads(pmap);
5497	}
5498	printf(" ");
5499}
5500#endif
5501