pmap.c revision 251897
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: stable/9/sys/i386/i386/pmap.c 251897 2013-06-18 05:21:40Z scottl $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_apic.h"
107#include "opt_cpu.h"
108#include "opt_pmap.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/rwlock.h>
123#include <sys/sf_buf.h>
124#include <sys/sx.h>
125#include <sys/vmmeter.h>
126#include <sys/sched.h>
127#include <sys/sysctl.h>
128#ifdef SMP
129#include <sys/smp.h>
130#else
131#include <sys/cpuset.h>
132#endif
133
134#include <vm/vm.h>
135#include <vm/vm_param.h>
136#include <vm/vm_kern.h>
137#include <vm/vm_page.h>
138#include <vm/vm_map.h>
139#include <vm/vm_object.h>
140#include <vm/vm_extern.h>
141#include <vm/vm_pageout.h>
142#include <vm/vm_pager.h>
143#include <vm/vm_reserv.h>
144#include <vm/uma.h>
145
146#ifdef DEV_APIC
147#include <sys/bus.h>
148#include <machine/intr_machdep.h>
149#include <machine/apicvar.h>
150#endif
151#include <machine/cpu.h>
152#include <machine/cputypes.h>
153#include <machine/md_var.h>
154#include <machine/pcb.h>
155#include <machine/specialreg.h>
156#ifdef SMP
157#include <machine/smp.h>
158#endif
159
160#ifdef XBOX
161#include <machine/xbox.h>
162#endif
163
164#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
165#define CPU_ENABLE_SSE
166#endif
167
168#ifndef PMAP_SHPGPERPROC
169#define PMAP_SHPGPERPROC 200
170#endif
171
172#if !defined(DIAGNOSTIC)
173#ifdef __GNUC_GNU_INLINE__
174#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
175#else
176#define PMAP_INLINE	extern inline
177#endif
178#else
179#define PMAP_INLINE
180#endif
181
182#ifdef PV_STATS
183#define PV_STAT(x)	do { x ; } while (0)
184#else
185#define PV_STAT(x)	do { } while (0)
186#endif
187
188#define	pa_index(pa)	((pa) >> PDRSHIFT)
189#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
190
191/*
192 * Get PDEs and PTEs for user/kernel address space
193 */
194#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
195#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
196
197#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
198#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
199#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
200#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
201#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
202
203#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
204    atomic_clear_int((u_int *)(pte), PG_W))
205#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
206
207struct pmap kernel_pmap_store;
208LIST_HEAD(pmaplist, pmap);
209static struct pmaplist allpmaps;
210static struct mtx allpmaps_lock;
211
212vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
213vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
214int pgeflag = 0;		/* PG_G or-in */
215int pseflag = 0;		/* PG_PS or-in */
216
217static int nkpt = NKPT;
218vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
219extern u_int32_t KERNend;
220extern u_int32_t KPTphys;
221
222#ifdef PAE
223pt_entry_t pg_nx;
224static uma_zone_t pdptzone;
225#endif
226
227static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
228
229static int pat_works = 1;
230SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
231    "Is page attribute table fully functional?");
232
233static int pg_ps_enabled = 1;
234SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
235    "Are large page mappings enabled?");
236
237#define	PAT_INDEX_SIZE	8
238static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
239
240/*
241 * Isolate the global pv list lock from data and other locks to prevent false
242 * sharing within the cache.
243 */
244static struct {
245	struct rwlock	lock;
246	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
247} pvh_global __aligned(CACHE_LINE_SIZE);
248
249#define	pvh_global_lock	pvh_global.lock
250
251/*
252 * Data for the pv entry allocation mechanism
253 */
254static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
255static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
256static struct md_page *pv_table;
257static int shpgperproc = PMAP_SHPGPERPROC;
258
259struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
260int pv_maxchunks;			/* How many chunks we have KVA for */
261vm_offset_t pv_vafree;			/* freelist stored in the PTE */
262
263/*
264 * All those kernel PT submaps that BSD is so fond of
265 */
266struct sysmaps {
267	struct	mtx lock;
268	pt_entry_t *CMAP1;
269	pt_entry_t *CMAP2;
270	caddr_t	CADDR1;
271	caddr_t	CADDR2;
272};
273static struct sysmaps sysmaps_pcpu[MAXCPU];
274pt_entry_t *CMAP1 = 0;
275static pt_entry_t *CMAP3;
276static pd_entry_t *KPTD;
277caddr_t CADDR1 = 0, ptvmmap = 0;
278static caddr_t CADDR3;
279struct msgbuf *msgbufp = 0;
280
281/*
282 * Crashdump maps.
283 */
284static caddr_t crashdumpmap;
285
286static pt_entry_t *PMAP1 = 0, *PMAP2;
287static pt_entry_t *PADDR1 = 0, *PADDR2;
288#ifdef SMP
289static int PMAP1cpu;
290static int PMAP1changedcpu;
291SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
292	   &PMAP1changedcpu, 0,
293	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
294#endif
295static int PMAP1changed;
296SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
297	   &PMAP1changed, 0,
298	   "Number of times pmap_pte_quick changed PMAP1");
299static int PMAP1unchanged;
300SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
301	   &PMAP1unchanged, 0,
302	   "Number of times pmap_pte_quick didn't change PMAP1");
303static struct mtx PMAP2mutex;
304
305static void	free_pv_chunk(struct pv_chunk *pc);
306static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
307static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
308static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
309static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
310static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
311static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
312static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
313		    vm_offset_t va);
314static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
315
316static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
317static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
318    vm_prot_t prot);
319static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
320    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
321static void pmap_flush_page(vm_page_t m);
322static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
323static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
324static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
325static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
326static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
327static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
328static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
329static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
330static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
331static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
332    vm_prot_t prot);
333static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
334static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
335    vm_page_t *free);
336static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
337    vm_page_t *free);
338static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
339static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
340    vm_page_t *free);
341static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
342					vm_offset_t va);
343static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
344static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
345    vm_page_t m);
346static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
347    pd_entry_t newpde);
348static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
349
350static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
351
352static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags);
353static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free);
354static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
355static void pmap_pte_release(pt_entry_t *pte);
356static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
357#ifdef PAE
358static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
359#endif
360static void pmap_set_pg(void);
361
362static __inline void pagezero(void *page);
363
364CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
365CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
366
367/*
368 * If you get an error here, then you set KVA_PAGES wrong! See the
369 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
370 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
371 */
372CTASSERT(KERNBASE % (1 << 24) == 0);
373
374/*
375 *	Bootstrap the system enough to run with virtual memory.
376 *
377 *	On the i386 this is called after mapping has already been enabled
378 *	and just syncs the pmap module with what has already been done.
379 *	[We can't call it easily with mapping off since the kernel is not
380 *	mapped with PA == VA, hence we would have to relocate every address
381 *	from the linked base (virtual) address "KERNBASE" to the actual
382 *	(physical) address starting relative to 0]
383 */
384void
385pmap_bootstrap(vm_paddr_t firstaddr)
386{
387	vm_offset_t va;
388	pt_entry_t *pte, *unused;
389	struct sysmaps *sysmaps;
390	int i;
391
392	/*
393	 * Initialize the first available kernel virtual address.  However,
394	 * using "firstaddr" may waste a few pages of the kernel virtual
395	 * address space, because locore may not have mapped every physical
396	 * page that it allocated.  Preferably, locore would provide a first
397	 * unused virtual address in addition to "firstaddr".
398	 */
399	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
400
401	virtual_end = VM_MAX_KERNEL_ADDRESS;
402
403	/*
404	 * Initialize the kernel pmap (which is statically allocated).
405	 */
406	PMAP_LOCK_INIT(kernel_pmap);
407	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
408#ifdef PAE
409	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
410#endif
411	kernel_pmap->pm_root = NULL;
412	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
413	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
414
415 	/*
416	 * Initialize the global pv list lock.
417	 */
418	rw_init(&pvh_global_lock, "pmap pv global");
419
420	LIST_INIT(&allpmaps);
421
422	/*
423	 * Request a spin mutex so that changes to allpmaps cannot be
424	 * preempted by smp_rendezvous_cpus().  Otherwise,
425	 * pmap_update_pde_kernel() could access allpmaps while it is
426	 * being changed.
427	 */
428	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
429	mtx_lock_spin(&allpmaps_lock);
430	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
431	mtx_unlock_spin(&allpmaps_lock);
432
433	/*
434	 * Reserve some special page table entries/VA space for temporary
435	 * mapping of pages.
436	 */
437#define	SYSMAP(c, p, v, n)	\
438	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
439
440	va = virtual_avail;
441	pte = vtopte(va);
442
443	/*
444	 * CMAP1/CMAP2 are used for zeroing and copying pages.
445	 * CMAP3 is used for the idle process page zeroing.
446	 */
447	for (i = 0; i < MAXCPU; i++) {
448		sysmaps = &sysmaps_pcpu[i];
449		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
450		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
451		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
452	}
453	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
454	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
455
456	/*
457	 * Crashdump maps.
458	 */
459	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
460
461	/*
462	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
463	 */
464	SYSMAP(caddr_t, unused, ptvmmap, 1)
465
466	/*
467	 * msgbufp is used to map the system message buffer.
468	 */
469	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
470
471	/*
472	 * KPTmap is used by pmap_kextract().
473	 *
474	 * KPTmap is first initialized by locore.  However, that initial
475	 * KPTmap can only support NKPT page table pages.  Here, a larger
476	 * KPTmap is created that can support KVA_PAGES page table pages.
477	 */
478	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
479
480	for (i = 0; i < NKPT; i++)
481		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
482
483	/*
484	 * Adjust the start of the KPTD and KPTmap so that the implementation
485	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
486	 */
487	KPTD -= KPTDI;
488	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
489
490	/*
491	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
492	 * respectively.
493	 */
494	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
495	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
496
497	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
498
499	virtual_avail = va;
500
501	/*
502	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
503	 * physical memory region that is used by the ACPI wakeup code.  This
504	 * mapping must not have PG_G set.
505	 */
506#ifdef XBOX
507	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
508	 * an early stadium, we cannot yet neatly map video memory ... :-(
509	 * Better fixes are very welcome! */
510	if (!arch_i386_is_xbox)
511#endif
512	for (i = 1; i < NKPT; i++)
513		PTD[i] = 0;
514
515	/* Initialize the PAT MSR if present. */
516	pmap_init_pat();
517
518	/* Turn on PG_G on kernel page(s) */
519	pmap_set_pg();
520}
521
522/*
523 * Setup the PAT MSR.
524 */
525void
526pmap_init_pat(void)
527{
528	int pat_table[PAT_INDEX_SIZE];
529	uint64_t pat_msr;
530	u_long cr0, cr4;
531	int i;
532
533	/* Set default PAT index table. */
534	for (i = 0; i < PAT_INDEX_SIZE; i++)
535		pat_table[i] = -1;
536	pat_table[PAT_WRITE_BACK] = 0;
537	pat_table[PAT_WRITE_THROUGH] = 1;
538	pat_table[PAT_UNCACHEABLE] = 3;
539	pat_table[PAT_WRITE_COMBINING] = 3;
540	pat_table[PAT_WRITE_PROTECTED] = 3;
541	pat_table[PAT_UNCACHED] = 3;
542
543	/* Bail if this CPU doesn't implement PAT. */
544	if ((cpu_feature & CPUID_PAT) == 0) {
545		for (i = 0; i < PAT_INDEX_SIZE; i++)
546			pat_index[i] = pat_table[i];
547		pat_works = 0;
548		return;
549	}
550
551	/*
552	 * Due to some Intel errata, we can only safely use the lower 4
553	 * PAT entries.
554	 *
555	 *   Intel Pentium III Processor Specification Update
556	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
557	 * or Mode C Paging)
558	 *
559	 *   Intel Pentium IV  Processor Specification Update
560	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
561	 */
562	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
563	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
564		pat_works = 0;
565
566	/* Initialize default PAT entries. */
567	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
568	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
569	    PAT_VALUE(2, PAT_UNCACHED) |
570	    PAT_VALUE(3, PAT_UNCACHEABLE) |
571	    PAT_VALUE(4, PAT_WRITE_BACK) |
572	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
573	    PAT_VALUE(6, PAT_UNCACHED) |
574	    PAT_VALUE(7, PAT_UNCACHEABLE);
575
576	if (pat_works) {
577		/*
578		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
579		 * Program 5 and 6 as WP and WC.
580		 * Leave 4 and 7 as WB and UC.
581		 */
582		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
583		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
584		    PAT_VALUE(6, PAT_WRITE_COMBINING);
585		pat_table[PAT_UNCACHED] = 2;
586		pat_table[PAT_WRITE_PROTECTED] = 5;
587		pat_table[PAT_WRITE_COMBINING] = 6;
588	} else {
589		/*
590		 * Just replace PAT Index 2 with WC instead of UC-.
591		 */
592		pat_msr &= ~PAT_MASK(2);
593		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
594		pat_table[PAT_WRITE_COMBINING] = 2;
595	}
596
597	/* Disable PGE. */
598	cr4 = rcr4();
599	load_cr4(cr4 & ~CR4_PGE);
600
601	/* Disable caches (CD = 1, NW = 0). */
602	cr0 = rcr0();
603	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
604
605	/* Flushes caches and TLBs. */
606	wbinvd();
607	invltlb();
608
609	/* Update PAT and index table. */
610	wrmsr(MSR_PAT, pat_msr);
611	for (i = 0; i < PAT_INDEX_SIZE; i++)
612		pat_index[i] = pat_table[i];
613
614	/* Flush caches and TLBs again. */
615	wbinvd();
616	invltlb();
617
618	/* Restore caches and PGE. */
619	load_cr0(cr0);
620	load_cr4(cr4);
621}
622
623/*
624 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
625 */
626static void
627pmap_set_pg(void)
628{
629	pt_entry_t *pte;
630	vm_offset_t va, endva;
631
632	if (pgeflag == 0)
633		return;
634
635	endva = KERNBASE + KERNend;
636
637	if (pseflag) {
638		va = KERNBASE + KERNLOAD;
639		while (va  < endva) {
640			pdir_pde(PTD, va) |= pgeflag;
641			invltlb();	/* Play it safe, invltlb() every time */
642			va += NBPDR;
643		}
644	} else {
645		va = (vm_offset_t)btext;
646		while (va < endva) {
647			pte = vtopte(va);
648			if (*pte)
649				*pte |= pgeflag;
650			invltlb();	/* Play it safe, invltlb() every time */
651			va += PAGE_SIZE;
652		}
653	}
654}
655
656/*
657 * Initialize a vm_page's machine-dependent fields.
658 */
659void
660pmap_page_init(vm_page_t m)
661{
662
663	TAILQ_INIT(&m->md.pv_list);
664	m->md.pat_mode = PAT_WRITE_BACK;
665}
666
667#ifdef PAE
668static void *
669pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
670{
671
672	/* Inform UMA that this allocator uses kernel_map/object. */
673	*flags = UMA_SLAB_KERNEL;
674	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
675	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
676}
677#endif
678
679/*
680 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
681 * Requirements:
682 *  - Must deal with pages in order to ensure that none of the PG_* bits
683 *    are ever set, PG_V in particular.
684 *  - Assumes we can write to ptes without pte_store() atomic ops, even
685 *    on PAE systems.  This should be ok.
686 *  - Assumes nothing will ever test these addresses for 0 to indicate
687 *    no mapping instead of correctly checking PG_V.
688 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
689 * Because PG_V is never set, there can be no mappings to invalidate.
690 */
691static vm_offset_t
692pmap_ptelist_alloc(vm_offset_t *head)
693{
694	pt_entry_t *pte;
695	vm_offset_t va;
696
697	va = *head;
698	if (va == 0)
699		return (va);	/* Out of memory */
700	pte = vtopte(va);
701	*head = *pte;
702	if (*head & PG_V)
703		panic("pmap_ptelist_alloc: va with PG_V set!");
704	*pte = 0;
705	return (va);
706}
707
708static void
709pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
710{
711	pt_entry_t *pte;
712
713	if (va & PG_V)
714		panic("pmap_ptelist_free: freeing va with PG_V set!");
715	pte = vtopte(va);
716	*pte = *head;		/* virtual! PG_V is 0 though */
717	*head = va;
718}
719
720static void
721pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
722{
723	int i;
724	vm_offset_t va;
725
726	*head = 0;
727	for (i = npages - 1; i >= 0; i--) {
728		va = (vm_offset_t)base + i * PAGE_SIZE;
729		pmap_ptelist_free(head, va);
730	}
731}
732
733
734/*
735 *	Initialize the pmap module.
736 *	Called by vm_init, to initialize any structures that the pmap
737 *	system needs to map virtual memory.
738 */
739void
740pmap_init(void)
741{
742	vm_page_t mpte;
743	vm_size_t s;
744	int i, pv_npg;
745
746	/*
747	 * Initialize the vm page array entries for the kernel pmap's
748	 * page table pages.
749	 */
750	for (i = 0; i < NKPT; i++) {
751		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
752		KASSERT(mpte >= vm_page_array &&
753		    mpte < &vm_page_array[vm_page_array_size],
754		    ("pmap_init: page table page is out of range"));
755		mpte->pindex = i + KPTDI;
756		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
757	}
758
759	/*
760	 * Initialize the address space (zone) for the pv entries.  Set a
761	 * high water mark so that the system can recover from excessive
762	 * numbers of pv entries.
763	 */
764	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
765	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
766	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
767	pv_entry_max = roundup(pv_entry_max, _NPCPV);
768	pv_entry_high_water = 9 * (pv_entry_max / 10);
769
770	/*
771	 * If the kernel is running in a virtual machine on an AMD Family 10h
772	 * processor, then it must assume that MCA is enabled by the virtual
773	 * machine monitor.
774	 */
775	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
776	    CPUID_TO_FAMILY(cpu_id) == 0x10)
777		workaround_erratum383 = 1;
778
779	/*
780	 * Are large page mappings supported and enabled?
781	 */
782	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
783	if (pseflag == 0)
784		pg_ps_enabled = 0;
785	else if (pg_ps_enabled) {
786		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
787		    ("pmap_init: can't assign to pagesizes[1]"));
788		pagesizes[1] = NBPDR;
789	}
790
791	/*
792	 * Calculate the size of the pv head table for superpages.
793	 */
794	for (i = 0; phys_avail[i + 1]; i += 2);
795	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
796
797	/*
798	 * Allocate memory for the pv head table for superpages.
799	 */
800	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
801	s = round_page(s);
802	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
803	for (i = 0; i < pv_npg; i++)
804		TAILQ_INIT(&pv_table[i].pv_list);
805
806	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
807	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
808	    PAGE_SIZE * pv_maxchunks);
809	if (pv_chunkbase == NULL)
810		panic("pmap_init: not enough kvm for pv chunks");
811	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
812#ifdef PAE
813	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
814	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
815	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
816	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
817#endif
818}
819
820
821SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
822	"Max number of PV entries");
823SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
824	"Page share factor per proc");
825
826static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
827    "2/4MB page mapping counters");
828
829static u_long pmap_pde_demotions;
830SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
831    &pmap_pde_demotions, 0, "2/4MB page demotions");
832
833static u_long pmap_pde_mappings;
834SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
835    &pmap_pde_mappings, 0, "2/4MB page mappings");
836
837static u_long pmap_pde_p_failures;
838SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
839    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
840
841static u_long pmap_pde_promotions;
842SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
843    &pmap_pde_promotions, 0, "2/4MB page promotions");
844
845/***************************************************
846 * Low level helper routines.....
847 ***************************************************/
848
849/*
850 * Determine the appropriate bits to set in a PTE or PDE for a specified
851 * caching mode.
852 */
853int
854pmap_cache_bits(int mode, boolean_t is_pde)
855{
856	int cache_bits, pat_flag, pat_idx;
857
858	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
859		panic("Unknown caching mode %d\n", mode);
860
861	/* The PAT bit is different for PTE's and PDE's. */
862	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
863
864	/* Map the caching mode to a PAT index. */
865	pat_idx = pat_index[mode];
866
867	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
868	cache_bits = 0;
869	if (pat_idx & 0x4)
870		cache_bits |= pat_flag;
871	if (pat_idx & 0x2)
872		cache_bits |= PG_NC_PCD;
873	if (pat_idx & 0x1)
874		cache_bits |= PG_NC_PWT;
875	return (cache_bits);
876}
877
878/*
879 * The caller is responsible for maintaining TLB consistency.
880 */
881static void
882pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
883{
884	pd_entry_t *pde;
885	pmap_t pmap;
886	boolean_t PTD_updated;
887
888	PTD_updated = FALSE;
889	mtx_lock_spin(&allpmaps_lock);
890	LIST_FOREACH(pmap, &allpmaps, pm_list) {
891		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
892		    PG_FRAME))
893			PTD_updated = TRUE;
894		pde = pmap_pde(pmap, va);
895		pde_store(pde, newpde);
896	}
897	mtx_unlock_spin(&allpmaps_lock);
898	KASSERT(PTD_updated,
899	    ("pmap_kenter_pde: current page table is not in allpmaps"));
900}
901
902/*
903 * After changing the page size for the specified virtual address in the page
904 * table, flush the corresponding entries from the processor's TLB.  Only the
905 * calling processor's TLB is affected.
906 *
907 * The calling thread must be pinned to a processor.
908 */
909static void
910pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
911{
912	u_long cr4;
913
914	if ((newpde & PG_PS) == 0)
915		/* Demotion: flush a specific 2MB page mapping. */
916		invlpg(va);
917	else if ((newpde & PG_G) == 0)
918		/*
919		 * Promotion: flush every 4KB page mapping from the TLB
920		 * because there are too many to flush individually.
921		 */
922		invltlb();
923	else {
924		/*
925		 * Promotion: flush every 4KB page mapping from the TLB,
926		 * including any global (PG_G) mappings.
927		 */
928		cr4 = rcr4();
929		load_cr4(cr4 & ~CR4_PGE);
930		/*
931		 * Although preemption at this point could be detrimental to
932		 * performance, it would not lead to an error.  PG_G is simply
933		 * ignored if CR4.PGE is clear.  Moreover, in case this block
934		 * is re-entered, the load_cr4() either above or below will
935		 * modify CR4.PGE flushing the TLB.
936		 */
937		load_cr4(cr4 | CR4_PGE);
938	}
939}
940#ifdef SMP
941/*
942 * For SMP, these functions have to use the IPI mechanism for coherence.
943 *
944 * N.B.: Before calling any of the following TLB invalidation functions,
945 * the calling processor must ensure that all stores updating a non-
946 * kernel page table are globally performed.  Otherwise, another
947 * processor could cache an old, pre-update entry without being
948 * invalidated.  This can happen one of two ways: (1) The pmap becomes
949 * active on another processor after its pm_active field is checked by
950 * one of the following functions but before a store updating the page
951 * table is globally performed. (2) The pmap becomes active on another
952 * processor before its pm_active field is checked but due to
953 * speculative loads one of the following functions stills reads the
954 * pmap as inactive on the other processor.
955 *
956 * The kernel page table is exempt because its pm_active field is
957 * immutable.  The kernel page table is always active on every
958 * processor.
959 */
960void
961pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
962{
963	cpuset_t other_cpus;
964	u_int cpuid;
965
966	sched_pin();
967	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
968		invlpg(va);
969		smp_invlpg(va);
970	} else {
971		cpuid = PCPU_GET(cpuid);
972		other_cpus = all_cpus;
973		CPU_CLR(cpuid, &other_cpus);
974		if (CPU_ISSET(cpuid, &pmap->pm_active))
975			invlpg(va);
976		CPU_AND(&other_cpus, &pmap->pm_active);
977		if (!CPU_EMPTY(&other_cpus))
978			smp_masked_invlpg(other_cpus, va);
979	}
980	sched_unpin();
981}
982
983void
984pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
985{
986	cpuset_t other_cpus;
987	vm_offset_t addr;
988	u_int cpuid;
989
990	sched_pin();
991	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
992		for (addr = sva; addr < eva; addr += PAGE_SIZE)
993			invlpg(addr);
994		smp_invlpg_range(sva, eva);
995	} else {
996		cpuid = PCPU_GET(cpuid);
997		other_cpus = all_cpus;
998		CPU_CLR(cpuid, &other_cpus);
999		if (CPU_ISSET(cpuid, &pmap->pm_active))
1000			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1001				invlpg(addr);
1002		CPU_AND(&other_cpus, &pmap->pm_active);
1003		if (!CPU_EMPTY(&other_cpus))
1004			smp_masked_invlpg_range(other_cpus, sva, eva);
1005	}
1006	sched_unpin();
1007}
1008
1009void
1010pmap_invalidate_all(pmap_t pmap)
1011{
1012	cpuset_t other_cpus;
1013	u_int cpuid;
1014
1015	sched_pin();
1016	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1017		invltlb();
1018		smp_invltlb();
1019	} else {
1020		cpuid = PCPU_GET(cpuid);
1021		other_cpus = all_cpus;
1022		CPU_CLR(cpuid, &other_cpus);
1023		if (CPU_ISSET(cpuid, &pmap->pm_active))
1024			invltlb();
1025		CPU_AND(&other_cpus, &pmap->pm_active);
1026		if (!CPU_EMPTY(&other_cpus))
1027			smp_masked_invltlb(other_cpus);
1028	}
1029	sched_unpin();
1030}
1031
1032void
1033pmap_invalidate_cache(void)
1034{
1035
1036	sched_pin();
1037	wbinvd();
1038	smp_cache_flush();
1039	sched_unpin();
1040}
1041
1042struct pde_action {
1043	cpuset_t invalidate;	/* processors that invalidate their TLB */
1044	vm_offset_t va;
1045	pd_entry_t *pde;
1046	pd_entry_t newpde;
1047	u_int store;		/* processor that updates the PDE */
1048};
1049
1050static void
1051pmap_update_pde_kernel(void *arg)
1052{
1053	struct pde_action *act = arg;
1054	pd_entry_t *pde;
1055	pmap_t pmap;
1056
1057	if (act->store == PCPU_GET(cpuid)) {
1058
1059		/*
1060		 * Elsewhere, this operation requires allpmaps_lock for
1061		 * synchronization.  Here, it does not because it is being
1062		 * performed in the context of an all_cpus rendezvous.
1063		 */
1064		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1065			pde = pmap_pde(pmap, act->va);
1066			pde_store(pde, act->newpde);
1067		}
1068	}
1069}
1070
1071static void
1072pmap_update_pde_user(void *arg)
1073{
1074	struct pde_action *act = arg;
1075
1076	if (act->store == PCPU_GET(cpuid))
1077		pde_store(act->pde, act->newpde);
1078}
1079
1080static void
1081pmap_update_pde_teardown(void *arg)
1082{
1083	struct pde_action *act = arg;
1084
1085	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1086		pmap_update_pde_invalidate(act->va, act->newpde);
1087}
1088
1089/*
1090 * Change the page size for the specified virtual address in a way that
1091 * prevents any possibility of the TLB ever having two entries that map the
1092 * same virtual address using different page sizes.  This is the recommended
1093 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1094 * machine check exception for a TLB state that is improperly diagnosed as a
1095 * hardware error.
1096 */
1097static void
1098pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1099{
1100	struct pde_action act;
1101	cpuset_t active, other_cpus;
1102	u_int cpuid;
1103
1104	sched_pin();
1105	cpuid = PCPU_GET(cpuid);
1106	other_cpus = all_cpus;
1107	CPU_CLR(cpuid, &other_cpus);
1108	if (pmap == kernel_pmap)
1109		active = all_cpus;
1110	else
1111		active = pmap->pm_active;
1112	if (CPU_OVERLAP(&active, &other_cpus)) {
1113		act.store = cpuid;
1114		act.invalidate = active;
1115		act.va = va;
1116		act.pde = pde;
1117		act.newpde = newpde;
1118		CPU_SET(cpuid, &active);
1119		smp_rendezvous_cpus(active,
1120		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1121		    pmap_update_pde_kernel : pmap_update_pde_user,
1122		    pmap_update_pde_teardown, &act);
1123	} else {
1124		if (pmap == kernel_pmap)
1125			pmap_kenter_pde(va, newpde);
1126		else
1127			pde_store(pde, newpde);
1128		if (CPU_ISSET(cpuid, &active))
1129			pmap_update_pde_invalidate(va, newpde);
1130	}
1131	sched_unpin();
1132}
1133#else /* !SMP */
1134/*
1135 * Normal, non-SMP, 486+ invalidation functions.
1136 * We inline these within pmap.c for speed.
1137 */
1138PMAP_INLINE void
1139pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1140{
1141
1142	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1143		invlpg(va);
1144}
1145
1146PMAP_INLINE void
1147pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1148{
1149	vm_offset_t addr;
1150
1151	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1152		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1153			invlpg(addr);
1154}
1155
1156PMAP_INLINE void
1157pmap_invalidate_all(pmap_t pmap)
1158{
1159
1160	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1161		invltlb();
1162}
1163
1164PMAP_INLINE void
1165pmap_invalidate_cache(void)
1166{
1167
1168	wbinvd();
1169}
1170
1171static void
1172pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1173{
1174
1175	if (pmap == kernel_pmap)
1176		pmap_kenter_pde(va, newpde);
1177	else
1178		pde_store(pde, newpde);
1179	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1180		pmap_update_pde_invalidate(va, newpde);
1181}
1182#endif /* !SMP */
1183
1184#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1185
1186void
1187pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1188{
1189
1190	KASSERT((sva & PAGE_MASK) == 0,
1191	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1192	KASSERT((eva & PAGE_MASK) == 0,
1193	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1194
1195	if (cpu_feature & CPUID_SS)
1196		; /* If "Self Snoop" is supported, do nothing. */
1197	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1198	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1199
1200#ifdef DEV_APIC
1201		/*
1202		 * XXX: Some CPUs fault, hang, or trash the local APIC
1203		 * registers if we use CLFLUSH on the local APIC
1204		 * range.  The local APIC is always uncached, so we
1205		 * don't need to flush for that range anyway.
1206		 */
1207		if (pmap_kextract(sva) == lapic_paddr)
1208			return;
1209#endif
1210		/*
1211		 * Otherwise, do per-cache line flush.  Use the mfence
1212		 * instruction to insure that previous stores are
1213		 * included in the write-back.  The processor
1214		 * propagates flush to other processors in the cache
1215		 * coherence domain.
1216		 */
1217		mfence();
1218		for (; sva < eva; sva += cpu_clflush_line_size)
1219			clflush(sva);
1220		mfence();
1221	} else {
1222
1223		/*
1224		 * No targeted cache flush methods are supported by CPU,
1225		 * or the supplied range is bigger than 2MB.
1226		 * Globally invalidate cache.
1227		 */
1228		pmap_invalidate_cache();
1229	}
1230}
1231
1232void
1233pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1234{
1235	int i;
1236
1237	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1238	    (cpu_feature & CPUID_CLFSH) == 0) {
1239		pmap_invalidate_cache();
1240	} else {
1241		for (i = 0; i < count; i++)
1242			pmap_flush_page(pages[i]);
1243	}
1244}
1245
1246/*
1247 * Are we current address space or kernel?  N.B. We return FALSE when
1248 * a pmap's page table is in use because a kernel thread is borrowing
1249 * it.  The borrowed page table can change spontaneously, making any
1250 * dependence on its continued use subject to a race condition.
1251 */
1252static __inline int
1253pmap_is_current(pmap_t pmap)
1254{
1255
1256	return (pmap == kernel_pmap ||
1257	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1258	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1259}
1260
1261/*
1262 * If the given pmap is not the current or kernel pmap, the returned pte must
1263 * be released by passing it to pmap_pte_release().
1264 */
1265pt_entry_t *
1266pmap_pte(pmap_t pmap, vm_offset_t va)
1267{
1268	pd_entry_t newpf;
1269	pd_entry_t *pde;
1270
1271	pde = pmap_pde(pmap, va);
1272	if (*pde & PG_PS)
1273		return (pde);
1274	if (*pde != 0) {
1275		/* are we current address space or kernel? */
1276		if (pmap_is_current(pmap))
1277			return (vtopte(va));
1278		mtx_lock(&PMAP2mutex);
1279		newpf = *pde & PG_FRAME;
1280		if ((*PMAP2 & PG_FRAME) != newpf) {
1281			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1282			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1283		}
1284		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1285	}
1286	return (NULL);
1287}
1288
1289/*
1290 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1291 * being NULL.
1292 */
1293static __inline void
1294pmap_pte_release(pt_entry_t *pte)
1295{
1296
1297	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1298		mtx_unlock(&PMAP2mutex);
1299}
1300
1301static __inline void
1302invlcaddr(void *caddr)
1303{
1304
1305	invlpg((u_int)caddr);
1306}
1307
1308/*
1309 * Super fast pmap_pte routine best used when scanning
1310 * the pv lists.  This eliminates many coarse-grained
1311 * invltlb calls.  Note that many of the pv list
1312 * scans are across different pmaps.  It is very wasteful
1313 * to do an entire invltlb for checking a single mapping.
1314 *
1315 * If the given pmap is not the current pmap, pvh_global_lock
1316 * must be held and curthread pinned to a CPU.
1317 */
1318static pt_entry_t *
1319pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1320{
1321	pd_entry_t newpf;
1322	pd_entry_t *pde;
1323
1324	pde = pmap_pde(pmap, va);
1325	if (*pde & PG_PS)
1326		return (pde);
1327	if (*pde != 0) {
1328		/* are we current address space or kernel? */
1329		if (pmap_is_current(pmap))
1330			return (vtopte(va));
1331		rw_assert(&pvh_global_lock, RA_WLOCKED);
1332		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1333		newpf = *pde & PG_FRAME;
1334		if ((*PMAP1 & PG_FRAME) != newpf) {
1335			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1336#ifdef SMP
1337			PMAP1cpu = PCPU_GET(cpuid);
1338#endif
1339			invlcaddr(PADDR1);
1340			PMAP1changed++;
1341		} else
1342#ifdef SMP
1343		if (PMAP1cpu != PCPU_GET(cpuid)) {
1344			PMAP1cpu = PCPU_GET(cpuid);
1345			invlcaddr(PADDR1);
1346			PMAP1changedcpu++;
1347		} else
1348#endif
1349			PMAP1unchanged++;
1350		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1351	}
1352	return (0);
1353}
1354
1355/*
1356 *	Routine:	pmap_extract
1357 *	Function:
1358 *		Extract the physical page address associated
1359 *		with the given map/virtual_address pair.
1360 */
1361vm_paddr_t
1362pmap_extract(pmap_t pmap, vm_offset_t va)
1363{
1364	vm_paddr_t rtval;
1365	pt_entry_t *pte;
1366	pd_entry_t pde;
1367
1368	rtval = 0;
1369	PMAP_LOCK(pmap);
1370	pde = pmap->pm_pdir[va >> PDRSHIFT];
1371	if (pde != 0) {
1372		if ((pde & PG_PS) != 0)
1373			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1374		else {
1375			pte = pmap_pte(pmap, va);
1376			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1377			pmap_pte_release(pte);
1378		}
1379	}
1380	PMAP_UNLOCK(pmap);
1381	return (rtval);
1382}
1383
1384/*
1385 *	Routine:	pmap_extract_and_hold
1386 *	Function:
1387 *		Atomically extract and hold the physical page
1388 *		with the given pmap and virtual address pair
1389 *		if that mapping permits the given protection.
1390 */
1391vm_page_t
1392pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1393{
1394	pd_entry_t pde;
1395	pt_entry_t pte, *ptep;
1396	vm_page_t m;
1397	vm_paddr_t pa;
1398
1399	pa = 0;
1400	m = NULL;
1401	PMAP_LOCK(pmap);
1402retry:
1403	pde = *pmap_pde(pmap, va);
1404	if (pde != 0) {
1405		if (pde & PG_PS) {
1406			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1407				if (vm_page_pa_tryrelock(pmap, (pde &
1408				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1409					goto retry;
1410				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1411				    (va & PDRMASK));
1412				vm_page_hold(m);
1413			}
1414		} else {
1415			ptep = pmap_pte(pmap, va);
1416			pte = *ptep;
1417			pmap_pte_release(ptep);
1418			if (pte != 0 &&
1419			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1420				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1421				    &pa))
1422					goto retry;
1423				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1424				vm_page_hold(m);
1425			}
1426		}
1427	}
1428	PA_UNLOCK_COND(pa);
1429	PMAP_UNLOCK(pmap);
1430	return (m);
1431}
1432
1433/***************************************************
1434 * Low level mapping routines.....
1435 ***************************************************/
1436
1437/*
1438 * Add a wired page to the kva.
1439 * Note: not SMP coherent.
1440 *
1441 * This function may be used before pmap_bootstrap() is called.
1442 */
1443PMAP_INLINE void
1444pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1445{
1446	pt_entry_t *pte;
1447
1448	pte = vtopte(va);
1449	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1450}
1451
1452static __inline void
1453pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1454{
1455	pt_entry_t *pte;
1456
1457	pte = vtopte(va);
1458	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1459}
1460
1461/*
1462 * Remove a page from the kernel pagetables.
1463 * Note: not SMP coherent.
1464 *
1465 * This function may be used before pmap_bootstrap() is called.
1466 */
1467PMAP_INLINE void
1468pmap_kremove(vm_offset_t va)
1469{
1470	pt_entry_t *pte;
1471
1472	pte = vtopte(va);
1473	pte_clear(pte);
1474}
1475
1476/*
1477 *	Used to map a range of physical addresses into kernel
1478 *	virtual address space.
1479 *
1480 *	The value passed in '*virt' is a suggested virtual address for
1481 *	the mapping. Architectures which can support a direct-mapped
1482 *	physical to virtual region can return the appropriate address
1483 *	within that region, leaving '*virt' unchanged. Other
1484 *	architectures should map the pages starting at '*virt' and
1485 *	update '*virt' with the first usable address after the mapped
1486 *	region.
1487 */
1488vm_offset_t
1489pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1490{
1491	vm_offset_t va, sva;
1492	vm_paddr_t superpage_offset;
1493	pd_entry_t newpde;
1494
1495	va = *virt;
1496	/*
1497	 * Does the physical address range's size and alignment permit at
1498	 * least one superpage mapping to be created?
1499	 */
1500	superpage_offset = start & PDRMASK;
1501	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1502		/*
1503		 * Increase the starting virtual address so that its alignment
1504		 * does not preclude the use of superpage mappings.
1505		 */
1506		if ((va & PDRMASK) < superpage_offset)
1507			va = (va & ~PDRMASK) + superpage_offset;
1508		else if ((va & PDRMASK) > superpage_offset)
1509			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1510	}
1511	sva = va;
1512	while (start < end) {
1513		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1514		    pseflag) {
1515			KASSERT((va & PDRMASK) == 0,
1516			    ("pmap_map: misaligned va %#x", va));
1517			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1518			pmap_kenter_pde(va, newpde);
1519			va += NBPDR;
1520			start += NBPDR;
1521		} else {
1522			pmap_kenter(va, start);
1523			va += PAGE_SIZE;
1524			start += PAGE_SIZE;
1525		}
1526	}
1527	pmap_invalidate_range(kernel_pmap, sva, va);
1528	*virt = va;
1529	return (sva);
1530}
1531
1532
1533/*
1534 * Add a list of wired pages to the kva
1535 * this routine is only used for temporary
1536 * kernel mappings that do not need to have
1537 * page modification or references recorded.
1538 * Note that old mappings are simply written
1539 * over.  The page *must* be wired.
1540 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1541 */
1542void
1543pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1544{
1545	pt_entry_t *endpte, oldpte, pa, *pte;
1546	vm_page_t m;
1547
1548	oldpte = 0;
1549	pte = vtopte(sva);
1550	endpte = pte + count;
1551	while (pte < endpte) {
1552		m = *ma++;
1553		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1554		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1555			oldpte |= *pte;
1556			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1557		}
1558		pte++;
1559	}
1560	if (__predict_false((oldpte & PG_V) != 0))
1561		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1562		    PAGE_SIZE);
1563}
1564
1565/*
1566 * This routine tears out page mappings from the
1567 * kernel -- it is meant only for temporary mappings.
1568 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1569 */
1570void
1571pmap_qremove(vm_offset_t sva, int count)
1572{
1573	vm_offset_t va;
1574
1575	va = sva;
1576	while (count-- > 0) {
1577		pmap_kremove(va);
1578		va += PAGE_SIZE;
1579	}
1580	pmap_invalidate_range(kernel_pmap, sva, va);
1581}
1582
1583/***************************************************
1584 * Page table page management routines.....
1585 ***************************************************/
1586static __inline void
1587pmap_free_zero_pages(vm_page_t free)
1588{
1589	vm_page_t m;
1590
1591	while (free != NULL) {
1592		m = free;
1593		free = m->right;
1594		/* Preserve the page's PG_ZERO setting. */
1595		vm_page_free_toq(m);
1596	}
1597}
1598
1599/*
1600 * Schedule the specified unused page table page to be freed.  Specifically,
1601 * add the page to the specified list of pages that will be released to the
1602 * physical memory manager after the TLB has been updated.
1603 */
1604static __inline void
1605pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1606{
1607
1608	if (set_PG_ZERO)
1609		m->flags |= PG_ZERO;
1610	else
1611		m->flags &= ~PG_ZERO;
1612	m->right = *free;
1613	*free = m;
1614}
1615
1616/*
1617 * Inserts the specified page table page into the specified pmap's collection
1618 * of idle page table pages.  Each of a pmap's page table pages is responsible
1619 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1620 * ordered by this virtual address range.
1621 */
1622static void
1623pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1624{
1625	vm_page_t root;
1626
1627	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1628	root = pmap->pm_root;
1629	if (root == NULL) {
1630		mpte->left = NULL;
1631		mpte->right = NULL;
1632	} else {
1633		root = vm_page_splay(mpte->pindex, root);
1634		if (mpte->pindex < root->pindex) {
1635			mpte->left = root->left;
1636			mpte->right = root;
1637			root->left = NULL;
1638		} else if (mpte->pindex == root->pindex)
1639			panic("pmap_insert_pt_page: pindex already inserted");
1640		else {
1641			mpte->right = root->right;
1642			mpte->left = root;
1643			root->right = NULL;
1644		}
1645	}
1646	pmap->pm_root = mpte;
1647}
1648
1649/*
1650 * Looks for a page table page mapping the specified virtual address in the
1651 * specified pmap's collection of idle page table pages.  Returns NULL if there
1652 * is no page table page corresponding to the specified virtual address.
1653 */
1654static vm_page_t
1655pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1656{
1657	vm_page_t mpte;
1658	vm_pindex_t pindex = va >> PDRSHIFT;
1659
1660	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1661	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1662		mpte = vm_page_splay(pindex, mpte);
1663		if ((pmap->pm_root = mpte)->pindex != pindex)
1664			mpte = NULL;
1665	}
1666	return (mpte);
1667}
1668
1669/*
1670 * Removes the specified page table page from the specified pmap's collection
1671 * of idle page table pages.  The specified page table page must be a member of
1672 * the pmap's collection.
1673 */
1674static void
1675pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1676{
1677	vm_page_t root;
1678
1679	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1680	if (mpte != pmap->pm_root)
1681		vm_page_splay(mpte->pindex, pmap->pm_root);
1682	if (mpte->left == NULL)
1683		root = mpte->right;
1684	else {
1685		root = vm_page_splay(mpte->pindex, mpte->left);
1686		root->right = mpte->right;
1687	}
1688	pmap->pm_root = root;
1689}
1690
1691/*
1692 * Decrements a page table page's wire count, which is used to record the
1693 * number of valid page table entries within the page.  If the wire count
1694 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1695 * page table page was unmapped and FALSE otherwise.
1696 */
1697static inline boolean_t
1698pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1699{
1700
1701	--m->wire_count;
1702	if (m->wire_count == 0) {
1703		_pmap_unwire_ptp(pmap, m, free);
1704		return (TRUE);
1705	} else
1706		return (FALSE);
1707}
1708
1709static void
1710_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1711{
1712	vm_offset_t pteva;
1713
1714	/*
1715	 * unmap the page table page
1716	 */
1717	pmap->pm_pdir[m->pindex] = 0;
1718	--pmap->pm_stats.resident_count;
1719
1720	/*
1721	 * This is a release store so that the ordinary store unmapping
1722	 * the page table page is globally performed before TLB shoot-
1723	 * down is begun.
1724	 */
1725	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1726
1727	/*
1728	 * Do an invltlb to make the invalidated mapping
1729	 * take effect immediately.
1730	 */
1731	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1732	pmap_invalidate_page(pmap, pteva);
1733
1734	/*
1735	 * Put page on a list so that it is released after
1736	 * *ALL* TLB shootdown is done
1737	 */
1738	pmap_add_delayed_free_list(m, free, TRUE);
1739}
1740
1741/*
1742 * After removing a page table entry, this routine is used to
1743 * conditionally free the page, and manage the hold/wire counts.
1744 */
1745static int
1746pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1747{
1748	pd_entry_t ptepde;
1749	vm_page_t mpte;
1750
1751	if (va >= VM_MAXUSER_ADDRESS)
1752		return (0);
1753	ptepde = *pmap_pde(pmap, va);
1754	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1755	return (pmap_unwire_ptp(pmap, mpte, free));
1756}
1757
1758/*
1759 * Initialize the pmap for the swapper process.
1760 */
1761void
1762pmap_pinit0(pmap_t pmap)
1763{
1764
1765	PMAP_LOCK_INIT(pmap);
1766	/*
1767	 * Since the page table directory is shared with the kernel pmap,
1768	 * which is already included in the list "allpmaps", this pmap does
1769	 * not need to be inserted into that list.
1770	 */
1771	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1772#ifdef PAE
1773	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1774#endif
1775	pmap->pm_root = NULL;
1776	CPU_ZERO(&pmap->pm_active);
1777	PCPU_SET(curpmap, pmap);
1778	TAILQ_INIT(&pmap->pm_pvchunk);
1779	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1780}
1781
1782/*
1783 * Initialize a preallocated and zeroed pmap structure,
1784 * such as one in a vmspace structure.
1785 */
1786int
1787pmap_pinit(pmap_t pmap)
1788{
1789	vm_page_t m, ptdpg[NPGPTD];
1790	vm_paddr_t pa;
1791	int i;
1792
1793	PMAP_LOCK_INIT(pmap);
1794
1795	/*
1796	 * No need to allocate page table space yet but we do need a valid
1797	 * page directory table.
1798	 */
1799	if (pmap->pm_pdir == NULL) {
1800		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1801		    NBPTD);
1802		if (pmap->pm_pdir == NULL) {
1803			PMAP_LOCK_DESTROY(pmap);
1804			return (0);
1805		}
1806#ifdef PAE
1807		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1808		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1809		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1810		    ("pmap_pinit: pdpt misaligned"));
1811		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1812		    ("pmap_pinit: pdpt above 4g"));
1813#endif
1814		pmap->pm_root = NULL;
1815	}
1816	KASSERT(pmap->pm_root == NULL,
1817	    ("pmap_pinit: pmap has reserved page table page(s)"));
1818
1819	/*
1820	 * allocate the page directory page(s)
1821	 */
1822	for (i = 0; i < NPGPTD;) {
1823		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1824		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1825		if (m == NULL)
1826			VM_WAIT;
1827		else {
1828			ptdpg[i++] = m;
1829		}
1830	}
1831
1832	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1833
1834	for (i = 0; i < NPGPTD; i++)
1835		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1836			pagezero(pmap->pm_pdir + (i * NPDEPG));
1837
1838	mtx_lock_spin(&allpmaps_lock);
1839	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1840	/* Copy the kernel page table directory entries. */
1841	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1842	mtx_unlock_spin(&allpmaps_lock);
1843
1844	/* install self-referential address mapping entry(s) */
1845	for (i = 0; i < NPGPTD; i++) {
1846		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1847		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1848#ifdef PAE
1849		pmap->pm_pdpt[i] = pa | PG_V;
1850#endif
1851	}
1852
1853	CPU_ZERO(&pmap->pm_active);
1854	TAILQ_INIT(&pmap->pm_pvchunk);
1855	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1856
1857	return (1);
1858}
1859
1860/*
1861 * this routine is called if the page table page is not
1862 * mapped correctly.
1863 */
1864static vm_page_t
1865_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags)
1866{
1867	vm_paddr_t ptepa;
1868	vm_page_t m;
1869
1870	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1871	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1872	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1873
1874	/*
1875	 * Allocate a page table page.
1876	 */
1877	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1878	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1879		if (flags & M_WAITOK) {
1880			PMAP_UNLOCK(pmap);
1881			rw_wunlock(&pvh_global_lock);
1882			VM_WAIT;
1883			rw_wlock(&pvh_global_lock);
1884			PMAP_LOCK(pmap);
1885		}
1886
1887		/*
1888		 * Indicate the need to retry.  While waiting, the page table
1889		 * page may have been allocated.
1890		 */
1891		return (NULL);
1892	}
1893	if ((m->flags & PG_ZERO) == 0)
1894		pmap_zero_page(m);
1895
1896	/*
1897	 * Map the pagetable page into the process address space, if
1898	 * it isn't already there.
1899	 */
1900
1901	pmap->pm_stats.resident_count++;
1902
1903	ptepa = VM_PAGE_TO_PHYS(m);
1904	pmap->pm_pdir[ptepindex] =
1905		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1906
1907	return (m);
1908}
1909
1910static vm_page_t
1911pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1912{
1913	u_int ptepindex;
1914	pd_entry_t ptepa;
1915	vm_page_t m;
1916
1917	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1918	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1919	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1920
1921	/*
1922	 * Calculate pagetable page index
1923	 */
1924	ptepindex = va >> PDRSHIFT;
1925retry:
1926	/*
1927	 * Get the page directory entry
1928	 */
1929	ptepa = pmap->pm_pdir[ptepindex];
1930
1931	/*
1932	 * This supports switching from a 4MB page to a
1933	 * normal 4K page.
1934	 */
1935	if (ptepa & PG_PS) {
1936		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1937		ptepa = pmap->pm_pdir[ptepindex];
1938	}
1939
1940	/*
1941	 * If the page table page is mapped, we just increment the
1942	 * hold count, and activate it.
1943	 */
1944	if (ptepa) {
1945		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1946		m->wire_count++;
1947	} else {
1948		/*
1949		 * Here if the pte page isn't mapped, or if it has
1950		 * been deallocated.
1951		 */
1952		m = _pmap_allocpte(pmap, ptepindex, flags);
1953		if (m == NULL && (flags & M_WAITOK))
1954			goto retry;
1955	}
1956	return (m);
1957}
1958
1959
1960/***************************************************
1961* Pmap allocation/deallocation routines.
1962 ***************************************************/
1963
1964#ifdef SMP
1965/*
1966 * Deal with a SMP shootdown of other users of the pmap that we are
1967 * trying to dispose of.  This can be a bit hairy.
1968 */
1969static cpuset_t *lazymask;
1970static u_int lazyptd;
1971static volatile u_int lazywait;
1972
1973void pmap_lazyfix_action(void);
1974
1975void
1976pmap_lazyfix_action(void)
1977{
1978
1979#ifdef COUNT_IPIS
1980	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1981#endif
1982	if (rcr3() == lazyptd)
1983		load_cr3(curpcb->pcb_cr3);
1984	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1985	atomic_store_rel_int(&lazywait, 1);
1986}
1987
1988static void
1989pmap_lazyfix_self(u_int cpuid)
1990{
1991
1992	if (rcr3() == lazyptd)
1993		load_cr3(curpcb->pcb_cr3);
1994	CPU_CLR_ATOMIC(cpuid, lazymask);
1995}
1996
1997
1998static void
1999pmap_lazyfix(pmap_t pmap)
2000{
2001	cpuset_t mymask, mask;
2002	u_int cpuid, spins;
2003	int lsb;
2004
2005	mask = pmap->pm_active;
2006	while (!CPU_EMPTY(&mask)) {
2007		spins = 50000000;
2008
2009		/* Find least significant set bit. */
2010		lsb = cpusetobj_ffs(&mask);
2011		MPASS(lsb != 0);
2012		lsb--;
2013		CPU_SETOF(lsb, &mask);
2014		mtx_lock_spin(&smp_ipi_mtx);
2015#ifdef PAE
2016		lazyptd = vtophys(pmap->pm_pdpt);
2017#else
2018		lazyptd = vtophys(pmap->pm_pdir);
2019#endif
2020		cpuid = PCPU_GET(cpuid);
2021
2022		/* Use a cpuset just for having an easy check. */
2023		CPU_SETOF(cpuid, &mymask);
2024		if (!CPU_CMP(&mask, &mymask)) {
2025			lazymask = &pmap->pm_active;
2026			pmap_lazyfix_self(cpuid);
2027		} else {
2028			atomic_store_rel_int((u_int *)&lazymask,
2029			    (u_int)&pmap->pm_active);
2030			atomic_store_rel_int(&lazywait, 0);
2031			ipi_selected(mask, IPI_LAZYPMAP);
2032			while (lazywait == 0) {
2033				ia32_pause();
2034				if (--spins == 0)
2035					break;
2036			}
2037		}
2038		mtx_unlock_spin(&smp_ipi_mtx);
2039		if (spins == 0)
2040			printf("pmap_lazyfix: spun for 50000000\n");
2041		mask = pmap->pm_active;
2042	}
2043}
2044
2045#else	/* SMP */
2046
2047/*
2048 * Cleaning up on uniprocessor is easy.  For various reasons, we're
2049 * unlikely to have to even execute this code, including the fact
2050 * that the cleanup is deferred until the parent does a wait(2), which
2051 * means that another userland process has run.
2052 */
2053static void
2054pmap_lazyfix(pmap_t pmap)
2055{
2056	u_int cr3;
2057
2058	cr3 = vtophys(pmap->pm_pdir);
2059	if (cr3 == rcr3()) {
2060		load_cr3(curpcb->pcb_cr3);
2061		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2062	}
2063}
2064#endif	/* SMP */
2065
2066/*
2067 * Release any resources held by the given physical map.
2068 * Called when a pmap initialized by pmap_pinit is being released.
2069 * Should only be called if the map contains no valid mappings.
2070 */
2071void
2072pmap_release(pmap_t pmap)
2073{
2074	vm_page_t m, ptdpg[NPGPTD];
2075	int i;
2076
2077	KASSERT(pmap->pm_stats.resident_count == 0,
2078	    ("pmap_release: pmap resident count %ld != 0",
2079	    pmap->pm_stats.resident_count));
2080	KASSERT(pmap->pm_root == NULL,
2081	    ("pmap_release: pmap has reserved page table page(s)"));
2082
2083	pmap_lazyfix(pmap);
2084	mtx_lock_spin(&allpmaps_lock);
2085	LIST_REMOVE(pmap, pm_list);
2086	mtx_unlock_spin(&allpmaps_lock);
2087
2088	for (i = 0; i < NPGPTD; i++)
2089		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2090		    PG_FRAME);
2091
2092	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2093	    sizeof(*pmap->pm_pdir));
2094
2095	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2096
2097	for (i = 0; i < NPGPTD; i++) {
2098		m = ptdpg[i];
2099#ifdef PAE
2100		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2101		    ("pmap_release: got wrong ptd page"));
2102#endif
2103		m->wire_count--;
2104		atomic_subtract_int(&cnt.v_wire_count, 1);
2105		vm_page_free_zero(m);
2106	}
2107	PMAP_LOCK_DESTROY(pmap);
2108}
2109
2110static int
2111kvm_size(SYSCTL_HANDLER_ARGS)
2112{
2113	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2114
2115	return (sysctl_handle_long(oidp, &ksize, 0, req));
2116}
2117SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2118    0, 0, kvm_size, "IU", "Size of KVM");
2119
2120static int
2121kvm_free(SYSCTL_HANDLER_ARGS)
2122{
2123	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2124
2125	return (sysctl_handle_long(oidp, &kfree, 0, req));
2126}
2127SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2128    0, 0, kvm_free, "IU", "Amount of KVM free");
2129
2130/*
2131 * grow the number of kernel page table entries, if needed
2132 */
2133void
2134pmap_growkernel(vm_offset_t addr)
2135{
2136	vm_paddr_t ptppaddr;
2137	vm_page_t nkpg;
2138	pd_entry_t newpdir;
2139
2140	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2141	addr = roundup2(addr, NBPDR);
2142	if (addr - 1 >= kernel_map->max_offset)
2143		addr = kernel_map->max_offset;
2144	while (kernel_vm_end < addr) {
2145		if (pdir_pde(PTD, kernel_vm_end)) {
2146			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2147			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2148				kernel_vm_end = kernel_map->max_offset;
2149				break;
2150			}
2151			continue;
2152		}
2153
2154		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2155		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2156		    VM_ALLOC_ZERO);
2157		if (nkpg == NULL)
2158			panic("pmap_growkernel: no memory to grow kernel");
2159
2160		nkpt++;
2161
2162		if ((nkpg->flags & PG_ZERO) == 0)
2163			pmap_zero_page(nkpg);
2164		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2165		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2166		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2167
2168		pmap_kenter_pde(kernel_vm_end, newpdir);
2169		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2170		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2171			kernel_vm_end = kernel_map->max_offset;
2172			break;
2173		}
2174	}
2175}
2176
2177
2178/***************************************************
2179 * page management routines.
2180 ***************************************************/
2181
2182CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2183CTASSERT(_NPCM == 11);
2184CTASSERT(_NPCPV == 336);
2185
2186static __inline struct pv_chunk *
2187pv_to_chunk(pv_entry_t pv)
2188{
2189
2190	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2191}
2192
2193#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2194
2195#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2196#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2197
2198static const uint32_t pc_freemask[_NPCM] = {
2199	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2200	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2201	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2202	PC_FREE0_9, PC_FREE10
2203};
2204
2205SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2206	"Current number of pv entries");
2207
2208#ifdef PV_STATS
2209static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2210
2211SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2212	"Current number of pv entry chunks");
2213SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2214	"Current number of pv entry chunks allocated");
2215SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2216	"Current number of pv entry chunks frees");
2217SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2218	"Number of times tried to get a chunk page but failed.");
2219
2220static long pv_entry_frees, pv_entry_allocs;
2221static int pv_entry_spare;
2222
2223SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2224	"Current number of pv entry frees");
2225SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2226	"Current number of pv entry allocs");
2227SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2228	"Current number of spare pv entries");
2229#endif
2230
2231/*
2232 * We are in a serious low memory condition.  Resort to
2233 * drastic measures to free some pages so we can allocate
2234 * another pv entry chunk.
2235 */
2236static vm_page_t
2237pmap_pv_reclaim(pmap_t locked_pmap)
2238{
2239	struct pch newtail;
2240	struct pv_chunk *pc;
2241	struct md_page *pvh;
2242	pd_entry_t *pde;
2243	pmap_t pmap;
2244	pt_entry_t *pte, tpte;
2245	pv_entry_t pv;
2246	vm_offset_t va;
2247	vm_page_t free, m, m_pc;
2248	uint32_t inuse;
2249	int bit, field, freed;
2250
2251	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2252	pmap = NULL;
2253	free = m_pc = NULL;
2254	TAILQ_INIT(&newtail);
2255	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2256	    free == NULL)) {
2257		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2258		if (pmap != pc->pc_pmap) {
2259			if (pmap != NULL) {
2260				pmap_invalidate_all(pmap);
2261				if (pmap != locked_pmap)
2262					PMAP_UNLOCK(pmap);
2263			}
2264			pmap = pc->pc_pmap;
2265			/* Avoid deadlock and lock recursion. */
2266			if (pmap > locked_pmap)
2267				PMAP_LOCK(pmap);
2268			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2269				pmap = NULL;
2270				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2271				continue;
2272			}
2273		}
2274
2275		/*
2276		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2277		 */
2278		freed = 0;
2279		for (field = 0; field < _NPCM; field++) {
2280			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2281			    inuse != 0; inuse &= ~(1UL << bit)) {
2282				bit = bsfl(inuse);
2283				pv = &pc->pc_pventry[field * 32 + bit];
2284				va = pv->pv_va;
2285				pde = pmap_pde(pmap, va);
2286				if ((*pde & PG_PS) != 0)
2287					continue;
2288				pte = pmap_pte(pmap, va);
2289				tpte = *pte;
2290				if ((tpte & PG_W) == 0)
2291					tpte = pte_load_clear(pte);
2292				pmap_pte_release(pte);
2293				if ((tpte & PG_W) != 0)
2294					continue;
2295				KASSERT(tpte != 0,
2296				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2297				    pmap, va));
2298				if ((tpte & PG_G) != 0)
2299					pmap_invalidate_page(pmap, va);
2300				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2301				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2302					vm_page_dirty(m);
2303				if ((tpte & PG_A) != 0)
2304					vm_page_aflag_set(m, PGA_REFERENCED);
2305				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2306				if (TAILQ_EMPTY(&m->md.pv_list) &&
2307				    (m->flags & PG_FICTITIOUS) == 0) {
2308					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2309					if (TAILQ_EMPTY(&pvh->pv_list)) {
2310						vm_page_aflag_clear(m,
2311						    PGA_WRITEABLE);
2312					}
2313				}
2314				pc->pc_map[field] |= 1UL << bit;
2315				pmap_unuse_pt(pmap, va, &free);
2316				freed++;
2317			}
2318		}
2319		if (freed == 0) {
2320			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2321			continue;
2322		}
2323		/* Every freed mapping is for a 4 KB page. */
2324		pmap->pm_stats.resident_count -= freed;
2325		PV_STAT(pv_entry_frees += freed);
2326		PV_STAT(pv_entry_spare += freed);
2327		pv_entry_count -= freed;
2328		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2329		for (field = 0; field < _NPCM; field++)
2330			if (pc->pc_map[field] != pc_freemask[field]) {
2331				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2332				    pc_list);
2333				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2334
2335				/*
2336				 * One freed pv entry in locked_pmap is
2337				 * sufficient.
2338				 */
2339				if (pmap == locked_pmap)
2340					goto out;
2341				break;
2342			}
2343		if (field == _NPCM) {
2344			PV_STAT(pv_entry_spare -= _NPCPV);
2345			PV_STAT(pc_chunk_count--);
2346			PV_STAT(pc_chunk_frees++);
2347			/* Entire chunk is free; return it. */
2348			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2349			pmap_qremove((vm_offset_t)pc, 1);
2350			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2351			break;
2352		}
2353	}
2354out:
2355	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2356	if (pmap != NULL) {
2357		pmap_invalidate_all(pmap);
2358		if (pmap != locked_pmap)
2359			PMAP_UNLOCK(pmap);
2360	}
2361	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
2362		m_pc = free;
2363		free = m_pc->right;
2364		/* Recycle a freed page table page. */
2365		m_pc->wire_count = 1;
2366		atomic_add_int(&cnt.v_wire_count, 1);
2367	}
2368	pmap_free_zero_pages(free);
2369	return (m_pc);
2370}
2371
2372/*
2373 * free the pv_entry back to the free list
2374 */
2375static void
2376free_pv_entry(pmap_t pmap, pv_entry_t pv)
2377{
2378	struct pv_chunk *pc;
2379	int idx, field, bit;
2380
2381	rw_assert(&pvh_global_lock, RA_WLOCKED);
2382	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2383	PV_STAT(pv_entry_frees++);
2384	PV_STAT(pv_entry_spare++);
2385	pv_entry_count--;
2386	pc = pv_to_chunk(pv);
2387	idx = pv - &pc->pc_pventry[0];
2388	field = idx / 32;
2389	bit = idx % 32;
2390	pc->pc_map[field] |= 1ul << bit;
2391	for (idx = 0; idx < _NPCM; idx++)
2392		if (pc->pc_map[idx] != pc_freemask[idx]) {
2393			/*
2394			 * 98% of the time, pc is already at the head of the
2395			 * list.  If it isn't already, move it to the head.
2396			 */
2397			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2398			    pc)) {
2399				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2400				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2401				    pc_list);
2402			}
2403			return;
2404		}
2405	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2406	free_pv_chunk(pc);
2407}
2408
2409static void
2410free_pv_chunk(struct pv_chunk *pc)
2411{
2412	vm_page_t m;
2413
2414 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2415	PV_STAT(pv_entry_spare -= _NPCPV);
2416	PV_STAT(pc_chunk_count--);
2417	PV_STAT(pc_chunk_frees++);
2418	/* entire chunk is free, return it */
2419	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2420	pmap_qremove((vm_offset_t)pc, 1);
2421	vm_page_unwire(m, 0);
2422	vm_page_free(m);
2423	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2424}
2425
2426/*
2427 * get a new pv_entry, allocating a block from the system
2428 * when needed.
2429 */
2430static pv_entry_t
2431get_pv_entry(pmap_t pmap, boolean_t try)
2432{
2433	static const struct timeval printinterval = { 60, 0 };
2434	static struct timeval lastprint;
2435	int bit, field;
2436	pv_entry_t pv;
2437	struct pv_chunk *pc;
2438	vm_page_t m;
2439
2440	rw_assert(&pvh_global_lock, RA_WLOCKED);
2441	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2442	PV_STAT(pv_entry_allocs++);
2443	pv_entry_count++;
2444	if (pv_entry_count > pv_entry_high_water)
2445		if (ratecheck(&lastprint, &printinterval))
2446			printf("Approaching the limit on PV entries, consider "
2447			    "increasing either the vm.pmap.shpgperproc or the "
2448			    "vm.pmap.pv_entry_max tunable.\n");
2449retry:
2450	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2451	if (pc != NULL) {
2452		for (field = 0; field < _NPCM; field++) {
2453			if (pc->pc_map[field]) {
2454				bit = bsfl(pc->pc_map[field]);
2455				break;
2456			}
2457		}
2458		if (field < _NPCM) {
2459			pv = &pc->pc_pventry[field * 32 + bit];
2460			pc->pc_map[field] &= ~(1ul << bit);
2461			/* If this was the last item, move it to tail */
2462			for (field = 0; field < _NPCM; field++)
2463				if (pc->pc_map[field] != 0) {
2464					PV_STAT(pv_entry_spare--);
2465					return (pv);	/* not full, return */
2466				}
2467			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2468			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2469			PV_STAT(pv_entry_spare--);
2470			return (pv);
2471		}
2472	}
2473	/*
2474	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2475	 * global lock.  If "pv_vafree" is currently non-empty, it will
2476	 * remain non-empty until pmap_ptelist_alloc() completes.
2477	 */
2478	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2479	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2480		if (try) {
2481			pv_entry_count--;
2482			PV_STAT(pc_chunk_tryfail++);
2483			return (NULL);
2484		}
2485		m = pmap_pv_reclaim(pmap);
2486		if (m == NULL)
2487			goto retry;
2488	}
2489	PV_STAT(pc_chunk_count++);
2490	PV_STAT(pc_chunk_allocs++);
2491	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2492	pmap_qenter((vm_offset_t)pc, &m, 1);
2493	pc->pc_pmap = pmap;
2494	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2495	for (field = 1; field < _NPCM; field++)
2496		pc->pc_map[field] = pc_freemask[field];
2497	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2498	pv = &pc->pc_pventry[0];
2499	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2500	PV_STAT(pv_entry_spare += _NPCPV - 1);
2501	return (pv);
2502}
2503
2504static __inline pv_entry_t
2505pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2506{
2507	pv_entry_t pv;
2508
2509	rw_assert(&pvh_global_lock, RA_WLOCKED);
2510	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2511		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2512			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2513			break;
2514		}
2515	}
2516	return (pv);
2517}
2518
2519static void
2520pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2521{
2522	struct md_page *pvh;
2523	pv_entry_t pv;
2524	vm_offset_t va_last;
2525	vm_page_t m;
2526
2527	rw_assert(&pvh_global_lock, RA_WLOCKED);
2528	KASSERT((pa & PDRMASK) == 0,
2529	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2530
2531	/*
2532	 * Transfer the 4mpage's pv entry for this mapping to the first
2533	 * page's pv list.
2534	 */
2535	pvh = pa_to_pvh(pa);
2536	va = trunc_4mpage(va);
2537	pv = pmap_pvh_remove(pvh, pmap, va);
2538	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2539	m = PHYS_TO_VM_PAGE(pa);
2540	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2541	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2542	va_last = va + NBPDR - PAGE_SIZE;
2543	do {
2544		m++;
2545		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2546		    ("pmap_pv_demote_pde: page %p is not managed", m));
2547		va += PAGE_SIZE;
2548		pmap_insert_entry(pmap, va, m);
2549	} while (va < va_last);
2550}
2551
2552static void
2553pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2554{
2555	struct md_page *pvh;
2556	pv_entry_t pv;
2557	vm_offset_t va_last;
2558	vm_page_t m;
2559
2560	rw_assert(&pvh_global_lock, RA_WLOCKED);
2561	KASSERT((pa & PDRMASK) == 0,
2562	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2563
2564	/*
2565	 * Transfer the first page's pv entry for this mapping to the
2566	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2567	 * to get_pv_entry(), a transfer avoids the possibility that
2568	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2569	 * removes one of the mappings that is being promoted.
2570	 */
2571	m = PHYS_TO_VM_PAGE(pa);
2572	va = trunc_4mpage(va);
2573	pv = pmap_pvh_remove(&m->md, pmap, va);
2574	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2575	pvh = pa_to_pvh(pa);
2576	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2577	/* Free the remaining NPTEPG - 1 pv entries. */
2578	va_last = va + NBPDR - PAGE_SIZE;
2579	do {
2580		m++;
2581		va += PAGE_SIZE;
2582		pmap_pvh_free(&m->md, pmap, va);
2583	} while (va < va_last);
2584}
2585
2586static void
2587pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2588{
2589	pv_entry_t pv;
2590
2591	pv = pmap_pvh_remove(pvh, pmap, va);
2592	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2593	free_pv_entry(pmap, pv);
2594}
2595
2596static void
2597pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2598{
2599	struct md_page *pvh;
2600
2601	rw_assert(&pvh_global_lock, RA_WLOCKED);
2602	pmap_pvh_free(&m->md, pmap, va);
2603	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2604		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2605		if (TAILQ_EMPTY(&pvh->pv_list))
2606			vm_page_aflag_clear(m, PGA_WRITEABLE);
2607	}
2608}
2609
2610/*
2611 * Create a pv entry for page at pa for
2612 * (pmap, va).
2613 */
2614static void
2615pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2616{
2617	pv_entry_t pv;
2618
2619	rw_assert(&pvh_global_lock, RA_WLOCKED);
2620	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2621	pv = get_pv_entry(pmap, FALSE);
2622	pv->pv_va = va;
2623	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2624}
2625
2626/*
2627 * Conditionally create a pv entry.
2628 */
2629static boolean_t
2630pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2631{
2632	pv_entry_t pv;
2633
2634	rw_assert(&pvh_global_lock, RA_WLOCKED);
2635	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2636	if (pv_entry_count < pv_entry_high_water &&
2637	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2638		pv->pv_va = va;
2639		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2640		return (TRUE);
2641	} else
2642		return (FALSE);
2643}
2644
2645/*
2646 * Create the pv entries for each of the pages within a superpage.
2647 */
2648static boolean_t
2649pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2650{
2651	struct md_page *pvh;
2652	pv_entry_t pv;
2653
2654	rw_assert(&pvh_global_lock, RA_WLOCKED);
2655	if (pv_entry_count < pv_entry_high_water &&
2656	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2657		pv->pv_va = va;
2658		pvh = pa_to_pvh(pa);
2659		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2660		return (TRUE);
2661	} else
2662		return (FALSE);
2663}
2664
2665/*
2666 * Fills a page table page with mappings to consecutive physical pages.
2667 */
2668static void
2669pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2670{
2671	pt_entry_t *pte;
2672
2673	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2674		*pte = newpte;
2675		newpte += PAGE_SIZE;
2676	}
2677}
2678
2679/*
2680 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2681 * 2- or 4MB page mapping is invalidated.
2682 */
2683static boolean_t
2684pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2685{
2686	pd_entry_t newpde, oldpde;
2687	pt_entry_t *firstpte, newpte;
2688	vm_paddr_t mptepa;
2689	vm_page_t free, mpte;
2690
2691	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2692	oldpde = *pde;
2693	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2694	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2695	mpte = pmap_lookup_pt_page(pmap, va);
2696	if (mpte != NULL)
2697		pmap_remove_pt_page(pmap, mpte);
2698	else {
2699		KASSERT((oldpde & PG_W) == 0,
2700		    ("pmap_demote_pde: page table page for a wired mapping"
2701		    " is missing"));
2702
2703		/*
2704		 * Invalidate the 2- or 4MB page mapping and return
2705		 * "failure" if the mapping was never accessed or the
2706		 * allocation of the new page table page fails.
2707		 */
2708		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2709		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2710		    VM_ALLOC_WIRED)) == NULL) {
2711			free = NULL;
2712			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2713			pmap_invalidate_page(pmap, trunc_4mpage(va));
2714			pmap_free_zero_pages(free);
2715			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2716			    " in pmap %p", va, pmap);
2717			return (FALSE);
2718		}
2719		if (va < VM_MAXUSER_ADDRESS)
2720			pmap->pm_stats.resident_count++;
2721	}
2722	mptepa = VM_PAGE_TO_PHYS(mpte);
2723
2724	/*
2725	 * If the page mapping is in the kernel's address space, then the
2726	 * KPTmap can provide access to the page table page.  Otherwise,
2727	 * temporarily map the page table page (mpte) into the kernel's
2728	 * address space at either PADDR1 or PADDR2.
2729	 */
2730	if (va >= KERNBASE)
2731		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2732	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2733		if ((*PMAP1 & PG_FRAME) != mptepa) {
2734			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2735#ifdef SMP
2736			PMAP1cpu = PCPU_GET(cpuid);
2737#endif
2738			invlcaddr(PADDR1);
2739			PMAP1changed++;
2740		} else
2741#ifdef SMP
2742		if (PMAP1cpu != PCPU_GET(cpuid)) {
2743			PMAP1cpu = PCPU_GET(cpuid);
2744			invlcaddr(PADDR1);
2745			PMAP1changedcpu++;
2746		} else
2747#endif
2748			PMAP1unchanged++;
2749		firstpte = PADDR1;
2750	} else {
2751		mtx_lock(&PMAP2mutex);
2752		if ((*PMAP2 & PG_FRAME) != mptepa) {
2753			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2754			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2755		}
2756		firstpte = PADDR2;
2757	}
2758	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2759	KASSERT((oldpde & PG_A) != 0,
2760	    ("pmap_demote_pde: oldpde is missing PG_A"));
2761	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2762	    ("pmap_demote_pde: oldpde is missing PG_M"));
2763	newpte = oldpde & ~PG_PS;
2764	if ((newpte & PG_PDE_PAT) != 0)
2765		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2766
2767	/*
2768	 * If the page table page is new, initialize it.
2769	 */
2770	if (mpte->wire_count == 1) {
2771		mpte->wire_count = NPTEPG;
2772		pmap_fill_ptp(firstpte, newpte);
2773	}
2774	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2775	    ("pmap_demote_pde: firstpte and newpte map different physical"
2776	    " addresses"));
2777
2778	/*
2779	 * If the mapping has changed attributes, update the page table
2780	 * entries.
2781	 */
2782	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2783		pmap_fill_ptp(firstpte, newpte);
2784
2785	/*
2786	 * Demote the mapping.  This pmap is locked.  The old PDE has
2787	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2788	 * set.  Thus, there is no danger of a race with another
2789	 * processor changing the setting of PG_A and/or PG_M between
2790	 * the read above and the store below.
2791	 */
2792	if (workaround_erratum383)
2793		pmap_update_pde(pmap, va, pde, newpde);
2794	else if (pmap == kernel_pmap)
2795		pmap_kenter_pde(va, newpde);
2796	else
2797		pde_store(pde, newpde);
2798	if (firstpte == PADDR2)
2799		mtx_unlock(&PMAP2mutex);
2800
2801	/*
2802	 * Invalidate the recursive mapping of the page table page.
2803	 */
2804	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2805
2806	/*
2807	 * Demote the pv entry.  This depends on the earlier demotion
2808	 * of the mapping.  Specifically, the (re)creation of a per-
2809	 * page pv entry might trigger the execution of pmap_collect(),
2810	 * which might reclaim a newly (re)created per-page pv entry
2811	 * and destroy the associated mapping.  In order to destroy
2812	 * the mapping, the PDE must have already changed from mapping
2813	 * the 2mpage to referencing the page table page.
2814	 */
2815	if ((oldpde & PG_MANAGED) != 0)
2816		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2817
2818	pmap_pde_demotions++;
2819	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2820	    " in pmap %p", va, pmap);
2821	return (TRUE);
2822}
2823
2824/*
2825 * pmap_remove_pde: do the things to unmap a superpage in a process
2826 */
2827static void
2828pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2829    vm_page_t *free)
2830{
2831	struct md_page *pvh;
2832	pd_entry_t oldpde;
2833	vm_offset_t eva, va;
2834	vm_page_t m, mpte;
2835
2836	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2837	KASSERT((sva & PDRMASK) == 0,
2838	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2839	oldpde = pte_load_clear(pdq);
2840	if (oldpde & PG_W)
2841		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2842
2843	/*
2844	 * Machines that don't support invlpg, also don't support
2845	 * PG_G.
2846	 */
2847	if (oldpde & PG_G)
2848		pmap_invalidate_page(kernel_pmap, sva);
2849	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2850	if (oldpde & PG_MANAGED) {
2851		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2852		pmap_pvh_free(pvh, pmap, sva);
2853		eva = sva + NBPDR;
2854		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2855		    va < eva; va += PAGE_SIZE, m++) {
2856			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2857				vm_page_dirty(m);
2858			if (oldpde & PG_A)
2859				vm_page_aflag_set(m, PGA_REFERENCED);
2860			if (TAILQ_EMPTY(&m->md.pv_list) &&
2861			    TAILQ_EMPTY(&pvh->pv_list))
2862				vm_page_aflag_clear(m, PGA_WRITEABLE);
2863		}
2864	}
2865	if (pmap == kernel_pmap) {
2866		if (!pmap_demote_pde(pmap, pdq, sva))
2867			panic("pmap_remove_pde: failed demotion");
2868	} else {
2869		mpte = pmap_lookup_pt_page(pmap, sva);
2870		if (mpte != NULL) {
2871			pmap_remove_pt_page(pmap, mpte);
2872			pmap->pm_stats.resident_count--;
2873			KASSERT(mpte->wire_count == NPTEPG,
2874			    ("pmap_remove_pde: pte page wire count error"));
2875			mpte->wire_count = 0;
2876			pmap_add_delayed_free_list(mpte, free, FALSE);
2877			atomic_subtract_int(&cnt.v_wire_count, 1);
2878		}
2879	}
2880}
2881
2882/*
2883 * pmap_remove_pte: do the things to unmap a page in a process
2884 */
2885static int
2886pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2887{
2888	pt_entry_t oldpte;
2889	vm_page_t m;
2890
2891	rw_assert(&pvh_global_lock, RA_WLOCKED);
2892	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2893	oldpte = pte_load_clear(ptq);
2894	KASSERT(oldpte != 0,
2895	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2896	if (oldpte & PG_W)
2897		pmap->pm_stats.wired_count -= 1;
2898	/*
2899	 * Machines that don't support invlpg, also don't support
2900	 * PG_G.
2901	 */
2902	if (oldpte & PG_G)
2903		pmap_invalidate_page(kernel_pmap, va);
2904	pmap->pm_stats.resident_count -= 1;
2905	if (oldpte & PG_MANAGED) {
2906		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2907		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2908			vm_page_dirty(m);
2909		if (oldpte & PG_A)
2910			vm_page_aflag_set(m, PGA_REFERENCED);
2911		pmap_remove_entry(pmap, m, va);
2912	}
2913	return (pmap_unuse_pt(pmap, va, free));
2914}
2915
2916/*
2917 * Remove a single page from a process address space
2918 */
2919static void
2920pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2921{
2922	pt_entry_t *pte;
2923
2924	rw_assert(&pvh_global_lock, RA_WLOCKED);
2925	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2926	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2927	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2928		return;
2929	pmap_remove_pte(pmap, pte, va, free);
2930	pmap_invalidate_page(pmap, va);
2931}
2932
2933/*
2934 *	Remove the given range of addresses from the specified map.
2935 *
2936 *	It is assumed that the start and end are properly
2937 *	rounded to the page size.
2938 */
2939void
2940pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2941{
2942	vm_offset_t pdnxt;
2943	pd_entry_t ptpaddr;
2944	pt_entry_t *pte;
2945	vm_page_t free = NULL;
2946	int anyvalid;
2947
2948	/*
2949	 * Perform an unsynchronized read.  This is, however, safe.
2950	 */
2951	if (pmap->pm_stats.resident_count == 0)
2952		return;
2953
2954	anyvalid = 0;
2955
2956	rw_wlock(&pvh_global_lock);
2957	sched_pin();
2958	PMAP_LOCK(pmap);
2959
2960	/*
2961	 * special handling of removing one page.  a very
2962	 * common operation and easy to short circuit some
2963	 * code.
2964	 */
2965	if ((sva + PAGE_SIZE == eva) &&
2966	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2967		pmap_remove_page(pmap, sva, &free);
2968		goto out;
2969	}
2970
2971	for (; sva < eva; sva = pdnxt) {
2972		u_int pdirindex;
2973
2974		/*
2975		 * Calculate index for next page table.
2976		 */
2977		pdnxt = (sva + NBPDR) & ~PDRMASK;
2978		if (pdnxt < sva)
2979			pdnxt = eva;
2980		if (pmap->pm_stats.resident_count == 0)
2981			break;
2982
2983		pdirindex = sva >> PDRSHIFT;
2984		ptpaddr = pmap->pm_pdir[pdirindex];
2985
2986		/*
2987		 * Weed out invalid mappings. Note: we assume that the page
2988		 * directory table is always allocated, and in kernel virtual.
2989		 */
2990		if (ptpaddr == 0)
2991			continue;
2992
2993		/*
2994		 * Check for large page.
2995		 */
2996		if ((ptpaddr & PG_PS) != 0) {
2997			/*
2998			 * Are we removing the entire large page?  If not,
2999			 * demote the mapping and fall through.
3000			 */
3001			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3002				/*
3003				 * The TLB entry for a PG_G mapping is
3004				 * invalidated by pmap_remove_pde().
3005				 */
3006				if ((ptpaddr & PG_G) == 0)
3007					anyvalid = 1;
3008				pmap_remove_pde(pmap,
3009				    &pmap->pm_pdir[pdirindex], sva, &free);
3010				continue;
3011			} else if (!pmap_demote_pde(pmap,
3012			    &pmap->pm_pdir[pdirindex], sva)) {
3013				/* The large page mapping was destroyed. */
3014				continue;
3015			}
3016		}
3017
3018		/*
3019		 * Limit our scan to either the end of the va represented
3020		 * by the current page table page, or to the end of the
3021		 * range being removed.
3022		 */
3023		if (pdnxt > eva)
3024			pdnxt = eva;
3025
3026		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3027		    sva += PAGE_SIZE) {
3028			if (*pte == 0)
3029				continue;
3030
3031			/*
3032			 * The TLB entry for a PG_G mapping is invalidated
3033			 * by pmap_remove_pte().
3034			 */
3035			if ((*pte & PG_G) == 0)
3036				anyvalid = 1;
3037			if (pmap_remove_pte(pmap, pte, sva, &free))
3038				break;
3039		}
3040	}
3041out:
3042	sched_unpin();
3043	if (anyvalid)
3044		pmap_invalidate_all(pmap);
3045	rw_wunlock(&pvh_global_lock);
3046	PMAP_UNLOCK(pmap);
3047	pmap_free_zero_pages(free);
3048}
3049
3050/*
3051 *	Routine:	pmap_remove_all
3052 *	Function:
3053 *		Removes this physical page from
3054 *		all physical maps in which it resides.
3055 *		Reflects back modify bits to the pager.
3056 *
3057 *	Notes:
3058 *		Original versions of this routine were very
3059 *		inefficient because they iteratively called
3060 *		pmap_remove (slow...)
3061 */
3062
3063void
3064pmap_remove_all(vm_page_t m)
3065{
3066	struct md_page *pvh;
3067	pv_entry_t pv;
3068	pmap_t pmap;
3069	pt_entry_t *pte, tpte;
3070	pd_entry_t *pde;
3071	vm_offset_t va;
3072	vm_page_t free;
3073
3074	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3075	    ("pmap_remove_all: page %p is not managed", m));
3076	free = NULL;
3077	rw_wlock(&pvh_global_lock);
3078	sched_pin();
3079	if ((m->flags & PG_FICTITIOUS) != 0)
3080		goto small_mappings;
3081	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3082	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3083		va = pv->pv_va;
3084		pmap = PV_PMAP(pv);
3085		PMAP_LOCK(pmap);
3086		pde = pmap_pde(pmap, va);
3087		(void)pmap_demote_pde(pmap, pde, va);
3088		PMAP_UNLOCK(pmap);
3089	}
3090small_mappings:
3091	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3092		pmap = PV_PMAP(pv);
3093		PMAP_LOCK(pmap);
3094		pmap->pm_stats.resident_count--;
3095		pde = pmap_pde(pmap, pv->pv_va);
3096		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3097		    " a 4mpage in page %p's pv list", m));
3098		pte = pmap_pte_quick(pmap, pv->pv_va);
3099		tpte = pte_load_clear(pte);
3100		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3101		    pmap, pv->pv_va));
3102		if (tpte & PG_W)
3103			pmap->pm_stats.wired_count--;
3104		if (tpte & PG_A)
3105			vm_page_aflag_set(m, PGA_REFERENCED);
3106
3107		/*
3108		 * Update the vm_page_t clean and reference bits.
3109		 */
3110		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3111			vm_page_dirty(m);
3112		pmap_unuse_pt(pmap, pv->pv_va, &free);
3113		pmap_invalidate_page(pmap, pv->pv_va);
3114		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3115		free_pv_entry(pmap, pv);
3116		PMAP_UNLOCK(pmap);
3117	}
3118	vm_page_aflag_clear(m, PGA_WRITEABLE);
3119	sched_unpin();
3120	rw_wunlock(&pvh_global_lock);
3121	pmap_free_zero_pages(free);
3122}
3123
3124/*
3125 * pmap_protect_pde: do the things to protect a 4mpage in a process
3126 */
3127static boolean_t
3128pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3129{
3130	pd_entry_t newpde, oldpde;
3131	vm_offset_t eva, va;
3132	vm_page_t m;
3133	boolean_t anychanged;
3134
3135	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3136	KASSERT((sva & PDRMASK) == 0,
3137	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3138	anychanged = FALSE;
3139retry:
3140	oldpde = newpde = *pde;
3141	if (oldpde & PG_MANAGED) {
3142		eva = sva + NBPDR;
3143		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3144		    va < eva; va += PAGE_SIZE, m++)
3145			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3146				vm_page_dirty(m);
3147	}
3148	if ((prot & VM_PROT_WRITE) == 0)
3149		newpde &= ~(PG_RW | PG_M);
3150#ifdef PAE
3151	if ((prot & VM_PROT_EXECUTE) == 0)
3152		newpde |= pg_nx;
3153#endif
3154	if (newpde != oldpde) {
3155		if (!pde_cmpset(pde, oldpde, newpde))
3156			goto retry;
3157		if (oldpde & PG_G)
3158			pmap_invalidate_page(pmap, sva);
3159		else
3160			anychanged = TRUE;
3161	}
3162	return (anychanged);
3163}
3164
3165/*
3166 *	Set the physical protection on the
3167 *	specified range of this map as requested.
3168 */
3169void
3170pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3171{
3172	vm_offset_t pdnxt;
3173	pd_entry_t ptpaddr;
3174	pt_entry_t *pte;
3175	boolean_t anychanged, pv_lists_locked;
3176
3177	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3178		pmap_remove(pmap, sva, eva);
3179		return;
3180	}
3181
3182#ifdef PAE
3183	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3184	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3185		return;
3186#else
3187	if (prot & VM_PROT_WRITE)
3188		return;
3189#endif
3190
3191	if (pmap_is_current(pmap))
3192		pv_lists_locked = FALSE;
3193	else {
3194		pv_lists_locked = TRUE;
3195resume:
3196		rw_wlock(&pvh_global_lock);
3197		sched_pin();
3198	}
3199	anychanged = FALSE;
3200
3201	PMAP_LOCK(pmap);
3202	for (; sva < eva; sva = pdnxt) {
3203		pt_entry_t obits, pbits;
3204		u_int pdirindex;
3205
3206		pdnxt = (sva + NBPDR) & ~PDRMASK;
3207		if (pdnxt < sva)
3208			pdnxt = eva;
3209
3210		pdirindex = sva >> PDRSHIFT;
3211		ptpaddr = pmap->pm_pdir[pdirindex];
3212
3213		/*
3214		 * Weed out invalid mappings. Note: we assume that the page
3215		 * directory table is always allocated, and in kernel virtual.
3216		 */
3217		if (ptpaddr == 0)
3218			continue;
3219
3220		/*
3221		 * Check for large page.
3222		 */
3223		if ((ptpaddr & PG_PS) != 0) {
3224			/*
3225			 * Are we protecting the entire large page?  If not,
3226			 * demote the mapping and fall through.
3227			 */
3228			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3229				/*
3230				 * The TLB entry for a PG_G mapping is
3231				 * invalidated by pmap_protect_pde().
3232				 */
3233				if (pmap_protect_pde(pmap,
3234				    &pmap->pm_pdir[pdirindex], sva, prot))
3235					anychanged = TRUE;
3236				continue;
3237			} else {
3238				if (!pv_lists_locked) {
3239					pv_lists_locked = TRUE;
3240					if (!rw_try_wlock(&pvh_global_lock)) {
3241						if (anychanged)
3242							pmap_invalidate_all(
3243							    pmap);
3244						PMAP_UNLOCK(pmap);
3245						goto resume;
3246					}
3247					sched_pin();
3248				}
3249				if (!pmap_demote_pde(pmap,
3250				    &pmap->pm_pdir[pdirindex], sva)) {
3251					/*
3252					 * The large page mapping was
3253					 * destroyed.
3254					 */
3255					continue;
3256				}
3257			}
3258		}
3259
3260		if (pdnxt > eva)
3261			pdnxt = eva;
3262
3263		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3264		    sva += PAGE_SIZE) {
3265			vm_page_t m;
3266
3267retry:
3268			/*
3269			 * Regardless of whether a pte is 32 or 64 bits in
3270			 * size, PG_RW, PG_A, and PG_M are among the least
3271			 * significant 32 bits.
3272			 */
3273			obits = pbits = *pte;
3274			if ((pbits & PG_V) == 0)
3275				continue;
3276
3277			if ((prot & VM_PROT_WRITE) == 0) {
3278				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3279				    (PG_MANAGED | PG_M | PG_RW)) {
3280					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3281					vm_page_dirty(m);
3282				}
3283				pbits &= ~(PG_RW | PG_M);
3284			}
3285#ifdef PAE
3286			if ((prot & VM_PROT_EXECUTE) == 0)
3287				pbits |= pg_nx;
3288#endif
3289
3290			if (pbits != obits) {
3291#ifdef PAE
3292				if (!atomic_cmpset_64(pte, obits, pbits))
3293					goto retry;
3294#else
3295				if (!atomic_cmpset_int((u_int *)pte, obits,
3296				    pbits))
3297					goto retry;
3298#endif
3299				if (obits & PG_G)
3300					pmap_invalidate_page(pmap, sva);
3301				else
3302					anychanged = TRUE;
3303			}
3304		}
3305	}
3306	if (anychanged)
3307		pmap_invalidate_all(pmap);
3308	if (pv_lists_locked) {
3309		sched_unpin();
3310		rw_wunlock(&pvh_global_lock);
3311	}
3312	PMAP_UNLOCK(pmap);
3313}
3314
3315/*
3316 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3317 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3318 * For promotion to occur, two conditions must be met: (1) the 4KB page
3319 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3320 * mappings must have identical characteristics.
3321 *
3322 * Managed (PG_MANAGED) mappings within the kernel address space are not
3323 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3324 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3325 * pmap.
3326 */
3327static void
3328pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3329{
3330	pd_entry_t newpde;
3331	pt_entry_t *firstpte, oldpte, pa, *pte;
3332	vm_offset_t oldpteva;
3333	vm_page_t mpte;
3334
3335	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3336
3337	/*
3338	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3339	 * either invalid, unused, or does not map the first 4KB physical page
3340	 * within a 2- or 4MB page.
3341	 */
3342	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3343setpde:
3344	newpde = *firstpte;
3345	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3346		pmap_pde_p_failures++;
3347		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3348		    " in pmap %p", va, pmap);
3349		return;
3350	}
3351	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3352		pmap_pde_p_failures++;
3353		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3354		    " in pmap %p", va, pmap);
3355		return;
3356	}
3357	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3358		/*
3359		 * When PG_M is already clear, PG_RW can be cleared without
3360		 * a TLB invalidation.
3361		 */
3362		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3363		    ~PG_RW))
3364			goto setpde;
3365		newpde &= ~PG_RW;
3366	}
3367
3368	/*
3369	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3370	 * PTE maps an unexpected 4KB physical page or does not have identical
3371	 * characteristics to the first PTE.
3372	 */
3373	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3374	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3375setpte:
3376		oldpte = *pte;
3377		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3378			pmap_pde_p_failures++;
3379			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3380			    " in pmap %p", va, pmap);
3381			return;
3382		}
3383		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3384			/*
3385			 * When PG_M is already clear, PG_RW can be cleared
3386			 * without a TLB invalidation.
3387			 */
3388			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3389			    oldpte & ~PG_RW))
3390				goto setpte;
3391			oldpte &= ~PG_RW;
3392			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3393			    (va & ~PDRMASK);
3394			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3395			    " in pmap %p", oldpteva, pmap);
3396		}
3397		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3398			pmap_pde_p_failures++;
3399			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3400			    " in pmap %p", va, pmap);
3401			return;
3402		}
3403		pa -= PAGE_SIZE;
3404	}
3405
3406	/*
3407	 * Save the page table page in its current state until the PDE
3408	 * mapping the superpage is demoted by pmap_demote_pde() or
3409	 * destroyed by pmap_remove_pde().
3410	 */
3411	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3412	KASSERT(mpte >= vm_page_array &&
3413	    mpte < &vm_page_array[vm_page_array_size],
3414	    ("pmap_promote_pde: page table page is out of range"));
3415	KASSERT(mpte->pindex == va >> PDRSHIFT,
3416	    ("pmap_promote_pde: page table page's pindex is wrong"));
3417	pmap_insert_pt_page(pmap, mpte);
3418
3419	/*
3420	 * Promote the pv entries.
3421	 */
3422	if ((newpde & PG_MANAGED) != 0)
3423		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3424
3425	/*
3426	 * Propagate the PAT index to its proper position.
3427	 */
3428	if ((newpde & PG_PTE_PAT) != 0)
3429		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3430
3431	/*
3432	 * Map the superpage.
3433	 */
3434	if (workaround_erratum383)
3435		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3436	else if (pmap == kernel_pmap)
3437		pmap_kenter_pde(va, PG_PS | newpde);
3438	else
3439		pde_store(pde, PG_PS | newpde);
3440
3441	pmap_pde_promotions++;
3442	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3443	    " in pmap %p", va, pmap);
3444}
3445
3446/*
3447 *	Insert the given physical page (p) at
3448 *	the specified virtual address (v) in the
3449 *	target physical map with the protection requested.
3450 *
3451 *	If specified, the page will be wired down, meaning
3452 *	that the related pte can not be reclaimed.
3453 *
3454 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3455 *	or lose information.  That is, this routine must actually
3456 *	insert this page into the given map NOW.
3457 */
3458void
3459pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3460    vm_prot_t prot, boolean_t wired)
3461{
3462	pd_entry_t *pde;
3463	pt_entry_t *pte;
3464	pt_entry_t newpte, origpte;
3465	pv_entry_t pv;
3466	vm_paddr_t opa, pa;
3467	vm_page_t mpte, om;
3468	boolean_t invlva;
3469
3470	va = trunc_page(va);
3471	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3472	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3473	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3474	    va));
3475	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
3476	    VM_OBJECT_LOCKED(m->object),
3477	    ("pmap_enter: page %p is not busy", m));
3478
3479	mpte = NULL;
3480
3481	rw_wlock(&pvh_global_lock);
3482	PMAP_LOCK(pmap);
3483	sched_pin();
3484
3485	/*
3486	 * In the case that a page table page is not
3487	 * resident, we are creating it here.
3488	 */
3489	if (va < VM_MAXUSER_ADDRESS) {
3490		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3491	}
3492
3493	pde = pmap_pde(pmap, va);
3494	if ((*pde & PG_PS) != 0)
3495		panic("pmap_enter: attempted pmap_enter on 4MB page");
3496	pte = pmap_pte_quick(pmap, va);
3497
3498	/*
3499	 * Page Directory table entry not valid, we need a new PT page
3500	 */
3501	if (pte == NULL) {
3502		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3503			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3504	}
3505
3506	pa = VM_PAGE_TO_PHYS(m);
3507	om = NULL;
3508	origpte = *pte;
3509	opa = origpte & PG_FRAME;
3510
3511	/*
3512	 * Mapping has not changed, must be protection or wiring change.
3513	 */
3514	if (origpte && (opa == pa)) {
3515		/*
3516		 * Wiring change, just update stats. We don't worry about
3517		 * wiring PT pages as they remain resident as long as there
3518		 * are valid mappings in them. Hence, if a user page is wired,
3519		 * the PT page will be also.
3520		 */
3521		if (wired && ((origpte & PG_W) == 0))
3522			pmap->pm_stats.wired_count++;
3523		else if (!wired && (origpte & PG_W))
3524			pmap->pm_stats.wired_count--;
3525
3526		/*
3527		 * Remove extra pte reference
3528		 */
3529		if (mpte)
3530			mpte->wire_count--;
3531
3532		if (origpte & PG_MANAGED) {
3533			om = m;
3534			pa |= PG_MANAGED;
3535		}
3536		goto validate;
3537	}
3538
3539	pv = NULL;
3540
3541	/*
3542	 * Mapping has changed, invalidate old range and fall through to
3543	 * handle validating new mapping.
3544	 */
3545	if (opa) {
3546		if (origpte & PG_W)
3547			pmap->pm_stats.wired_count--;
3548		if (origpte & PG_MANAGED) {
3549			om = PHYS_TO_VM_PAGE(opa);
3550			pv = pmap_pvh_remove(&om->md, pmap, va);
3551		}
3552		if (mpte != NULL) {
3553			mpte->wire_count--;
3554			KASSERT(mpte->wire_count > 0,
3555			    ("pmap_enter: missing reference to page table page,"
3556			     " va: 0x%x", va));
3557		}
3558	} else
3559		pmap->pm_stats.resident_count++;
3560
3561	/*
3562	 * Enter on the PV list if part of our managed memory.
3563	 */
3564	if ((m->oflags & VPO_UNMANAGED) == 0) {
3565		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3566		    ("pmap_enter: managed mapping within the clean submap"));
3567		if (pv == NULL)
3568			pv = get_pv_entry(pmap, FALSE);
3569		pv->pv_va = va;
3570		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3571		pa |= PG_MANAGED;
3572	} else if (pv != NULL)
3573		free_pv_entry(pmap, pv);
3574
3575	/*
3576	 * Increment counters
3577	 */
3578	if (wired)
3579		pmap->pm_stats.wired_count++;
3580
3581validate:
3582	/*
3583	 * Now validate mapping with desired protection/wiring.
3584	 */
3585	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3586	if ((prot & VM_PROT_WRITE) != 0) {
3587		newpte |= PG_RW;
3588		if ((newpte & PG_MANAGED) != 0)
3589			vm_page_aflag_set(m, PGA_WRITEABLE);
3590	}
3591#ifdef PAE
3592	if ((prot & VM_PROT_EXECUTE) == 0)
3593		newpte |= pg_nx;
3594#endif
3595	if (wired)
3596		newpte |= PG_W;
3597	if (va < VM_MAXUSER_ADDRESS)
3598		newpte |= PG_U;
3599	if (pmap == kernel_pmap)
3600		newpte |= pgeflag;
3601
3602	/*
3603	 * if the mapping or permission bits are different, we need
3604	 * to update the pte.
3605	 */
3606	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3607		newpte |= PG_A;
3608		if ((access & VM_PROT_WRITE) != 0)
3609			newpte |= PG_M;
3610		if (origpte & PG_V) {
3611			invlva = FALSE;
3612			origpte = pte_load_store(pte, newpte);
3613			if (origpte & PG_A) {
3614				if (origpte & PG_MANAGED)
3615					vm_page_aflag_set(om, PGA_REFERENCED);
3616				if (opa != VM_PAGE_TO_PHYS(m))
3617					invlva = TRUE;
3618#ifdef PAE
3619				if ((origpte & PG_NX) == 0 &&
3620				    (newpte & PG_NX) != 0)
3621					invlva = TRUE;
3622#endif
3623			}
3624			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3625				if ((origpte & PG_MANAGED) != 0)
3626					vm_page_dirty(om);
3627				if ((prot & VM_PROT_WRITE) == 0)
3628					invlva = TRUE;
3629			}
3630			if ((origpte & PG_MANAGED) != 0 &&
3631			    TAILQ_EMPTY(&om->md.pv_list) &&
3632			    ((om->flags & PG_FICTITIOUS) != 0 ||
3633			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3634				vm_page_aflag_clear(om, PGA_WRITEABLE);
3635			if (invlva)
3636				pmap_invalidate_page(pmap, va);
3637		} else
3638			pte_store(pte, newpte);
3639	}
3640
3641	/*
3642	 * If both the page table page and the reservation are fully
3643	 * populated, then attempt promotion.
3644	 */
3645	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3646	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3647	    vm_reserv_level_iffullpop(m) == 0)
3648		pmap_promote_pde(pmap, pde, va);
3649
3650	sched_unpin();
3651	rw_wunlock(&pvh_global_lock);
3652	PMAP_UNLOCK(pmap);
3653}
3654
3655/*
3656 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3657 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3658 * blocking, (2) a mapping already exists at the specified virtual address, or
3659 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3660 */
3661static boolean_t
3662pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3663{
3664	pd_entry_t *pde, newpde;
3665
3666	rw_assert(&pvh_global_lock, RA_WLOCKED);
3667	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3668	pde = pmap_pde(pmap, va);
3669	if (*pde != 0) {
3670		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3671		    " in pmap %p", va, pmap);
3672		return (FALSE);
3673	}
3674	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3675	    PG_PS | PG_V;
3676	if ((m->oflags & VPO_UNMANAGED) == 0) {
3677		newpde |= PG_MANAGED;
3678
3679		/*
3680		 * Abort this mapping if its PV entry could not be created.
3681		 */
3682		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3683			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3684			    " in pmap %p", va, pmap);
3685			return (FALSE);
3686		}
3687	}
3688#ifdef PAE
3689	if ((prot & VM_PROT_EXECUTE) == 0)
3690		newpde |= pg_nx;
3691#endif
3692	if (va < VM_MAXUSER_ADDRESS)
3693		newpde |= PG_U;
3694
3695	/*
3696	 * Increment counters.
3697	 */
3698	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3699
3700	/*
3701	 * Map the superpage.
3702	 */
3703	pde_store(pde, newpde);
3704
3705	pmap_pde_mappings++;
3706	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3707	    " in pmap %p", va, pmap);
3708	return (TRUE);
3709}
3710
3711/*
3712 * Maps a sequence of resident pages belonging to the same object.
3713 * The sequence begins with the given page m_start.  This page is
3714 * mapped at the given virtual address start.  Each subsequent page is
3715 * mapped at a virtual address that is offset from start by the same
3716 * amount as the page is offset from m_start within the object.  The
3717 * last page in the sequence is the page with the largest offset from
3718 * m_start that can be mapped at a virtual address less than the given
3719 * virtual address end.  Not every virtual page between start and end
3720 * is mapped; only those for which a resident page exists with the
3721 * corresponding offset from m_start are mapped.
3722 */
3723void
3724pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3725    vm_page_t m_start, vm_prot_t prot)
3726{
3727	vm_offset_t va;
3728	vm_page_t m, mpte;
3729	vm_pindex_t diff, psize;
3730
3731	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3732	psize = atop(end - start);
3733	mpte = NULL;
3734	m = m_start;
3735	rw_wlock(&pvh_global_lock);
3736	PMAP_LOCK(pmap);
3737	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3738		va = start + ptoa(diff);
3739		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3740		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3741		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3742		    pmap_enter_pde(pmap, va, m, prot))
3743			m = &m[NBPDR / PAGE_SIZE - 1];
3744		else
3745			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3746			    mpte);
3747		m = TAILQ_NEXT(m, listq);
3748	}
3749	rw_wunlock(&pvh_global_lock);
3750	PMAP_UNLOCK(pmap);
3751}
3752
3753/*
3754 * this code makes some *MAJOR* assumptions:
3755 * 1. Current pmap & pmap exists.
3756 * 2. Not wired.
3757 * 3. Read access.
3758 * 4. No page table pages.
3759 * but is *MUCH* faster than pmap_enter...
3760 */
3761
3762void
3763pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3764{
3765
3766	rw_wlock(&pvh_global_lock);
3767	PMAP_LOCK(pmap);
3768	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3769	rw_wunlock(&pvh_global_lock);
3770	PMAP_UNLOCK(pmap);
3771}
3772
3773static vm_page_t
3774pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3775    vm_prot_t prot, vm_page_t mpte)
3776{
3777	pt_entry_t *pte;
3778	vm_paddr_t pa;
3779	vm_page_t free;
3780
3781	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3782	    (m->oflags & VPO_UNMANAGED) != 0,
3783	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3784	rw_assert(&pvh_global_lock, RA_WLOCKED);
3785	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3786
3787	/*
3788	 * In the case that a page table page is not
3789	 * resident, we are creating it here.
3790	 */
3791	if (va < VM_MAXUSER_ADDRESS) {
3792		u_int ptepindex;
3793		pd_entry_t ptepa;
3794
3795		/*
3796		 * Calculate pagetable page index
3797		 */
3798		ptepindex = va >> PDRSHIFT;
3799		if (mpte && (mpte->pindex == ptepindex)) {
3800			mpte->wire_count++;
3801		} else {
3802			/*
3803			 * Get the page directory entry
3804			 */
3805			ptepa = pmap->pm_pdir[ptepindex];
3806
3807			/*
3808			 * If the page table page is mapped, we just increment
3809			 * the hold count, and activate it.
3810			 */
3811			if (ptepa) {
3812				if (ptepa & PG_PS)
3813					return (NULL);
3814				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3815				mpte->wire_count++;
3816			} else {
3817				mpte = _pmap_allocpte(pmap, ptepindex,
3818				    M_NOWAIT);
3819				if (mpte == NULL)
3820					return (mpte);
3821			}
3822		}
3823	} else {
3824		mpte = NULL;
3825	}
3826
3827	/*
3828	 * This call to vtopte makes the assumption that we are
3829	 * entering the page into the current pmap.  In order to support
3830	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3831	 * But that isn't as quick as vtopte.
3832	 */
3833	pte = vtopte(va);
3834	if (*pte) {
3835		if (mpte != NULL) {
3836			mpte->wire_count--;
3837			mpte = NULL;
3838		}
3839		return (mpte);
3840	}
3841
3842	/*
3843	 * Enter on the PV list if part of our managed memory.
3844	 */
3845	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3846	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3847		if (mpte != NULL) {
3848			free = NULL;
3849			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3850				pmap_invalidate_page(pmap, va);
3851				pmap_free_zero_pages(free);
3852			}
3853
3854			mpte = NULL;
3855		}
3856		return (mpte);
3857	}
3858
3859	/*
3860	 * Increment counters
3861	 */
3862	pmap->pm_stats.resident_count++;
3863
3864	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3865#ifdef PAE
3866	if ((prot & VM_PROT_EXECUTE) == 0)
3867		pa |= pg_nx;
3868#endif
3869
3870	/*
3871	 * Now validate mapping with RO protection
3872	 */
3873	if ((m->oflags & VPO_UNMANAGED) != 0)
3874		pte_store(pte, pa | PG_V | PG_U);
3875	else
3876		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3877	return (mpte);
3878}
3879
3880/*
3881 * Make a temporary mapping for a physical address.  This is only intended
3882 * to be used for panic dumps.
3883 */
3884void *
3885pmap_kenter_temporary(vm_paddr_t pa, int i)
3886{
3887	vm_offset_t va;
3888
3889	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3890	pmap_kenter(va, pa);
3891	invlpg(va);
3892	return ((void *)crashdumpmap);
3893}
3894
3895/*
3896 * This code maps large physical mmap regions into the
3897 * processor address space.  Note that some shortcuts
3898 * are taken, but the code works.
3899 */
3900void
3901pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3902    vm_pindex_t pindex, vm_size_t size)
3903{
3904	pd_entry_t *pde;
3905	vm_paddr_t pa, ptepa;
3906	vm_page_t p;
3907	int pat_mode;
3908
3909	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3910	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3911	    ("pmap_object_init_pt: non-device object"));
3912	if (pseflag &&
3913	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3914		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3915			return;
3916		p = vm_page_lookup(object, pindex);
3917		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3918		    ("pmap_object_init_pt: invalid page %p", p));
3919		pat_mode = p->md.pat_mode;
3920
3921		/*
3922		 * Abort the mapping if the first page is not physically
3923		 * aligned to a 2/4MB page boundary.
3924		 */
3925		ptepa = VM_PAGE_TO_PHYS(p);
3926		if (ptepa & (NBPDR - 1))
3927			return;
3928
3929		/*
3930		 * Skip the first page.  Abort the mapping if the rest of
3931		 * the pages are not physically contiguous or have differing
3932		 * memory attributes.
3933		 */
3934		p = TAILQ_NEXT(p, listq);
3935		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3936		    pa += PAGE_SIZE) {
3937			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3938			    ("pmap_object_init_pt: invalid page %p", p));
3939			if (pa != VM_PAGE_TO_PHYS(p) ||
3940			    pat_mode != p->md.pat_mode)
3941				return;
3942			p = TAILQ_NEXT(p, listq);
3943		}
3944
3945		/*
3946		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3947		 * "size" is a multiple of 2/4M, adding the PAT setting to
3948		 * "pa" will not affect the termination of this loop.
3949		 */
3950		PMAP_LOCK(pmap);
3951		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3952		    size; pa += NBPDR) {
3953			pde = pmap_pde(pmap, addr);
3954			if (*pde == 0) {
3955				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3956				    PG_U | PG_RW | PG_V);
3957				pmap->pm_stats.resident_count += NBPDR /
3958				    PAGE_SIZE;
3959				pmap_pde_mappings++;
3960			}
3961			/* Else continue on if the PDE is already valid. */
3962			addr += NBPDR;
3963		}
3964		PMAP_UNLOCK(pmap);
3965	}
3966}
3967
3968/*
3969 *	Routine:	pmap_change_wiring
3970 *	Function:	Change the wiring attribute for a map/virtual-address
3971 *			pair.
3972 *	In/out conditions:
3973 *			The mapping must already exist in the pmap.
3974 */
3975void
3976pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3977{
3978	pd_entry_t *pde;
3979	pt_entry_t *pte;
3980	boolean_t are_queues_locked;
3981
3982	are_queues_locked = FALSE;
3983retry:
3984	PMAP_LOCK(pmap);
3985	pde = pmap_pde(pmap, va);
3986	if ((*pde & PG_PS) != 0) {
3987		if (!wired != ((*pde & PG_W) == 0)) {
3988			if (!are_queues_locked) {
3989				are_queues_locked = TRUE;
3990				if (!rw_try_wlock(&pvh_global_lock)) {
3991					PMAP_UNLOCK(pmap);
3992					rw_wlock(&pvh_global_lock);
3993					goto retry;
3994				}
3995			}
3996			if (!pmap_demote_pde(pmap, pde, va))
3997				panic("pmap_change_wiring: demotion failed");
3998		} else
3999			goto out;
4000	}
4001	pte = pmap_pte(pmap, va);
4002
4003	if (wired && !pmap_pte_w(pte))
4004		pmap->pm_stats.wired_count++;
4005	else if (!wired && pmap_pte_w(pte))
4006		pmap->pm_stats.wired_count--;
4007
4008	/*
4009	 * Wiring is not a hardware characteristic so there is no need to
4010	 * invalidate TLB.
4011	 */
4012	pmap_pte_set_w(pte, wired);
4013	pmap_pte_release(pte);
4014out:
4015	if (are_queues_locked)
4016		rw_wunlock(&pvh_global_lock);
4017	PMAP_UNLOCK(pmap);
4018}
4019
4020
4021
4022/*
4023 *	Copy the range specified by src_addr/len
4024 *	from the source map to the range dst_addr/len
4025 *	in the destination map.
4026 *
4027 *	This routine is only advisory and need not do anything.
4028 */
4029
4030void
4031pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4032    vm_offset_t src_addr)
4033{
4034	vm_page_t   free;
4035	vm_offset_t addr;
4036	vm_offset_t end_addr = src_addr + len;
4037	vm_offset_t pdnxt;
4038
4039	if (dst_addr != src_addr)
4040		return;
4041
4042	if (!pmap_is_current(src_pmap))
4043		return;
4044
4045	rw_wlock(&pvh_global_lock);
4046	if (dst_pmap < src_pmap) {
4047		PMAP_LOCK(dst_pmap);
4048		PMAP_LOCK(src_pmap);
4049	} else {
4050		PMAP_LOCK(src_pmap);
4051		PMAP_LOCK(dst_pmap);
4052	}
4053	sched_pin();
4054	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4055		pt_entry_t *src_pte, *dst_pte;
4056		vm_page_t dstmpte, srcmpte;
4057		pd_entry_t srcptepaddr;
4058		u_int ptepindex;
4059
4060		KASSERT(addr < UPT_MIN_ADDRESS,
4061		    ("pmap_copy: invalid to pmap_copy page tables"));
4062
4063		pdnxt = (addr + NBPDR) & ~PDRMASK;
4064		if (pdnxt < addr)
4065			pdnxt = end_addr;
4066		ptepindex = addr >> PDRSHIFT;
4067
4068		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4069		if (srcptepaddr == 0)
4070			continue;
4071
4072		if (srcptepaddr & PG_PS) {
4073			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4074			    ((srcptepaddr & PG_MANAGED) == 0 ||
4075			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4076			    PG_PS_FRAME))) {
4077				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4078				    ~PG_W;
4079				dst_pmap->pm_stats.resident_count +=
4080				    NBPDR / PAGE_SIZE;
4081			}
4082			continue;
4083		}
4084
4085		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4086		KASSERT(srcmpte->wire_count > 0,
4087		    ("pmap_copy: source page table page is unused"));
4088
4089		if (pdnxt > end_addr)
4090			pdnxt = end_addr;
4091
4092		src_pte = vtopte(addr);
4093		while (addr < pdnxt) {
4094			pt_entry_t ptetemp;
4095			ptetemp = *src_pte;
4096			/*
4097			 * we only virtual copy managed pages
4098			 */
4099			if ((ptetemp & PG_MANAGED) != 0) {
4100				dstmpte = pmap_allocpte(dst_pmap, addr,
4101				    M_NOWAIT);
4102				if (dstmpte == NULL)
4103					goto out;
4104				dst_pte = pmap_pte_quick(dst_pmap, addr);
4105				if (*dst_pte == 0 &&
4106				    pmap_try_insert_pv_entry(dst_pmap, addr,
4107				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4108					/*
4109					 * Clear the wired, modified, and
4110					 * accessed (referenced) bits
4111					 * during the copy.
4112					 */
4113					*dst_pte = ptetemp & ~(PG_W | PG_M |
4114					    PG_A);
4115					dst_pmap->pm_stats.resident_count++;
4116	 			} else {
4117					free = NULL;
4118					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4119					    &free)) {
4120						pmap_invalidate_page(dst_pmap,
4121						    addr);
4122						pmap_free_zero_pages(free);
4123					}
4124					goto out;
4125				}
4126				if (dstmpte->wire_count >= srcmpte->wire_count)
4127					break;
4128			}
4129			addr += PAGE_SIZE;
4130			src_pte++;
4131		}
4132	}
4133out:
4134	sched_unpin();
4135	rw_wunlock(&pvh_global_lock);
4136	PMAP_UNLOCK(src_pmap);
4137	PMAP_UNLOCK(dst_pmap);
4138}
4139
4140static __inline void
4141pagezero(void *page)
4142{
4143#if defined(I686_CPU)
4144	if (cpu_class == CPUCLASS_686) {
4145#if defined(CPU_ENABLE_SSE)
4146		if (cpu_feature & CPUID_SSE2)
4147			sse2_pagezero(page);
4148		else
4149#endif
4150			i686_pagezero(page);
4151	} else
4152#endif
4153		bzero(page, PAGE_SIZE);
4154}
4155
4156/*
4157 *	pmap_zero_page zeros the specified hardware page by mapping
4158 *	the page into KVM and using bzero to clear its contents.
4159 */
4160void
4161pmap_zero_page(vm_page_t m)
4162{
4163	struct sysmaps *sysmaps;
4164
4165	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4166	mtx_lock(&sysmaps->lock);
4167	if (*sysmaps->CMAP2)
4168		panic("pmap_zero_page: CMAP2 busy");
4169	sched_pin();
4170	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4171	    pmap_cache_bits(m->md.pat_mode, 0);
4172	invlcaddr(sysmaps->CADDR2);
4173	pagezero(sysmaps->CADDR2);
4174	*sysmaps->CMAP2 = 0;
4175	sched_unpin();
4176	mtx_unlock(&sysmaps->lock);
4177}
4178
4179/*
4180 *	pmap_zero_page_area zeros the specified hardware page by mapping
4181 *	the page into KVM and using bzero to clear its contents.
4182 *
4183 *	off and size may not cover an area beyond a single hardware page.
4184 */
4185void
4186pmap_zero_page_area(vm_page_t m, int off, int size)
4187{
4188	struct sysmaps *sysmaps;
4189
4190	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4191	mtx_lock(&sysmaps->lock);
4192	if (*sysmaps->CMAP2)
4193		panic("pmap_zero_page_area: CMAP2 busy");
4194	sched_pin();
4195	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4196	    pmap_cache_bits(m->md.pat_mode, 0);
4197	invlcaddr(sysmaps->CADDR2);
4198	if (off == 0 && size == PAGE_SIZE)
4199		pagezero(sysmaps->CADDR2);
4200	else
4201		bzero((char *)sysmaps->CADDR2 + off, size);
4202	*sysmaps->CMAP2 = 0;
4203	sched_unpin();
4204	mtx_unlock(&sysmaps->lock);
4205}
4206
4207/*
4208 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4209 *	the page into KVM and using bzero to clear its contents.  This
4210 *	is intended to be called from the vm_pagezero process only and
4211 *	outside of Giant.
4212 */
4213void
4214pmap_zero_page_idle(vm_page_t m)
4215{
4216
4217	if (*CMAP3)
4218		panic("pmap_zero_page_idle: CMAP3 busy");
4219	sched_pin();
4220	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4221	    pmap_cache_bits(m->md.pat_mode, 0);
4222	invlcaddr(CADDR3);
4223	pagezero(CADDR3);
4224	*CMAP3 = 0;
4225	sched_unpin();
4226}
4227
4228/*
4229 *	pmap_copy_page copies the specified (machine independent)
4230 *	page by mapping the page into virtual memory and using
4231 *	bcopy to copy the page, one machine dependent page at a
4232 *	time.
4233 */
4234void
4235pmap_copy_page(vm_page_t src, vm_page_t dst)
4236{
4237	struct sysmaps *sysmaps;
4238
4239	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4240	mtx_lock(&sysmaps->lock);
4241	if (*sysmaps->CMAP1)
4242		panic("pmap_copy_page: CMAP1 busy");
4243	if (*sysmaps->CMAP2)
4244		panic("pmap_copy_page: CMAP2 busy");
4245	sched_pin();
4246	invlpg((u_int)sysmaps->CADDR1);
4247	invlpg((u_int)sysmaps->CADDR2);
4248	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4249	    pmap_cache_bits(src->md.pat_mode, 0);
4250	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4251	    pmap_cache_bits(dst->md.pat_mode, 0);
4252	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4253	*sysmaps->CMAP1 = 0;
4254	*sysmaps->CMAP2 = 0;
4255	sched_unpin();
4256	mtx_unlock(&sysmaps->lock);
4257}
4258
4259int unmapped_buf_allowed = 1;
4260
4261void
4262pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4263    vm_offset_t b_offset, int xfersize)
4264{
4265	struct sysmaps *sysmaps;
4266	vm_page_t a_pg, b_pg;
4267	char *a_cp, *b_cp;
4268	vm_offset_t a_pg_offset, b_pg_offset;
4269	int cnt;
4270
4271	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4272	mtx_lock(&sysmaps->lock);
4273	if (*sysmaps->CMAP1 != 0)
4274		panic("pmap_copy_pages: CMAP1 busy");
4275	if (*sysmaps->CMAP2 != 0)
4276		panic("pmap_copy_pages: CMAP2 busy");
4277	sched_pin();
4278	while (xfersize > 0) {
4279		invlpg((u_int)sysmaps->CADDR1);
4280		invlpg((u_int)sysmaps->CADDR2);
4281		a_pg = ma[a_offset >> PAGE_SHIFT];
4282		a_pg_offset = a_offset & PAGE_MASK;
4283		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4284		b_pg = mb[b_offset >> PAGE_SHIFT];
4285		b_pg_offset = b_offset & PAGE_MASK;
4286		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4287		*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4288		    pmap_cache_bits(b_pg->md.pat_mode, 0);
4289		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4290		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4291		a_cp = sysmaps->CADDR1 + a_pg_offset;
4292		b_cp = sysmaps->CADDR2 + b_pg_offset;
4293		bcopy(a_cp, b_cp, cnt);
4294		a_offset += cnt;
4295		b_offset += cnt;
4296		xfersize -= cnt;
4297	}
4298	*sysmaps->CMAP1 = 0;
4299	*sysmaps->CMAP2 = 0;
4300	sched_unpin();
4301	mtx_unlock(&sysmaps->lock);
4302}
4303
4304/*
4305 * Returns true if the pmap's pv is one of the first
4306 * 16 pvs linked to from this page.  This count may
4307 * be changed upwards or downwards in the future; it
4308 * is only necessary that true be returned for a small
4309 * subset of pmaps for proper page aging.
4310 */
4311boolean_t
4312pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4313{
4314	struct md_page *pvh;
4315	pv_entry_t pv;
4316	int loops = 0;
4317	boolean_t rv;
4318
4319	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4320	    ("pmap_page_exists_quick: page %p is not managed", m));
4321	rv = FALSE;
4322	rw_wlock(&pvh_global_lock);
4323	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4324		if (PV_PMAP(pv) == pmap) {
4325			rv = TRUE;
4326			break;
4327		}
4328		loops++;
4329		if (loops >= 16)
4330			break;
4331	}
4332	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4333		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4334		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4335			if (PV_PMAP(pv) == pmap) {
4336				rv = TRUE;
4337				break;
4338			}
4339			loops++;
4340			if (loops >= 16)
4341				break;
4342		}
4343	}
4344	rw_wunlock(&pvh_global_lock);
4345	return (rv);
4346}
4347
4348/*
4349 *	pmap_page_wired_mappings:
4350 *
4351 *	Return the number of managed mappings to the given physical page
4352 *	that are wired.
4353 */
4354int
4355pmap_page_wired_mappings(vm_page_t m)
4356{
4357	int count;
4358
4359	count = 0;
4360	if ((m->oflags & VPO_UNMANAGED) != 0)
4361		return (count);
4362	rw_wlock(&pvh_global_lock);
4363	count = pmap_pvh_wired_mappings(&m->md, count);
4364	if ((m->flags & PG_FICTITIOUS) == 0) {
4365	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4366	        count);
4367	}
4368	rw_wunlock(&pvh_global_lock);
4369	return (count);
4370}
4371
4372/*
4373 *	pmap_pvh_wired_mappings:
4374 *
4375 *	Return the updated number "count" of managed mappings that are wired.
4376 */
4377static int
4378pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4379{
4380	pmap_t pmap;
4381	pt_entry_t *pte;
4382	pv_entry_t pv;
4383
4384	rw_assert(&pvh_global_lock, RA_WLOCKED);
4385	sched_pin();
4386	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4387		pmap = PV_PMAP(pv);
4388		PMAP_LOCK(pmap);
4389		pte = pmap_pte_quick(pmap, pv->pv_va);
4390		if ((*pte & PG_W) != 0)
4391			count++;
4392		PMAP_UNLOCK(pmap);
4393	}
4394	sched_unpin();
4395	return (count);
4396}
4397
4398/*
4399 * Returns TRUE if the given page is mapped individually or as part of
4400 * a 4mpage.  Otherwise, returns FALSE.
4401 */
4402boolean_t
4403pmap_page_is_mapped(vm_page_t m)
4404{
4405	boolean_t rv;
4406
4407	if ((m->oflags & VPO_UNMANAGED) != 0)
4408		return (FALSE);
4409	rw_wlock(&pvh_global_lock);
4410	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4411	    ((m->flags & PG_FICTITIOUS) == 0 &&
4412	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4413	rw_wunlock(&pvh_global_lock);
4414	return (rv);
4415}
4416
4417/*
4418 * Remove all pages from specified address space
4419 * this aids process exit speeds.  Also, this code
4420 * is special cased for current process only, but
4421 * can have the more generic (and slightly slower)
4422 * mode enabled.  This is much faster than pmap_remove
4423 * in the case of running down an entire address space.
4424 */
4425void
4426pmap_remove_pages(pmap_t pmap)
4427{
4428	pt_entry_t *pte, tpte;
4429	vm_page_t free = NULL;
4430	vm_page_t m, mpte, mt;
4431	pv_entry_t pv;
4432	struct md_page *pvh;
4433	struct pv_chunk *pc, *npc;
4434	int field, idx;
4435	int32_t bit;
4436	uint32_t inuse, bitmask;
4437	int allfree;
4438
4439	if (pmap != PCPU_GET(curpmap)) {
4440		printf("warning: pmap_remove_pages called with non-current pmap\n");
4441		return;
4442	}
4443	rw_wlock(&pvh_global_lock);
4444	PMAP_LOCK(pmap);
4445	sched_pin();
4446	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4447		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4448		    pc->pc_pmap));
4449		allfree = 1;
4450		for (field = 0; field < _NPCM; field++) {
4451			inuse = ~pc->pc_map[field] & pc_freemask[field];
4452			while (inuse != 0) {
4453				bit = bsfl(inuse);
4454				bitmask = 1UL << bit;
4455				idx = field * 32 + bit;
4456				pv = &pc->pc_pventry[idx];
4457				inuse &= ~bitmask;
4458
4459				pte = pmap_pde(pmap, pv->pv_va);
4460				tpte = *pte;
4461				if ((tpte & PG_PS) == 0) {
4462					pte = vtopte(pv->pv_va);
4463					tpte = *pte & ~PG_PTE_PAT;
4464				}
4465
4466				if (tpte == 0) {
4467					printf(
4468					    "TPTE at %p  IS ZERO @ VA %08x\n",
4469					    pte, pv->pv_va);
4470					panic("bad pte");
4471				}
4472
4473/*
4474 * We cannot remove wired pages from a process' mapping at this time
4475 */
4476				if (tpte & PG_W) {
4477					allfree = 0;
4478					continue;
4479				}
4480
4481				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4482				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4483				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4484				    m, (uintmax_t)m->phys_addr,
4485				    (uintmax_t)tpte));
4486
4487				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4488				    m < &vm_page_array[vm_page_array_size],
4489				    ("pmap_remove_pages: bad tpte %#jx",
4490				    (uintmax_t)tpte));
4491
4492				pte_clear(pte);
4493
4494				/*
4495				 * Update the vm_page_t clean/reference bits.
4496				 */
4497				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4498					if ((tpte & PG_PS) != 0) {
4499						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4500							vm_page_dirty(mt);
4501					} else
4502						vm_page_dirty(m);
4503				}
4504
4505				/* Mark free */
4506				PV_STAT(pv_entry_frees++);
4507				PV_STAT(pv_entry_spare++);
4508				pv_entry_count--;
4509				pc->pc_map[field] |= bitmask;
4510				if ((tpte & PG_PS) != 0) {
4511					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4512					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4513					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4514					if (TAILQ_EMPTY(&pvh->pv_list)) {
4515						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4516							if (TAILQ_EMPTY(&mt->md.pv_list))
4517								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4518					}
4519					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4520					if (mpte != NULL) {
4521						pmap_remove_pt_page(pmap, mpte);
4522						pmap->pm_stats.resident_count--;
4523						KASSERT(mpte->wire_count == NPTEPG,
4524						    ("pmap_remove_pages: pte page wire count error"));
4525						mpte->wire_count = 0;
4526						pmap_add_delayed_free_list(mpte, &free, FALSE);
4527						atomic_subtract_int(&cnt.v_wire_count, 1);
4528					}
4529				} else {
4530					pmap->pm_stats.resident_count--;
4531					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4532					if (TAILQ_EMPTY(&m->md.pv_list) &&
4533					    (m->flags & PG_FICTITIOUS) == 0) {
4534						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4535						if (TAILQ_EMPTY(&pvh->pv_list))
4536							vm_page_aflag_clear(m, PGA_WRITEABLE);
4537					}
4538					pmap_unuse_pt(pmap, pv->pv_va, &free);
4539				}
4540			}
4541		}
4542		if (allfree) {
4543			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4544			free_pv_chunk(pc);
4545		}
4546	}
4547	sched_unpin();
4548	pmap_invalidate_all(pmap);
4549	rw_wunlock(&pvh_global_lock);
4550	PMAP_UNLOCK(pmap);
4551	pmap_free_zero_pages(free);
4552}
4553
4554/*
4555 *	pmap_is_modified:
4556 *
4557 *	Return whether or not the specified physical page was modified
4558 *	in any physical maps.
4559 */
4560boolean_t
4561pmap_is_modified(vm_page_t m)
4562{
4563	boolean_t rv;
4564
4565	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4566	    ("pmap_is_modified: page %p is not managed", m));
4567
4568	/*
4569	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
4570	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4571	 * is clear, no PTEs can have PG_M set.
4572	 */
4573	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4574	if ((m->oflags & VPO_BUSY) == 0 &&
4575	    (m->aflags & PGA_WRITEABLE) == 0)
4576		return (FALSE);
4577	rw_wlock(&pvh_global_lock);
4578	rv = pmap_is_modified_pvh(&m->md) ||
4579	    ((m->flags & PG_FICTITIOUS) == 0 &&
4580	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4581	rw_wunlock(&pvh_global_lock);
4582	return (rv);
4583}
4584
4585/*
4586 * Returns TRUE if any of the given mappings were used to modify
4587 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4588 * mappings are supported.
4589 */
4590static boolean_t
4591pmap_is_modified_pvh(struct md_page *pvh)
4592{
4593	pv_entry_t pv;
4594	pt_entry_t *pte;
4595	pmap_t pmap;
4596	boolean_t rv;
4597
4598	rw_assert(&pvh_global_lock, RA_WLOCKED);
4599	rv = FALSE;
4600	sched_pin();
4601	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4602		pmap = PV_PMAP(pv);
4603		PMAP_LOCK(pmap);
4604		pte = pmap_pte_quick(pmap, pv->pv_va);
4605		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4606		PMAP_UNLOCK(pmap);
4607		if (rv)
4608			break;
4609	}
4610	sched_unpin();
4611	return (rv);
4612}
4613
4614/*
4615 *	pmap_is_prefaultable:
4616 *
4617 *	Return whether or not the specified virtual address is elgible
4618 *	for prefault.
4619 */
4620boolean_t
4621pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4622{
4623	pd_entry_t *pde;
4624	pt_entry_t *pte;
4625	boolean_t rv;
4626
4627	rv = FALSE;
4628	PMAP_LOCK(pmap);
4629	pde = pmap_pde(pmap, addr);
4630	if (*pde != 0 && (*pde & PG_PS) == 0) {
4631		pte = vtopte(addr);
4632		rv = *pte == 0;
4633	}
4634	PMAP_UNLOCK(pmap);
4635	return (rv);
4636}
4637
4638/*
4639 *	pmap_is_referenced:
4640 *
4641 *	Return whether or not the specified physical page was referenced
4642 *	in any physical maps.
4643 */
4644boolean_t
4645pmap_is_referenced(vm_page_t m)
4646{
4647	boolean_t rv;
4648
4649	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4650	    ("pmap_is_referenced: page %p is not managed", m));
4651	rw_wlock(&pvh_global_lock);
4652	rv = pmap_is_referenced_pvh(&m->md) ||
4653	    ((m->flags & PG_FICTITIOUS) == 0 &&
4654	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4655	rw_wunlock(&pvh_global_lock);
4656	return (rv);
4657}
4658
4659/*
4660 * Returns TRUE if any of the given mappings were referenced and FALSE
4661 * otherwise.  Both page and 4mpage mappings are supported.
4662 */
4663static boolean_t
4664pmap_is_referenced_pvh(struct md_page *pvh)
4665{
4666	pv_entry_t pv;
4667	pt_entry_t *pte;
4668	pmap_t pmap;
4669	boolean_t rv;
4670
4671	rw_assert(&pvh_global_lock, RA_WLOCKED);
4672	rv = FALSE;
4673	sched_pin();
4674	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4675		pmap = PV_PMAP(pv);
4676		PMAP_LOCK(pmap);
4677		pte = pmap_pte_quick(pmap, pv->pv_va);
4678		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4679		PMAP_UNLOCK(pmap);
4680		if (rv)
4681			break;
4682	}
4683	sched_unpin();
4684	return (rv);
4685}
4686
4687/*
4688 * Clear the write and modified bits in each of the given page's mappings.
4689 */
4690void
4691pmap_remove_write(vm_page_t m)
4692{
4693	struct md_page *pvh;
4694	pv_entry_t next_pv, pv;
4695	pmap_t pmap;
4696	pd_entry_t *pde;
4697	pt_entry_t oldpte, *pte;
4698	vm_offset_t va;
4699
4700	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4701	    ("pmap_remove_write: page %p is not managed", m));
4702
4703	/*
4704	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
4705	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
4706	 * is clear, no page table entries need updating.
4707	 */
4708	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4709	if ((m->oflags & VPO_BUSY) == 0 &&
4710	    (m->aflags & PGA_WRITEABLE) == 0)
4711		return;
4712	rw_wlock(&pvh_global_lock);
4713	sched_pin();
4714	if ((m->flags & PG_FICTITIOUS) != 0)
4715		goto small_mappings;
4716	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4717	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4718		va = pv->pv_va;
4719		pmap = PV_PMAP(pv);
4720		PMAP_LOCK(pmap);
4721		pde = pmap_pde(pmap, va);
4722		if ((*pde & PG_RW) != 0)
4723			(void)pmap_demote_pde(pmap, pde, va);
4724		PMAP_UNLOCK(pmap);
4725	}
4726small_mappings:
4727	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4728		pmap = PV_PMAP(pv);
4729		PMAP_LOCK(pmap);
4730		pde = pmap_pde(pmap, pv->pv_va);
4731		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4732		    " a 4mpage in page %p's pv list", m));
4733		pte = pmap_pte_quick(pmap, pv->pv_va);
4734retry:
4735		oldpte = *pte;
4736		if ((oldpte & PG_RW) != 0) {
4737			/*
4738			 * Regardless of whether a pte is 32 or 64 bits
4739			 * in size, PG_RW and PG_M are among the least
4740			 * significant 32 bits.
4741			 */
4742			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4743			    oldpte & ~(PG_RW | PG_M)))
4744				goto retry;
4745			if ((oldpte & PG_M) != 0)
4746				vm_page_dirty(m);
4747			pmap_invalidate_page(pmap, pv->pv_va);
4748		}
4749		PMAP_UNLOCK(pmap);
4750	}
4751	vm_page_aflag_clear(m, PGA_WRITEABLE);
4752	sched_unpin();
4753	rw_wunlock(&pvh_global_lock);
4754}
4755
4756/*
4757 *	pmap_ts_referenced:
4758 *
4759 *	Return a count of reference bits for a page, clearing those bits.
4760 *	It is not necessary for every reference bit to be cleared, but it
4761 *	is necessary that 0 only be returned when there are truly no
4762 *	reference bits set.
4763 *
4764 *	XXX: The exact number of bits to check and clear is a matter that
4765 *	should be tested and standardized at some point in the future for
4766 *	optimal aging of shared pages.
4767 */
4768int
4769pmap_ts_referenced(vm_page_t m)
4770{
4771	struct md_page *pvh;
4772	pv_entry_t pv, pvf, pvn;
4773	pmap_t pmap;
4774	pd_entry_t oldpde, *pde;
4775	pt_entry_t *pte;
4776	vm_offset_t va;
4777	int rtval = 0;
4778
4779	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4780	    ("pmap_ts_referenced: page %p is not managed", m));
4781	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4782	rw_wlock(&pvh_global_lock);
4783	sched_pin();
4784	if ((m->flags & PG_FICTITIOUS) != 0)
4785		goto small_mappings;
4786	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4787		va = pv->pv_va;
4788		pmap = PV_PMAP(pv);
4789		PMAP_LOCK(pmap);
4790		pde = pmap_pde(pmap, va);
4791		oldpde = *pde;
4792		if ((oldpde & PG_A) != 0) {
4793			if (pmap_demote_pde(pmap, pde, va)) {
4794				if ((oldpde & PG_W) == 0) {
4795					/*
4796					 * Remove the mapping to a single page
4797					 * so that a subsequent access may
4798					 * repromote.  Since the underlying
4799					 * page table page is fully populated,
4800					 * this removal never frees a page
4801					 * table page.
4802					 */
4803					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4804					    PG_PS_FRAME);
4805					pmap_remove_page(pmap, va, NULL);
4806					rtval++;
4807					if (rtval > 4) {
4808						PMAP_UNLOCK(pmap);
4809						goto out;
4810					}
4811				}
4812			}
4813		}
4814		PMAP_UNLOCK(pmap);
4815	}
4816small_mappings:
4817	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4818		pvf = pv;
4819		do {
4820			pvn = TAILQ_NEXT(pv, pv_list);
4821			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4822			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4823			pmap = PV_PMAP(pv);
4824			PMAP_LOCK(pmap);
4825			pde = pmap_pde(pmap, pv->pv_va);
4826			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4827			    " found a 4mpage in page %p's pv list", m));
4828			pte = pmap_pte_quick(pmap, pv->pv_va);
4829			if ((*pte & PG_A) != 0) {
4830				atomic_clear_int((u_int *)pte, PG_A);
4831				pmap_invalidate_page(pmap, pv->pv_va);
4832				rtval++;
4833				if (rtval > 4)
4834					pvn = NULL;
4835			}
4836			PMAP_UNLOCK(pmap);
4837		} while ((pv = pvn) != NULL && pv != pvf);
4838	}
4839out:
4840	sched_unpin();
4841	rw_wunlock(&pvh_global_lock);
4842	return (rtval);
4843}
4844
4845/*
4846 *	Clear the modify bits on the specified physical page.
4847 */
4848void
4849pmap_clear_modify(vm_page_t m)
4850{
4851	struct md_page *pvh;
4852	pv_entry_t next_pv, pv;
4853	pmap_t pmap;
4854	pd_entry_t oldpde, *pde;
4855	pt_entry_t oldpte, *pte;
4856	vm_offset_t va;
4857
4858	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4859	    ("pmap_clear_modify: page %p is not managed", m));
4860	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4861	KASSERT((m->oflags & VPO_BUSY) == 0,
4862	    ("pmap_clear_modify: page %p is busy", m));
4863
4864	/*
4865	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4866	 * If the object containing the page is locked and the page is not
4867	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
4868	 */
4869	if ((m->aflags & PGA_WRITEABLE) == 0)
4870		return;
4871	rw_wlock(&pvh_global_lock);
4872	sched_pin();
4873	if ((m->flags & PG_FICTITIOUS) != 0)
4874		goto small_mappings;
4875	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4876	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4877		va = pv->pv_va;
4878		pmap = PV_PMAP(pv);
4879		PMAP_LOCK(pmap);
4880		pde = pmap_pde(pmap, va);
4881		oldpde = *pde;
4882		if ((oldpde & PG_RW) != 0) {
4883			if (pmap_demote_pde(pmap, pde, va)) {
4884				if ((oldpde & PG_W) == 0) {
4885					/*
4886					 * Write protect the mapping to a
4887					 * single page so that a subsequent
4888					 * write access may repromote.
4889					 */
4890					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4891					    PG_PS_FRAME);
4892					pte = pmap_pte_quick(pmap, va);
4893					oldpte = *pte;
4894					if ((oldpte & PG_V) != 0) {
4895						/*
4896						 * Regardless of whether a pte is 32 or 64 bits
4897						 * in size, PG_RW and PG_M are among the least
4898						 * significant 32 bits.
4899						 */
4900						while (!atomic_cmpset_int((u_int *)pte,
4901						    oldpte,
4902						    oldpte & ~(PG_M | PG_RW)))
4903							oldpte = *pte;
4904						vm_page_dirty(m);
4905						pmap_invalidate_page(pmap, va);
4906					}
4907				}
4908			}
4909		}
4910		PMAP_UNLOCK(pmap);
4911	}
4912small_mappings:
4913	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4914		pmap = PV_PMAP(pv);
4915		PMAP_LOCK(pmap);
4916		pde = pmap_pde(pmap, pv->pv_va);
4917		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4918		    " a 4mpage in page %p's pv list", m));
4919		pte = pmap_pte_quick(pmap, pv->pv_va);
4920		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4921			/*
4922			 * Regardless of whether a pte is 32 or 64 bits
4923			 * in size, PG_M is among the least significant
4924			 * 32 bits.
4925			 */
4926			atomic_clear_int((u_int *)pte, PG_M);
4927			pmap_invalidate_page(pmap, pv->pv_va);
4928		}
4929		PMAP_UNLOCK(pmap);
4930	}
4931	sched_unpin();
4932	rw_wunlock(&pvh_global_lock);
4933}
4934
4935/*
4936 *	pmap_clear_reference:
4937 *
4938 *	Clear the reference bit on the specified physical page.
4939 */
4940void
4941pmap_clear_reference(vm_page_t m)
4942{
4943	struct md_page *pvh;
4944	pv_entry_t next_pv, pv;
4945	pmap_t pmap;
4946	pd_entry_t oldpde, *pde;
4947	pt_entry_t *pte;
4948	vm_offset_t va;
4949
4950	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4951	    ("pmap_clear_reference: page %p is not managed", m));
4952	rw_wlock(&pvh_global_lock);
4953	sched_pin();
4954	if ((m->flags & PG_FICTITIOUS) != 0)
4955		goto small_mappings;
4956	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4957	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4958		va = pv->pv_va;
4959		pmap = PV_PMAP(pv);
4960		PMAP_LOCK(pmap);
4961		pde = pmap_pde(pmap, va);
4962		oldpde = *pde;
4963		if ((oldpde & PG_A) != 0) {
4964			if (pmap_demote_pde(pmap, pde, va)) {
4965				/*
4966				 * Remove the mapping to a single page so
4967				 * that a subsequent access may repromote.
4968				 * Since the underlying page table page is
4969				 * fully populated, this removal never frees
4970				 * a page table page.
4971				 */
4972				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4973				    PG_PS_FRAME);
4974				pmap_remove_page(pmap, va, NULL);
4975			}
4976		}
4977		PMAP_UNLOCK(pmap);
4978	}
4979small_mappings:
4980	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4981		pmap = PV_PMAP(pv);
4982		PMAP_LOCK(pmap);
4983		pde = pmap_pde(pmap, pv->pv_va);
4984		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4985		    " a 4mpage in page %p's pv list", m));
4986		pte = pmap_pte_quick(pmap, pv->pv_va);
4987		if ((*pte & PG_A) != 0) {
4988			/*
4989			 * Regardless of whether a pte is 32 or 64 bits
4990			 * in size, PG_A is among the least significant
4991			 * 32 bits.
4992			 */
4993			atomic_clear_int((u_int *)pte, PG_A);
4994			pmap_invalidate_page(pmap, pv->pv_va);
4995		}
4996		PMAP_UNLOCK(pmap);
4997	}
4998	sched_unpin();
4999	rw_wunlock(&pvh_global_lock);
5000}
5001
5002/*
5003 * Miscellaneous support routines follow
5004 */
5005
5006/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5007static __inline void
5008pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5009{
5010	u_int opte, npte;
5011
5012	/*
5013	 * The cache mode bits are all in the low 32-bits of the
5014	 * PTE, so we can just spin on updating the low 32-bits.
5015	 */
5016	do {
5017		opte = *(u_int *)pte;
5018		npte = opte & ~PG_PTE_CACHE;
5019		npte |= cache_bits;
5020	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5021}
5022
5023/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5024static __inline void
5025pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5026{
5027	u_int opde, npde;
5028
5029	/*
5030	 * The cache mode bits are all in the low 32-bits of the
5031	 * PDE, so we can just spin on updating the low 32-bits.
5032	 */
5033	do {
5034		opde = *(u_int *)pde;
5035		npde = opde & ~PG_PDE_CACHE;
5036		npde |= cache_bits;
5037	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5038}
5039
5040/*
5041 * Map a set of physical memory pages into the kernel virtual
5042 * address space. Return a pointer to where it is mapped. This
5043 * routine is intended to be used for mapping device memory,
5044 * NOT real memory.
5045 */
5046void *
5047pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5048{
5049	vm_offset_t va, offset;
5050	vm_size_t tmpsize;
5051
5052	offset = pa & PAGE_MASK;
5053	size = roundup(offset + size, PAGE_SIZE);
5054	pa = pa & PG_FRAME;
5055
5056	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5057		va = KERNBASE + pa;
5058	else
5059		va = kmem_alloc_nofault(kernel_map, size);
5060	if (!va)
5061		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5062
5063	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5064		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5065	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5066	pmap_invalidate_cache_range(va, va + size);
5067	return ((void *)(va + offset));
5068}
5069
5070void *
5071pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5072{
5073
5074	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5075}
5076
5077void *
5078pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5079{
5080
5081	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5082}
5083
5084void
5085pmap_unmapdev(vm_offset_t va, vm_size_t size)
5086{
5087	vm_offset_t base, offset;
5088
5089	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5090		return;
5091	base = trunc_page(va);
5092	offset = va & PAGE_MASK;
5093	size = roundup(offset + size, PAGE_SIZE);
5094	kmem_free(kernel_map, base, size);
5095}
5096
5097/*
5098 * Sets the memory attribute for the specified page.
5099 */
5100void
5101pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5102{
5103
5104	m->md.pat_mode = ma;
5105	if ((m->flags & PG_FICTITIOUS) != 0)
5106		return;
5107
5108	/*
5109	 * If "m" is a normal page, flush it from the cache.
5110	 * See pmap_invalidate_cache_range().
5111	 *
5112	 * First, try to find an existing mapping of the page by sf
5113	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5114	 * flushes the cache.
5115	 */
5116	if (sf_buf_invalidate_cache(m))
5117		return;
5118
5119	/*
5120	 * If page is not mapped by sf buffer, but CPU does not
5121	 * support self snoop, map the page transient and do
5122	 * invalidation. In the worst case, whole cache is flushed by
5123	 * pmap_invalidate_cache_range().
5124	 */
5125	if ((cpu_feature & CPUID_SS) == 0)
5126		pmap_flush_page(m);
5127}
5128
5129static void
5130pmap_flush_page(vm_page_t m)
5131{
5132	struct sysmaps *sysmaps;
5133	vm_offset_t sva, eva;
5134
5135	if ((cpu_feature & CPUID_CLFSH) != 0) {
5136		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5137		mtx_lock(&sysmaps->lock);
5138		if (*sysmaps->CMAP2)
5139			panic("pmap_flush_page: CMAP2 busy");
5140		sched_pin();
5141		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5142		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5143		invlcaddr(sysmaps->CADDR2);
5144		sva = (vm_offset_t)sysmaps->CADDR2;
5145		eva = sva + PAGE_SIZE;
5146
5147		/*
5148		 * Use mfence despite the ordering implied by
5149		 * mtx_{un,}lock() because clflush is not guaranteed
5150		 * to be ordered by any other instruction.
5151		 */
5152		mfence();
5153		for (; sva < eva; sva += cpu_clflush_line_size)
5154			clflush(sva);
5155		mfence();
5156		*sysmaps->CMAP2 = 0;
5157		sched_unpin();
5158		mtx_unlock(&sysmaps->lock);
5159	} else
5160		pmap_invalidate_cache();
5161}
5162
5163/*
5164 * Changes the specified virtual address range's memory type to that given by
5165 * the parameter "mode".  The specified virtual address range must be
5166 * completely contained within either the kernel map.
5167 *
5168 * Returns zero if the change completed successfully, and either EINVAL or
5169 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5170 * of the virtual address range was not mapped, and ENOMEM is returned if
5171 * there was insufficient memory available to complete the change.
5172 */
5173int
5174pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5175{
5176	vm_offset_t base, offset, tmpva;
5177	pd_entry_t *pde;
5178	pt_entry_t *pte;
5179	int cache_bits_pte, cache_bits_pde;
5180	boolean_t changed;
5181
5182	base = trunc_page(va);
5183	offset = va & PAGE_MASK;
5184	size = roundup(offset + size, PAGE_SIZE);
5185
5186	/*
5187	 * Only supported on kernel virtual addresses above the recursive map.
5188	 */
5189	if (base < VM_MIN_KERNEL_ADDRESS)
5190		return (EINVAL);
5191
5192	cache_bits_pde = pmap_cache_bits(mode, 1);
5193	cache_bits_pte = pmap_cache_bits(mode, 0);
5194	changed = FALSE;
5195
5196	/*
5197	 * Pages that aren't mapped aren't supported.  Also break down
5198	 * 2/4MB pages into 4KB pages if required.
5199	 */
5200	PMAP_LOCK(kernel_pmap);
5201	for (tmpva = base; tmpva < base + size; ) {
5202		pde = pmap_pde(kernel_pmap, tmpva);
5203		if (*pde == 0) {
5204			PMAP_UNLOCK(kernel_pmap);
5205			return (EINVAL);
5206		}
5207		if (*pde & PG_PS) {
5208			/*
5209			 * If the current 2/4MB page already has
5210			 * the required memory type, then we need not
5211			 * demote this page.  Just increment tmpva to
5212			 * the next 2/4MB page frame.
5213			 */
5214			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5215				tmpva = trunc_4mpage(tmpva) + NBPDR;
5216				continue;
5217			}
5218
5219			/*
5220			 * If the current offset aligns with a 2/4MB
5221			 * page frame and there is at least 2/4MB left
5222			 * within the range, then we need not break
5223			 * down this page into 4KB pages.
5224			 */
5225			if ((tmpva & PDRMASK) == 0 &&
5226			    tmpva + PDRMASK < base + size) {
5227				tmpva += NBPDR;
5228				continue;
5229			}
5230			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5231				PMAP_UNLOCK(kernel_pmap);
5232				return (ENOMEM);
5233			}
5234		}
5235		pte = vtopte(tmpva);
5236		if (*pte == 0) {
5237			PMAP_UNLOCK(kernel_pmap);
5238			return (EINVAL);
5239		}
5240		tmpva += PAGE_SIZE;
5241	}
5242	PMAP_UNLOCK(kernel_pmap);
5243
5244	/*
5245	 * Ok, all the pages exist, so run through them updating their
5246	 * cache mode if required.
5247	 */
5248	for (tmpva = base; tmpva < base + size; ) {
5249		pde = pmap_pde(kernel_pmap, tmpva);
5250		if (*pde & PG_PS) {
5251			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5252				pmap_pde_attr(pde, cache_bits_pde);
5253				changed = TRUE;
5254			}
5255			tmpva = trunc_4mpage(tmpva) + NBPDR;
5256		} else {
5257			pte = vtopte(tmpva);
5258			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5259				pmap_pte_attr(pte, cache_bits_pte);
5260				changed = TRUE;
5261			}
5262			tmpva += PAGE_SIZE;
5263		}
5264	}
5265
5266	/*
5267	 * Flush CPU caches to make sure any data isn't cached that
5268	 * shouldn't be, etc.
5269	 */
5270	if (changed) {
5271		pmap_invalidate_range(kernel_pmap, base, tmpva);
5272		pmap_invalidate_cache_range(base, tmpva);
5273	}
5274	return (0);
5275}
5276
5277/*
5278 * perform the pmap work for mincore
5279 */
5280int
5281pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5282{
5283	pd_entry_t *pdep;
5284	pt_entry_t *ptep, pte;
5285	vm_paddr_t pa;
5286	int val;
5287
5288	PMAP_LOCK(pmap);
5289retry:
5290	pdep = pmap_pde(pmap, addr);
5291	if (*pdep != 0) {
5292		if (*pdep & PG_PS) {
5293			pte = *pdep;
5294			/* Compute the physical address of the 4KB page. */
5295			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5296			    PG_FRAME;
5297			val = MINCORE_SUPER;
5298		} else {
5299			ptep = pmap_pte(pmap, addr);
5300			pte = *ptep;
5301			pmap_pte_release(ptep);
5302			pa = pte & PG_FRAME;
5303			val = 0;
5304		}
5305	} else {
5306		pte = 0;
5307		pa = 0;
5308		val = 0;
5309	}
5310	if ((pte & PG_V) != 0) {
5311		val |= MINCORE_INCORE;
5312		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5313			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5314		if ((pte & PG_A) != 0)
5315			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5316	}
5317	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5318	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5319	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5320		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5321		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5322			goto retry;
5323	} else
5324		PA_UNLOCK_COND(*locked_pa);
5325	PMAP_UNLOCK(pmap);
5326	return (val);
5327}
5328
5329void
5330pmap_activate(struct thread *td)
5331{
5332	pmap_t	pmap, oldpmap;
5333	u_int	cpuid;
5334	u_int32_t  cr3;
5335
5336	critical_enter();
5337	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5338	oldpmap = PCPU_GET(curpmap);
5339	cpuid = PCPU_GET(cpuid);
5340#if defined(SMP)
5341	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5342	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5343#else
5344	CPU_CLR(cpuid, &oldpmap->pm_active);
5345	CPU_SET(cpuid, &pmap->pm_active);
5346#endif
5347#ifdef PAE
5348	cr3 = vtophys(pmap->pm_pdpt);
5349#else
5350	cr3 = vtophys(pmap->pm_pdir);
5351#endif
5352	/*
5353	 * pmap_activate is for the current thread on the current cpu
5354	 */
5355	td->td_pcb->pcb_cr3 = cr3;
5356	load_cr3(cr3);
5357	PCPU_SET(curpmap, pmap);
5358	critical_exit();
5359}
5360
5361void
5362pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5363{
5364}
5365
5366/*
5367 *	Increase the starting virtual address of the given mapping if a
5368 *	different alignment might result in more superpage mappings.
5369 */
5370void
5371pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5372    vm_offset_t *addr, vm_size_t size)
5373{
5374	vm_offset_t superpage_offset;
5375
5376	if (size < NBPDR)
5377		return;
5378	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5379		offset += ptoa(object->pg_color);
5380	superpage_offset = offset & PDRMASK;
5381	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5382	    (*addr & PDRMASK) == superpage_offset)
5383		return;
5384	if ((*addr & PDRMASK) < superpage_offset)
5385		*addr = (*addr & ~PDRMASK) + superpage_offset;
5386	else
5387		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5388}
5389
5390
5391#if defined(PMAP_DEBUG)
5392pmap_pid_dump(int pid)
5393{
5394	pmap_t pmap;
5395	struct proc *p;
5396	int npte = 0;
5397	int index;
5398
5399	sx_slock(&allproc_lock);
5400	FOREACH_PROC_IN_SYSTEM(p) {
5401		if (p->p_pid != pid)
5402			continue;
5403
5404		if (p->p_vmspace) {
5405			int i,j;
5406			index = 0;
5407			pmap = vmspace_pmap(p->p_vmspace);
5408			for (i = 0; i < NPDEPTD; i++) {
5409				pd_entry_t *pde;
5410				pt_entry_t *pte;
5411				vm_offset_t base = i << PDRSHIFT;
5412
5413				pde = &pmap->pm_pdir[i];
5414				if (pde && pmap_pde_v(pde)) {
5415					for (j = 0; j < NPTEPG; j++) {
5416						vm_offset_t va = base + (j << PAGE_SHIFT);
5417						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5418							if (index) {
5419								index = 0;
5420								printf("\n");
5421							}
5422							sx_sunlock(&allproc_lock);
5423							return (npte);
5424						}
5425						pte = pmap_pte(pmap, va);
5426						if (pte && pmap_pte_v(pte)) {
5427							pt_entry_t pa;
5428							vm_page_t m;
5429							pa = *pte;
5430							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5431							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5432								va, pa, m->hold_count, m->wire_count, m->flags);
5433							npte++;
5434							index++;
5435							if (index >= 2) {
5436								index = 0;
5437								printf("\n");
5438							} else {
5439								printf(" ");
5440							}
5441						}
5442					}
5443				}
5444			}
5445		}
5446	}
5447	sx_sunlock(&allproc_lock);
5448	return (npte);
5449}
5450#endif
5451
5452#if defined(DEBUG)
5453
5454static void	pads(pmap_t pm);
5455void		pmap_pvdump(vm_paddr_t pa);
5456
5457/* print address space of pmap*/
5458static void
5459pads(pmap_t pm)
5460{
5461	int i, j;
5462	vm_paddr_t va;
5463	pt_entry_t *ptep;
5464
5465	if (pm == kernel_pmap)
5466		return;
5467	for (i = 0; i < NPDEPTD; i++)
5468		if (pm->pm_pdir[i])
5469			for (j = 0; j < NPTEPG; j++) {
5470				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5471				if (pm == kernel_pmap && va < KERNBASE)
5472					continue;
5473				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5474					continue;
5475				ptep = pmap_pte(pm, va);
5476				if (pmap_pte_v(ptep))
5477					printf("%x:%x ", va, *ptep);
5478			};
5479
5480}
5481
5482void
5483pmap_pvdump(vm_paddr_t pa)
5484{
5485	pv_entry_t pv;
5486	pmap_t pmap;
5487	vm_page_t m;
5488
5489	printf("pa %x", pa);
5490	m = PHYS_TO_VM_PAGE(pa);
5491	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5492		pmap = PV_PMAP(pv);
5493		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5494		pads(pmap);
5495	}
5496	printf(" ");
5497}
5498#endif
5499