1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD$");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	In addition to hardware address maps, this
84 *	module is called upon to provide software-use-only
85 *	maps which may or may not be stored in the same
86 *	form as hardware maps.  These pseudo-maps are
87 *	used to store intermediate results from copy
88 *	operations to and from address spaces.
89 *
90 *	Since the information managed by this module is
91 *	also stored by the logical address mapping module,
92 *	this module may throw away valid virtual-to-physical
93 *	mappings at almost any time.  However, invalidations
94 *	of virtual-to-physical mappings must be done as
95 *	requested.
96 *
97 *	In order to cope with hardware architectures which
98 *	make virtual-to-physical map invalidates expensive,
99 *	this module may delay invalidate or reduced protection
100 *	operations until such time as they are actually
101 *	necessary.  This module is given full information as
102 *	to which processors are currently using which maps,
103 *	and to when physical maps must be made correct.
104 */
105
106#include "opt_apic.h"
107#include "opt_cpu.h"
108#include "opt_pmap.h"
109#include "opt_smp.h"
110#include "opt_xbox.h"
111
112#include <sys/param.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/rwlock.h>
123#include <sys/sf_buf.h>
124#include <sys/sx.h>
125#include <sys/vmmeter.h>
126#include <sys/sched.h>
127#include <sys/sysctl.h>
128#ifdef SMP
129#include <sys/smp.h>
130#else
131#include <sys/cpuset.h>
132#endif
133
134#include <vm/vm.h>
135#include <vm/vm_param.h>
136#include <vm/vm_kern.h>
137#include <vm/vm_page.h>
138#include <vm/vm_map.h>
139#include <vm/vm_object.h>
140#include <vm/vm_extern.h>
141#include <vm/vm_pageout.h>
142#include <vm/vm_pager.h>
143#include <vm/vm_reserv.h>
144#include <vm/uma.h>
145
146#ifdef DEV_APIC
147#include <sys/bus.h>
148#include <machine/intr_machdep.h>
149#include <machine/apicvar.h>
150#endif
151#include <machine/cpu.h>
152#include <machine/cputypes.h>
153#include <machine/md_var.h>
154#include <machine/pcb.h>
155#include <machine/specialreg.h>
156#ifdef SMP
157#include <machine/smp.h>
158#endif
159
160#ifdef XBOX
161#include <machine/xbox.h>
162#endif
163
164#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
165#define CPU_ENABLE_SSE
166#endif
167
168#ifndef PMAP_SHPGPERPROC
169#define PMAP_SHPGPERPROC 200
170#endif
171
172#if !defined(DIAGNOSTIC)
173#ifdef __GNUC_GNU_INLINE__
174#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
175#else
176#define PMAP_INLINE	extern inline
177#endif
178#else
179#define PMAP_INLINE
180#endif
181
182#ifdef PV_STATS
183#define PV_STAT(x)	do { x ; } while (0)
184#else
185#define PV_STAT(x)	do { } while (0)
186#endif
187
188#define	pa_index(pa)	((pa) >> PDRSHIFT)
189#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
190
191/*
192 * Get PDEs and PTEs for user/kernel address space
193 */
194#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
195#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
196
197#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
198#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
199#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
200#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
201#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
202
203#define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
204    atomic_clear_int((u_int *)(pte), PG_W))
205#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
206
207struct pmap kernel_pmap_store;
208LIST_HEAD(pmaplist, pmap);
209static struct pmaplist allpmaps;
210static struct mtx allpmaps_lock;
211
212vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
213vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
214int pgeflag = 0;		/* PG_G or-in */
215int pseflag = 0;		/* PG_PS or-in */
216
217static int nkpt = NKPT;
218vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
219extern u_int32_t KERNend;
220extern u_int32_t KPTphys;
221
222#ifdef PAE
223pt_entry_t pg_nx;
224static uma_zone_t pdptzone;
225#endif
226
227static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
228
229static int pat_works = 1;
230SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
231    "Is page attribute table fully functional?");
232
233static int pg_ps_enabled = 1;
234SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
235    "Are large page mappings enabled?");
236
237#define	PAT_INDEX_SIZE	8
238static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
239
240/*
241 * Isolate the global pv list lock from data and other locks to prevent false
242 * sharing within the cache.
243 */
244static struct {
245	struct rwlock	lock;
246	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
247} pvh_global __aligned(CACHE_LINE_SIZE);
248
249#define	pvh_global_lock	pvh_global.lock
250
251/*
252 * Data for the pv entry allocation mechanism
253 */
254static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
255static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
256static struct md_page *pv_table;
257static int shpgperproc = PMAP_SHPGPERPROC;
258
259struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
260int pv_maxchunks;			/* How many chunks we have KVA for */
261vm_offset_t pv_vafree;			/* freelist stored in the PTE */
262
263/*
264 * All those kernel PT submaps that BSD is so fond of
265 */
266struct sysmaps {
267	struct	mtx lock;
268	pt_entry_t *CMAP1;
269	pt_entry_t *CMAP2;
270	caddr_t	CADDR1;
271	caddr_t	CADDR2;
272};
273static struct sysmaps sysmaps_pcpu[MAXCPU];
274pt_entry_t *CMAP1 = 0;
275static pt_entry_t *CMAP3;
276static pd_entry_t *KPTD;
277caddr_t CADDR1 = 0, ptvmmap = 0;
278static caddr_t CADDR3;
279struct msgbuf *msgbufp = 0;
280
281/*
282 * Crashdump maps.
283 */
284static caddr_t crashdumpmap;
285
286static pt_entry_t *PMAP1 = 0, *PMAP2;
287static pt_entry_t *PADDR1 = 0, *PADDR2;
288#ifdef SMP
289static int PMAP1cpu;
290static int PMAP1changedcpu;
291SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
292	   &PMAP1changedcpu, 0,
293	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
294#endif
295static int PMAP1changed;
296SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
297	   &PMAP1changed, 0,
298	   "Number of times pmap_pte_quick changed PMAP1");
299static int PMAP1unchanged;
300SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
301	   &PMAP1unchanged, 0,
302	   "Number of times pmap_pte_quick didn't change PMAP1");
303static struct mtx PMAP2mutex;
304
305static void	free_pv_chunk(struct pv_chunk *pc);
306static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
307static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
308static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
309static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
310static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
311static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
312static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
313		    vm_offset_t va);
314static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
315
316static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
317static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
318    vm_prot_t prot);
319static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
320    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
321static void pmap_flush_page(vm_page_t m);
322static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
323static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
324static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
325static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
326static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
327static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
328static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
329static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
330static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
331static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
332    vm_prot_t prot);
333static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
334static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
335    vm_page_t *free);
336static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
337    vm_page_t *free);
338static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
339static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
340    vm_page_t *free);
341static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
342					vm_offset_t va);
343static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
344static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
345    vm_page_t m);
346static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
347    pd_entry_t newpde);
348static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
349
350static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
351
352static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags);
353static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free);
354static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
355static void pmap_pte_release(pt_entry_t *pte);
356static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
357#ifdef PAE
358static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
359#endif
360static void pmap_set_pg(void);
361
362static __inline void pagezero(void *page);
363
364CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
365CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
366
367/*
368 * If you get an error here, then you set KVA_PAGES wrong! See the
369 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
370 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
371 */
372CTASSERT(KERNBASE % (1 << 24) == 0);
373
374/*
375 *	Bootstrap the system enough to run with virtual memory.
376 *
377 *	On the i386 this is called after mapping has already been enabled
378 *	and just syncs the pmap module with what has already been done.
379 *	[We can't call it easily with mapping off since the kernel is not
380 *	mapped with PA == VA, hence we would have to relocate every address
381 *	from the linked base (virtual) address "KERNBASE" to the actual
382 *	(physical) address starting relative to 0]
383 */
384void
385pmap_bootstrap(vm_paddr_t firstaddr)
386{
387	vm_offset_t va;
388	pt_entry_t *pte, *unused;
389	struct sysmaps *sysmaps;
390	int i;
391
392	/*
393	 * Initialize the first available kernel virtual address.  However,
394	 * using "firstaddr" may waste a few pages of the kernel virtual
395	 * address space, because locore may not have mapped every physical
396	 * page that it allocated.  Preferably, locore would provide a first
397	 * unused virtual address in addition to "firstaddr".
398	 */
399	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
400
401	virtual_end = VM_MAX_KERNEL_ADDRESS;
402
403	/*
404	 * Initialize the kernel pmap (which is statically allocated).
405	 */
406	PMAP_LOCK_INIT(kernel_pmap);
407	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
408#ifdef PAE
409	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
410#endif
411	kernel_pmap->pm_root = NULL;
412	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
413	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
414
415 	/*
416	 * Initialize the global pv list lock.
417	 */
418	rw_init(&pvh_global_lock, "pmap pv global");
419
420	LIST_INIT(&allpmaps);
421
422	/*
423	 * Request a spin mutex so that changes to allpmaps cannot be
424	 * preempted by smp_rendezvous_cpus().  Otherwise,
425	 * pmap_update_pde_kernel() could access allpmaps while it is
426	 * being changed.
427	 */
428	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
429	mtx_lock_spin(&allpmaps_lock);
430	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
431	mtx_unlock_spin(&allpmaps_lock);
432
433	/*
434	 * Reserve some special page table entries/VA space for temporary
435	 * mapping of pages.
436	 */
437#define	SYSMAP(c, p, v, n)	\
438	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
439
440	va = virtual_avail;
441	pte = vtopte(va);
442
443	/*
444	 * CMAP1/CMAP2 are used for zeroing and copying pages.
445	 * CMAP3 is used for the idle process page zeroing.
446	 */
447	for (i = 0; i < MAXCPU; i++) {
448		sysmaps = &sysmaps_pcpu[i];
449		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
450		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
451		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
452	}
453	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
454	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
455
456	/*
457	 * Crashdump maps.
458	 */
459	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
460
461	/*
462	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
463	 */
464	SYSMAP(caddr_t, unused, ptvmmap, 1)
465
466	/*
467	 * msgbufp is used to map the system message buffer.
468	 */
469	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
470
471	/*
472	 * KPTmap is used by pmap_kextract().
473	 *
474	 * KPTmap is first initialized by locore.  However, that initial
475	 * KPTmap can only support NKPT page table pages.  Here, a larger
476	 * KPTmap is created that can support KVA_PAGES page table pages.
477	 */
478	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
479
480	for (i = 0; i < NKPT; i++)
481		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
482
483	/*
484	 * Adjust the start of the KPTD and KPTmap so that the implementation
485	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
486	 */
487	KPTD -= KPTDI;
488	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
489
490	/*
491	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
492	 * respectively.
493	 */
494	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
495	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
496
497	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
498
499	virtual_avail = va;
500
501	/*
502	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
503	 * physical memory region that is used by the ACPI wakeup code.  This
504	 * mapping must not have PG_G set.
505	 */
506#ifdef XBOX
507	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
508	 * an early stadium, we cannot yet neatly map video memory ... :-(
509	 * Better fixes are very welcome! */
510	if (!arch_i386_is_xbox)
511#endif
512	for (i = 1; i < NKPT; i++)
513		PTD[i] = 0;
514
515	/* Initialize the PAT MSR if present. */
516	pmap_init_pat();
517
518	/* Turn on PG_G on kernel page(s) */
519	pmap_set_pg();
520}
521
522/*
523 * Setup the PAT MSR.
524 */
525void
526pmap_init_pat(void)
527{
528	int pat_table[PAT_INDEX_SIZE];
529	uint64_t pat_msr;
530	u_long cr0, cr4;
531	int i;
532
533	/* Set default PAT index table. */
534	for (i = 0; i < PAT_INDEX_SIZE; i++)
535		pat_table[i] = -1;
536	pat_table[PAT_WRITE_BACK] = 0;
537	pat_table[PAT_WRITE_THROUGH] = 1;
538	pat_table[PAT_UNCACHEABLE] = 3;
539	pat_table[PAT_WRITE_COMBINING] = 3;
540	pat_table[PAT_WRITE_PROTECTED] = 3;
541	pat_table[PAT_UNCACHED] = 3;
542
543	/* Bail if this CPU doesn't implement PAT. */
544	if ((cpu_feature & CPUID_PAT) == 0) {
545		for (i = 0; i < PAT_INDEX_SIZE; i++)
546			pat_index[i] = pat_table[i];
547		pat_works = 0;
548		return;
549	}
550
551	/*
552	 * Due to some Intel errata, we can only safely use the lower 4
553	 * PAT entries.
554	 *
555	 *   Intel Pentium III Processor Specification Update
556	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
557	 * or Mode C Paging)
558	 *
559	 *   Intel Pentium IV  Processor Specification Update
560	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
561	 */
562	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
563	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
564		pat_works = 0;
565
566	/* Initialize default PAT entries. */
567	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
568	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
569	    PAT_VALUE(2, PAT_UNCACHED) |
570	    PAT_VALUE(3, PAT_UNCACHEABLE) |
571	    PAT_VALUE(4, PAT_WRITE_BACK) |
572	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
573	    PAT_VALUE(6, PAT_UNCACHED) |
574	    PAT_VALUE(7, PAT_UNCACHEABLE);
575
576	if (pat_works) {
577		/*
578		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
579		 * Program 5 and 6 as WP and WC.
580		 * Leave 4 and 7 as WB and UC.
581		 */
582		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
583		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
584		    PAT_VALUE(6, PAT_WRITE_COMBINING);
585		pat_table[PAT_UNCACHED] = 2;
586		pat_table[PAT_WRITE_PROTECTED] = 5;
587		pat_table[PAT_WRITE_COMBINING] = 6;
588	} else {
589		/*
590		 * Just replace PAT Index 2 with WC instead of UC-.
591		 */
592		pat_msr &= ~PAT_MASK(2);
593		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
594		pat_table[PAT_WRITE_COMBINING] = 2;
595	}
596
597	/* Disable PGE. */
598	cr4 = rcr4();
599	load_cr4(cr4 & ~CR4_PGE);
600
601	/* Disable caches (CD = 1, NW = 0). */
602	cr0 = rcr0();
603	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
604
605	/* Flushes caches and TLBs. */
606	wbinvd();
607	invltlb();
608
609	/* Update PAT and index table. */
610	wrmsr(MSR_PAT, pat_msr);
611	for (i = 0; i < PAT_INDEX_SIZE; i++)
612		pat_index[i] = pat_table[i];
613
614	/* Flush caches and TLBs again. */
615	wbinvd();
616	invltlb();
617
618	/* Restore caches and PGE. */
619	load_cr0(cr0);
620	load_cr4(cr4);
621}
622
623/*
624 * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
625 */
626static void
627pmap_set_pg(void)
628{
629	pt_entry_t *pte;
630	vm_offset_t va, endva;
631
632	if (pgeflag == 0)
633		return;
634
635	endva = KERNBASE + KERNend;
636
637	if (pseflag) {
638		va = KERNBASE + KERNLOAD;
639		while (va  < endva) {
640			pdir_pde(PTD, va) |= pgeflag;
641			invltlb();	/* Play it safe, invltlb() every time */
642			va += NBPDR;
643		}
644	} else {
645		va = (vm_offset_t)btext;
646		while (va < endva) {
647			pte = vtopte(va);
648			if (*pte)
649				*pte |= pgeflag;
650			invltlb();	/* Play it safe, invltlb() every time */
651			va += PAGE_SIZE;
652		}
653	}
654}
655
656/*
657 * Initialize a vm_page's machine-dependent fields.
658 */
659void
660pmap_page_init(vm_page_t m)
661{
662
663	TAILQ_INIT(&m->md.pv_list);
664	m->md.pat_mode = PAT_WRITE_BACK;
665}
666
667#ifdef PAE
668static void *
669pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
670{
671
672	/* Inform UMA that this allocator uses kernel_map/object. */
673	*flags = UMA_SLAB_KERNEL;
674	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
675	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
676}
677#endif
678
679/*
680 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
681 * Requirements:
682 *  - Must deal with pages in order to ensure that none of the PG_* bits
683 *    are ever set, PG_V in particular.
684 *  - Assumes we can write to ptes without pte_store() atomic ops, even
685 *    on PAE systems.  This should be ok.
686 *  - Assumes nothing will ever test these addresses for 0 to indicate
687 *    no mapping instead of correctly checking PG_V.
688 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
689 * Because PG_V is never set, there can be no mappings to invalidate.
690 */
691static vm_offset_t
692pmap_ptelist_alloc(vm_offset_t *head)
693{
694	pt_entry_t *pte;
695	vm_offset_t va;
696
697	va = *head;
698	if (va == 0)
699		return (va);	/* Out of memory */
700	pte = vtopte(va);
701	*head = *pte;
702	if (*head & PG_V)
703		panic("pmap_ptelist_alloc: va with PG_V set!");
704	*pte = 0;
705	return (va);
706}
707
708static void
709pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
710{
711	pt_entry_t *pte;
712
713	if (va & PG_V)
714		panic("pmap_ptelist_free: freeing va with PG_V set!");
715	pte = vtopte(va);
716	*pte = *head;		/* virtual! PG_V is 0 though */
717	*head = va;
718}
719
720static void
721pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
722{
723	int i;
724	vm_offset_t va;
725
726	*head = 0;
727	for (i = npages - 1; i >= 0; i--) {
728		va = (vm_offset_t)base + i * PAGE_SIZE;
729		pmap_ptelist_free(head, va);
730	}
731}
732
733
734/*
735 *	Initialize the pmap module.
736 *	Called by vm_init, to initialize any structures that the pmap
737 *	system needs to map virtual memory.
738 */
739void
740pmap_init(void)
741{
742	vm_page_t mpte;
743	vm_size_t s;
744	int i, pv_npg;
745
746	/*
747	 * Initialize the vm page array entries for the kernel pmap's
748	 * page table pages.
749	 */
750	for (i = 0; i < NKPT; i++) {
751		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
752		KASSERT(mpte >= vm_page_array &&
753		    mpte < &vm_page_array[vm_page_array_size],
754		    ("pmap_init: page table page is out of range"));
755		mpte->pindex = i + KPTDI;
756		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
757	}
758
759	/*
760	 * Initialize the address space (zone) for the pv entries.  Set a
761	 * high water mark so that the system can recover from excessive
762	 * numbers of pv entries.
763	 */
764	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
765	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
766	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
767	pv_entry_max = roundup(pv_entry_max, _NPCPV);
768	pv_entry_high_water = 9 * (pv_entry_max / 10);
769
770	/*
771	 * If the kernel is running on a virtual machine, then it must assume
772	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
773	 * be prepared for the hypervisor changing the vendor and family that
774	 * are reported by CPUID.  Consequently, the workaround for AMD Family
775	 * 10h Erratum 383 is enabled if the processor's feature set does not
776	 * include at least one feature that is only supported by older Intel
777	 * or newer AMD processors.
778	 */
779	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
780	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
781	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
782	    AMDID2_FMA4)) == 0)
783		workaround_erratum383 = 1;
784
785	/*
786	 * Are large page mappings supported and enabled?
787	 */
788	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
789	if (pseflag == 0)
790		pg_ps_enabled = 0;
791	else if (pg_ps_enabled) {
792		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
793		    ("pmap_init: can't assign to pagesizes[1]"));
794		pagesizes[1] = NBPDR;
795	}
796
797	/*
798	 * Calculate the size of the pv head table for superpages.
799	 */
800	for (i = 0; phys_avail[i + 1]; i += 2);
801	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
802
803	/*
804	 * Allocate memory for the pv head table for superpages.
805	 */
806	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
807	s = round_page(s);
808	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
809	for (i = 0; i < pv_npg; i++)
810		TAILQ_INIT(&pv_table[i].pv_list);
811
812	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
813	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
814	    PAGE_SIZE * pv_maxchunks);
815	if (pv_chunkbase == NULL)
816		panic("pmap_init: not enough kvm for pv chunks");
817	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
818#ifdef PAE
819	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
820	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
821	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
822	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
823#endif
824}
825
826
827SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
828	"Max number of PV entries");
829SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
830	"Page share factor per proc");
831
832static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
833    "2/4MB page mapping counters");
834
835static u_long pmap_pde_demotions;
836SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
837    &pmap_pde_demotions, 0, "2/4MB page demotions");
838
839static u_long pmap_pde_mappings;
840SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
841    &pmap_pde_mappings, 0, "2/4MB page mappings");
842
843static u_long pmap_pde_p_failures;
844SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
845    &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
846
847static u_long pmap_pde_promotions;
848SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
849    &pmap_pde_promotions, 0, "2/4MB page promotions");
850
851/***************************************************
852 * Low level helper routines.....
853 ***************************************************/
854
855/*
856 * Determine the appropriate bits to set in a PTE or PDE for a specified
857 * caching mode.
858 */
859int
860pmap_cache_bits(int mode, boolean_t is_pde)
861{
862	int cache_bits, pat_flag, pat_idx;
863
864	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
865		panic("Unknown caching mode %d\n", mode);
866
867	/* The PAT bit is different for PTE's and PDE's. */
868	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
869
870	/* Map the caching mode to a PAT index. */
871	pat_idx = pat_index[mode];
872
873	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
874	cache_bits = 0;
875	if (pat_idx & 0x4)
876		cache_bits |= pat_flag;
877	if (pat_idx & 0x2)
878		cache_bits |= PG_NC_PCD;
879	if (pat_idx & 0x1)
880		cache_bits |= PG_NC_PWT;
881	return (cache_bits);
882}
883
884/*
885 * The caller is responsible for maintaining TLB consistency.
886 */
887static void
888pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
889{
890	pd_entry_t *pde;
891	pmap_t pmap;
892	boolean_t PTD_updated;
893
894	PTD_updated = FALSE;
895	mtx_lock_spin(&allpmaps_lock);
896	LIST_FOREACH(pmap, &allpmaps, pm_list) {
897		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
898		    PG_FRAME))
899			PTD_updated = TRUE;
900		pde = pmap_pde(pmap, va);
901		pde_store(pde, newpde);
902	}
903	mtx_unlock_spin(&allpmaps_lock);
904	KASSERT(PTD_updated,
905	    ("pmap_kenter_pde: current page table is not in allpmaps"));
906}
907
908/*
909 * After changing the page size for the specified virtual address in the page
910 * table, flush the corresponding entries from the processor's TLB.  Only the
911 * calling processor's TLB is affected.
912 *
913 * The calling thread must be pinned to a processor.
914 */
915static void
916pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
917{
918	u_long cr4;
919
920	if ((newpde & PG_PS) == 0)
921		/* Demotion: flush a specific 2MB page mapping. */
922		invlpg(va);
923	else if ((newpde & PG_G) == 0)
924		/*
925		 * Promotion: flush every 4KB page mapping from the TLB
926		 * because there are too many to flush individually.
927		 */
928		invltlb();
929	else {
930		/*
931		 * Promotion: flush every 4KB page mapping from the TLB,
932		 * including any global (PG_G) mappings.
933		 */
934		cr4 = rcr4();
935		load_cr4(cr4 & ~CR4_PGE);
936		/*
937		 * Although preemption at this point could be detrimental to
938		 * performance, it would not lead to an error.  PG_G is simply
939		 * ignored if CR4.PGE is clear.  Moreover, in case this block
940		 * is re-entered, the load_cr4() either above or below will
941		 * modify CR4.PGE flushing the TLB.
942		 */
943		load_cr4(cr4 | CR4_PGE);
944	}
945}
946#ifdef SMP
947/*
948 * For SMP, these functions have to use the IPI mechanism for coherence.
949 *
950 * N.B.: Before calling any of the following TLB invalidation functions,
951 * the calling processor must ensure that all stores updating a non-
952 * kernel page table are globally performed.  Otherwise, another
953 * processor could cache an old, pre-update entry without being
954 * invalidated.  This can happen one of two ways: (1) The pmap becomes
955 * active on another processor after its pm_active field is checked by
956 * one of the following functions but before a store updating the page
957 * table is globally performed. (2) The pmap becomes active on another
958 * processor before its pm_active field is checked but due to
959 * speculative loads one of the following functions stills reads the
960 * pmap as inactive on the other processor.
961 *
962 * The kernel page table is exempt because its pm_active field is
963 * immutable.  The kernel page table is always active on every
964 * processor.
965 */
966void
967pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
968{
969	cpuset_t other_cpus;
970	u_int cpuid;
971
972	sched_pin();
973	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
974		invlpg(va);
975		smp_invlpg(va);
976	} else {
977		cpuid = PCPU_GET(cpuid);
978		other_cpus = all_cpus;
979		CPU_CLR(cpuid, &other_cpus);
980		if (CPU_ISSET(cpuid, &pmap->pm_active))
981			invlpg(va);
982		CPU_AND(&other_cpus, &pmap->pm_active);
983		if (!CPU_EMPTY(&other_cpus))
984			smp_masked_invlpg(other_cpus, va);
985	}
986	sched_unpin();
987}
988
989void
990pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
991{
992	cpuset_t other_cpus;
993	vm_offset_t addr;
994	u_int cpuid;
995
996	sched_pin();
997	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
998		for (addr = sva; addr < eva; addr += PAGE_SIZE)
999			invlpg(addr);
1000		smp_invlpg_range(sva, eva);
1001	} else {
1002		cpuid = PCPU_GET(cpuid);
1003		other_cpus = all_cpus;
1004		CPU_CLR(cpuid, &other_cpus);
1005		if (CPU_ISSET(cpuid, &pmap->pm_active))
1006			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1007				invlpg(addr);
1008		CPU_AND(&other_cpus, &pmap->pm_active);
1009		if (!CPU_EMPTY(&other_cpus))
1010			smp_masked_invlpg_range(other_cpus, sva, eva);
1011	}
1012	sched_unpin();
1013}
1014
1015void
1016pmap_invalidate_all(pmap_t pmap)
1017{
1018	cpuset_t other_cpus;
1019	u_int cpuid;
1020
1021	sched_pin();
1022	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1023		invltlb();
1024		smp_invltlb();
1025	} else {
1026		cpuid = PCPU_GET(cpuid);
1027		other_cpus = all_cpus;
1028		CPU_CLR(cpuid, &other_cpus);
1029		if (CPU_ISSET(cpuid, &pmap->pm_active))
1030			invltlb();
1031		CPU_AND(&other_cpus, &pmap->pm_active);
1032		if (!CPU_EMPTY(&other_cpus))
1033			smp_masked_invltlb(other_cpus);
1034	}
1035	sched_unpin();
1036}
1037
1038void
1039pmap_invalidate_cache(void)
1040{
1041
1042	sched_pin();
1043	wbinvd();
1044	smp_cache_flush();
1045	sched_unpin();
1046}
1047
1048struct pde_action {
1049	cpuset_t invalidate;	/* processors that invalidate their TLB */
1050	vm_offset_t va;
1051	pd_entry_t *pde;
1052	pd_entry_t newpde;
1053	u_int store;		/* processor that updates the PDE */
1054};
1055
1056static void
1057pmap_update_pde_kernel(void *arg)
1058{
1059	struct pde_action *act = arg;
1060	pd_entry_t *pde;
1061	pmap_t pmap;
1062
1063	if (act->store == PCPU_GET(cpuid)) {
1064
1065		/*
1066		 * Elsewhere, this operation requires allpmaps_lock for
1067		 * synchronization.  Here, it does not because it is being
1068		 * performed in the context of an all_cpus rendezvous.
1069		 */
1070		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1071			pde = pmap_pde(pmap, act->va);
1072			pde_store(pde, act->newpde);
1073		}
1074	}
1075}
1076
1077static void
1078pmap_update_pde_user(void *arg)
1079{
1080	struct pde_action *act = arg;
1081
1082	if (act->store == PCPU_GET(cpuid))
1083		pde_store(act->pde, act->newpde);
1084}
1085
1086static void
1087pmap_update_pde_teardown(void *arg)
1088{
1089	struct pde_action *act = arg;
1090
1091	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1092		pmap_update_pde_invalidate(act->va, act->newpde);
1093}
1094
1095/*
1096 * Change the page size for the specified virtual address in a way that
1097 * prevents any possibility of the TLB ever having two entries that map the
1098 * same virtual address using different page sizes.  This is the recommended
1099 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1100 * machine check exception for a TLB state that is improperly diagnosed as a
1101 * hardware error.
1102 */
1103static void
1104pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1105{
1106	struct pde_action act;
1107	cpuset_t active, other_cpus;
1108	u_int cpuid;
1109
1110	sched_pin();
1111	cpuid = PCPU_GET(cpuid);
1112	other_cpus = all_cpus;
1113	CPU_CLR(cpuid, &other_cpus);
1114	if (pmap == kernel_pmap)
1115		active = all_cpus;
1116	else
1117		active = pmap->pm_active;
1118	if (CPU_OVERLAP(&active, &other_cpus)) {
1119		act.store = cpuid;
1120		act.invalidate = active;
1121		act.va = va;
1122		act.pde = pde;
1123		act.newpde = newpde;
1124		CPU_SET(cpuid, &active);
1125		smp_rendezvous_cpus(active,
1126		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1127		    pmap_update_pde_kernel : pmap_update_pde_user,
1128		    pmap_update_pde_teardown, &act);
1129	} else {
1130		if (pmap == kernel_pmap)
1131			pmap_kenter_pde(va, newpde);
1132		else
1133			pde_store(pde, newpde);
1134		if (CPU_ISSET(cpuid, &active))
1135			pmap_update_pde_invalidate(va, newpde);
1136	}
1137	sched_unpin();
1138}
1139#else /* !SMP */
1140/*
1141 * Normal, non-SMP, 486+ invalidation functions.
1142 * We inline these within pmap.c for speed.
1143 */
1144PMAP_INLINE void
1145pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1146{
1147
1148	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1149		invlpg(va);
1150}
1151
1152PMAP_INLINE void
1153pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1154{
1155	vm_offset_t addr;
1156
1157	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1158		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1159			invlpg(addr);
1160}
1161
1162PMAP_INLINE void
1163pmap_invalidate_all(pmap_t pmap)
1164{
1165
1166	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1167		invltlb();
1168}
1169
1170PMAP_INLINE void
1171pmap_invalidate_cache(void)
1172{
1173
1174	wbinvd();
1175}
1176
1177static void
1178pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1179{
1180
1181	if (pmap == kernel_pmap)
1182		pmap_kenter_pde(va, newpde);
1183	else
1184		pde_store(pde, newpde);
1185	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1186		pmap_update_pde_invalidate(va, newpde);
1187}
1188#endif /* !SMP */
1189
1190#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1191
1192void
1193pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1194{
1195
1196	KASSERT((sva & PAGE_MASK) == 0,
1197	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1198	KASSERT((eva & PAGE_MASK) == 0,
1199	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1200
1201	if (cpu_feature & CPUID_SS)
1202		; /* If "Self Snoop" is supported, do nothing. */
1203	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1204	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1205
1206#ifdef DEV_APIC
1207		/*
1208		 * XXX: Some CPUs fault, hang, or trash the local APIC
1209		 * registers if we use CLFLUSH on the local APIC
1210		 * range.  The local APIC is always uncached, so we
1211		 * don't need to flush for that range anyway.
1212		 */
1213		if (pmap_kextract(sva) == lapic_paddr)
1214			return;
1215#endif
1216		/*
1217		 * Otherwise, do per-cache line flush.  Use the mfence
1218		 * instruction to insure that previous stores are
1219		 * included in the write-back.  The processor
1220		 * propagates flush to other processors in the cache
1221		 * coherence domain.
1222		 */
1223		mfence();
1224		for (; sva < eva; sva += cpu_clflush_line_size)
1225			clflush(sva);
1226		mfence();
1227	} else {
1228
1229		/*
1230		 * No targeted cache flush methods are supported by CPU,
1231		 * or the supplied range is bigger than 2MB.
1232		 * Globally invalidate cache.
1233		 */
1234		pmap_invalidate_cache();
1235	}
1236}
1237
1238void
1239pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1240{
1241	int i;
1242
1243	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1244	    (cpu_feature & CPUID_CLFSH) == 0) {
1245		pmap_invalidate_cache();
1246	} else {
1247		for (i = 0; i < count; i++)
1248			pmap_flush_page(pages[i]);
1249	}
1250}
1251
1252/*
1253 * Are we current address space or kernel?  N.B. We return FALSE when
1254 * a pmap's page table is in use because a kernel thread is borrowing
1255 * it.  The borrowed page table can change spontaneously, making any
1256 * dependence on its continued use subject to a race condition.
1257 */
1258static __inline int
1259pmap_is_current(pmap_t pmap)
1260{
1261
1262	return (pmap == kernel_pmap ||
1263	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1264	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1265}
1266
1267/*
1268 * If the given pmap is not the current or kernel pmap, the returned pte must
1269 * be released by passing it to pmap_pte_release().
1270 */
1271pt_entry_t *
1272pmap_pte(pmap_t pmap, vm_offset_t va)
1273{
1274	pd_entry_t newpf;
1275	pd_entry_t *pde;
1276
1277	pde = pmap_pde(pmap, va);
1278	if (*pde & PG_PS)
1279		return (pde);
1280	if (*pde != 0) {
1281		/* are we current address space or kernel? */
1282		if (pmap_is_current(pmap))
1283			return (vtopte(va));
1284		mtx_lock(&PMAP2mutex);
1285		newpf = *pde & PG_FRAME;
1286		if ((*PMAP2 & PG_FRAME) != newpf) {
1287			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1288			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1289		}
1290		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1291	}
1292	return (NULL);
1293}
1294
1295/*
1296 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1297 * being NULL.
1298 */
1299static __inline void
1300pmap_pte_release(pt_entry_t *pte)
1301{
1302
1303	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1304		mtx_unlock(&PMAP2mutex);
1305}
1306
1307static __inline void
1308invlcaddr(void *caddr)
1309{
1310
1311	invlpg((u_int)caddr);
1312}
1313
1314/*
1315 * Super fast pmap_pte routine best used when scanning
1316 * the pv lists.  This eliminates many coarse-grained
1317 * invltlb calls.  Note that many of the pv list
1318 * scans are across different pmaps.  It is very wasteful
1319 * to do an entire invltlb for checking a single mapping.
1320 *
1321 * If the given pmap is not the current pmap, pvh_global_lock
1322 * must be held and curthread pinned to a CPU.
1323 */
1324static pt_entry_t *
1325pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1326{
1327	pd_entry_t newpf;
1328	pd_entry_t *pde;
1329
1330	pde = pmap_pde(pmap, va);
1331	if (*pde & PG_PS)
1332		return (pde);
1333	if (*pde != 0) {
1334		/* are we current address space or kernel? */
1335		if (pmap_is_current(pmap))
1336			return (vtopte(va));
1337		rw_assert(&pvh_global_lock, RA_WLOCKED);
1338		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1339		newpf = *pde & PG_FRAME;
1340		if ((*PMAP1 & PG_FRAME) != newpf) {
1341			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1342#ifdef SMP
1343			PMAP1cpu = PCPU_GET(cpuid);
1344#endif
1345			invlcaddr(PADDR1);
1346			PMAP1changed++;
1347		} else
1348#ifdef SMP
1349		if (PMAP1cpu != PCPU_GET(cpuid)) {
1350			PMAP1cpu = PCPU_GET(cpuid);
1351			invlcaddr(PADDR1);
1352			PMAP1changedcpu++;
1353		} else
1354#endif
1355			PMAP1unchanged++;
1356		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1357	}
1358	return (0);
1359}
1360
1361/*
1362 *	Routine:	pmap_extract
1363 *	Function:
1364 *		Extract the physical page address associated
1365 *		with the given map/virtual_address pair.
1366 */
1367vm_paddr_t
1368pmap_extract(pmap_t pmap, vm_offset_t va)
1369{
1370	vm_paddr_t rtval;
1371	pt_entry_t *pte;
1372	pd_entry_t pde;
1373
1374	rtval = 0;
1375	PMAP_LOCK(pmap);
1376	pde = pmap->pm_pdir[va >> PDRSHIFT];
1377	if (pde != 0) {
1378		if ((pde & PG_PS) != 0)
1379			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1380		else {
1381			pte = pmap_pte(pmap, va);
1382			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1383			pmap_pte_release(pte);
1384		}
1385	}
1386	PMAP_UNLOCK(pmap);
1387	return (rtval);
1388}
1389
1390/*
1391 *	Routine:	pmap_extract_and_hold
1392 *	Function:
1393 *		Atomically extract and hold the physical page
1394 *		with the given pmap and virtual address pair
1395 *		if that mapping permits the given protection.
1396 */
1397vm_page_t
1398pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1399{
1400	pd_entry_t pde;
1401	pt_entry_t pte, *ptep;
1402	vm_page_t m;
1403	vm_paddr_t pa;
1404
1405	pa = 0;
1406	m = NULL;
1407	PMAP_LOCK(pmap);
1408retry:
1409	pde = *pmap_pde(pmap, va);
1410	if (pde != 0) {
1411		if (pde & PG_PS) {
1412			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1413				if (vm_page_pa_tryrelock(pmap, (pde &
1414				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1415					goto retry;
1416				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1417				    (va & PDRMASK));
1418				vm_page_hold(m);
1419			}
1420		} else {
1421			ptep = pmap_pte(pmap, va);
1422			pte = *ptep;
1423			pmap_pte_release(ptep);
1424			if (pte != 0 &&
1425			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1426				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1427				    &pa))
1428					goto retry;
1429				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1430				vm_page_hold(m);
1431			}
1432		}
1433	}
1434	PA_UNLOCK_COND(pa);
1435	PMAP_UNLOCK(pmap);
1436	return (m);
1437}
1438
1439/***************************************************
1440 * Low level mapping routines.....
1441 ***************************************************/
1442
1443/*
1444 * Add a wired page to the kva.
1445 * Note: not SMP coherent.
1446 *
1447 * This function may be used before pmap_bootstrap() is called.
1448 */
1449PMAP_INLINE void
1450pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1451{
1452	pt_entry_t *pte;
1453
1454	pte = vtopte(va);
1455	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1456}
1457
1458static __inline void
1459pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1460{
1461	pt_entry_t *pte;
1462
1463	pte = vtopte(va);
1464	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1465}
1466
1467/*
1468 * Remove a page from the kernel pagetables.
1469 * Note: not SMP coherent.
1470 *
1471 * This function may be used before pmap_bootstrap() is called.
1472 */
1473PMAP_INLINE void
1474pmap_kremove(vm_offset_t va)
1475{
1476	pt_entry_t *pte;
1477
1478	pte = vtopte(va);
1479	pte_clear(pte);
1480}
1481
1482/*
1483 *	Used to map a range of physical addresses into kernel
1484 *	virtual address space.
1485 *
1486 *	The value passed in '*virt' is a suggested virtual address for
1487 *	the mapping. Architectures which can support a direct-mapped
1488 *	physical to virtual region can return the appropriate address
1489 *	within that region, leaving '*virt' unchanged. Other
1490 *	architectures should map the pages starting at '*virt' and
1491 *	update '*virt' with the first usable address after the mapped
1492 *	region.
1493 */
1494vm_offset_t
1495pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1496{
1497	vm_offset_t va, sva;
1498	vm_paddr_t superpage_offset;
1499	pd_entry_t newpde;
1500
1501	va = *virt;
1502	/*
1503	 * Does the physical address range's size and alignment permit at
1504	 * least one superpage mapping to be created?
1505	 */
1506	superpage_offset = start & PDRMASK;
1507	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1508		/*
1509		 * Increase the starting virtual address so that its alignment
1510		 * does not preclude the use of superpage mappings.
1511		 */
1512		if ((va & PDRMASK) < superpage_offset)
1513			va = (va & ~PDRMASK) + superpage_offset;
1514		else if ((va & PDRMASK) > superpage_offset)
1515			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1516	}
1517	sva = va;
1518	while (start < end) {
1519		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1520		    pseflag) {
1521			KASSERT((va & PDRMASK) == 0,
1522			    ("pmap_map: misaligned va %#x", va));
1523			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1524			pmap_kenter_pde(va, newpde);
1525			va += NBPDR;
1526			start += NBPDR;
1527		} else {
1528			pmap_kenter(va, start);
1529			va += PAGE_SIZE;
1530			start += PAGE_SIZE;
1531		}
1532	}
1533	pmap_invalidate_range(kernel_pmap, sva, va);
1534	*virt = va;
1535	return (sva);
1536}
1537
1538
1539/*
1540 * Add a list of wired pages to the kva
1541 * this routine is only used for temporary
1542 * kernel mappings that do not need to have
1543 * page modification or references recorded.
1544 * Note that old mappings are simply written
1545 * over.  The page *must* be wired.
1546 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1547 */
1548void
1549pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1550{
1551	pt_entry_t *endpte, oldpte, pa, *pte;
1552	vm_page_t m;
1553
1554	oldpte = 0;
1555	pte = vtopte(sva);
1556	endpte = pte + count;
1557	while (pte < endpte) {
1558		m = *ma++;
1559		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1560		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1561			oldpte |= *pte;
1562			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1563		}
1564		pte++;
1565	}
1566	if (__predict_false((oldpte & PG_V) != 0))
1567		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1568		    PAGE_SIZE);
1569}
1570
1571/*
1572 * This routine tears out page mappings from the
1573 * kernel -- it is meant only for temporary mappings.
1574 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1575 */
1576void
1577pmap_qremove(vm_offset_t sva, int count)
1578{
1579	vm_offset_t va;
1580
1581	va = sva;
1582	while (count-- > 0) {
1583		pmap_kremove(va);
1584		va += PAGE_SIZE;
1585	}
1586	pmap_invalidate_range(kernel_pmap, sva, va);
1587}
1588
1589/***************************************************
1590 * Page table page management routines.....
1591 ***************************************************/
1592static __inline void
1593pmap_free_zero_pages(vm_page_t free)
1594{
1595	vm_page_t m;
1596
1597	while (free != NULL) {
1598		m = free;
1599		free = m->right;
1600		/* Preserve the page's PG_ZERO setting. */
1601		vm_page_free_toq(m);
1602	}
1603}
1604
1605/*
1606 * Schedule the specified unused page table page to be freed.  Specifically,
1607 * add the page to the specified list of pages that will be released to the
1608 * physical memory manager after the TLB has been updated.
1609 */
1610static __inline void
1611pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1612{
1613
1614	if (set_PG_ZERO)
1615		m->flags |= PG_ZERO;
1616	else
1617		m->flags &= ~PG_ZERO;
1618	m->right = *free;
1619	*free = m;
1620}
1621
1622/*
1623 * Inserts the specified page table page into the specified pmap's collection
1624 * of idle page table pages.  Each of a pmap's page table pages is responsible
1625 * for mapping a distinct range of virtual addresses.  The pmap's collection is
1626 * ordered by this virtual address range.
1627 */
1628static void
1629pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1630{
1631	vm_page_t root;
1632
1633	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1634	root = pmap->pm_root;
1635	if (root == NULL) {
1636		mpte->left = NULL;
1637		mpte->right = NULL;
1638	} else {
1639		root = vm_page_splay(mpte->pindex, root);
1640		if (mpte->pindex < root->pindex) {
1641			mpte->left = root->left;
1642			mpte->right = root;
1643			root->left = NULL;
1644		} else if (mpte->pindex == root->pindex)
1645			panic("pmap_insert_pt_page: pindex already inserted");
1646		else {
1647			mpte->right = root->right;
1648			mpte->left = root;
1649			root->right = NULL;
1650		}
1651	}
1652	pmap->pm_root = mpte;
1653}
1654
1655/*
1656 * Looks for a page table page mapping the specified virtual address in the
1657 * specified pmap's collection of idle page table pages.  Returns NULL if there
1658 * is no page table page corresponding to the specified virtual address.
1659 */
1660static vm_page_t
1661pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1662{
1663	vm_page_t mpte;
1664	vm_pindex_t pindex = va >> PDRSHIFT;
1665
1666	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1667	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1668		mpte = vm_page_splay(pindex, mpte);
1669		if ((pmap->pm_root = mpte)->pindex != pindex)
1670			mpte = NULL;
1671	}
1672	return (mpte);
1673}
1674
1675/*
1676 * Removes the specified page table page from the specified pmap's collection
1677 * of idle page table pages.  The specified page table page must be a member of
1678 * the pmap's collection.
1679 */
1680static void
1681pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1682{
1683	vm_page_t root;
1684
1685	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1686	if (mpte != pmap->pm_root)
1687		vm_page_splay(mpte->pindex, pmap->pm_root);
1688	if (mpte->left == NULL)
1689		root = mpte->right;
1690	else {
1691		root = vm_page_splay(mpte->pindex, mpte->left);
1692		root->right = mpte->right;
1693	}
1694	pmap->pm_root = root;
1695}
1696
1697/*
1698 * Decrements a page table page's wire count, which is used to record the
1699 * number of valid page table entries within the page.  If the wire count
1700 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1701 * page table page was unmapped and FALSE otherwise.
1702 */
1703static inline boolean_t
1704pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1705{
1706
1707	--m->wire_count;
1708	if (m->wire_count == 0) {
1709		_pmap_unwire_ptp(pmap, m, free);
1710		return (TRUE);
1711	} else
1712		return (FALSE);
1713}
1714
1715static void
1716_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1717{
1718	vm_offset_t pteva;
1719
1720	/*
1721	 * unmap the page table page
1722	 */
1723	pmap->pm_pdir[m->pindex] = 0;
1724	--pmap->pm_stats.resident_count;
1725
1726	/*
1727	 * This is a release store so that the ordinary store unmapping
1728	 * the page table page is globally performed before TLB shoot-
1729	 * down is begun.
1730	 */
1731	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1732
1733	/*
1734	 * Do an invltlb to make the invalidated mapping
1735	 * take effect immediately.
1736	 */
1737	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1738	pmap_invalidate_page(pmap, pteva);
1739
1740	/*
1741	 * Put page on a list so that it is released after
1742	 * *ALL* TLB shootdown is done
1743	 */
1744	pmap_add_delayed_free_list(m, free, TRUE);
1745}
1746
1747/*
1748 * After removing a page table entry, this routine is used to
1749 * conditionally free the page, and manage the hold/wire counts.
1750 */
1751static int
1752pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1753{
1754	pd_entry_t ptepde;
1755	vm_page_t mpte;
1756
1757	if (va >= VM_MAXUSER_ADDRESS)
1758		return (0);
1759	ptepde = *pmap_pde(pmap, va);
1760	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1761	return (pmap_unwire_ptp(pmap, mpte, free));
1762}
1763
1764/*
1765 * Initialize the pmap for the swapper process.
1766 */
1767void
1768pmap_pinit0(pmap_t pmap)
1769{
1770
1771	PMAP_LOCK_INIT(pmap);
1772	/*
1773	 * Since the page table directory is shared with the kernel pmap,
1774	 * which is already included in the list "allpmaps", this pmap does
1775	 * not need to be inserted into that list.
1776	 */
1777	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1778#ifdef PAE
1779	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1780#endif
1781	pmap->pm_root = NULL;
1782	CPU_ZERO(&pmap->pm_active);
1783	PCPU_SET(curpmap, pmap);
1784	TAILQ_INIT(&pmap->pm_pvchunk);
1785	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1786}
1787
1788/*
1789 * Initialize a preallocated and zeroed pmap structure,
1790 * such as one in a vmspace structure.
1791 */
1792int
1793pmap_pinit(pmap_t pmap)
1794{
1795	vm_page_t m, ptdpg[NPGPTD];
1796	vm_paddr_t pa;
1797	int i;
1798
1799	PMAP_LOCK_INIT(pmap);
1800
1801	/*
1802	 * No need to allocate page table space yet but we do need a valid
1803	 * page directory table.
1804	 */
1805	if (pmap->pm_pdir == NULL) {
1806		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1807		    NBPTD);
1808		if (pmap->pm_pdir == NULL) {
1809			PMAP_LOCK_DESTROY(pmap);
1810			return (0);
1811		}
1812#ifdef PAE
1813		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1814		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1815		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1816		    ("pmap_pinit: pdpt misaligned"));
1817		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1818		    ("pmap_pinit: pdpt above 4g"));
1819#endif
1820		pmap->pm_root = NULL;
1821	}
1822	KASSERT(pmap->pm_root == NULL,
1823	    ("pmap_pinit: pmap has reserved page table page(s)"));
1824
1825	/*
1826	 * allocate the page directory page(s)
1827	 */
1828	for (i = 0; i < NPGPTD;) {
1829		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1830		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1831		if (m == NULL)
1832			VM_WAIT;
1833		else {
1834			ptdpg[i++] = m;
1835		}
1836	}
1837
1838	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1839
1840	for (i = 0; i < NPGPTD; i++)
1841		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1842			pagezero(pmap->pm_pdir + (i * NPDEPG));
1843
1844	mtx_lock_spin(&allpmaps_lock);
1845	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1846	/* Copy the kernel page table directory entries. */
1847	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1848	mtx_unlock_spin(&allpmaps_lock);
1849
1850	/* install self-referential address mapping entry(s) */
1851	for (i = 0; i < NPGPTD; i++) {
1852		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1853		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1854#ifdef PAE
1855		pmap->pm_pdpt[i] = pa | PG_V;
1856#endif
1857	}
1858
1859	CPU_ZERO(&pmap->pm_active);
1860	TAILQ_INIT(&pmap->pm_pvchunk);
1861	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1862
1863	return (1);
1864}
1865
1866/*
1867 * this routine is called if the page table page is not
1868 * mapped correctly.
1869 */
1870static vm_page_t
1871_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags)
1872{
1873	vm_paddr_t ptepa;
1874	vm_page_t m;
1875
1876	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1877	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1878	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1879
1880	/*
1881	 * Allocate a page table page.
1882	 */
1883	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1884	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1885		if (flags & M_WAITOK) {
1886			PMAP_UNLOCK(pmap);
1887			rw_wunlock(&pvh_global_lock);
1888			VM_WAIT;
1889			rw_wlock(&pvh_global_lock);
1890			PMAP_LOCK(pmap);
1891		}
1892
1893		/*
1894		 * Indicate the need to retry.  While waiting, the page table
1895		 * page may have been allocated.
1896		 */
1897		return (NULL);
1898	}
1899	if ((m->flags & PG_ZERO) == 0)
1900		pmap_zero_page(m);
1901
1902	/*
1903	 * Map the pagetable page into the process address space, if
1904	 * it isn't already there.
1905	 */
1906
1907	pmap->pm_stats.resident_count++;
1908
1909	ptepa = VM_PAGE_TO_PHYS(m);
1910	pmap->pm_pdir[ptepindex] =
1911		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1912
1913	return (m);
1914}
1915
1916static vm_page_t
1917pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1918{
1919	u_int ptepindex;
1920	pd_entry_t ptepa;
1921	vm_page_t m;
1922
1923	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1924	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1925	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1926
1927	/*
1928	 * Calculate pagetable page index
1929	 */
1930	ptepindex = va >> PDRSHIFT;
1931retry:
1932	/*
1933	 * Get the page directory entry
1934	 */
1935	ptepa = pmap->pm_pdir[ptepindex];
1936
1937	/*
1938	 * This supports switching from a 4MB page to a
1939	 * normal 4K page.
1940	 */
1941	if (ptepa & PG_PS) {
1942		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1943		ptepa = pmap->pm_pdir[ptepindex];
1944	}
1945
1946	/*
1947	 * If the page table page is mapped, we just increment the
1948	 * hold count, and activate it.
1949	 */
1950	if (ptepa) {
1951		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1952		m->wire_count++;
1953	} else {
1954		/*
1955		 * Here if the pte page isn't mapped, or if it has
1956		 * been deallocated.
1957		 */
1958		m = _pmap_allocpte(pmap, ptepindex, flags);
1959		if (m == NULL && (flags & M_WAITOK))
1960			goto retry;
1961	}
1962	return (m);
1963}
1964
1965
1966/***************************************************
1967* Pmap allocation/deallocation routines.
1968 ***************************************************/
1969
1970#ifdef SMP
1971/*
1972 * Deal with a SMP shootdown of other users of the pmap that we are
1973 * trying to dispose of.  This can be a bit hairy.
1974 */
1975static cpuset_t *lazymask;
1976static u_int lazyptd;
1977static volatile u_int lazywait;
1978
1979void pmap_lazyfix_action(void);
1980
1981void
1982pmap_lazyfix_action(void)
1983{
1984
1985#ifdef COUNT_IPIS
1986	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1987#endif
1988	if (rcr3() == lazyptd)
1989		load_cr3(curpcb->pcb_cr3);
1990	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1991	atomic_store_rel_int(&lazywait, 1);
1992}
1993
1994static void
1995pmap_lazyfix_self(u_int cpuid)
1996{
1997
1998	if (rcr3() == lazyptd)
1999		load_cr3(curpcb->pcb_cr3);
2000	CPU_CLR_ATOMIC(cpuid, lazymask);
2001}
2002
2003
2004static void
2005pmap_lazyfix(pmap_t pmap)
2006{
2007	cpuset_t mymask, mask;
2008	u_int cpuid, spins;
2009	int lsb;
2010
2011	mask = pmap->pm_active;
2012	while (!CPU_EMPTY(&mask)) {
2013		spins = 50000000;
2014
2015		/* Find least significant set bit. */
2016		lsb = CPU_FFS(&mask);
2017		MPASS(lsb != 0);
2018		lsb--;
2019		CPU_SETOF(lsb, &mask);
2020		mtx_lock_spin(&smp_ipi_mtx);
2021#ifdef PAE
2022		lazyptd = vtophys(pmap->pm_pdpt);
2023#else
2024		lazyptd = vtophys(pmap->pm_pdir);
2025#endif
2026		cpuid = PCPU_GET(cpuid);
2027
2028		/* Use a cpuset just for having an easy check. */
2029		CPU_SETOF(cpuid, &mymask);
2030		if (!CPU_CMP(&mask, &mymask)) {
2031			lazymask = &pmap->pm_active;
2032			pmap_lazyfix_self(cpuid);
2033		} else {
2034			atomic_store_rel_int((u_int *)&lazymask,
2035			    (u_int)&pmap->pm_active);
2036			atomic_store_rel_int(&lazywait, 0);
2037			ipi_selected(mask, IPI_LAZYPMAP);
2038			while (lazywait == 0) {
2039				ia32_pause();
2040				if (--spins == 0)
2041					break;
2042			}
2043		}
2044		mtx_unlock_spin(&smp_ipi_mtx);
2045		if (spins == 0)
2046			printf("pmap_lazyfix: spun for 50000000\n");
2047		mask = pmap->pm_active;
2048	}
2049}
2050
2051#else	/* SMP */
2052
2053/*
2054 * Cleaning up on uniprocessor is easy.  For various reasons, we're
2055 * unlikely to have to even execute this code, including the fact
2056 * that the cleanup is deferred until the parent does a wait(2), which
2057 * means that another userland process has run.
2058 */
2059static void
2060pmap_lazyfix(pmap_t pmap)
2061{
2062	u_int cr3;
2063
2064	cr3 = vtophys(pmap->pm_pdir);
2065	if (cr3 == rcr3()) {
2066		load_cr3(curpcb->pcb_cr3);
2067		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
2068	}
2069}
2070#endif	/* SMP */
2071
2072/*
2073 * Release any resources held by the given physical map.
2074 * Called when a pmap initialized by pmap_pinit is being released.
2075 * Should only be called if the map contains no valid mappings.
2076 */
2077void
2078pmap_release(pmap_t pmap)
2079{
2080	vm_page_t m, ptdpg[NPGPTD];
2081	int i;
2082
2083	KASSERT(pmap->pm_stats.resident_count == 0,
2084	    ("pmap_release: pmap resident count %ld != 0",
2085	    pmap->pm_stats.resident_count));
2086	KASSERT(pmap->pm_root == NULL,
2087	    ("pmap_release: pmap has reserved page table page(s)"));
2088
2089	pmap_lazyfix(pmap);
2090	mtx_lock_spin(&allpmaps_lock);
2091	LIST_REMOVE(pmap, pm_list);
2092	mtx_unlock_spin(&allpmaps_lock);
2093
2094	for (i = 0; i < NPGPTD; i++)
2095		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2096		    PG_FRAME);
2097
2098	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2099	    sizeof(*pmap->pm_pdir));
2100
2101	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2102
2103	for (i = 0; i < NPGPTD; i++) {
2104		m = ptdpg[i];
2105#ifdef PAE
2106		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2107		    ("pmap_release: got wrong ptd page"));
2108#endif
2109		m->wire_count--;
2110		atomic_subtract_int(&cnt.v_wire_count, 1);
2111		vm_page_free_zero(m);
2112	}
2113	PMAP_LOCK_DESTROY(pmap);
2114}
2115
2116static int
2117kvm_size(SYSCTL_HANDLER_ARGS)
2118{
2119	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2120
2121	return (sysctl_handle_long(oidp, &ksize, 0, req));
2122}
2123SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2124    0, 0, kvm_size, "IU", "Size of KVM");
2125
2126static int
2127kvm_free(SYSCTL_HANDLER_ARGS)
2128{
2129	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2130
2131	return (sysctl_handle_long(oidp, &kfree, 0, req));
2132}
2133SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2134    0, 0, kvm_free, "IU", "Amount of KVM free");
2135
2136/*
2137 * grow the number of kernel page table entries, if needed
2138 */
2139void
2140pmap_growkernel(vm_offset_t addr)
2141{
2142	vm_paddr_t ptppaddr;
2143	vm_page_t nkpg;
2144	pd_entry_t newpdir;
2145
2146	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2147	addr = roundup2(addr, NBPDR);
2148	if (addr - 1 >= kernel_map->max_offset)
2149		addr = kernel_map->max_offset;
2150	while (kernel_vm_end < addr) {
2151		if (pdir_pde(PTD, kernel_vm_end)) {
2152			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2153			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2154				kernel_vm_end = kernel_map->max_offset;
2155				break;
2156			}
2157			continue;
2158		}
2159
2160		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2161		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2162		    VM_ALLOC_ZERO);
2163		if (nkpg == NULL)
2164			panic("pmap_growkernel: no memory to grow kernel");
2165
2166		nkpt++;
2167
2168		if ((nkpg->flags & PG_ZERO) == 0)
2169			pmap_zero_page(nkpg);
2170		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2171		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2172		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2173
2174		pmap_kenter_pde(kernel_vm_end, newpdir);
2175		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2176		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2177			kernel_vm_end = kernel_map->max_offset;
2178			break;
2179		}
2180	}
2181}
2182
2183
2184/***************************************************
2185 * page management routines.
2186 ***************************************************/
2187
2188CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2189CTASSERT(_NPCM == 11);
2190CTASSERT(_NPCPV == 336);
2191
2192static __inline struct pv_chunk *
2193pv_to_chunk(pv_entry_t pv)
2194{
2195
2196	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2197}
2198
2199#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2200
2201#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2202#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2203
2204static const uint32_t pc_freemask[_NPCM] = {
2205	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2206	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2207	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2208	PC_FREE0_9, PC_FREE10
2209};
2210
2211SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2212	"Current number of pv entries");
2213
2214#ifdef PV_STATS
2215static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2216
2217SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2218	"Current number of pv entry chunks");
2219SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2220	"Current number of pv entry chunks allocated");
2221SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2222	"Current number of pv entry chunks frees");
2223SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2224	"Number of times tried to get a chunk page but failed.");
2225
2226static long pv_entry_frees, pv_entry_allocs;
2227static int pv_entry_spare;
2228
2229SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2230	"Current number of pv entry frees");
2231SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2232	"Current number of pv entry allocs");
2233SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2234	"Current number of spare pv entries");
2235#endif
2236
2237/*
2238 * We are in a serious low memory condition.  Resort to
2239 * drastic measures to free some pages so we can allocate
2240 * another pv entry chunk.
2241 */
2242static vm_page_t
2243pmap_pv_reclaim(pmap_t locked_pmap)
2244{
2245	struct pch newtail;
2246	struct pv_chunk *pc;
2247	struct md_page *pvh;
2248	pd_entry_t *pde;
2249	pmap_t pmap;
2250	pt_entry_t *pte, tpte;
2251	pv_entry_t pv;
2252	vm_offset_t va;
2253	vm_page_t free, m, m_pc;
2254	uint32_t inuse;
2255	int bit, field, freed;
2256
2257	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2258	pmap = NULL;
2259	free = m_pc = NULL;
2260	TAILQ_INIT(&newtail);
2261	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2262	    free == NULL)) {
2263		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2264		if (pmap != pc->pc_pmap) {
2265			if (pmap != NULL) {
2266				pmap_invalidate_all(pmap);
2267				if (pmap != locked_pmap)
2268					PMAP_UNLOCK(pmap);
2269			}
2270			pmap = pc->pc_pmap;
2271			/* Avoid deadlock and lock recursion. */
2272			if (pmap > locked_pmap)
2273				PMAP_LOCK(pmap);
2274			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2275				pmap = NULL;
2276				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2277				continue;
2278			}
2279		}
2280
2281		/*
2282		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2283		 */
2284		freed = 0;
2285		for (field = 0; field < _NPCM; field++) {
2286			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2287			    inuse != 0; inuse &= ~(1UL << bit)) {
2288				bit = bsfl(inuse);
2289				pv = &pc->pc_pventry[field * 32 + bit];
2290				va = pv->pv_va;
2291				pde = pmap_pde(pmap, va);
2292				if ((*pde & PG_PS) != 0)
2293					continue;
2294				pte = pmap_pte(pmap, va);
2295				tpte = *pte;
2296				if ((tpte & PG_W) == 0)
2297					tpte = pte_load_clear(pte);
2298				pmap_pte_release(pte);
2299				if ((tpte & PG_W) != 0)
2300					continue;
2301				KASSERT(tpte != 0,
2302				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2303				    pmap, va));
2304				if ((tpte & PG_G) != 0)
2305					pmap_invalidate_page(pmap, va);
2306				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2307				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2308					vm_page_dirty(m);
2309				if ((tpte & PG_A) != 0)
2310					vm_page_aflag_set(m, PGA_REFERENCED);
2311				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2312				if (TAILQ_EMPTY(&m->md.pv_list) &&
2313				    (m->flags & PG_FICTITIOUS) == 0) {
2314					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2315					if (TAILQ_EMPTY(&pvh->pv_list)) {
2316						vm_page_aflag_clear(m,
2317						    PGA_WRITEABLE);
2318					}
2319				}
2320				pc->pc_map[field] |= 1UL << bit;
2321				pmap_unuse_pt(pmap, va, &free);
2322				freed++;
2323			}
2324		}
2325		if (freed == 0) {
2326			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2327			continue;
2328		}
2329		/* Every freed mapping is for a 4 KB page. */
2330		pmap->pm_stats.resident_count -= freed;
2331		PV_STAT(pv_entry_frees += freed);
2332		PV_STAT(pv_entry_spare += freed);
2333		pv_entry_count -= freed;
2334		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2335		for (field = 0; field < _NPCM; field++)
2336			if (pc->pc_map[field] != pc_freemask[field]) {
2337				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2338				    pc_list);
2339				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2340
2341				/*
2342				 * One freed pv entry in locked_pmap is
2343				 * sufficient.
2344				 */
2345				if (pmap == locked_pmap)
2346					goto out;
2347				break;
2348			}
2349		if (field == _NPCM) {
2350			PV_STAT(pv_entry_spare -= _NPCPV);
2351			PV_STAT(pc_chunk_count--);
2352			PV_STAT(pc_chunk_frees++);
2353			/* Entire chunk is free; return it. */
2354			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2355			pmap_qremove((vm_offset_t)pc, 1);
2356			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2357			break;
2358		}
2359	}
2360out:
2361	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2362	if (pmap != NULL) {
2363		pmap_invalidate_all(pmap);
2364		if (pmap != locked_pmap)
2365			PMAP_UNLOCK(pmap);
2366	}
2367	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
2368		m_pc = free;
2369		free = m_pc->right;
2370		/* Recycle a freed page table page. */
2371		m_pc->wire_count = 1;
2372		atomic_add_int(&cnt.v_wire_count, 1);
2373	}
2374	pmap_free_zero_pages(free);
2375	return (m_pc);
2376}
2377
2378/*
2379 * free the pv_entry back to the free list
2380 */
2381static void
2382free_pv_entry(pmap_t pmap, pv_entry_t pv)
2383{
2384	struct pv_chunk *pc;
2385	int idx, field, bit;
2386
2387	rw_assert(&pvh_global_lock, RA_WLOCKED);
2388	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2389	PV_STAT(pv_entry_frees++);
2390	PV_STAT(pv_entry_spare++);
2391	pv_entry_count--;
2392	pc = pv_to_chunk(pv);
2393	idx = pv - &pc->pc_pventry[0];
2394	field = idx / 32;
2395	bit = idx % 32;
2396	pc->pc_map[field] |= 1ul << bit;
2397	for (idx = 0; idx < _NPCM; idx++)
2398		if (pc->pc_map[idx] != pc_freemask[idx]) {
2399			/*
2400			 * 98% of the time, pc is already at the head of the
2401			 * list.  If it isn't already, move it to the head.
2402			 */
2403			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2404			    pc)) {
2405				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2406				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2407				    pc_list);
2408			}
2409			return;
2410		}
2411	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2412	free_pv_chunk(pc);
2413}
2414
2415static void
2416free_pv_chunk(struct pv_chunk *pc)
2417{
2418	vm_page_t m;
2419
2420 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2421	PV_STAT(pv_entry_spare -= _NPCPV);
2422	PV_STAT(pc_chunk_count--);
2423	PV_STAT(pc_chunk_frees++);
2424	/* entire chunk is free, return it */
2425	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2426	pmap_qremove((vm_offset_t)pc, 1);
2427	vm_page_unwire(m, 0);
2428	vm_page_free(m);
2429	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2430}
2431
2432/*
2433 * get a new pv_entry, allocating a block from the system
2434 * when needed.
2435 */
2436static pv_entry_t
2437get_pv_entry(pmap_t pmap, boolean_t try)
2438{
2439	static const struct timeval printinterval = { 60, 0 };
2440	static struct timeval lastprint;
2441	int bit, field;
2442	pv_entry_t pv;
2443	struct pv_chunk *pc;
2444	vm_page_t m;
2445
2446	rw_assert(&pvh_global_lock, RA_WLOCKED);
2447	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2448	PV_STAT(pv_entry_allocs++);
2449	pv_entry_count++;
2450	if (pv_entry_count > pv_entry_high_water)
2451		if (ratecheck(&lastprint, &printinterval))
2452			printf("Approaching the limit on PV entries, consider "
2453			    "increasing either the vm.pmap.shpgperproc or the "
2454			    "vm.pmap.pv_entry_max tunable.\n");
2455retry:
2456	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2457	if (pc != NULL) {
2458		for (field = 0; field < _NPCM; field++) {
2459			if (pc->pc_map[field]) {
2460				bit = bsfl(pc->pc_map[field]);
2461				break;
2462			}
2463		}
2464		if (field < _NPCM) {
2465			pv = &pc->pc_pventry[field * 32 + bit];
2466			pc->pc_map[field] &= ~(1ul << bit);
2467			/* If this was the last item, move it to tail */
2468			for (field = 0; field < _NPCM; field++)
2469				if (pc->pc_map[field] != 0) {
2470					PV_STAT(pv_entry_spare--);
2471					return (pv);	/* not full, return */
2472				}
2473			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2474			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2475			PV_STAT(pv_entry_spare--);
2476			return (pv);
2477		}
2478	}
2479	/*
2480	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2481	 * global lock.  If "pv_vafree" is currently non-empty, it will
2482	 * remain non-empty until pmap_ptelist_alloc() completes.
2483	 */
2484	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2485	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2486		if (try) {
2487			pv_entry_count--;
2488			PV_STAT(pc_chunk_tryfail++);
2489			return (NULL);
2490		}
2491		m = pmap_pv_reclaim(pmap);
2492		if (m == NULL)
2493			goto retry;
2494	}
2495	PV_STAT(pc_chunk_count++);
2496	PV_STAT(pc_chunk_allocs++);
2497	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2498	pmap_qenter((vm_offset_t)pc, &m, 1);
2499	pc->pc_pmap = pmap;
2500	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2501	for (field = 1; field < _NPCM; field++)
2502		pc->pc_map[field] = pc_freemask[field];
2503	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2504	pv = &pc->pc_pventry[0];
2505	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2506	PV_STAT(pv_entry_spare += _NPCPV - 1);
2507	return (pv);
2508}
2509
2510static __inline pv_entry_t
2511pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2512{
2513	pv_entry_t pv;
2514
2515	rw_assert(&pvh_global_lock, RA_WLOCKED);
2516	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2517		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2518			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2519			break;
2520		}
2521	}
2522	return (pv);
2523}
2524
2525static void
2526pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2527{
2528	struct md_page *pvh;
2529	pv_entry_t pv;
2530	vm_offset_t va_last;
2531	vm_page_t m;
2532
2533	rw_assert(&pvh_global_lock, RA_WLOCKED);
2534	KASSERT((pa & PDRMASK) == 0,
2535	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2536
2537	/*
2538	 * Transfer the 4mpage's pv entry for this mapping to the first
2539	 * page's pv list.
2540	 */
2541	pvh = pa_to_pvh(pa);
2542	va = trunc_4mpage(va);
2543	pv = pmap_pvh_remove(pvh, pmap, va);
2544	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2545	m = PHYS_TO_VM_PAGE(pa);
2546	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2547	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2548	va_last = va + NBPDR - PAGE_SIZE;
2549	do {
2550		m++;
2551		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2552		    ("pmap_pv_demote_pde: page %p is not managed", m));
2553		va += PAGE_SIZE;
2554		pmap_insert_entry(pmap, va, m);
2555	} while (va < va_last);
2556}
2557
2558static void
2559pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2560{
2561	struct md_page *pvh;
2562	pv_entry_t pv;
2563	vm_offset_t va_last;
2564	vm_page_t m;
2565
2566	rw_assert(&pvh_global_lock, RA_WLOCKED);
2567	KASSERT((pa & PDRMASK) == 0,
2568	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2569
2570	/*
2571	 * Transfer the first page's pv entry for this mapping to the
2572	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2573	 * to get_pv_entry(), a transfer avoids the possibility that
2574	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2575	 * removes one of the mappings that is being promoted.
2576	 */
2577	m = PHYS_TO_VM_PAGE(pa);
2578	va = trunc_4mpage(va);
2579	pv = pmap_pvh_remove(&m->md, pmap, va);
2580	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2581	pvh = pa_to_pvh(pa);
2582	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2583	/* Free the remaining NPTEPG - 1 pv entries. */
2584	va_last = va + NBPDR - PAGE_SIZE;
2585	do {
2586		m++;
2587		va += PAGE_SIZE;
2588		pmap_pvh_free(&m->md, pmap, va);
2589	} while (va < va_last);
2590}
2591
2592static void
2593pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2594{
2595	pv_entry_t pv;
2596
2597	pv = pmap_pvh_remove(pvh, pmap, va);
2598	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2599	free_pv_entry(pmap, pv);
2600}
2601
2602static void
2603pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2604{
2605	struct md_page *pvh;
2606
2607	rw_assert(&pvh_global_lock, RA_WLOCKED);
2608	pmap_pvh_free(&m->md, pmap, va);
2609	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2610		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2611		if (TAILQ_EMPTY(&pvh->pv_list))
2612			vm_page_aflag_clear(m, PGA_WRITEABLE);
2613	}
2614}
2615
2616/*
2617 * Create a pv entry for page at pa for
2618 * (pmap, va).
2619 */
2620static void
2621pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2622{
2623	pv_entry_t pv;
2624
2625	rw_assert(&pvh_global_lock, RA_WLOCKED);
2626	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2627	pv = get_pv_entry(pmap, FALSE);
2628	pv->pv_va = va;
2629	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2630}
2631
2632/*
2633 * Conditionally create a pv entry.
2634 */
2635static boolean_t
2636pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2637{
2638	pv_entry_t pv;
2639
2640	rw_assert(&pvh_global_lock, RA_WLOCKED);
2641	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2642	if (pv_entry_count < pv_entry_high_water &&
2643	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2644		pv->pv_va = va;
2645		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2646		return (TRUE);
2647	} else
2648		return (FALSE);
2649}
2650
2651/*
2652 * Create the pv entries for each of the pages within a superpage.
2653 */
2654static boolean_t
2655pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2656{
2657	struct md_page *pvh;
2658	pv_entry_t pv;
2659
2660	rw_assert(&pvh_global_lock, RA_WLOCKED);
2661	if (pv_entry_count < pv_entry_high_water &&
2662	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2663		pv->pv_va = va;
2664		pvh = pa_to_pvh(pa);
2665		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2666		return (TRUE);
2667	} else
2668		return (FALSE);
2669}
2670
2671/*
2672 * Fills a page table page with mappings to consecutive physical pages.
2673 */
2674static void
2675pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2676{
2677	pt_entry_t *pte;
2678
2679	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2680		*pte = newpte;
2681		newpte += PAGE_SIZE;
2682	}
2683}
2684
2685/*
2686 * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2687 * 2- or 4MB page mapping is invalidated.
2688 */
2689static boolean_t
2690pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2691{
2692	pd_entry_t newpde, oldpde;
2693	pt_entry_t *firstpte, newpte;
2694	vm_paddr_t mptepa;
2695	vm_page_t free, mpte;
2696
2697	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2698	oldpde = *pde;
2699	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2700	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2701	mpte = pmap_lookup_pt_page(pmap, va);
2702	if (mpte != NULL)
2703		pmap_remove_pt_page(pmap, mpte);
2704	else {
2705		KASSERT((oldpde & PG_W) == 0,
2706		    ("pmap_demote_pde: page table page for a wired mapping"
2707		    " is missing"));
2708
2709		/*
2710		 * Invalidate the 2- or 4MB page mapping and return
2711		 * "failure" if the mapping was never accessed or the
2712		 * allocation of the new page table page fails.
2713		 */
2714		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2715		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2716		    VM_ALLOC_WIRED)) == NULL) {
2717			free = NULL;
2718			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2719			pmap_invalidate_page(pmap, trunc_4mpage(va));
2720			pmap_free_zero_pages(free);
2721			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2722			    " in pmap %p", va, pmap);
2723			return (FALSE);
2724		}
2725		if (va < VM_MAXUSER_ADDRESS)
2726			pmap->pm_stats.resident_count++;
2727	}
2728	mptepa = VM_PAGE_TO_PHYS(mpte);
2729
2730	/*
2731	 * If the page mapping is in the kernel's address space, then the
2732	 * KPTmap can provide access to the page table page.  Otherwise,
2733	 * temporarily map the page table page (mpte) into the kernel's
2734	 * address space at either PADDR1 or PADDR2.
2735	 */
2736	if (va >= KERNBASE)
2737		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2738	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2739		if ((*PMAP1 & PG_FRAME) != mptepa) {
2740			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2741#ifdef SMP
2742			PMAP1cpu = PCPU_GET(cpuid);
2743#endif
2744			invlcaddr(PADDR1);
2745			PMAP1changed++;
2746		} else
2747#ifdef SMP
2748		if (PMAP1cpu != PCPU_GET(cpuid)) {
2749			PMAP1cpu = PCPU_GET(cpuid);
2750			invlcaddr(PADDR1);
2751			PMAP1changedcpu++;
2752		} else
2753#endif
2754			PMAP1unchanged++;
2755		firstpte = PADDR1;
2756	} else {
2757		mtx_lock(&PMAP2mutex);
2758		if ((*PMAP2 & PG_FRAME) != mptepa) {
2759			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2760			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2761		}
2762		firstpte = PADDR2;
2763	}
2764	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2765	KASSERT((oldpde & PG_A) != 0,
2766	    ("pmap_demote_pde: oldpde is missing PG_A"));
2767	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2768	    ("pmap_demote_pde: oldpde is missing PG_M"));
2769	newpte = oldpde & ~PG_PS;
2770	if ((newpte & PG_PDE_PAT) != 0)
2771		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2772
2773	/*
2774	 * If the page table page is new, initialize it.
2775	 */
2776	if (mpte->wire_count == 1) {
2777		mpte->wire_count = NPTEPG;
2778		pmap_fill_ptp(firstpte, newpte);
2779	}
2780	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2781	    ("pmap_demote_pde: firstpte and newpte map different physical"
2782	    " addresses"));
2783
2784	/*
2785	 * If the mapping has changed attributes, update the page table
2786	 * entries.
2787	 */
2788	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2789		pmap_fill_ptp(firstpte, newpte);
2790
2791	/*
2792	 * Demote the mapping.  This pmap is locked.  The old PDE has
2793	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2794	 * set.  Thus, there is no danger of a race with another
2795	 * processor changing the setting of PG_A and/or PG_M between
2796	 * the read above and the store below.
2797	 */
2798	if (workaround_erratum383)
2799		pmap_update_pde(pmap, va, pde, newpde);
2800	else if (pmap == kernel_pmap)
2801		pmap_kenter_pde(va, newpde);
2802	else
2803		pde_store(pde, newpde);
2804	if (firstpte == PADDR2)
2805		mtx_unlock(&PMAP2mutex);
2806
2807	/*
2808	 * Invalidate the recursive mapping of the page table page.
2809	 */
2810	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2811
2812	/*
2813	 * Demote the pv entry.  This depends on the earlier demotion
2814	 * of the mapping.  Specifically, the (re)creation of a per-
2815	 * page pv entry might trigger the execution of pmap_collect(),
2816	 * which might reclaim a newly (re)created per-page pv entry
2817	 * and destroy the associated mapping.  In order to destroy
2818	 * the mapping, the PDE must have already changed from mapping
2819	 * the 2mpage to referencing the page table page.
2820	 */
2821	if ((oldpde & PG_MANAGED) != 0)
2822		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2823
2824	pmap_pde_demotions++;
2825	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2826	    " in pmap %p", va, pmap);
2827	return (TRUE);
2828}
2829
2830/*
2831 * pmap_remove_pde: do the things to unmap a superpage in a process
2832 */
2833static void
2834pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2835    vm_page_t *free)
2836{
2837	struct md_page *pvh;
2838	pd_entry_t oldpde;
2839	vm_offset_t eva, va;
2840	vm_page_t m, mpte;
2841
2842	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2843	KASSERT((sva & PDRMASK) == 0,
2844	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2845	oldpde = pte_load_clear(pdq);
2846	if (oldpde & PG_W)
2847		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2848
2849	/*
2850	 * Machines that don't support invlpg, also don't support
2851	 * PG_G.
2852	 */
2853	if (oldpde & PG_G)
2854		pmap_invalidate_page(kernel_pmap, sva);
2855	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2856	if (oldpde & PG_MANAGED) {
2857		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2858		pmap_pvh_free(pvh, pmap, sva);
2859		eva = sva + NBPDR;
2860		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2861		    va < eva; va += PAGE_SIZE, m++) {
2862			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2863				vm_page_dirty(m);
2864			if (oldpde & PG_A)
2865				vm_page_aflag_set(m, PGA_REFERENCED);
2866			if (TAILQ_EMPTY(&m->md.pv_list) &&
2867			    TAILQ_EMPTY(&pvh->pv_list))
2868				vm_page_aflag_clear(m, PGA_WRITEABLE);
2869		}
2870	}
2871	if (pmap == kernel_pmap) {
2872		if (!pmap_demote_pde(pmap, pdq, sva))
2873			panic("pmap_remove_pde: failed demotion");
2874	} else {
2875		mpte = pmap_lookup_pt_page(pmap, sva);
2876		if (mpte != NULL) {
2877			pmap_remove_pt_page(pmap, mpte);
2878			pmap->pm_stats.resident_count--;
2879			KASSERT(mpte->wire_count == NPTEPG,
2880			    ("pmap_remove_pde: pte page wire count error"));
2881			mpte->wire_count = 0;
2882			pmap_add_delayed_free_list(mpte, free, FALSE);
2883			atomic_subtract_int(&cnt.v_wire_count, 1);
2884		}
2885	}
2886}
2887
2888/*
2889 * pmap_remove_pte: do the things to unmap a page in a process
2890 */
2891static int
2892pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2893{
2894	pt_entry_t oldpte;
2895	vm_page_t m;
2896
2897	rw_assert(&pvh_global_lock, RA_WLOCKED);
2898	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2899	oldpte = pte_load_clear(ptq);
2900	KASSERT(oldpte != 0,
2901	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2902	if (oldpte & PG_W)
2903		pmap->pm_stats.wired_count -= 1;
2904	/*
2905	 * Machines that don't support invlpg, also don't support
2906	 * PG_G.
2907	 */
2908	if (oldpte & PG_G)
2909		pmap_invalidate_page(kernel_pmap, va);
2910	pmap->pm_stats.resident_count -= 1;
2911	if (oldpte & PG_MANAGED) {
2912		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2913		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2914			vm_page_dirty(m);
2915		if (oldpte & PG_A)
2916			vm_page_aflag_set(m, PGA_REFERENCED);
2917		pmap_remove_entry(pmap, m, va);
2918	}
2919	return (pmap_unuse_pt(pmap, va, free));
2920}
2921
2922/*
2923 * Remove a single page from a process address space
2924 */
2925static void
2926pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2927{
2928	pt_entry_t *pte;
2929
2930	rw_assert(&pvh_global_lock, RA_WLOCKED);
2931	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2932	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2933	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2934		return;
2935	pmap_remove_pte(pmap, pte, va, free);
2936	pmap_invalidate_page(pmap, va);
2937}
2938
2939/*
2940 *	Remove the given range of addresses from the specified map.
2941 *
2942 *	It is assumed that the start and end are properly
2943 *	rounded to the page size.
2944 */
2945void
2946pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2947{
2948	vm_offset_t pdnxt;
2949	pd_entry_t ptpaddr;
2950	pt_entry_t *pte;
2951	vm_page_t free = NULL;
2952	int anyvalid;
2953
2954	/*
2955	 * Perform an unsynchronized read.  This is, however, safe.
2956	 */
2957	if (pmap->pm_stats.resident_count == 0)
2958		return;
2959
2960	anyvalid = 0;
2961
2962	rw_wlock(&pvh_global_lock);
2963	sched_pin();
2964	PMAP_LOCK(pmap);
2965
2966	/*
2967	 * special handling of removing one page.  a very
2968	 * common operation and easy to short circuit some
2969	 * code.
2970	 */
2971	if ((sva + PAGE_SIZE == eva) &&
2972	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2973		pmap_remove_page(pmap, sva, &free);
2974		goto out;
2975	}
2976
2977	for (; sva < eva; sva = pdnxt) {
2978		u_int pdirindex;
2979
2980		/*
2981		 * Calculate index for next page table.
2982		 */
2983		pdnxt = (sva + NBPDR) & ~PDRMASK;
2984		if (pdnxt < sva)
2985			pdnxt = eva;
2986		if (pmap->pm_stats.resident_count == 0)
2987			break;
2988
2989		pdirindex = sva >> PDRSHIFT;
2990		ptpaddr = pmap->pm_pdir[pdirindex];
2991
2992		/*
2993		 * Weed out invalid mappings. Note: we assume that the page
2994		 * directory table is always allocated, and in kernel virtual.
2995		 */
2996		if (ptpaddr == 0)
2997			continue;
2998
2999		/*
3000		 * Check for large page.
3001		 */
3002		if ((ptpaddr & PG_PS) != 0) {
3003			/*
3004			 * Are we removing the entire large page?  If not,
3005			 * demote the mapping and fall through.
3006			 */
3007			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3008				/*
3009				 * The TLB entry for a PG_G mapping is
3010				 * invalidated by pmap_remove_pde().
3011				 */
3012				if ((ptpaddr & PG_G) == 0)
3013					anyvalid = 1;
3014				pmap_remove_pde(pmap,
3015				    &pmap->pm_pdir[pdirindex], sva, &free);
3016				continue;
3017			} else if (!pmap_demote_pde(pmap,
3018			    &pmap->pm_pdir[pdirindex], sva)) {
3019				/* The large page mapping was destroyed. */
3020				continue;
3021			}
3022		}
3023
3024		/*
3025		 * Limit our scan to either the end of the va represented
3026		 * by the current page table page, or to the end of the
3027		 * range being removed.
3028		 */
3029		if (pdnxt > eva)
3030			pdnxt = eva;
3031
3032		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3033		    sva += PAGE_SIZE) {
3034			if (*pte == 0)
3035				continue;
3036
3037			/*
3038			 * The TLB entry for a PG_G mapping is invalidated
3039			 * by pmap_remove_pte().
3040			 */
3041			if ((*pte & PG_G) == 0)
3042				anyvalid = 1;
3043			if (pmap_remove_pte(pmap, pte, sva, &free))
3044				break;
3045		}
3046	}
3047out:
3048	sched_unpin();
3049	if (anyvalid)
3050		pmap_invalidate_all(pmap);
3051	rw_wunlock(&pvh_global_lock);
3052	PMAP_UNLOCK(pmap);
3053	pmap_free_zero_pages(free);
3054}
3055
3056/*
3057 *	Routine:	pmap_remove_all
3058 *	Function:
3059 *		Removes this physical page from
3060 *		all physical maps in which it resides.
3061 *		Reflects back modify bits to the pager.
3062 *
3063 *	Notes:
3064 *		Original versions of this routine were very
3065 *		inefficient because they iteratively called
3066 *		pmap_remove (slow...)
3067 */
3068
3069void
3070pmap_remove_all(vm_page_t m)
3071{
3072	struct md_page *pvh;
3073	pv_entry_t pv;
3074	pmap_t pmap;
3075	pt_entry_t *pte, tpte;
3076	pd_entry_t *pde;
3077	vm_offset_t va;
3078	vm_page_t free;
3079
3080	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3081	    ("pmap_remove_all: page %p is not managed", m));
3082	free = NULL;
3083	rw_wlock(&pvh_global_lock);
3084	sched_pin();
3085	if ((m->flags & PG_FICTITIOUS) != 0)
3086		goto small_mappings;
3087	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3088	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3089		va = pv->pv_va;
3090		pmap = PV_PMAP(pv);
3091		PMAP_LOCK(pmap);
3092		pde = pmap_pde(pmap, va);
3093		(void)pmap_demote_pde(pmap, pde, va);
3094		PMAP_UNLOCK(pmap);
3095	}
3096small_mappings:
3097	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3098		pmap = PV_PMAP(pv);
3099		PMAP_LOCK(pmap);
3100		pmap->pm_stats.resident_count--;
3101		pde = pmap_pde(pmap, pv->pv_va);
3102		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3103		    " a 4mpage in page %p's pv list", m));
3104		pte = pmap_pte_quick(pmap, pv->pv_va);
3105		tpte = pte_load_clear(pte);
3106		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3107		    pmap, pv->pv_va));
3108		if (tpte & PG_W)
3109			pmap->pm_stats.wired_count--;
3110		if (tpte & PG_A)
3111			vm_page_aflag_set(m, PGA_REFERENCED);
3112
3113		/*
3114		 * Update the vm_page_t clean and reference bits.
3115		 */
3116		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3117			vm_page_dirty(m);
3118		pmap_unuse_pt(pmap, pv->pv_va, &free);
3119		pmap_invalidate_page(pmap, pv->pv_va);
3120		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3121		free_pv_entry(pmap, pv);
3122		PMAP_UNLOCK(pmap);
3123	}
3124	vm_page_aflag_clear(m, PGA_WRITEABLE);
3125	sched_unpin();
3126	rw_wunlock(&pvh_global_lock);
3127	pmap_free_zero_pages(free);
3128}
3129
3130/*
3131 * pmap_protect_pde: do the things to protect a 4mpage in a process
3132 */
3133static boolean_t
3134pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3135{
3136	pd_entry_t newpde, oldpde;
3137	vm_offset_t eva, va;
3138	vm_page_t m;
3139	boolean_t anychanged;
3140
3141	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3142	KASSERT((sva & PDRMASK) == 0,
3143	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3144	anychanged = FALSE;
3145retry:
3146	oldpde = newpde = *pde;
3147	if (oldpde & PG_MANAGED) {
3148		eva = sva + NBPDR;
3149		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3150		    va < eva; va += PAGE_SIZE, m++)
3151			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3152				vm_page_dirty(m);
3153	}
3154	if ((prot & VM_PROT_WRITE) == 0)
3155		newpde &= ~(PG_RW | PG_M);
3156#ifdef PAE
3157	if ((prot & VM_PROT_EXECUTE) == 0)
3158		newpde |= pg_nx;
3159#endif
3160	if (newpde != oldpde) {
3161		if (!pde_cmpset(pde, oldpde, newpde))
3162			goto retry;
3163		if (oldpde & PG_G)
3164			pmap_invalidate_page(pmap, sva);
3165		else
3166			anychanged = TRUE;
3167	}
3168	return (anychanged);
3169}
3170
3171/*
3172 *	Set the physical protection on the
3173 *	specified range of this map as requested.
3174 */
3175void
3176pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3177{
3178	vm_offset_t pdnxt;
3179	pd_entry_t ptpaddr;
3180	pt_entry_t *pte;
3181	boolean_t anychanged, pv_lists_locked;
3182
3183	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3184		pmap_remove(pmap, sva, eva);
3185		return;
3186	}
3187
3188#ifdef PAE
3189	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3190	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3191		return;
3192#else
3193	if (prot & VM_PROT_WRITE)
3194		return;
3195#endif
3196
3197	if (pmap_is_current(pmap))
3198		pv_lists_locked = FALSE;
3199	else {
3200		pv_lists_locked = TRUE;
3201resume:
3202		rw_wlock(&pvh_global_lock);
3203		sched_pin();
3204	}
3205	anychanged = FALSE;
3206
3207	PMAP_LOCK(pmap);
3208	for (; sva < eva; sva = pdnxt) {
3209		pt_entry_t obits, pbits;
3210		u_int pdirindex;
3211
3212		pdnxt = (sva + NBPDR) & ~PDRMASK;
3213		if (pdnxt < sva)
3214			pdnxt = eva;
3215
3216		pdirindex = sva >> PDRSHIFT;
3217		ptpaddr = pmap->pm_pdir[pdirindex];
3218
3219		/*
3220		 * Weed out invalid mappings. Note: we assume that the page
3221		 * directory table is always allocated, and in kernel virtual.
3222		 */
3223		if (ptpaddr == 0)
3224			continue;
3225
3226		/*
3227		 * Check for large page.
3228		 */
3229		if ((ptpaddr & PG_PS) != 0) {
3230			/*
3231			 * Are we protecting the entire large page?  If not,
3232			 * demote the mapping and fall through.
3233			 */
3234			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3235				/*
3236				 * The TLB entry for a PG_G mapping is
3237				 * invalidated by pmap_protect_pde().
3238				 */
3239				if (pmap_protect_pde(pmap,
3240				    &pmap->pm_pdir[pdirindex], sva, prot))
3241					anychanged = TRUE;
3242				continue;
3243			} else {
3244				if (!pv_lists_locked) {
3245					pv_lists_locked = TRUE;
3246					if (!rw_try_wlock(&pvh_global_lock)) {
3247						if (anychanged)
3248							pmap_invalidate_all(
3249							    pmap);
3250						PMAP_UNLOCK(pmap);
3251						goto resume;
3252					}
3253					sched_pin();
3254				}
3255				if (!pmap_demote_pde(pmap,
3256				    &pmap->pm_pdir[pdirindex], sva)) {
3257					/*
3258					 * The large page mapping was
3259					 * destroyed.
3260					 */
3261					continue;
3262				}
3263			}
3264		}
3265
3266		if (pdnxt > eva)
3267			pdnxt = eva;
3268
3269		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3270		    sva += PAGE_SIZE) {
3271			vm_page_t m;
3272
3273retry:
3274			/*
3275			 * Regardless of whether a pte is 32 or 64 bits in
3276			 * size, PG_RW, PG_A, and PG_M are among the least
3277			 * significant 32 bits.
3278			 */
3279			obits = pbits = *pte;
3280			if ((pbits & PG_V) == 0)
3281				continue;
3282
3283			if ((prot & VM_PROT_WRITE) == 0) {
3284				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3285				    (PG_MANAGED | PG_M | PG_RW)) {
3286					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3287					vm_page_dirty(m);
3288				}
3289				pbits &= ~(PG_RW | PG_M);
3290			}
3291#ifdef PAE
3292			if ((prot & VM_PROT_EXECUTE) == 0)
3293				pbits |= pg_nx;
3294#endif
3295
3296			if (pbits != obits) {
3297#ifdef PAE
3298				if (!atomic_cmpset_64(pte, obits, pbits))
3299					goto retry;
3300#else
3301				if (!atomic_cmpset_int((u_int *)pte, obits,
3302				    pbits))
3303					goto retry;
3304#endif
3305				if (obits & PG_G)
3306					pmap_invalidate_page(pmap, sva);
3307				else
3308					anychanged = TRUE;
3309			}
3310		}
3311	}
3312	if (anychanged)
3313		pmap_invalidate_all(pmap);
3314	if (pv_lists_locked) {
3315		sched_unpin();
3316		rw_wunlock(&pvh_global_lock);
3317	}
3318	PMAP_UNLOCK(pmap);
3319}
3320
3321/*
3322 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3323 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3324 * For promotion to occur, two conditions must be met: (1) the 4KB page
3325 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3326 * mappings must have identical characteristics.
3327 *
3328 * Managed (PG_MANAGED) mappings within the kernel address space are not
3329 * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3330 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3331 * pmap.
3332 */
3333static void
3334pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3335{
3336	pd_entry_t newpde;
3337	pt_entry_t *firstpte, oldpte, pa, *pte;
3338	vm_offset_t oldpteva;
3339	vm_page_t mpte;
3340
3341	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3342
3343	/*
3344	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3345	 * either invalid, unused, or does not map the first 4KB physical page
3346	 * within a 2- or 4MB page.
3347	 */
3348	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3349setpde:
3350	newpde = *firstpte;
3351	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3352		pmap_pde_p_failures++;
3353		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3354		    " in pmap %p", va, pmap);
3355		return;
3356	}
3357	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3358		pmap_pde_p_failures++;
3359		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3360		    " in pmap %p", va, pmap);
3361		return;
3362	}
3363	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3364		/*
3365		 * When PG_M is already clear, PG_RW can be cleared without
3366		 * a TLB invalidation.
3367		 */
3368		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3369		    ~PG_RW))
3370			goto setpde;
3371		newpde &= ~PG_RW;
3372	}
3373
3374	/*
3375	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3376	 * PTE maps an unexpected 4KB physical page or does not have identical
3377	 * characteristics to the first PTE.
3378	 */
3379	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3380	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3381setpte:
3382		oldpte = *pte;
3383		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3384			pmap_pde_p_failures++;
3385			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3386			    " in pmap %p", va, pmap);
3387			return;
3388		}
3389		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3390			/*
3391			 * When PG_M is already clear, PG_RW can be cleared
3392			 * without a TLB invalidation.
3393			 */
3394			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3395			    oldpte & ~PG_RW))
3396				goto setpte;
3397			oldpte &= ~PG_RW;
3398			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3399			    (va & ~PDRMASK);
3400			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3401			    " in pmap %p", oldpteva, pmap);
3402		}
3403		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3404			pmap_pde_p_failures++;
3405			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3406			    " in pmap %p", va, pmap);
3407			return;
3408		}
3409		pa -= PAGE_SIZE;
3410	}
3411
3412	/*
3413	 * Save the page table page in its current state until the PDE
3414	 * mapping the superpage is demoted by pmap_demote_pde() or
3415	 * destroyed by pmap_remove_pde().
3416	 */
3417	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3418	KASSERT(mpte >= vm_page_array &&
3419	    mpte < &vm_page_array[vm_page_array_size],
3420	    ("pmap_promote_pde: page table page is out of range"));
3421	KASSERT(mpte->pindex == va >> PDRSHIFT,
3422	    ("pmap_promote_pde: page table page's pindex is wrong"));
3423	pmap_insert_pt_page(pmap, mpte);
3424
3425	/*
3426	 * Promote the pv entries.
3427	 */
3428	if ((newpde & PG_MANAGED) != 0)
3429		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3430
3431	/*
3432	 * Propagate the PAT index to its proper position.
3433	 */
3434	if ((newpde & PG_PTE_PAT) != 0)
3435		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3436
3437	/*
3438	 * Map the superpage.
3439	 */
3440	if (workaround_erratum383)
3441		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3442	else if (pmap == kernel_pmap)
3443		pmap_kenter_pde(va, PG_PS | newpde);
3444	else
3445		pde_store(pde, PG_PS | newpde);
3446
3447	pmap_pde_promotions++;
3448	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3449	    " in pmap %p", va, pmap);
3450}
3451
3452/*
3453 *	Insert the given physical page (p) at
3454 *	the specified virtual address (v) in the
3455 *	target physical map with the protection requested.
3456 *
3457 *	If specified, the page will be wired down, meaning
3458 *	that the related pte can not be reclaimed.
3459 *
3460 *	NB:  This is the only routine which MAY NOT lazy-evaluate
3461 *	or lose information.  That is, this routine must actually
3462 *	insert this page into the given map NOW.
3463 */
3464void
3465pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3466    vm_prot_t prot, boolean_t wired)
3467{
3468	pd_entry_t *pde;
3469	pt_entry_t *pte;
3470	pt_entry_t newpte, origpte;
3471	pv_entry_t pv;
3472	vm_paddr_t opa, pa;
3473	vm_page_t mpte, om;
3474	boolean_t invlva;
3475
3476	va = trunc_page(va);
3477	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3478	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3479	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3480	    va));
3481	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
3482	    VM_OBJECT_LOCKED(m->object),
3483	    ("pmap_enter: page %p is not busy", m));
3484
3485	mpte = NULL;
3486
3487	rw_wlock(&pvh_global_lock);
3488	PMAP_LOCK(pmap);
3489	sched_pin();
3490
3491	/*
3492	 * In the case that a page table page is not
3493	 * resident, we are creating it here.
3494	 */
3495	if (va < VM_MAXUSER_ADDRESS) {
3496		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3497	}
3498
3499	pde = pmap_pde(pmap, va);
3500	if ((*pde & PG_PS) != 0)
3501		panic("pmap_enter: attempted pmap_enter on 4MB page");
3502	pte = pmap_pte_quick(pmap, va);
3503
3504	/*
3505	 * Page Directory table entry not valid, we need a new PT page
3506	 */
3507	if (pte == NULL) {
3508		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3509			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3510	}
3511
3512	pa = VM_PAGE_TO_PHYS(m);
3513	om = NULL;
3514	origpte = *pte;
3515	opa = origpte & PG_FRAME;
3516
3517	/*
3518	 * Mapping has not changed, must be protection or wiring change.
3519	 */
3520	if (origpte && (opa == pa)) {
3521		/*
3522		 * Wiring change, just update stats. We don't worry about
3523		 * wiring PT pages as they remain resident as long as there
3524		 * are valid mappings in them. Hence, if a user page is wired,
3525		 * the PT page will be also.
3526		 */
3527		if (wired && ((origpte & PG_W) == 0))
3528			pmap->pm_stats.wired_count++;
3529		else if (!wired && (origpte & PG_W))
3530			pmap->pm_stats.wired_count--;
3531
3532		/*
3533		 * Remove extra pte reference
3534		 */
3535		if (mpte)
3536			mpte->wire_count--;
3537
3538		if (origpte & PG_MANAGED) {
3539			om = m;
3540			pa |= PG_MANAGED;
3541		}
3542		goto validate;
3543	}
3544
3545	pv = NULL;
3546
3547	/*
3548	 * Mapping has changed, invalidate old range and fall through to
3549	 * handle validating new mapping.
3550	 */
3551	if (opa) {
3552		if (origpte & PG_W)
3553			pmap->pm_stats.wired_count--;
3554		if (origpte & PG_MANAGED) {
3555			om = PHYS_TO_VM_PAGE(opa);
3556			pv = pmap_pvh_remove(&om->md, pmap, va);
3557		}
3558		if (mpte != NULL) {
3559			mpte->wire_count--;
3560			KASSERT(mpte->wire_count > 0,
3561			    ("pmap_enter: missing reference to page table page,"
3562			     " va: 0x%x", va));
3563		}
3564	} else
3565		pmap->pm_stats.resident_count++;
3566
3567	/*
3568	 * Enter on the PV list if part of our managed memory.
3569	 */
3570	if ((m->oflags & VPO_UNMANAGED) == 0) {
3571		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3572		    ("pmap_enter: managed mapping within the clean submap"));
3573		if (pv == NULL)
3574			pv = get_pv_entry(pmap, FALSE);
3575		pv->pv_va = va;
3576		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3577		pa |= PG_MANAGED;
3578	} else if (pv != NULL)
3579		free_pv_entry(pmap, pv);
3580
3581	/*
3582	 * Increment counters
3583	 */
3584	if (wired)
3585		pmap->pm_stats.wired_count++;
3586
3587validate:
3588	/*
3589	 * Now validate mapping with desired protection/wiring.
3590	 */
3591	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3592	if ((prot & VM_PROT_WRITE) != 0) {
3593		newpte |= PG_RW;
3594		if ((newpte & PG_MANAGED) != 0)
3595			vm_page_aflag_set(m, PGA_WRITEABLE);
3596	}
3597#ifdef PAE
3598	if ((prot & VM_PROT_EXECUTE) == 0)
3599		newpte |= pg_nx;
3600#endif
3601	if (wired)
3602		newpte |= PG_W;
3603	if (va < VM_MAXUSER_ADDRESS)
3604		newpte |= PG_U;
3605	if (pmap == kernel_pmap)
3606		newpte |= pgeflag;
3607
3608	/*
3609	 * if the mapping or permission bits are different, we need
3610	 * to update the pte.
3611	 */
3612	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3613		newpte |= PG_A;
3614		if ((access & VM_PROT_WRITE) != 0)
3615			newpte |= PG_M;
3616		if (origpte & PG_V) {
3617			invlva = FALSE;
3618			origpte = pte_load_store(pte, newpte);
3619			if (origpte & PG_A) {
3620				if (origpte & PG_MANAGED)
3621					vm_page_aflag_set(om, PGA_REFERENCED);
3622				if (opa != VM_PAGE_TO_PHYS(m))
3623					invlva = TRUE;
3624#ifdef PAE
3625				if ((origpte & PG_NX) == 0 &&
3626				    (newpte & PG_NX) != 0)
3627					invlva = TRUE;
3628#endif
3629			}
3630			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3631				if ((origpte & PG_MANAGED) != 0)
3632					vm_page_dirty(om);
3633				if ((prot & VM_PROT_WRITE) == 0)
3634					invlva = TRUE;
3635			}
3636			if ((origpte & PG_MANAGED) != 0 &&
3637			    TAILQ_EMPTY(&om->md.pv_list) &&
3638			    ((om->flags & PG_FICTITIOUS) != 0 ||
3639			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3640				vm_page_aflag_clear(om, PGA_WRITEABLE);
3641			if (invlva)
3642				pmap_invalidate_page(pmap, va);
3643		} else
3644			pte_store(pte, newpte);
3645	}
3646
3647	/*
3648	 * If both the page table page and the reservation are fully
3649	 * populated, then attempt promotion.
3650	 */
3651	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3652	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3653	    vm_reserv_level_iffullpop(m) == 0)
3654		pmap_promote_pde(pmap, pde, va);
3655
3656	sched_unpin();
3657	rw_wunlock(&pvh_global_lock);
3658	PMAP_UNLOCK(pmap);
3659}
3660
3661/*
3662 * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3663 * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3664 * blocking, (2) a mapping already exists at the specified virtual address, or
3665 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3666 */
3667static boolean_t
3668pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3669{
3670	pd_entry_t *pde, newpde;
3671
3672	rw_assert(&pvh_global_lock, RA_WLOCKED);
3673	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3674	pde = pmap_pde(pmap, va);
3675	if (*pde != 0) {
3676		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3677		    " in pmap %p", va, pmap);
3678		return (FALSE);
3679	}
3680	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3681	    PG_PS | PG_V;
3682	if ((m->oflags & VPO_UNMANAGED) == 0) {
3683		newpde |= PG_MANAGED;
3684
3685		/*
3686		 * Abort this mapping if its PV entry could not be created.
3687		 */
3688		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3689			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3690			    " in pmap %p", va, pmap);
3691			return (FALSE);
3692		}
3693	}
3694#ifdef PAE
3695	if ((prot & VM_PROT_EXECUTE) == 0)
3696		newpde |= pg_nx;
3697#endif
3698	if (va < VM_MAXUSER_ADDRESS)
3699		newpde |= PG_U;
3700
3701	/*
3702	 * Increment counters.
3703	 */
3704	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3705
3706	/*
3707	 * Map the superpage.
3708	 */
3709	pde_store(pde, newpde);
3710
3711	pmap_pde_mappings++;
3712	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3713	    " in pmap %p", va, pmap);
3714	return (TRUE);
3715}
3716
3717/*
3718 * Maps a sequence of resident pages belonging to the same object.
3719 * The sequence begins with the given page m_start.  This page is
3720 * mapped at the given virtual address start.  Each subsequent page is
3721 * mapped at a virtual address that is offset from start by the same
3722 * amount as the page is offset from m_start within the object.  The
3723 * last page in the sequence is the page with the largest offset from
3724 * m_start that can be mapped at a virtual address less than the given
3725 * virtual address end.  Not every virtual page between start and end
3726 * is mapped; only those for which a resident page exists with the
3727 * corresponding offset from m_start are mapped.
3728 */
3729void
3730pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3731    vm_page_t m_start, vm_prot_t prot)
3732{
3733	vm_offset_t va;
3734	vm_page_t m, mpte;
3735	vm_pindex_t diff, psize;
3736
3737	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3738	psize = atop(end - start);
3739	mpte = NULL;
3740	m = m_start;
3741	rw_wlock(&pvh_global_lock);
3742	PMAP_LOCK(pmap);
3743	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3744		va = start + ptoa(diff);
3745		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3746		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3747		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3748		    pmap_enter_pde(pmap, va, m, prot))
3749			m = &m[NBPDR / PAGE_SIZE - 1];
3750		else
3751			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3752			    mpte);
3753		m = TAILQ_NEXT(m, listq);
3754	}
3755	rw_wunlock(&pvh_global_lock);
3756	PMAP_UNLOCK(pmap);
3757}
3758
3759/*
3760 * this code makes some *MAJOR* assumptions:
3761 * 1. Current pmap & pmap exists.
3762 * 2. Not wired.
3763 * 3. Read access.
3764 * 4. No page table pages.
3765 * but is *MUCH* faster than pmap_enter...
3766 */
3767
3768void
3769pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3770{
3771
3772	rw_wlock(&pvh_global_lock);
3773	PMAP_LOCK(pmap);
3774	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3775	rw_wunlock(&pvh_global_lock);
3776	PMAP_UNLOCK(pmap);
3777}
3778
3779static vm_page_t
3780pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3781    vm_prot_t prot, vm_page_t mpte)
3782{
3783	pt_entry_t *pte;
3784	vm_paddr_t pa;
3785	vm_page_t free;
3786
3787	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3788	    (m->oflags & VPO_UNMANAGED) != 0,
3789	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3790	rw_assert(&pvh_global_lock, RA_WLOCKED);
3791	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3792
3793	/*
3794	 * In the case that a page table page is not
3795	 * resident, we are creating it here.
3796	 */
3797	if (va < VM_MAXUSER_ADDRESS) {
3798		u_int ptepindex;
3799		pd_entry_t ptepa;
3800
3801		/*
3802		 * Calculate pagetable page index
3803		 */
3804		ptepindex = va >> PDRSHIFT;
3805		if (mpte && (mpte->pindex == ptepindex)) {
3806			mpte->wire_count++;
3807		} else {
3808			/*
3809			 * Get the page directory entry
3810			 */
3811			ptepa = pmap->pm_pdir[ptepindex];
3812
3813			/*
3814			 * If the page table page is mapped, we just increment
3815			 * the hold count, and activate it.
3816			 */
3817			if (ptepa) {
3818				if (ptepa & PG_PS)
3819					return (NULL);
3820				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3821				mpte->wire_count++;
3822			} else {
3823				mpte = _pmap_allocpte(pmap, ptepindex,
3824				    M_NOWAIT);
3825				if (mpte == NULL)
3826					return (mpte);
3827			}
3828		}
3829	} else {
3830		mpte = NULL;
3831	}
3832
3833	/*
3834	 * This call to vtopte makes the assumption that we are
3835	 * entering the page into the current pmap.  In order to support
3836	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3837	 * But that isn't as quick as vtopte.
3838	 */
3839	pte = vtopte(va);
3840	if (*pte) {
3841		if (mpte != NULL) {
3842			mpte->wire_count--;
3843			mpte = NULL;
3844		}
3845		return (mpte);
3846	}
3847
3848	/*
3849	 * Enter on the PV list if part of our managed memory.
3850	 */
3851	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3852	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3853		if (mpte != NULL) {
3854			free = NULL;
3855			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3856				pmap_invalidate_page(pmap, va);
3857				pmap_free_zero_pages(free);
3858			}
3859
3860			mpte = NULL;
3861		}
3862		return (mpte);
3863	}
3864
3865	/*
3866	 * Increment counters
3867	 */
3868	pmap->pm_stats.resident_count++;
3869
3870	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3871#ifdef PAE
3872	if ((prot & VM_PROT_EXECUTE) == 0)
3873		pa |= pg_nx;
3874#endif
3875
3876	/*
3877	 * Now validate mapping with RO protection
3878	 */
3879	if ((m->oflags & VPO_UNMANAGED) != 0)
3880		pte_store(pte, pa | PG_V | PG_U);
3881	else
3882		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3883	return (mpte);
3884}
3885
3886/*
3887 * Make a temporary mapping for a physical address.  This is only intended
3888 * to be used for panic dumps.
3889 */
3890void *
3891pmap_kenter_temporary(vm_paddr_t pa, int i)
3892{
3893	vm_offset_t va;
3894
3895	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3896	pmap_kenter(va, pa);
3897	invlpg(va);
3898	return ((void *)crashdumpmap);
3899}
3900
3901/*
3902 * This code maps large physical mmap regions into the
3903 * processor address space.  Note that some shortcuts
3904 * are taken, but the code works.
3905 */
3906void
3907pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3908    vm_pindex_t pindex, vm_size_t size)
3909{
3910	pd_entry_t *pde;
3911	vm_paddr_t pa, ptepa;
3912	vm_page_t p;
3913	int pat_mode;
3914
3915	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3916	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3917	    ("pmap_object_init_pt: non-device object"));
3918	if (pseflag &&
3919	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3920		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3921			return;
3922		p = vm_page_lookup(object, pindex);
3923		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3924		    ("pmap_object_init_pt: invalid page %p", p));
3925		pat_mode = p->md.pat_mode;
3926
3927		/*
3928		 * Abort the mapping if the first page is not physically
3929		 * aligned to a 2/4MB page boundary.
3930		 */
3931		ptepa = VM_PAGE_TO_PHYS(p);
3932		if (ptepa & (NBPDR - 1))
3933			return;
3934
3935		/*
3936		 * Skip the first page.  Abort the mapping if the rest of
3937		 * the pages are not physically contiguous or have differing
3938		 * memory attributes.
3939		 */
3940		p = TAILQ_NEXT(p, listq);
3941		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3942		    pa += PAGE_SIZE) {
3943			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3944			    ("pmap_object_init_pt: invalid page %p", p));
3945			if (pa != VM_PAGE_TO_PHYS(p) ||
3946			    pat_mode != p->md.pat_mode)
3947				return;
3948			p = TAILQ_NEXT(p, listq);
3949		}
3950
3951		/*
3952		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3953		 * "size" is a multiple of 2/4M, adding the PAT setting to
3954		 * "pa" will not affect the termination of this loop.
3955		 */
3956		PMAP_LOCK(pmap);
3957		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3958		    size; pa += NBPDR) {
3959			pde = pmap_pde(pmap, addr);
3960			if (*pde == 0) {
3961				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3962				    PG_U | PG_RW | PG_V);
3963				pmap->pm_stats.resident_count += NBPDR /
3964				    PAGE_SIZE;
3965				pmap_pde_mappings++;
3966			}
3967			/* Else continue on if the PDE is already valid. */
3968			addr += NBPDR;
3969		}
3970		PMAP_UNLOCK(pmap);
3971	}
3972}
3973
3974/*
3975 *	Routine:	pmap_change_wiring
3976 *	Function:	Change the wiring attribute for a map/virtual-address
3977 *			pair.
3978 *	In/out conditions:
3979 *			The mapping must already exist in the pmap.
3980 */
3981void
3982pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3983{
3984	pd_entry_t *pde;
3985	pt_entry_t *pte;
3986	boolean_t are_queues_locked;
3987
3988	are_queues_locked = FALSE;
3989retry:
3990	PMAP_LOCK(pmap);
3991	pde = pmap_pde(pmap, va);
3992	if ((*pde & PG_PS) != 0) {
3993		if (!wired != ((*pde & PG_W) == 0)) {
3994			if (!are_queues_locked) {
3995				are_queues_locked = TRUE;
3996				if (!rw_try_wlock(&pvh_global_lock)) {
3997					PMAP_UNLOCK(pmap);
3998					rw_wlock(&pvh_global_lock);
3999					goto retry;
4000				}
4001			}
4002			if (!pmap_demote_pde(pmap, pde, va))
4003				panic("pmap_change_wiring: demotion failed");
4004		} else
4005			goto out;
4006	}
4007	pte = pmap_pte(pmap, va);
4008
4009	if (wired && !pmap_pte_w(pte))
4010		pmap->pm_stats.wired_count++;
4011	else if (!wired && pmap_pte_w(pte))
4012		pmap->pm_stats.wired_count--;
4013
4014	/*
4015	 * Wiring is not a hardware characteristic so there is no need to
4016	 * invalidate TLB.
4017	 */
4018	pmap_pte_set_w(pte, wired);
4019	pmap_pte_release(pte);
4020out:
4021	if (are_queues_locked)
4022		rw_wunlock(&pvh_global_lock);
4023	PMAP_UNLOCK(pmap);
4024}
4025
4026
4027
4028/*
4029 *	Copy the range specified by src_addr/len
4030 *	from the source map to the range dst_addr/len
4031 *	in the destination map.
4032 *
4033 *	This routine is only advisory and need not do anything.
4034 */
4035
4036void
4037pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4038    vm_offset_t src_addr)
4039{
4040	vm_page_t   free;
4041	vm_offset_t addr;
4042	vm_offset_t end_addr = src_addr + len;
4043	vm_offset_t pdnxt;
4044
4045	if (dst_addr != src_addr)
4046		return;
4047
4048	if (!pmap_is_current(src_pmap))
4049		return;
4050
4051	rw_wlock(&pvh_global_lock);
4052	if (dst_pmap < src_pmap) {
4053		PMAP_LOCK(dst_pmap);
4054		PMAP_LOCK(src_pmap);
4055	} else {
4056		PMAP_LOCK(src_pmap);
4057		PMAP_LOCK(dst_pmap);
4058	}
4059	sched_pin();
4060	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4061		pt_entry_t *src_pte, *dst_pte;
4062		vm_page_t dstmpte, srcmpte;
4063		pd_entry_t srcptepaddr;
4064		u_int ptepindex;
4065
4066		KASSERT(addr < UPT_MIN_ADDRESS,
4067		    ("pmap_copy: invalid to pmap_copy page tables"));
4068
4069		pdnxt = (addr + NBPDR) & ~PDRMASK;
4070		if (pdnxt < addr)
4071			pdnxt = end_addr;
4072		ptepindex = addr >> PDRSHIFT;
4073
4074		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4075		if (srcptepaddr == 0)
4076			continue;
4077
4078		if (srcptepaddr & PG_PS) {
4079			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4080				continue;
4081			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4082			    ((srcptepaddr & PG_MANAGED) == 0 ||
4083			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4084			    PG_PS_FRAME))) {
4085				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4086				    ~PG_W;
4087				dst_pmap->pm_stats.resident_count +=
4088				    NBPDR / PAGE_SIZE;
4089			}
4090			continue;
4091		}
4092
4093		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4094		KASSERT(srcmpte->wire_count > 0,
4095		    ("pmap_copy: source page table page is unused"));
4096
4097		if (pdnxt > end_addr)
4098			pdnxt = end_addr;
4099
4100		src_pte = vtopte(addr);
4101		while (addr < pdnxt) {
4102			pt_entry_t ptetemp;
4103			ptetemp = *src_pte;
4104			/*
4105			 * we only virtual copy managed pages
4106			 */
4107			if ((ptetemp & PG_MANAGED) != 0) {
4108				dstmpte = pmap_allocpte(dst_pmap, addr,
4109				    M_NOWAIT);
4110				if (dstmpte == NULL)
4111					goto out;
4112				dst_pte = pmap_pte_quick(dst_pmap, addr);
4113				if (*dst_pte == 0 &&
4114				    pmap_try_insert_pv_entry(dst_pmap, addr,
4115				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4116					/*
4117					 * Clear the wired, modified, and
4118					 * accessed (referenced) bits
4119					 * during the copy.
4120					 */
4121					*dst_pte = ptetemp & ~(PG_W | PG_M |
4122					    PG_A);
4123					dst_pmap->pm_stats.resident_count++;
4124	 			} else {
4125					free = NULL;
4126					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4127					    &free)) {
4128						pmap_invalidate_page(dst_pmap,
4129						    addr);
4130						pmap_free_zero_pages(free);
4131					}
4132					goto out;
4133				}
4134				if (dstmpte->wire_count >= srcmpte->wire_count)
4135					break;
4136			}
4137			addr += PAGE_SIZE;
4138			src_pte++;
4139		}
4140	}
4141out:
4142	sched_unpin();
4143	rw_wunlock(&pvh_global_lock);
4144	PMAP_UNLOCK(src_pmap);
4145	PMAP_UNLOCK(dst_pmap);
4146}
4147
4148static __inline void
4149pagezero(void *page)
4150{
4151#if defined(I686_CPU)
4152	if (cpu_class == CPUCLASS_686) {
4153#if defined(CPU_ENABLE_SSE)
4154		if (cpu_feature & CPUID_SSE2)
4155			sse2_pagezero(page);
4156		else
4157#endif
4158			i686_pagezero(page);
4159	} else
4160#endif
4161		bzero(page, PAGE_SIZE);
4162}
4163
4164/*
4165 *	pmap_zero_page zeros the specified hardware page by mapping
4166 *	the page into KVM and using bzero to clear its contents.
4167 */
4168void
4169pmap_zero_page(vm_page_t m)
4170{
4171	struct sysmaps *sysmaps;
4172
4173	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4174	mtx_lock(&sysmaps->lock);
4175	if (*sysmaps->CMAP2)
4176		panic("pmap_zero_page: CMAP2 busy");
4177	sched_pin();
4178	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4179	    pmap_cache_bits(m->md.pat_mode, 0);
4180	invlcaddr(sysmaps->CADDR2);
4181	pagezero(sysmaps->CADDR2);
4182	*sysmaps->CMAP2 = 0;
4183	sched_unpin();
4184	mtx_unlock(&sysmaps->lock);
4185}
4186
4187/*
4188 *	pmap_zero_page_area zeros the specified hardware page by mapping
4189 *	the page into KVM and using bzero to clear its contents.
4190 *
4191 *	off and size may not cover an area beyond a single hardware page.
4192 */
4193void
4194pmap_zero_page_area(vm_page_t m, int off, int size)
4195{
4196	struct sysmaps *sysmaps;
4197
4198	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4199	mtx_lock(&sysmaps->lock);
4200	if (*sysmaps->CMAP2)
4201		panic("pmap_zero_page_area: CMAP2 busy");
4202	sched_pin();
4203	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4204	    pmap_cache_bits(m->md.pat_mode, 0);
4205	invlcaddr(sysmaps->CADDR2);
4206	if (off == 0 && size == PAGE_SIZE)
4207		pagezero(sysmaps->CADDR2);
4208	else
4209		bzero((char *)sysmaps->CADDR2 + off, size);
4210	*sysmaps->CMAP2 = 0;
4211	sched_unpin();
4212	mtx_unlock(&sysmaps->lock);
4213}
4214
4215/*
4216 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4217 *	the page into KVM and using bzero to clear its contents.  This
4218 *	is intended to be called from the vm_pagezero process only and
4219 *	outside of Giant.
4220 */
4221void
4222pmap_zero_page_idle(vm_page_t m)
4223{
4224
4225	if (*CMAP3)
4226		panic("pmap_zero_page_idle: CMAP3 busy");
4227	sched_pin();
4228	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4229	    pmap_cache_bits(m->md.pat_mode, 0);
4230	invlcaddr(CADDR3);
4231	pagezero(CADDR3);
4232	*CMAP3 = 0;
4233	sched_unpin();
4234}
4235
4236/*
4237 *	pmap_copy_page copies the specified (machine independent)
4238 *	page by mapping the page into virtual memory and using
4239 *	bcopy to copy the page, one machine dependent page at a
4240 *	time.
4241 */
4242void
4243pmap_copy_page(vm_page_t src, vm_page_t dst)
4244{
4245	struct sysmaps *sysmaps;
4246
4247	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4248	mtx_lock(&sysmaps->lock);
4249	if (*sysmaps->CMAP1)
4250		panic("pmap_copy_page: CMAP1 busy");
4251	if (*sysmaps->CMAP2)
4252		panic("pmap_copy_page: CMAP2 busy");
4253	sched_pin();
4254	invlpg((u_int)sysmaps->CADDR1);
4255	invlpg((u_int)sysmaps->CADDR2);
4256	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4257	    pmap_cache_bits(src->md.pat_mode, 0);
4258	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4259	    pmap_cache_bits(dst->md.pat_mode, 0);
4260	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4261	*sysmaps->CMAP1 = 0;
4262	*sysmaps->CMAP2 = 0;
4263	sched_unpin();
4264	mtx_unlock(&sysmaps->lock);
4265}
4266
4267int unmapped_buf_allowed = 1;
4268
4269void
4270pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4271    vm_offset_t b_offset, int xfersize)
4272{
4273	struct sysmaps *sysmaps;
4274	vm_page_t a_pg, b_pg;
4275	char *a_cp, *b_cp;
4276	vm_offset_t a_pg_offset, b_pg_offset;
4277	int cnt;
4278
4279	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4280	mtx_lock(&sysmaps->lock);
4281	if (*sysmaps->CMAP1 != 0)
4282		panic("pmap_copy_pages: CMAP1 busy");
4283	if (*sysmaps->CMAP2 != 0)
4284		panic("pmap_copy_pages: CMAP2 busy");
4285	sched_pin();
4286	while (xfersize > 0) {
4287		invlpg((u_int)sysmaps->CADDR1);
4288		invlpg((u_int)sysmaps->CADDR2);
4289		a_pg = ma[a_offset >> PAGE_SHIFT];
4290		a_pg_offset = a_offset & PAGE_MASK;
4291		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4292		b_pg = mb[b_offset >> PAGE_SHIFT];
4293		b_pg_offset = b_offset & PAGE_MASK;
4294		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4295		*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4296		    pmap_cache_bits(b_pg->md.pat_mode, 0);
4297		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4298		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4299		a_cp = sysmaps->CADDR1 + a_pg_offset;
4300		b_cp = sysmaps->CADDR2 + b_pg_offset;
4301		bcopy(a_cp, b_cp, cnt);
4302		a_offset += cnt;
4303		b_offset += cnt;
4304		xfersize -= cnt;
4305	}
4306	*sysmaps->CMAP1 = 0;
4307	*sysmaps->CMAP2 = 0;
4308	sched_unpin();
4309	mtx_unlock(&sysmaps->lock);
4310}
4311
4312/*
4313 * Returns true if the pmap's pv is one of the first
4314 * 16 pvs linked to from this page.  This count may
4315 * be changed upwards or downwards in the future; it
4316 * is only necessary that true be returned for a small
4317 * subset of pmaps for proper page aging.
4318 */
4319boolean_t
4320pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4321{
4322	struct md_page *pvh;
4323	pv_entry_t pv;
4324	int loops = 0;
4325	boolean_t rv;
4326
4327	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4328	    ("pmap_page_exists_quick: page %p is not managed", m));
4329	rv = FALSE;
4330	rw_wlock(&pvh_global_lock);
4331	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4332		if (PV_PMAP(pv) == pmap) {
4333			rv = TRUE;
4334			break;
4335		}
4336		loops++;
4337		if (loops >= 16)
4338			break;
4339	}
4340	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4341		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4342		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4343			if (PV_PMAP(pv) == pmap) {
4344				rv = TRUE;
4345				break;
4346			}
4347			loops++;
4348			if (loops >= 16)
4349				break;
4350		}
4351	}
4352	rw_wunlock(&pvh_global_lock);
4353	return (rv);
4354}
4355
4356/*
4357 *	pmap_page_wired_mappings:
4358 *
4359 *	Return the number of managed mappings to the given physical page
4360 *	that are wired.
4361 */
4362int
4363pmap_page_wired_mappings(vm_page_t m)
4364{
4365	int count;
4366
4367	count = 0;
4368	if ((m->oflags & VPO_UNMANAGED) != 0)
4369		return (count);
4370	rw_wlock(&pvh_global_lock);
4371	count = pmap_pvh_wired_mappings(&m->md, count);
4372	if ((m->flags & PG_FICTITIOUS) == 0) {
4373	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4374	        count);
4375	}
4376	rw_wunlock(&pvh_global_lock);
4377	return (count);
4378}
4379
4380/*
4381 *	pmap_pvh_wired_mappings:
4382 *
4383 *	Return the updated number "count" of managed mappings that are wired.
4384 */
4385static int
4386pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4387{
4388	pmap_t pmap;
4389	pt_entry_t *pte;
4390	pv_entry_t pv;
4391
4392	rw_assert(&pvh_global_lock, RA_WLOCKED);
4393	sched_pin();
4394	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4395		pmap = PV_PMAP(pv);
4396		PMAP_LOCK(pmap);
4397		pte = pmap_pte_quick(pmap, pv->pv_va);
4398		if ((*pte & PG_W) != 0)
4399			count++;
4400		PMAP_UNLOCK(pmap);
4401	}
4402	sched_unpin();
4403	return (count);
4404}
4405
4406/*
4407 * Returns TRUE if the given page is mapped individually or as part of
4408 * a 4mpage.  Otherwise, returns FALSE.
4409 */
4410boolean_t
4411pmap_page_is_mapped(vm_page_t m)
4412{
4413	boolean_t rv;
4414
4415	if ((m->oflags & VPO_UNMANAGED) != 0)
4416		return (FALSE);
4417	rw_wlock(&pvh_global_lock);
4418	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4419	    ((m->flags & PG_FICTITIOUS) == 0 &&
4420	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4421	rw_wunlock(&pvh_global_lock);
4422	return (rv);
4423}
4424
4425/*
4426 * Remove all pages from specified address space
4427 * this aids process exit speeds.  Also, this code
4428 * is special cased for current process only, but
4429 * can have the more generic (and slightly slower)
4430 * mode enabled.  This is much faster than pmap_remove
4431 * in the case of running down an entire address space.
4432 */
4433void
4434pmap_remove_pages(pmap_t pmap)
4435{
4436	pt_entry_t *pte, tpte;
4437	vm_page_t free = NULL;
4438	vm_page_t m, mpte, mt;
4439	pv_entry_t pv;
4440	struct md_page *pvh;
4441	struct pv_chunk *pc, *npc;
4442	int field, idx;
4443	int32_t bit;
4444	uint32_t inuse, bitmask;
4445	int allfree;
4446
4447	if (pmap != PCPU_GET(curpmap)) {
4448		printf("warning: pmap_remove_pages called with non-current pmap\n");
4449		return;
4450	}
4451	rw_wlock(&pvh_global_lock);
4452	PMAP_LOCK(pmap);
4453	sched_pin();
4454	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4455		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4456		    pc->pc_pmap));
4457		allfree = 1;
4458		for (field = 0; field < _NPCM; field++) {
4459			inuse = ~pc->pc_map[field] & pc_freemask[field];
4460			while (inuse != 0) {
4461				bit = bsfl(inuse);
4462				bitmask = 1UL << bit;
4463				idx = field * 32 + bit;
4464				pv = &pc->pc_pventry[idx];
4465				inuse &= ~bitmask;
4466
4467				pte = pmap_pde(pmap, pv->pv_va);
4468				tpte = *pte;
4469				if ((tpte & PG_PS) == 0) {
4470					pte = vtopte(pv->pv_va);
4471					tpte = *pte & ~PG_PTE_PAT;
4472				}
4473
4474				if (tpte == 0) {
4475					printf(
4476					    "TPTE at %p  IS ZERO @ VA %08x\n",
4477					    pte, pv->pv_va);
4478					panic("bad pte");
4479				}
4480
4481/*
4482 * We cannot remove wired pages from a process' mapping at this time
4483 */
4484				if (tpte & PG_W) {
4485					allfree = 0;
4486					continue;
4487				}
4488
4489				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4490				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4491				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4492				    m, (uintmax_t)m->phys_addr,
4493				    (uintmax_t)tpte));
4494
4495				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4496				    m < &vm_page_array[vm_page_array_size],
4497				    ("pmap_remove_pages: bad tpte %#jx",
4498				    (uintmax_t)tpte));
4499
4500				pte_clear(pte);
4501
4502				/*
4503				 * Update the vm_page_t clean/reference bits.
4504				 */
4505				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4506					if ((tpte & PG_PS) != 0) {
4507						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4508							vm_page_dirty(mt);
4509					} else
4510						vm_page_dirty(m);
4511				}
4512
4513				/* Mark free */
4514				PV_STAT(pv_entry_frees++);
4515				PV_STAT(pv_entry_spare++);
4516				pv_entry_count--;
4517				pc->pc_map[field] |= bitmask;
4518				if ((tpte & PG_PS) != 0) {
4519					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4520					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4521					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4522					if (TAILQ_EMPTY(&pvh->pv_list)) {
4523						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4524							if (TAILQ_EMPTY(&mt->md.pv_list))
4525								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4526					}
4527					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4528					if (mpte != NULL) {
4529						pmap_remove_pt_page(pmap, mpte);
4530						pmap->pm_stats.resident_count--;
4531						KASSERT(mpte->wire_count == NPTEPG,
4532						    ("pmap_remove_pages: pte page wire count error"));
4533						mpte->wire_count = 0;
4534						pmap_add_delayed_free_list(mpte, &free, FALSE);
4535						atomic_subtract_int(&cnt.v_wire_count, 1);
4536					}
4537				} else {
4538					pmap->pm_stats.resident_count--;
4539					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4540					if (TAILQ_EMPTY(&m->md.pv_list) &&
4541					    (m->flags & PG_FICTITIOUS) == 0) {
4542						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4543						if (TAILQ_EMPTY(&pvh->pv_list))
4544							vm_page_aflag_clear(m, PGA_WRITEABLE);
4545					}
4546					pmap_unuse_pt(pmap, pv->pv_va, &free);
4547				}
4548			}
4549		}
4550		if (allfree) {
4551			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4552			free_pv_chunk(pc);
4553		}
4554	}
4555	sched_unpin();
4556	pmap_invalidate_all(pmap);
4557	rw_wunlock(&pvh_global_lock);
4558	PMAP_UNLOCK(pmap);
4559	pmap_free_zero_pages(free);
4560}
4561
4562/*
4563 *	pmap_is_modified:
4564 *
4565 *	Return whether or not the specified physical page was modified
4566 *	in any physical maps.
4567 */
4568boolean_t
4569pmap_is_modified(vm_page_t m)
4570{
4571	boolean_t rv;
4572
4573	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4574	    ("pmap_is_modified: page %p is not managed", m));
4575
4576	/*
4577	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
4578	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4579	 * is clear, no PTEs can have PG_M set.
4580	 */
4581	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4582	if ((m->oflags & VPO_BUSY) == 0 &&
4583	    (m->aflags & PGA_WRITEABLE) == 0)
4584		return (FALSE);
4585	rw_wlock(&pvh_global_lock);
4586	rv = pmap_is_modified_pvh(&m->md) ||
4587	    ((m->flags & PG_FICTITIOUS) == 0 &&
4588	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4589	rw_wunlock(&pvh_global_lock);
4590	return (rv);
4591}
4592
4593/*
4594 * Returns TRUE if any of the given mappings were used to modify
4595 * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4596 * mappings are supported.
4597 */
4598static boolean_t
4599pmap_is_modified_pvh(struct md_page *pvh)
4600{
4601	pv_entry_t pv;
4602	pt_entry_t *pte;
4603	pmap_t pmap;
4604	boolean_t rv;
4605
4606	rw_assert(&pvh_global_lock, RA_WLOCKED);
4607	rv = FALSE;
4608	sched_pin();
4609	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4610		pmap = PV_PMAP(pv);
4611		PMAP_LOCK(pmap);
4612		pte = pmap_pte_quick(pmap, pv->pv_va);
4613		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4614		PMAP_UNLOCK(pmap);
4615		if (rv)
4616			break;
4617	}
4618	sched_unpin();
4619	return (rv);
4620}
4621
4622/*
4623 *	pmap_is_prefaultable:
4624 *
4625 *	Return whether or not the specified virtual address is elgible
4626 *	for prefault.
4627 */
4628boolean_t
4629pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4630{
4631	pd_entry_t *pde;
4632	pt_entry_t *pte;
4633	boolean_t rv;
4634
4635	rv = FALSE;
4636	PMAP_LOCK(pmap);
4637	pde = pmap_pde(pmap, addr);
4638	if (*pde != 0 && (*pde & PG_PS) == 0) {
4639		pte = vtopte(addr);
4640		rv = *pte == 0;
4641	}
4642	PMAP_UNLOCK(pmap);
4643	return (rv);
4644}
4645
4646/*
4647 *	pmap_is_referenced:
4648 *
4649 *	Return whether or not the specified physical page was referenced
4650 *	in any physical maps.
4651 */
4652boolean_t
4653pmap_is_referenced(vm_page_t m)
4654{
4655	boolean_t rv;
4656
4657	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4658	    ("pmap_is_referenced: page %p is not managed", m));
4659	rw_wlock(&pvh_global_lock);
4660	rv = pmap_is_referenced_pvh(&m->md) ||
4661	    ((m->flags & PG_FICTITIOUS) == 0 &&
4662	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4663	rw_wunlock(&pvh_global_lock);
4664	return (rv);
4665}
4666
4667/*
4668 * Returns TRUE if any of the given mappings were referenced and FALSE
4669 * otherwise.  Both page and 4mpage mappings are supported.
4670 */
4671static boolean_t
4672pmap_is_referenced_pvh(struct md_page *pvh)
4673{
4674	pv_entry_t pv;
4675	pt_entry_t *pte;
4676	pmap_t pmap;
4677	boolean_t rv;
4678
4679	rw_assert(&pvh_global_lock, RA_WLOCKED);
4680	rv = FALSE;
4681	sched_pin();
4682	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4683		pmap = PV_PMAP(pv);
4684		PMAP_LOCK(pmap);
4685		pte = pmap_pte_quick(pmap, pv->pv_va);
4686		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4687		PMAP_UNLOCK(pmap);
4688		if (rv)
4689			break;
4690	}
4691	sched_unpin();
4692	return (rv);
4693}
4694
4695/*
4696 * Clear the write and modified bits in each of the given page's mappings.
4697 */
4698void
4699pmap_remove_write(vm_page_t m)
4700{
4701	struct md_page *pvh;
4702	pv_entry_t next_pv, pv;
4703	pmap_t pmap;
4704	pd_entry_t *pde;
4705	pt_entry_t oldpte, *pte;
4706	vm_offset_t va;
4707
4708	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4709	    ("pmap_remove_write: page %p is not managed", m));
4710
4711	/*
4712	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
4713	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
4714	 * is clear, no page table entries need updating.
4715	 */
4716	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4717	if ((m->oflags & VPO_BUSY) == 0 &&
4718	    (m->aflags & PGA_WRITEABLE) == 0)
4719		return;
4720	rw_wlock(&pvh_global_lock);
4721	sched_pin();
4722	if ((m->flags & PG_FICTITIOUS) != 0)
4723		goto small_mappings;
4724	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4725	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4726		va = pv->pv_va;
4727		pmap = PV_PMAP(pv);
4728		PMAP_LOCK(pmap);
4729		pde = pmap_pde(pmap, va);
4730		if ((*pde & PG_RW) != 0)
4731			(void)pmap_demote_pde(pmap, pde, va);
4732		PMAP_UNLOCK(pmap);
4733	}
4734small_mappings:
4735	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4736		pmap = PV_PMAP(pv);
4737		PMAP_LOCK(pmap);
4738		pde = pmap_pde(pmap, pv->pv_va);
4739		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4740		    " a 4mpage in page %p's pv list", m));
4741		pte = pmap_pte_quick(pmap, pv->pv_va);
4742retry:
4743		oldpte = *pte;
4744		if ((oldpte & PG_RW) != 0) {
4745			/*
4746			 * Regardless of whether a pte is 32 or 64 bits
4747			 * in size, PG_RW and PG_M are among the least
4748			 * significant 32 bits.
4749			 */
4750			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4751			    oldpte & ~(PG_RW | PG_M)))
4752				goto retry;
4753			if ((oldpte & PG_M) != 0)
4754				vm_page_dirty(m);
4755			pmap_invalidate_page(pmap, pv->pv_va);
4756		}
4757		PMAP_UNLOCK(pmap);
4758	}
4759	vm_page_aflag_clear(m, PGA_WRITEABLE);
4760	sched_unpin();
4761	rw_wunlock(&pvh_global_lock);
4762}
4763
4764/*
4765 *	pmap_ts_referenced:
4766 *
4767 *	Return a count of reference bits for a page, clearing those bits.
4768 *	It is not necessary for every reference bit to be cleared, but it
4769 *	is necessary that 0 only be returned when there are truly no
4770 *	reference bits set.
4771 *
4772 *	XXX: The exact number of bits to check and clear is a matter that
4773 *	should be tested and standardized at some point in the future for
4774 *	optimal aging of shared pages.
4775 */
4776int
4777pmap_ts_referenced(vm_page_t m)
4778{
4779	struct md_page *pvh;
4780	pv_entry_t pv, pvf, pvn;
4781	pmap_t pmap;
4782	pd_entry_t oldpde, *pde;
4783	pt_entry_t *pte;
4784	vm_offset_t va;
4785	int rtval = 0;
4786
4787	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4788	    ("pmap_ts_referenced: page %p is not managed", m));
4789	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4790	rw_wlock(&pvh_global_lock);
4791	sched_pin();
4792	if ((m->flags & PG_FICTITIOUS) != 0)
4793		goto small_mappings;
4794	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4795		va = pv->pv_va;
4796		pmap = PV_PMAP(pv);
4797		PMAP_LOCK(pmap);
4798		pde = pmap_pde(pmap, va);
4799		oldpde = *pde;
4800		if ((oldpde & PG_A) != 0) {
4801			if (pmap_demote_pde(pmap, pde, va)) {
4802				if ((oldpde & PG_W) == 0) {
4803					/*
4804					 * Remove the mapping to a single page
4805					 * so that a subsequent access may
4806					 * repromote.  Since the underlying
4807					 * page table page is fully populated,
4808					 * this removal never frees a page
4809					 * table page.
4810					 */
4811					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4812					    PG_PS_FRAME);
4813					pmap_remove_page(pmap, va, NULL);
4814					rtval++;
4815					if (rtval > 4) {
4816						PMAP_UNLOCK(pmap);
4817						goto out;
4818					}
4819				}
4820			}
4821		}
4822		PMAP_UNLOCK(pmap);
4823	}
4824small_mappings:
4825	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4826		pvf = pv;
4827		do {
4828			pvn = TAILQ_NEXT(pv, pv_list);
4829			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4830			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4831			pmap = PV_PMAP(pv);
4832			PMAP_LOCK(pmap);
4833			pde = pmap_pde(pmap, pv->pv_va);
4834			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4835			    " found a 4mpage in page %p's pv list", m));
4836			pte = pmap_pte_quick(pmap, pv->pv_va);
4837			if ((*pte & PG_A) != 0) {
4838				atomic_clear_int((u_int *)pte, PG_A);
4839				pmap_invalidate_page(pmap, pv->pv_va);
4840				rtval++;
4841				if (rtval > 4)
4842					pvn = NULL;
4843			}
4844			PMAP_UNLOCK(pmap);
4845		} while ((pv = pvn) != NULL && pv != pvf);
4846	}
4847out:
4848	sched_unpin();
4849	rw_wunlock(&pvh_global_lock);
4850	return (rtval);
4851}
4852
4853/*
4854 *	Clear the modify bits on the specified physical page.
4855 */
4856void
4857pmap_clear_modify(vm_page_t m)
4858{
4859	struct md_page *pvh;
4860	pv_entry_t next_pv, pv;
4861	pmap_t pmap;
4862	pd_entry_t oldpde, *pde;
4863	pt_entry_t oldpte, *pte;
4864	vm_offset_t va;
4865
4866	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4867	    ("pmap_clear_modify: page %p is not managed", m));
4868	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4869	KASSERT((m->oflags & VPO_BUSY) == 0,
4870	    ("pmap_clear_modify: page %p is busy", m));
4871
4872	/*
4873	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4874	 * If the object containing the page is locked and the page is not
4875	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
4876	 */
4877	if ((m->aflags & PGA_WRITEABLE) == 0)
4878		return;
4879	rw_wlock(&pvh_global_lock);
4880	sched_pin();
4881	if ((m->flags & PG_FICTITIOUS) != 0)
4882		goto small_mappings;
4883	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4884	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4885		va = pv->pv_va;
4886		pmap = PV_PMAP(pv);
4887		PMAP_LOCK(pmap);
4888		pde = pmap_pde(pmap, va);
4889		oldpde = *pde;
4890		if ((oldpde & PG_RW) != 0) {
4891			if (pmap_demote_pde(pmap, pde, va)) {
4892				if ((oldpde & PG_W) == 0) {
4893					/*
4894					 * Write protect the mapping to a
4895					 * single page so that a subsequent
4896					 * write access may repromote.
4897					 */
4898					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4899					    PG_PS_FRAME);
4900					pte = pmap_pte_quick(pmap, va);
4901					oldpte = *pte;
4902					if ((oldpte & PG_V) != 0) {
4903						/*
4904						 * Regardless of whether a pte is 32 or 64 bits
4905						 * in size, PG_RW and PG_M are among the least
4906						 * significant 32 bits.
4907						 */
4908						while (!atomic_cmpset_int((u_int *)pte,
4909						    oldpte,
4910						    oldpte & ~(PG_M | PG_RW)))
4911							oldpte = *pte;
4912						vm_page_dirty(m);
4913						pmap_invalidate_page(pmap, va);
4914					}
4915				}
4916			}
4917		}
4918		PMAP_UNLOCK(pmap);
4919	}
4920small_mappings:
4921	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4922		pmap = PV_PMAP(pv);
4923		PMAP_LOCK(pmap);
4924		pde = pmap_pde(pmap, pv->pv_va);
4925		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4926		    " a 4mpage in page %p's pv list", m));
4927		pte = pmap_pte_quick(pmap, pv->pv_va);
4928		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4929			/*
4930			 * Regardless of whether a pte is 32 or 64 bits
4931			 * in size, PG_M is among the least significant
4932			 * 32 bits.
4933			 */
4934			atomic_clear_int((u_int *)pte, PG_M);
4935			pmap_invalidate_page(pmap, pv->pv_va);
4936		}
4937		PMAP_UNLOCK(pmap);
4938	}
4939	sched_unpin();
4940	rw_wunlock(&pvh_global_lock);
4941}
4942
4943/*
4944 *	pmap_clear_reference:
4945 *
4946 *	Clear the reference bit on the specified physical page.
4947 */
4948void
4949pmap_clear_reference(vm_page_t m)
4950{
4951	struct md_page *pvh;
4952	pv_entry_t next_pv, pv;
4953	pmap_t pmap;
4954	pd_entry_t oldpde, *pde;
4955	pt_entry_t *pte;
4956	vm_offset_t va;
4957
4958	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4959	    ("pmap_clear_reference: page %p is not managed", m));
4960	rw_wlock(&pvh_global_lock);
4961	sched_pin();
4962	if ((m->flags & PG_FICTITIOUS) != 0)
4963		goto small_mappings;
4964	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4965	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4966		va = pv->pv_va;
4967		pmap = PV_PMAP(pv);
4968		PMAP_LOCK(pmap);
4969		pde = pmap_pde(pmap, va);
4970		oldpde = *pde;
4971		if ((oldpde & PG_A) != 0) {
4972			if (pmap_demote_pde(pmap, pde, va)) {
4973				/*
4974				 * Remove the mapping to a single page so
4975				 * that a subsequent access may repromote.
4976				 * Since the underlying page table page is
4977				 * fully populated, this removal never frees
4978				 * a page table page.
4979				 */
4980				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4981				    PG_PS_FRAME);
4982				pmap_remove_page(pmap, va, NULL);
4983			}
4984		}
4985		PMAP_UNLOCK(pmap);
4986	}
4987small_mappings:
4988	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4989		pmap = PV_PMAP(pv);
4990		PMAP_LOCK(pmap);
4991		pde = pmap_pde(pmap, pv->pv_va);
4992		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4993		    " a 4mpage in page %p's pv list", m));
4994		pte = pmap_pte_quick(pmap, pv->pv_va);
4995		if ((*pte & PG_A) != 0) {
4996			/*
4997			 * Regardless of whether a pte is 32 or 64 bits
4998			 * in size, PG_A is among the least significant
4999			 * 32 bits.
5000			 */
5001			atomic_clear_int((u_int *)pte, PG_A);
5002			pmap_invalidate_page(pmap, pv->pv_va);
5003		}
5004		PMAP_UNLOCK(pmap);
5005	}
5006	sched_unpin();
5007	rw_wunlock(&pvh_global_lock);
5008}
5009
5010/*
5011 * Miscellaneous support routines follow
5012 */
5013
5014/* Adjust the cache mode for a 4KB page mapped via a PTE. */
5015static __inline void
5016pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5017{
5018	u_int opte, npte;
5019
5020	/*
5021	 * The cache mode bits are all in the low 32-bits of the
5022	 * PTE, so we can just spin on updating the low 32-bits.
5023	 */
5024	do {
5025		opte = *(u_int *)pte;
5026		npte = opte & ~PG_PTE_CACHE;
5027		npte |= cache_bits;
5028	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5029}
5030
5031/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5032static __inline void
5033pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5034{
5035	u_int opde, npde;
5036
5037	/*
5038	 * The cache mode bits are all in the low 32-bits of the
5039	 * PDE, so we can just spin on updating the low 32-bits.
5040	 */
5041	do {
5042		opde = *(u_int *)pde;
5043		npde = opde & ~PG_PDE_CACHE;
5044		npde |= cache_bits;
5045	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5046}
5047
5048/*
5049 * Map a set of physical memory pages into the kernel virtual
5050 * address space. Return a pointer to where it is mapped. This
5051 * routine is intended to be used for mapping device memory,
5052 * NOT real memory.
5053 */
5054void *
5055pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5056{
5057	vm_offset_t va, offset;
5058	vm_size_t tmpsize;
5059
5060	offset = pa & PAGE_MASK;
5061	size = roundup(offset + size, PAGE_SIZE);
5062	pa = pa & PG_FRAME;
5063
5064	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5065		va = KERNBASE + pa;
5066	else
5067		va = kmem_alloc_nofault(kernel_map, size);
5068	if (!va)
5069		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5070
5071	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5072		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5073	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5074	pmap_invalidate_cache_range(va, va + size);
5075	return ((void *)(va + offset));
5076}
5077
5078void *
5079pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5080{
5081
5082	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5083}
5084
5085void *
5086pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5087{
5088
5089	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5090}
5091
5092void
5093pmap_unmapdev(vm_offset_t va, vm_size_t size)
5094{
5095	vm_offset_t base, offset;
5096
5097	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5098		return;
5099	base = trunc_page(va);
5100	offset = va & PAGE_MASK;
5101	size = roundup(offset + size, PAGE_SIZE);
5102	kmem_free(kernel_map, base, size);
5103}
5104
5105/*
5106 * Sets the memory attribute for the specified page.
5107 */
5108void
5109pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5110{
5111
5112	m->md.pat_mode = ma;
5113	if ((m->flags & PG_FICTITIOUS) != 0)
5114		return;
5115
5116	/*
5117	 * If "m" is a normal page, flush it from the cache.
5118	 * See pmap_invalidate_cache_range().
5119	 *
5120	 * First, try to find an existing mapping of the page by sf
5121	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5122	 * flushes the cache.
5123	 */
5124	if (sf_buf_invalidate_cache(m))
5125		return;
5126
5127	/*
5128	 * If page is not mapped by sf buffer, but CPU does not
5129	 * support self snoop, map the page transient and do
5130	 * invalidation. In the worst case, whole cache is flushed by
5131	 * pmap_invalidate_cache_range().
5132	 */
5133	if ((cpu_feature & CPUID_SS) == 0)
5134		pmap_flush_page(m);
5135}
5136
5137static void
5138pmap_flush_page(vm_page_t m)
5139{
5140	struct sysmaps *sysmaps;
5141	vm_offset_t sva, eva;
5142
5143	if ((cpu_feature & CPUID_CLFSH) != 0) {
5144		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5145		mtx_lock(&sysmaps->lock);
5146		if (*sysmaps->CMAP2)
5147			panic("pmap_flush_page: CMAP2 busy");
5148		sched_pin();
5149		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5150		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5151		invlcaddr(sysmaps->CADDR2);
5152		sva = (vm_offset_t)sysmaps->CADDR2;
5153		eva = sva + PAGE_SIZE;
5154
5155		/*
5156		 * Use mfence despite the ordering implied by
5157		 * mtx_{un,}lock() because clflush is not guaranteed
5158		 * to be ordered by any other instruction.
5159		 */
5160		mfence();
5161		for (; sva < eva; sva += cpu_clflush_line_size)
5162			clflush(sva);
5163		mfence();
5164		*sysmaps->CMAP2 = 0;
5165		sched_unpin();
5166		mtx_unlock(&sysmaps->lock);
5167	} else
5168		pmap_invalidate_cache();
5169}
5170
5171/*
5172 * Changes the specified virtual address range's memory type to that given by
5173 * the parameter "mode".  The specified virtual address range must be
5174 * completely contained within either the kernel map.
5175 *
5176 * Returns zero if the change completed successfully, and either EINVAL or
5177 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5178 * of the virtual address range was not mapped, and ENOMEM is returned if
5179 * there was insufficient memory available to complete the change.
5180 */
5181int
5182pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5183{
5184	vm_offset_t base, offset, tmpva;
5185	pd_entry_t *pde;
5186	pt_entry_t *pte;
5187	int cache_bits_pte, cache_bits_pde;
5188	boolean_t changed;
5189
5190	base = trunc_page(va);
5191	offset = va & PAGE_MASK;
5192	size = roundup(offset + size, PAGE_SIZE);
5193
5194	/*
5195	 * Only supported on kernel virtual addresses above the recursive map.
5196	 */
5197	if (base < VM_MIN_KERNEL_ADDRESS)
5198		return (EINVAL);
5199
5200	cache_bits_pde = pmap_cache_bits(mode, 1);
5201	cache_bits_pte = pmap_cache_bits(mode, 0);
5202	changed = FALSE;
5203
5204	/*
5205	 * Pages that aren't mapped aren't supported.  Also break down
5206	 * 2/4MB pages into 4KB pages if required.
5207	 */
5208	PMAP_LOCK(kernel_pmap);
5209	for (tmpva = base; tmpva < base + size; ) {
5210		pde = pmap_pde(kernel_pmap, tmpva);
5211		if (*pde == 0) {
5212			PMAP_UNLOCK(kernel_pmap);
5213			return (EINVAL);
5214		}
5215		if (*pde & PG_PS) {
5216			/*
5217			 * If the current 2/4MB page already has
5218			 * the required memory type, then we need not
5219			 * demote this page.  Just increment tmpva to
5220			 * the next 2/4MB page frame.
5221			 */
5222			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5223				tmpva = trunc_4mpage(tmpva) + NBPDR;
5224				continue;
5225			}
5226
5227			/*
5228			 * If the current offset aligns with a 2/4MB
5229			 * page frame and there is at least 2/4MB left
5230			 * within the range, then we need not break
5231			 * down this page into 4KB pages.
5232			 */
5233			if ((tmpva & PDRMASK) == 0 &&
5234			    tmpva + PDRMASK < base + size) {
5235				tmpva += NBPDR;
5236				continue;
5237			}
5238			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5239				PMAP_UNLOCK(kernel_pmap);
5240				return (ENOMEM);
5241			}
5242		}
5243		pte = vtopte(tmpva);
5244		if (*pte == 0) {
5245			PMAP_UNLOCK(kernel_pmap);
5246			return (EINVAL);
5247		}
5248		tmpva += PAGE_SIZE;
5249	}
5250	PMAP_UNLOCK(kernel_pmap);
5251
5252	/*
5253	 * Ok, all the pages exist, so run through them updating their
5254	 * cache mode if required.
5255	 */
5256	for (tmpva = base; tmpva < base + size; ) {
5257		pde = pmap_pde(kernel_pmap, tmpva);
5258		if (*pde & PG_PS) {
5259			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5260				pmap_pde_attr(pde, cache_bits_pde);
5261				changed = TRUE;
5262			}
5263			tmpva = trunc_4mpage(tmpva) + NBPDR;
5264		} else {
5265			pte = vtopte(tmpva);
5266			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5267				pmap_pte_attr(pte, cache_bits_pte);
5268				changed = TRUE;
5269			}
5270			tmpva += PAGE_SIZE;
5271		}
5272	}
5273
5274	/*
5275	 * Flush CPU caches to make sure any data isn't cached that
5276	 * shouldn't be, etc.
5277	 */
5278	if (changed) {
5279		pmap_invalidate_range(kernel_pmap, base, tmpva);
5280		pmap_invalidate_cache_range(base, tmpva);
5281	}
5282	return (0);
5283}
5284
5285/*
5286 * perform the pmap work for mincore
5287 */
5288int
5289pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5290{
5291	pd_entry_t *pdep;
5292	pt_entry_t *ptep, pte;
5293	vm_paddr_t pa;
5294	int val;
5295
5296	PMAP_LOCK(pmap);
5297retry:
5298	pdep = pmap_pde(pmap, addr);
5299	if (*pdep != 0) {
5300		if (*pdep & PG_PS) {
5301			pte = *pdep;
5302			/* Compute the physical address of the 4KB page. */
5303			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5304			    PG_FRAME;
5305			val = MINCORE_SUPER;
5306		} else {
5307			ptep = pmap_pte(pmap, addr);
5308			pte = *ptep;
5309			pmap_pte_release(ptep);
5310			pa = pte & PG_FRAME;
5311			val = 0;
5312		}
5313	} else {
5314		pte = 0;
5315		pa = 0;
5316		val = 0;
5317	}
5318	if ((pte & PG_V) != 0) {
5319		val |= MINCORE_INCORE;
5320		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5321			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5322		if ((pte & PG_A) != 0)
5323			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5324	}
5325	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5326	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5327	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5328		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5329		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5330			goto retry;
5331	} else
5332		PA_UNLOCK_COND(*locked_pa);
5333	PMAP_UNLOCK(pmap);
5334	return (val);
5335}
5336
5337void
5338pmap_activate(struct thread *td)
5339{
5340	pmap_t	pmap, oldpmap;
5341	u_int	cpuid;
5342	u_int32_t  cr3;
5343
5344	critical_enter();
5345	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5346	oldpmap = PCPU_GET(curpmap);
5347	cpuid = PCPU_GET(cpuid);
5348#if defined(SMP)
5349	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5350	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5351#else
5352	CPU_CLR(cpuid, &oldpmap->pm_active);
5353	CPU_SET(cpuid, &pmap->pm_active);
5354#endif
5355#ifdef PAE
5356	cr3 = vtophys(pmap->pm_pdpt);
5357#else
5358	cr3 = vtophys(pmap->pm_pdir);
5359#endif
5360	/*
5361	 * pmap_activate is for the current thread on the current cpu
5362	 */
5363	td->td_pcb->pcb_cr3 = cr3;
5364	load_cr3(cr3);
5365	PCPU_SET(curpmap, pmap);
5366	critical_exit();
5367}
5368
5369void
5370pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5371{
5372}
5373
5374/*
5375 *	Increase the starting virtual address of the given mapping if a
5376 *	different alignment might result in more superpage mappings.
5377 */
5378void
5379pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5380    vm_offset_t *addr, vm_size_t size)
5381{
5382	vm_offset_t superpage_offset;
5383
5384	if (size < NBPDR)
5385		return;
5386	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5387		offset += ptoa(object->pg_color);
5388	superpage_offset = offset & PDRMASK;
5389	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5390	    (*addr & PDRMASK) == superpage_offset)
5391		return;
5392	if ((*addr & PDRMASK) < superpage_offset)
5393		*addr = (*addr & ~PDRMASK) + superpage_offset;
5394	else
5395		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5396}
5397
5398
5399#if defined(PMAP_DEBUG)
5400pmap_pid_dump(int pid)
5401{
5402	pmap_t pmap;
5403	struct proc *p;
5404	int npte = 0;
5405	int index;
5406
5407	sx_slock(&allproc_lock);
5408	FOREACH_PROC_IN_SYSTEM(p) {
5409		if (p->p_pid != pid)
5410			continue;
5411
5412		if (p->p_vmspace) {
5413			int i,j;
5414			index = 0;
5415			pmap = vmspace_pmap(p->p_vmspace);
5416			for (i = 0; i < NPDEPTD; i++) {
5417				pd_entry_t *pde;
5418				pt_entry_t *pte;
5419				vm_offset_t base = i << PDRSHIFT;
5420
5421				pde = &pmap->pm_pdir[i];
5422				if (pde && pmap_pde_v(pde)) {
5423					for (j = 0; j < NPTEPG; j++) {
5424						vm_offset_t va = base + (j << PAGE_SHIFT);
5425						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5426							if (index) {
5427								index = 0;
5428								printf("\n");
5429							}
5430							sx_sunlock(&allproc_lock);
5431							return (npte);
5432						}
5433						pte = pmap_pte(pmap, va);
5434						if (pte && pmap_pte_v(pte)) {
5435							pt_entry_t pa;
5436							vm_page_t m;
5437							pa = *pte;
5438							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5439							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5440								va, pa, m->hold_count, m->wire_count, m->flags);
5441							npte++;
5442							index++;
5443							if (index >= 2) {
5444								index = 0;
5445								printf("\n");
5446							} else {
5447								printf(" ");
5448							}
5449						}
5450					}
5451				}
5452			}
5453		}
5454	}
5455	sx_sunlock(&allproc_lock);
5456	return (npte);
5457}
5458#endif
5459
5460#if defined(DEBUG)
5461
5462static void	pads(pmap_t pm);
5463void		pmap_pvdump(vm_paddr_t pa);
5464
5465/* print address space of pmap*/
5466static void
5467pads(pmap_t pm)
5468{
5469	int i, j;
5470	vm_paddr_t va;
5471	pt_entry_t *ptep;
5472
5473	if (pm == kernel_pmap)
5474		return;
5475	for (i = 0; i < NPDEPTD; i++)
5476		if (pm->pm_pdir[i])
5477			for (j = 0; j < NPTEPG; j++) {
5478				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5479				if (pm == kernel_pmap && va < KERNBASE)
5480					continue;
5481				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5482					continue;
5483				ptep = pmap_pte(pm, va);
5484				if (pmap_pte_v(ptep))
5485					printf("%x:%x ", va, *ptep);
5486			};
5487
5488}
5489
5490void
5491pmap_pvdump(vm_paddr_t pa)
5492{
5493	pv_entry_t pv;
5494	pmap_t pmap;
5495	vm_page_t m;
5496
5497	printf("pa %x", pa);
5498	m = PHYS_TO_VM_PAGE(pa);
5499	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5500		pmap = PV_PMAP(pv);
5501		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5502		pads(pmap);
5503	}
5504	printf(" ");
5505}
5506#endif
5507