pmap.c revision 285830
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44 */
45/*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 *    notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 *    notice, this list of conditions and the following disclaimer in the
62 *    documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77#include <sys/cdefs.h>
78__FBSDID("$FreeBSD: releng/10.2/sys/i386/xen/pmap.c 273136 2014-10-15 14:07:24Z kib $");
79
80/*
81 *	Manages physical address maps.
82 *
83 *	Since the information managed by this module is
84 *	also stored by the logical address mapping module,
85 *	this module may throw away valid virtual-to-physical
86 *	mappings at almost any time.  However, invalidations
87 *	of virtual-to-physical mappings must be done as
88 *	requested.
89 *
90 *	In order to cope with hardware architectures which
91 *	make virtual-to-physical map invalidates expensive,
92 *	this module may delay invalidate or reduced protection
93 *	operations until such time as they are actually
94 *	necessary.  This module is given full information as
95 *	to which processors are currently using which maps,
96 *	and to when physical maps must be made correct.
97 */
98
99#include "opt_cpu.h"
100#include "opt_pmap.h"
101#include "opt_smp.h"
102#include "opt_xbox.h"
103
104#include <sys/param.h>
105#include <sys/systm.h>
106#include <sys/kernel.h>
107#include <sys/ktr.h>
108#include <sys/lock.h>
109#include <sys/malloc.h>
110#include <sys/mman.h>
111#include <sys/msgbuf.h>
112#include <sys/mutex.h>
113#include <sys/proc.h>
114#include <sys/rwlock.h>
115#include <sys/sf_buf.h>
116#include <sys/sx.h>
117#include <sys/vmmeter.h>
118#include <sys/sched.h>
119#include <sys/sysctl.h>
120#ifdef SMP
121#include <sys/smp.h>
122#else
123#include <sys/cpuset.h>
124#endif
125
126#include <vm/vm.h>
127#include <vm/vm_param.h>
128#include <vm/vm_kern.h>
129#include <vm/vm_page.h>
130#include <vm/vm_map.h>
131#include <vm/vm_object.h>
132#include <vm/vm_extern.h>
133#include <vm/vm_pageout.h>
134#include <vm/vm_pager.h>
135#include <vm/uma.h>
136
137#include <machine/cpu.h>
138#include <machine/cputypes.h>
139#include <machine/md_var.h>
140#include <machine/pcb.h>
141#include <machine/specialreg.h>
142#ifdef SMP
143#include <machine/smp.h>
144#endif
145
146#ifdef XBOX
147#include <machine/xbox.h>
148#endif
149
150#include <xen/interface/xen.h>
151#include <xen/hypervisor.h>
152#include <machine/xen/hypercall.h>
153#include <machine/xen/xenvar.h>
154#include <machine/xen/xenfunc.h>
155
156#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
157#define CPU_ENABLE_SSE
158#endif
159
160#ifndef PMAP_SHPGPERPROC
161#define PMAP_SHPGPERPROC 200
162#endif
163
164#define DIAGNOSTIC
165
166#if !defined(DIAGNOSTIC)
167#ifdef __GNUC_GNU_INLINE__
168#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
169#else
170#define PMAP_INLINE	extern inline
171#endif
172#else
173#define PMAP_INLINE
174#endif
175
176#ifdef PV_STATS
177#define PV_STAT(x)	do { x ; } while (0)
178#else
179#define PV_STAT(x)	do { } while (0)
180#endif
181
182/*
183 * Get PDEs and PTEs for user/kernel address space
184 */
185#define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
186#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
187
188#define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
189#define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
190#define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
191#define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
192#define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
193
194#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
195
196#define HAMFISTED_LOCKING
197#ifdef HAMFISTED_LOCKING
198static struct mtx createdelete_lock;
199#endif
200
201struct pmap kernel_pmap_store;
202LIST_HEAD(pmaplist, pmap);
203static struct pmaplist allpmaps;
204static struct mtx allpmaps_lock;
205
206vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
207vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
208int pgeflag = 0;		/* PG_G or-in */
209int pseflag = 0;		/* PG_PS or-in */
210
211int nkpt;
212vm_offset_t kernel_vm_end;
213extern u_int32_t KERNend;
214
215#ifdef PAE
216pt_entry_t pg_nx;
217#endif
218
219static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
220
221static int pat_works;			/* Is page attribute table sane? */
222
223/*
224 * This lock is defined as static in other pmap implementations.  It cannot,
225 * however, be defined as static here, because it is (ab)used to serialize
226 * queued page table changes in other sources files.
227 */
228struct rwlock pvh_global_lock;
229
230/*
231 * Data for the pv entry allocation mechanism
232 */
233static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
234static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
235static int shpgperproc = PMAP_SHPGPERPROC;
236
237struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
238int pv_maxchunks;			/* How many chunks we have KVA for */
239vm_offset_t pv_vafree;			/* freelist stored in the PTE */
240
241/*
242 * All those kernel PT submaps that BSD is so fond of
243 */
244struct sysmaps {
245	struct	mtx lock;
246	pt_entry_t *CMAP1;
247	pt_entry_t *CMAP2;
248	caddr_t	CADDR1;
249	caddr_t	CADDR2;
250};
251static struct sysmaps sysmaps_pcpu[MAXCPU];
252pt_entry_t *CMAP3;
253caddr_t ptvmmap = 0;
254caddr_t CADDR3;
255struct msgbuf *msgbufp = 0;
256
257/*
258 * Crashdump maps.
259 */
260static caddr_t crashdumpmap;
261
262static pt_entry_t *PMAP1 = 0, *PMAP2;
263static pt_entry_t *PADDR1 = 0, *PADDR2;
264#ifdef SMP
265static int PMAP1cpu;
266static int PMAP1changedcpu;
267SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
268	   &PMAP1changedcpu, 0,
269	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
270#endif
271static int PMAP1changed;
272SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
273	   &PMAP1changed, 0,
274	   "Number of times pmap_pte_quick changed PMAP1");
275static int PMAP1unchanged;
276SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
277	   &PMAP1unchanged, 0,
278	   "Number of times pmap_pte_quick didn't change PMAP1");
279static struct mtx PMAP2mutex;
280
281static void	free_pv_chunk(struct pv_chunk *pc);
282static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
283static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
284static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
285static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
286		    vm_offset_t va);
287
288static vm_page_t pmap_enter_quick_locked(multicall_entry_t **mcl, int *count, pmap_t pmap, vm_offset_t va,
289    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
290static void pmap_flush_page(vm_page_t m);
291static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
292static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
293    vm_page_t *free);
294static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
295    vm_page_t *free);
296static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
297					vm_offset_t va);
298static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
299    vm_page_t m);
300
301static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
302
303static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
304static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free);
305static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
306static void pmap_pte_release(pt_entry_t *pte);
307static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
308static boolean_t pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr);
309
310static __inline void pagezero(void *page);
311
312CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
313CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
314
315/*
316 * If you get an error here, then you set KVA_PAGES wrong! See the
317 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
318 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
319 */
320CTASSERT(KERNBASE % (1 << 24) == 0);
321
322void
323pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type)
324{
325	vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]);
326
327	switch (type) {
328	case SH_PD_SET_VA:
329#if 0
330		xen_queue_pt_update(shadow_pdir_ma,
331				    xpmap_ptom(val & ~(PG_RW)));
332#endif
333		xen_queue_pt_update(pdir_ma,
334				    xpmap_ptom(val));
335		break;
336	case SH_PD_SET_VA_MA:
337#if 0
338		xen_queue_pt_update(shadow_pdir_ma,
339				    val & ~(PG_RW));
340#endif
341		xen_queue_pt_update(pdir_ma, val);
342		break;
343	case SH_PD_SET_VA_CLEAR:
344#if 0
345		xen_queue_pt_update(shadow_pdir_ma, 0);
346#endif
347		xen_queue_pt_update(pdir_ma, 0);
348		break;
349	}
350}
351
352/*
353 *	Bootstrap the system enough to run with virtual memory.
354 *
355 *	On the i386 this is called after mapping has already been enabled
356 *	and just syncs the pmap module with what has already been done.
357 *	[We can't call it easily with mapping off since the kernel is not
358 *	mapped with PA == VA, hence we would have to relocate every address
359 *	from the linked base (virtual) address "KERNBASE" to the actual
360 *	(physical) address starting relative to 0]
361 */
362void
363pmap_bootstrap(vm_paddr_t firstaddr)
364{
365	vm_offset_t va;
366	pt_entry_t *pte, *unused;
367	struct sysmaps *sysmaps;
368	int i;
369
370	/*
371	 * Initialize the first available kernel virtual address.  However,
372	 * using "firstaddr" may waste a few pages of the kernel virtual
373	 * address space, because locore may not have mapped every physical
374	 * page that it allocated.  Preferably, locore would provide a first
375	 * unused virtual address in addition to "firstaddr".
376	 */
377	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
378
379	virtual_end = VM_MAX_KERNEL_ADDRESS;
380
381	/*
382	 * Initialize the kernel pmap (which is statically allocated).
383	 */
384	PMAP_LOCK_INIT(kernel_pmap);
385	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
386#ifdef PAE
387	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
388#endif
389	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
390	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
391
392 	/*
393	 * Initialize the global pv list lock.
394	 */
395	rw_init_flags(&pvh_global_lock, "pmap pv global", RW_RECURSE);
396
397	LIST_INIT(&allpmaps);
398	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
399	mtx_lock_spin(&allpmaps_lock);
400	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
401	mtx_unlock_spin(&allpmaps_lock);
402	if (nkpt == 0)
403		nkpt = NKPT;
404
405	/*
406	 * Reserve some special page table entries/VA space for temporary
407	 * mapping of pages.
408	 */
409#define	SYSMAP(c, p, v, n)	\
410	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
411
412	va = virtual_avail;
413	pte = vtopte(va);
414
415	/*
416	 * CMAP1/CMAP2 are used for zeroing and copying pages.
417	 * CMAP3 is used for the idle process page zeroing.
418	 */
419	for (i = 0; i < MAXCPU; i++) {
420		sysmaps = &sysmaps_pcpu[i];
421		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
422		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
423		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
424		PT_SET_MA(sysmaps->CADDR1, 0);
425		PT_SET_MA(sysmaps->CADDR2, 0);
426	}
427	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
428	PT_SET_MA(CADDR3, 0);
429
430	/*
431	 * Crashdump maps.
432	 */
433	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
434
435	/*
436	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
437	 */
438	SYSMAP(caddr_t, unused, ptvmmap, 1)
439
440	/*
441	 * msgbufp is used to map the system message buffer.
442	 */
443	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
444
445	/*
446	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
447	 * respectively.
448	 */
449	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
450	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
451
452	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
453
454	virtual_avail = va;
455
456	/*
457	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
458	 * physical memory region that is used by the ACPI wakeup code.  This
459	 * mapping must not have PG_G set.
460	 */
461#ifndef XEN
462	/*
463	 * leave here deliberately to show that this is not supported
464	 */
465#ifdef XBOX
466	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
467	 * an early stadium, we cannot yet neatly map video memory ... :-(
468	 * Better fixes are very welcome! */
469	if (!arch_i386_is_xbox)
470#endif
471	for (i = 1; i < NKPT; i++)
472		PTD[i] = 0;
473
474	/* Initialize the PAT MSR if present. */
475	pmap_init_pat();
476
477	/* Turn on PG_G on kernel page(s) */
478	pmap_set_pg();
479#endif
480
481#ifdef HAMFISTED_LOCKING
482	mtx_init(&createdelete_lock, "pmap create/delete", NULL, MTX_DEF);
483#endif
484}
485
486/*
487 * Setup the PAT MSR.
488 */
489void
490pmap_init_pat(void)
491{
492	uint64_t pat_msr;
493
494	/* Bail if this CPU doesn't implement PAT. */
495	if (!(cpu_feature & CPUID_PAT))
496		return;
497
498	if (cpu_vendor_id != CPU_VENDOR_INTEL ||
499	    (CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) {
500		/*
501		 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
502		 * Program 4 and 5 as WP and WC.
503		 * Leave 6 and 7 as UC and UC-.
504		 */
505		pat_msr = rdmsr(MSR_PAT);
506		pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
507		pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
508		    PAT_VALUE(5, PAT_WRITE_COMBINING);
509		pat_works = 1;
510	} else {
511		/*
512		 * Due to some Intel errata, we can only safely use the lower 4
513		 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
514		 * of UC-.
515		 *
516		 *   Intel Pentium III Processor Specification Update
517		 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
518		 * or Mode C Paging)
519		 *
520		 *   Intel Pentium IV  Processor Specification Update
521		 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
522		 */
523		pat_msr = rdmsr(MSR_PAT);
524		pat_msr &= ~PAT_MASK(2);
525		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
526		pat_works = 0;
527	}
528	wrmsr(MSR_PAT, pat_msr);
529}
530
531/*
532 * Initialize a vm_page's machine-dependent fields.
533 */
534void
535pmap_page_init(vm_page_t m)
536{
537
538	TAILQ_INIT(&m->md.pv_list);
539	m->md.pat_mode = PAT_WRITE_BACK;
540}
541
542/*
543 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
544 * Requirements:
545 *  - Must deal with pages in order to ensure that none of the PG_* bits
546 *    are ever set, PG_V in particular.
547 *  - Assumes we can write to ptes without pte_store() atomic ops, even
548 *    on PAE systems.  This should be ok.
549 *  - Assumes nothing will ever test these addresses for 0 to indicate
550 *    no mapping instead of correctly checking PG_V.
551 *  - Assumes a vm_offset_t will fit in a pte (true for i386).
552 * Because PG_V is never set, there can be no mappings to invalidate.
553 */
554static int ptelist_count = 0;
555static vm_offset_t
556pmap_ptelist_alloc(vm_offset_t *head)
557{
558	vm_offset_t va;
559	vm_offset_t *phead = (vm_offset_t *)*head;
560
561	if (ptelist_count == 0) {
562		printf("out of memory!!!!!!\n");
563		return (0);	/* Out of memory */
564	}
565	ptelist_count--;
566	va = phead[ptelist_count];
567	return (va);
568}
569
570static void
571pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
572{
573	vm_offset_t *phead = (vm_offset_t *)*head;
574
575	phead[ptelist_count++] = va;
576}
577
578static void
579pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
580{
581	int i, nstackpages;
582	vm_offset_t va;
583	vm_page_t m;
584
585	nstackpages = (npages + PAGE_SIZE/sizeof(vm_offset_t) - 1)/ (PAGE_SIZE/sizeof(vm_offset_t));
586	for (i = 0; i < nstackpages; i++) {
587		va = (vm_offset_t)base + i * PAGE_SIZE;
588		m = vm_page_alloc(NULL, i,
589		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
590		    VM_ALLOC_ZERO);
591		pmap_qenter(va, &m, 1);
592	}
593
594	*head = (vm_offset_t)base;
595	for (i = npages - 1; i >= nstackpages; i--) {
596		va = (vm_offset_t)base + i * PAGE_SIZE;
597		pmap_ptelist_free(head, va);
598	}
599}
600
601
602/*
603 *	Initialize the pmap module.
604 *	Called by vm_init, to initialize any structures that the pmap
605 *	system needs to map virtual memory.
606 */
607void
608pmap_init(void)
609{
610
611	/*
612	 * Initialize the address space (zone) for the pv entries.  Set a
613	 * high water mark so that the system can recover from excessive
614	 * numbers of pv entries.
615	 */
616	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
617	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
618	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
619	pv_entry_max = roundup(pv_entry_max, _NPCPV);
620	pv_entry_high_water = 9 * (pv_entry_max / 10);
621
622	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
623	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
624	if (pv_chunkbase == NULL)
625		panic("pmap_init: not enough kvm for pv chunks");
626	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
627}
628
629
630SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
631	"Max number of PV entries");
632SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
633	"Page share factor per proc");
634
635static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
636    "2/4MB page mapping counters");
637
638static u_long pmap_pde_mappings;
639SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
640    &pmap_pde_mappings, 0, "2/4MB page mappings");
641
642/***************************************************
643 * Low level helper routines.....
644 ***************************************************/
645
646/*
647 * Determine the appropriate bits to set in a PTE or PDE for a specified
648 * caching mode.
649 */
650int
651pmap_cache_bits(int mode, boolean_t is_pde)
652{
653	int pat_flag, pat_index, cache_bits;
654
655	/* The PAT bit is different for PTE's and PDE's. */
656	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
657
658	/* If we don't support PAT, map extended modes to older ones. */
659	if (!(cpu_feature & CPUID_PAT)) {
660		switch (mode) {
661		case PAT_UNCACHEABLE:
662		case PAT_WRITE_THROUGH:
663		case PAT_WRITE_BACK:
664			break;
665		case PAT_UNCACHED:
666		case PAT_WRITE_COMBINING:
667		case PAT_WRITE_PROTECTED:
668			mode = PAT_UNCACHEABLE;
669			break;
670		}
671	}
672
673	/* Map the caching mode to a PAT index. */
674	if (pat_works) {
675		switch (mode) {
676			case PAT_UNCACHEABLE:
677				pat_index = 3;
678				break;
679			case PAT_WRITE_THROUGH:
680				pat_index = 1;
681				break;
682			case PAT_WRITE_BACK:
683				pat_index = 0;
684				break;
685			case PAT_UNCACHED:
686				pat_index = 2;
687				break;
688			case PAT_WRITE_COMBINING:
689				pat_index = 5;
690				break;
691			case PAT_WRITE_PROTECTED:
692				pat_index = 4;
693				break;
694			default:
695				panic("Unknown caching mode %d\n", mode);
696		}
697	} else {
698		switch (mode) {
699			case PAT_UNCACHED:
700			case PAT_UNCACHEABLE:
701			case PAT_WRITE_PROTECTED:
702				pat_index = 3;
703				break;
704			case PAT_WRITE_THROUGH:
705				pat_index = 1;
706				break;
707			case PAT_WRITE_BACK:
708				pat_index = 0;
709				break;
710			case PAT_WRITE_COMBINING:
711				pat_index = 2;
712				break;
713			default:
714				panic("Unknown caching mode %d\n", mode);
715		}
716	}
717
718	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
719	cache_bits = 0;
720	if (pat_index & 0x4)
721		cache_bits |= pat_flag;
722	if (pat_index & 0x2)
723		cache_bits |= PG_NC_PCD;
724	if (pat_index & 0x1)
725		cache_bits |= PG_NC_PWT;
726	return (cache_bits);
727}
728#ifdef SMP
729/*
730 * For SMP, these functions have to use the IPI mechanism for coherence.
731 *
732 * N.B.: Before calling any of the following TLB invalidation functions,
733 * the calling processor must ensure that all stores updating a non-
734 * kernel page table are globally performed.  Otherwise, another
735 * processor could cache an old, pre-update entry without being
736 * invalidated.  This can happen one of two ways: (1) The pmap becomes
737 * active on another processor after its pm_active field is checked by
738 * one of the following functions but before a store updating the page
739 * table is globally performed. (2) The pmap becomes active on another
740 * processor before its pm_active field is checked but due to
741 * speculative loads one of the following functions stills reads the
742 * pmap as inactive on the other processor.
743 *
744 * The kernel page table is exempt because its pm_active field is
745 * immutable.  The kernel page table is always active on every
746 * processor.
747 */
748void
749pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
750{
751	cpuset_t other_cpus;
752	u_int cpuid;
753
754	CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x",
755	    pmap, va);
756
757	sched_pin();
758	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
759		invlpg(va);
760		smp_invlpg(va);
761	} else {
762		cpuid = PCPU_GET(cpuid);
763		other_cpus = all_cpus;
764		CPU_CLR(cpuid, &other_cpus);
765		if (CPU_ISSET(cpuid, &pmap->pm_active))
766			invlpg(va);
767		CPU_AND(&other_cpus, &pmap->pm_active);
768		if (!CPU_EMPTY(&other_cpus))
769			smp_masked_invlpg(other_cpus, va);
770	}
771	sched_unpin();
772	PT_UPDATES_FLUSH();
773}
774
775void
776pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
777{
778	cpuset_t other_cpus;
779	vm_offset_t addr;
780	u_int cpuid;
781
782	CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x",
783	    pmap, sva, eva);
784
785	sched_pin();
786	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
787		for (addr = sva; addr < eva; addr += PAGE_SIZE)
788			invlpg(addr);
789		smp_invlpg_range(sva, eva);
790	} else {
791		cpuid = PCPU_GET(cpuid);
792		other_cpus = all_cpus;
793		CPU_CLR(cpuid, &other_cpus);
794		if (CPU_ISSET(cpuid, &pmap->pm_active))
795			for (addr = sva; addr < eva; addr += PAGE_SIZE)
796				invlpg(addr);
797		CPU_AND(&other_cpus, &pmap->pm_active);
798		if (!CPU_EMPTY(&other_cpus))
799			smp_masked_invlpg_range(other_cpus, sva, eva);
800	}
801	sched_unpin();
802	PT_UPDATES_FLUSH();
803}
804
805void
806pmap_invalidate_all(pmap_t pmap)
807{
808	cpuset_t other_cpus;
809	u_int cpuid;
810
811	CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap);
812
813	sched_pin();
814	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
815		invltlb();
816		smp_invltlb();
817	} else {
818		cpuid = PCPU_GET(cpuid);
819		other_cpus = all_cpus;
820		CPU_CLR(cpuid, &other_cpus);
821		if (CPU_ISSET(cpuid, &pmap->pm_active))
822			invltlb();
823		CPU_AND(&other_cpus, &pmap->pm_active);
824		if (!CPU_EMPTY(&other_cpus))
825			smp_masked_invltlb(other_cpus);
826	}
827	sched_unpin();
828}
829
830void
831pmap_invalidate_cache(void)
832{
833
834	sched_pin();
835	wbinvd();
836	smp_cache_flush();
837	sched_unpin();
838}
839#else /* !SMP */
840/*
841 * Normal, non-SMP, 486+ invalidation functions.
842 * We inline these within pmap.c for speed.
843 */
844PMAP_INLINE void
845pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
846{
847	CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x",
848	    pmap, va);
849
850	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
851		invlpg(va);
852	PT_UPDATES_FLUSH();
853}
854
855PMAP_INLINE void
856pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
857{
858	vm_offset_t addr;
859
860	if (eva - sva > PAGE_SIZE)
861		CTR3(KTR_PMAP, "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x",
862		    pmap, sva, eva);
863
864	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
865		for (addr = sva; addr < eva; addr += PAGE_SIZE)
866			invlpg(addr);
867	PT_UPDATES_FLUSH();
868}
869
870PMAP_INLINE void
871pmap_invalidate_all(pmap_t pmap)
872{
873
874	CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap);
875
876	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
877		invltlb();
878}
879
880PMAP_INLINE void
881pmap_invalidate_cache(void)
882{
883
884	wbinvd();
885}
886#endif /* !SMP */
887
888#define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
889
890void
891pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
892{
893
894	if (force) {
895		sva &= ~(vm_offset_t)cpu_clflush_line_size;
896	} else {
897		KASSERT((sva & PAGE_MASK) == 0,
898		    ("pmap_invalidate_cache_range: sva not page-aligned"));
899		KASSERT((eva & PAGE_MASK) == 0,
900		    ("pmap_invalidate_cache_range: eva not page-aligned"));
901	}
902
903	if ((cpu_feature & CPUID_SS) != 0 && !force)
904		; /* If "Self Snoop" is supported, do nothing. */
905	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
906	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
907
908		/*
909		 * Otherwise, do per-cache line flush.  Use the mfence
910		 * instruction to insure that previous stores are
911		 * included in the write-back.  The processor
912		 * propagates flush to other processors in the cache
913		 * coherence domain.
914		 */
915		mfence();
916		for (; sva < eva; sva += cpu_clflush_line_size)
917			clflush(sva);
918		mfence();
919	} else {
920
921		/*
922		 * No targeted cache flush methods are supported by CPU,
923		 * or the supplied range is bigger than 2MB.
924		 * Globally invalidate cache.
925		 */
926		pmap_invalidate_cache();
927	}
928}
929
930void
931pmap_invalidate_cache_pages(vm_page_t *pages, int count)
932{
933	int i;
934
935	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
936	    (cpu_feature & CPUID_CLFSH) == 0) {
937		pmap_invalidate_cache();
938	} else {
939		for (i = 0; i < count; i++)
940			pmap_flush_page(pages[i]);
941	}
942}
943
944/*
945 * Are we current address space or kernel?  N.B. We return FALSE when
946 * a pmap's page table is in use because a kernel thread is borrowing
947 * it.  The borrowed page table can change spontaneously, making any
948 * dependence on its continued use subject to a race condition.
949 */
950static __inline int
951pmap_is_current(pmap_t pmap)
952{
953
954	return (pmap == kernel_pmap ||
955	    (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
956	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
957}
958
959/*
960 * If the given pmap is not the current or kernel pmap, the returned pte must
961 * be released by passing it to pmap_pte_release().
962 */
963pt_entry_t *
964pmap_pte(pmap_t pmap, vm_offset_t va)
965{
966	pd_entry_t newpf;
967	pd_entry_t *pde;
968
969	pde = pmap_pde(pmap, va);
970	if (*pde & PG_PS)
971		return (pde);
972	if (*pde != 0) {
973		/* are we current address space or kernel? */
974		if (pmap_is_current(pmap))
975			return (vtopte(va));
976		mtx_lock(&PMAP2mutex);
977		newpf = *pde & PG_FRAME;
978		if ((*PMAP2 & PG_FRAME) != newpf) {
979			PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M);
980			CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x",
981			    pmap, va, (*PMAP2 & 0xffffffff));
982		}
983		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
984	}
985	return (NULL);
986}
987
988/*
989 * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
990 * being NULL.
991 */
992static __inline void
993pmap_pte_release(pt_entry_t *pte)
994{
995
996	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) {
997		CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx",
998		    *PMAP2);
999		rw_wlock(&pvh_global_lock);
1000		PT_SET_VA(PMAP2, 0, TRUE);
1001		rw_wunlock(&pvh_global_lock);
1002		mtx_unlock(&PMAP2mutex);
1003	}
1004}
1005
1006static __inline void
1007invlcaddr(void *caddr)
1008{
1009
1010	invlpg((u_int)caddr);
1011	PT_UPDATES_FLUSH();
1012}
1013
1014/*
1015 * Super fast pmap_pte routine best used when scanning
1016 * the pv lists.  This eliminates many coarse-grained
1017 * invltlb calls.  Note that many of the pv list
1018 * scans are across different pmaps.  It is very wasteful
1019 * to do an entire invltlb for checking a single mapping.
1020 *
1021 * If the given pmap is not the current pmap, pvh_global_lock
1022 * must be held and curthread pinned to a CPU.
1023 */
1024static pt_entry_t *
1025pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1026{
1027	pd_entry_t newpf;
1028	pd_entry_t *pde;
1029
1030	pde = pmap_pde(pmap, va);
1031	if (*pde & PG_PS)
1032		return (pde);
1033	if (*pde != 0) {
1034		/* are we current address space or kernel? */
1035		if (pmap_is_current(pmap))
1036			return (vtopte(va));
1037		rw_assert(&pvh_global_lock, RA_WLOCKED);
1038		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1039		newpf = *pde & PG_FRAME;
1040		if ((*PMAP1 & PG_FRAME) != newpf) {
1041			PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M);
1042			CTR3(KTR_PMAP, "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x",
1043			    pmap, va, (u_long)*PMAP1);
1044
1045#ifdef SMP
1046			PMAP1cpu = PCPU_GET(cpuid);
1047#endif
1048			PMAP1changed++;
1049		} else
1050#ifdef SMP
1051		if (PMAP1cpu != PCPU_GET(cpuid)) {
1052			PMAP1cpu = PCPU_GET(cpuid);
1053			invlcaddr(PADDR1);
1054			PMAP1changedcpu++;
1055		} else
1056#endif
1057			PMAP1unchanged++;
1058		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1059	}
1060	return (0);
1061}
1062
1063/*
1064 *	Routine:	pmap_extract
1065 *	Function:
1066 *		Extract the physical page address associated
1067 *		with the given map/virtual_address pair.
1068 */
1069vm_paddr_t
1070pmap_extract(pmap_t pmap, vm_offset_t va)
1071{
1072	vm_paddr_t rtval;
1073	pt_entry_t *pte;
1074	pd_entry_t pde;
1075	pt_entry_t pteval;
1076
1077	rtval = 0;
1078	PMAP_LOCK(pmap);
1079	pde = pmap->pm_pdir[va >> PDRSHIFT];
1080	if (pde != 0) {
1081		if ((pde & PG_PS) != 0) {
1082			rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK);
1083			PMAP_UNLOCK(pmap);
1084			return rtval;
1085		}
1086		pte = pmap_pte(pmap, va);
1087		pteval = *pte ? xpmap_mtop(*pte) : 0;
1088		rtval = (pteval & PG_FRAME) | (va & PAGE_MASK);
1089		pmap_pte_release(pte);
1090	}
1091	PMAP_UNLOCK(pmap);
1092	return (rtval);
1093}
1094
1095/*
1096 *	Routine:	pmap_extract_ma
1097 *	Function:
1098 *		Like pmap_extract, but returns machine address
1099 */
1100vm_paddr_t
1101pmap_extract_ma(pmap_t pmap, vm_offset_t va)
1102{
1103	vm_paddr_t rtval;
1104	pt_entry_t *pte;
1105	pd_entry_t pde;
1106
1107	rtval = 0;
1108	PMAP_LOCK(pmap);
1109	pde = pmap->pm_pdir[va >> PDRSHIFT];
1110	if (pde != 0) {
1111		if ((pde & PG_PS) != 0) {
1112			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
1113			PMAP_UNLOCK(pmap);
1114			return rtval;
1115		}
1116		pte = pmap_pte(pmap, va);
1117		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1118		pmap_pte_release(pte);
1119	}
1120	PMAP_UNLOCK(pmap);
1121	return (rtval);
1122}
1123
1124/*
1125 *	Routine:	pmap_extract_and_hold
1126 *	Function:
1127 *		Atomically extract and hold the physical page
1128 *		with the given pmap and virtual address pair
1129 *		if that mapping permits the given protection.
1130 */
1131vm_page_t
1132pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1133{
1134	pd_entry_t pde;
1135	pt_entry_t pte, *ptep;
1136	vm_page_t m;
1137	vm_paddr_t pa;
1138
1139	pa = 0;
1140	m = NULL;
1141	PMAP_LOCK(pmap);
1142retry:
1143	pde = PT_GET(pmap_pde(pmap, va));
1144	if (pde != 0) {
1145		if (pde & PG_PS) {
1146			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1147				if (vm_page_pa_tryrelock(pmap, (pde &
1148				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1149					goto retry;
1150				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1151				    (va & PDRMASK));
1152				vm_page_hold(m);
1153			}
1154		} else {
1155			ptep = pmap_pte(pmap, va);
1156			pte = PT_GET(ptep);
1157			pmap_pte_release(ptep);
1158			if (pte != 0 &&
1159			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1160				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1161				    &pa))
1162					goto retry;
1163				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1164				vm_page_hold(m);
1165			}
1166		}
1167	}
1168	PA_UNLOCK_COND(pa);
1169	PMAP_UNLOCK(pmap);
1170	return (m);
1171}
1172
1173/***************************************************
1174 * Low level mapping routines.....
1175 ***************************************************/
1176
1177/*
1178 * Add a wired page to the kva.
1179 * Note: not SMP coherent.
1180 *
1181 * This function may be used before pmap_bootstrap() is called.
1182 */
1183void
1184pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1185{
1186
1187	PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag);
1188}
1189
1190void
1191pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma)
1192{
1193	pt_entry_t *pte;
1194
1195	pte = vtopte(va);
1196	pte_store_ma(pte, ma | PG_RW | PG_V | pgeflag);
1197}
1198
1199static __inline void
1200pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1201{
1202
1203	PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1204}
1205
1206/*
1207 * Remove a page from the kernel pagetables.
1208 * Note: not SMP coherent.
1209 *
1210 * This function may be used before pmap_bootstrap() is called.
1211 */
1212PMAP_INLINE void
1213pmap_kremove(vm_offset_t va)
1214{
1215	pt_entry_t *pte;
1216
1217	pte = vtopte(va);
1218	PT_CLEAR_VA(pte, FALSE);
1219}
1220
1221/*
1222 *	Used to map a range of physical addresses into kernel
1223 *	virtual address space.
1224 *
1225 *	The value passed in '*virt' is a suggested virtual address for
1226 *	the mapping. Architectures which can support a direct-mapped
1227 *	physical to virtual region can return the appropriate address
1228 *	within that region, leaving '*virt' unchanged. Other
1229 *	architectures should map the pages starting at '*virt' and
1230 *	update '*virt' with the first usable address after the mapped
1231 *	region.
1232 */
1233vm_offset_t
1234pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1235{
1236	vm_offset_t va, sva;
1237
1238	va = sva = *virt;
1239	CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x",
1240	    va, start, end, prot);
1241	while (start < end) {
1242		pmap_kenter(va, start);
1243		va += PAGE_SIZE;
1244		start += PAGE_SIZE;
1245	}
1246	pmap_invalidate_range(kernel_pmap, sva, va);
1247	*virt = va;
1248	return (sva);
1249}
1250
1251
1252/*
1253 * Add a list of wired pages to the kva
1254 * this routine is only used for temporary
1255 * kernel mappings that do not need to have
1256 * page modification or references recorded.
1257 * Note that old mappings are simply written
1258 * over.  The page *must* be wired.
1259 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1260 */
1261void
1262pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1263{
1264	pt_entry_t *endpte, *pte;
1265	vm_paddr_t pa;
1266	vm_offset_t va = sva;
1267	int mclcount = 0;
1268	multicall_entry_t mcl[16];
1269	multicall_entry_t *mclp = mcl;
1270	int error;
1271
1272	CTR2(KTR_PMAP, "pmap_qenter:sva=0x%x count=%d", va, count);
1273	pte = vtopte(sva);
1274	endpte = pte + count;
1275	while (pte < endpte) {
1276		pa = VM_PAGE_TO_MACH(*ma) | pgeflag | PG_RW | PG_V | PG_M | PG_A;
1277
1278		mclp->op = __HYPERVISOR_update_va_mapping;
1279		mclp->args[0] = va;
1280		mclp->args[1] = (uint32_t)(pa & 0xffffffff);
1281		mclp->args[2] = (uint32_t)(pa >> 32);
1282		mclp->args[3] = (*pte & PG_V) ? UVMF_INVLPG|UVMF_ALL : 0;
1283
1284		va += PAGE_SIZE;
1285		pte++;
1286		ma++;
1287		mclp++;
1288		mclcount++;
1289		if (mclcount == 16) {
1290			error = HYPERVISOR_multicall(mcl, mclcount);
1291			mclp = mcl;
1292			mclcount = 0;
1293			KASSERT(error == 0, ("bad multicall %d", error));
1294		}
1295	}
1296	if (mclcount) {
1297		error = HYPERVISOR_multicall(mcl, mclcount);
1298		KASSERT(error == 0, ("bad multicall %d", error));
1299	}
1300
1301#ifdef INVARIANTS
1302	for (pte = vtopte(sva), mclcount = 0; mclcount < count; mclcount++, pte++)
1303		KASSERT(*pte, ("pte not set for va=0x%x", sva + mclcount*PAGE_SIZE));
1304#endif
1305}
1306
1307/*
1308 * This routine tears out page mappings from the
1309 * kernel -- it is meant only for temporary mappings.
1310 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1311 */
1312void
1313pmap_qremove(vm_offset_t sva, int count)
1314{
1315	vm_offset_t va;
1316
1317	CTR2(KTR_PMAP, "pmap_qremove: sva=0x%x count=%d", sva, count);
1318	va = sva;
1319	rw_wlock(&pvh_global_lock);
1320	critical_enter();
1321	while (count-- > 0) {
1322		pmap_kremove(va);
1323		va += PAGE_SIZE;
1324	}
1325	PT_UPDATES_FLUSH();
1326	pmap_invalidate_range(kernel_pmap, sva, va);
1327	critical_exit();
1328	rw_wunlock(&pvh_global_lock);
1329}
1330
1331/***************************************************
1332 * Page table page management routines.....
1333 ***************************************************/
1334static __inline void
1335pmap_free_zero_pages(vm_page_t free)
1336{
1337	vm_page_t m;
1338
1339	while (free != NULL) {
1340		m = free;
1341		free = (void *)m->object;
1342		m->object = NULL;
1343		vm_page_free_zero(m);
1344	}
1345}
1346
1347/*
1348 * Decrements a page table page's wire count, which is used to record the
1349 * number of valid page table entries within the page.  If the wire count
1350 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1351 * page table page was unmapped and FALSE otherwise.
1352 */
1353static inline boolean_t
1354pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1355{
1356
1357	--m->wire_count;
1358	if (m->wire_count == 0) {
1359		_pmap_unwire_ptp(pmap, m, free);
1360		return (TRUE);
1361	} else
1362		return (FALSE);
1363}
1364
1365static void
1366_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
1367{
1368	vm_offset_t pteva;
1369
1370	PT_UPDATES_FLUSH();
1371	/*
1372	 * unmap the page table page
1373	 */
1374	xen_pt_unpin(pmap->pm_pdir[m->pindex]);
1375	/*
1376	 * page *might* contain residual mapping :-/
1377	 */
1378	PD_CLEAR_VA(pmap, m->pindex, TRUE);
1379	pmap_zero_page(m);
1380	--pmap->pm_stats.resident_count;
1381
1382	/*
1383	 * This is a release store so that the ordinary store unmapping
1384	 * the page table page is globally performed before TLB shoot-
1385	 * down is begun.
1386	 */
1387	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1388
1389	/*
1390	 * Do an invltlb to make the invalidated mapping
1391	 * take effect immediately.
1392	 */
1393	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1394	pmap_invalidate_page(pmap, pteva);
1395
1396	/*
1397	 * Put page on a list so that it is released after
1398	 * *ALL* TLB shootdown is done
1399	 */
1400	m->object = (void *)*free;
1401	*free = m;
1402}
1403
1404/*
1405 * After removing a page table entry, this routine is used to
1406 * conditionally free the page, and manage the hold/wire counts.
1407 */
1408static int
1409pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1410{
1411	pd_entry_t ptepde;
1412	vm_page_t mpte;
1413
1414	if (va >= VM_MAXUSER_ADDRESS)
1415		return (0);
1416	ptepde = PT_GET(pmap_pde(pmap, va));
1417	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1418	return (pmap_unwire_ptp(pmap, mpte, free));
1419}
1420
1421/*
1422 * Initialize the pmap for the swapper process.
1423 */
1424void
1425pmap_pinit0(pmap_t pmap)
1426{
1427
1428	PMAP_LOCK_INIT(pmap);
1429	/*
1430	 * Since the page table directory is shared with the kernel pmap,
1431	 * which is already included in the list "allpmaps", this pmap does
1432	 * not need to be inserted into that list.
1433	 */
1434	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1435#ifdef PAE
1436	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1437#endif
1438	CPU_ZERO(&pmap->pm_active);
1439	PCPU_SET(curpmap, pmap);
1440	TAILQ_INIT(&pmap->pm_pvchunk);
1441	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1442}
1443
1444/*
1445 * Initialize a preallocated and zeroed pmap structure,
1446 * such as one in a vmspace structure.
1447 */
1448int
1449pmap_pinit(pmap_t pmap)
1450{
1451	vm_page_t m, ptdpg[NPGPTD + 1];
1452	int npgptd = NPGPTD + 1;
1453	int i;
1454
1455#ifdef HAMFISTED_LOCKING
1456	mtx_lock(&createdelete_lock);
1457#endif
1458
1459	/*
1460	 * No need to allocate page table space yet but we do need a valid
1461	 * page directory table.
1462	 */
1463	if (pmap->pm_pdir == NULL) {
1464		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1465		if (pmap->pm_pdir == NULL) {
1466#ifdef HAMFISTED_LOCKING
1467			mtx_unlock(&createdelete_lock);
1468#endif
1469			return (0);
1470		}
1471#ifdef PAE
1472		pmap->pm_pdpt = (pd_entry_t *)kva_alloc(1);
1473#endif
1474	}
1475
1476	/*
1477	 * allocate the page directory page(s)
1478	 */
1479	for (i = 0; i < npgptd;) {
1480		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1481		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1482		if (m == NULL)
1483			VM_WAIT;
1484		else {
1485			ptdpg[i++] = m;
1486		}
1487	}
1488
1489	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1490
1491	for (i = 0; i < NPGPTD; i++)
1492		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1493			pagezero(pmap->pm_pdir + (i * NPDEPG));
1494
1495	mtx_lock_spin(&allpmaps_lock);
1496	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1497	/* Copy the kernel page table directory entries. */
1498	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1499	mtx_unlock_spin(&allpmaps_lock);
1500
1501#ifdef PAE
1502	pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1);
1503	if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0)
1504		bzero(pmap->pm_pdpt, PAGE_SIZE);
1505	for (i = 0; i < NPGPTD; i++) {
1506		vm_paddr_t ma;
1507
1508		ma = VM_PAGE_TO_MACH(ptdpg[i]);
1509		pmap->pm_pdpt[i] = ma | PG_V;
1510
1511	}
1512#endif
1513	for (i = 0; i < NPGPTD; i++) {
1514		pt_entry_t *pd;
1515		vm_paddr_t ma;
1516
1517		ma = VM_PAGE_TO_MACH(ptdpg[i]);
1518		pd = pmap->pm_pdir + (i * NPDEPG);
1519		PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW));
1520#if 0
1521		xen_pgd_pin(ma);
1522#endif
1523	}
1524
1525#ifdef PAE
1526	PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW);
1527#endif
1528	rw_wlock(&pvh_global_lock);
1529	xen_flush_queue();
1530	xen_pgdpt_pin(VM_PAGE_TO_MACH(ptdpg[NPGPTD]));
1531	for (i = 0; i < NPGPTD; i++) {
1532		vm_paddr_t ma = VM_PAGE_TO_MACH(ptdpg[i]);
1533		PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE);
1534	}
1535	xen_flush_queue();
1536	rw_wunlock(&pvh_global_lock);
1537	CPU_ZERO(&pmap->pm_active);
1538	TAILQ_INIT(&pmap->pm_pvchunk);
1539	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1540
1541#ifdef HAMFISTED_LOCKING
1542	mtx_unlock(&createdelete_lock);
1543#endif
1544	return (1);
1545}
1546
1547/*
1548 * this routine is called if the page table page is not
1549 * mapped correctly.
1550 */
1551static vm_page_t
1552_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1553{
1554	vm_paddr_t ptema;
1555	vm_page_t m;
1556
1557	/*
1558	 * Allocate a page table page.
1559	 */
1560	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1561	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1562		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1563			PMAP_UNLOCK(pmap);
1564			rw_wunlock(&pvh_global_lock);
1565			VM_WAIT;
1566			rw_wlock(&pvh_global_lock);
1567			PMAP_LOCK(pmap);
1568		}
1569
1570		/*
1571		 * Indicate the need to retry.  While waiting, the page table
1572		 * page may have been allocated.
1573		 */
1574		return (NULL);
1575	}
1576	if ((m->flags & PG_ZERO) == 0)
1577		pmap_zero_page(m);
1578
1579	/*
1580	 * Map the pagetable page into the process address space, if
1581	 * it isn't already there.
1582	 */
1583
1584	pmap->pm_stats.resident_count++;
1585
1586	ptema = VM_PAGE_TO_MACH(m);
1587	xen_pt_pin(ptema);
1588	PT_SET_VA_MA(&pmap->pm_pdir[ptepindex],
1589		(ptema | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE);
1590
1591	KASSERT(pmap->pm_pdir[ptepindex],
1592	    ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex));
1593	return (m);
1594}
1595
1596static vm_page_t
1597pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1598{
1599	u_int ptepindex;
1600	pd_entry_t ptema;
1601	vm_page_t m;
1602
1603	/*
1604	 * Calculate pagetable page index
1605	 */
1606	ptepindex = va >> PDRSHIFT;
1607retry:
1608	/*
1609	 * Get the page directory entry
1610	 */
1611	ptema = pmap->pm_pdir[ptepindex];
1612
1613	/*
1614	 * This supports switching from a 4MB page to a
1615	 * normal 4K page.
1616	 */
1617	if (ptema & PG_PS) {
1618		/*
1619		 * XXX
1620		 */
1621		pmap->pm_pdir[ptepindex] = 0;
1622		ptema = 0;
1623		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1624		pmap_invalidate_all(kernel_pmap);
1625	}
1626
1627	/*
1628	 * If the page table page is mapped, we just increment the
1629	 * hold count, and activate it.
1630	 */
1631	if (ptema & PG_V) {
1632		m = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME);
1633		m->wire_count++;
1634	} else {
1635		/*
1636		 * Here if the pte page isn't mapped, or if it has
1637		 * been deallocated.
1638		 */
1639		CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x",
1640		    pmap, va, flags);
1641		m = _pmap_allocpte(pmap, ptepindex, flags);
1642		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
1643			goto retry;
1644
1645		KASSERT(pmap->pm_pdir[ptepindex], ("ptepindex=%d did not get mapped", ptepindex));
1646	}
1647	return (m);
1648}
1649
1650
1651/***************************************************
1652* Pmap allocation/deallocation routines.
1653 ***************************************************/
1654
1655#ifdef SMP
1656/*
1657 * Deal with a SMP shootdown of other users of the pmap that we are
1658 * trying to dispose of.  This can be a bit hairy.
1659 */
1660static cpuset_t *lazymask;
1661static u_int lazyptd;
1662static volatile u_int lazywait;
1663
1664void pmap_lazyfix_action(void);
1665
1666void
1667pmap_lazyfix_action(void)
1668{
1669
1670#ifdef COUNT_IPIS
1671	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1672#endif
1673	if (rcr3() == lazyptd)
1674		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1675	CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask);
1676	atomic_store_rel_int(&lazywait, 1);
1677}
1678
1679static void
1680pmap_lazyfix_self(u_int cpuid)
1681{
1682
1683	if (rcr3() == lazyptd)
1684		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1685	CPU_CLR_ATOMIC(cpuid, lazymask);
1686}
1687
1688
1689static void
1690pmap_lazyfix(pmap_t pmap)
1691{
1692	cpuset_t mymask, mask;
1693	u_int cpuid, spins;
1694	int lsb;
1695
1696	mask = pmap->pm_active;
1697	while (!CPU_EMPTY(&mask)) {
1698		spins = 50000000;
1699
1700		/* Find least significant set bit. */
1701		lsb = CPU_FFS(&mask);
1702		MPASS(lsb != 0);
1703		lsb--;
1704		CPU_SETOF(lsb, &mask);
1705		mtx_lock_spin(&smp_ipi_mtx);
1706#ifdef PAE
1707		lazyptd = vtophys(pmap->pm_pdpt);
1708#else
1709		lazyptd = vtophys(pmap->pm_pdir);
1710#endif
1711		cpuid = PCPU_GET(cpuid);
1712
1713		/* Use a cpuset just for having an easy check. */
1714		CPU_SETOF(cpuid, &mymask);
1715		if (!CPU_CMP(&mask, &mymask)) {
1716			lazymask = &pmap->pm_active;
1717			pmap_lazyfix_self(cpuid);
1718		} else {
1719			atomic_store_rel_int((u_int *)&lazymask,
1720			    (u_int)&pmap->pm_active);
1721			atomic_store_rel_int(&lazywait, 0);
1722			ipi_selected(mask, IPI_LAZYPMAP);
1723			while (lazywait == 0) {
1724				ia32_pause();
1725				if (--spins == 0)
1726					break;
1727			}
1728		}
1729		mtx_unlock_spin(&smp_ipi_mtx);
1730		if (spins == 0)
1731			printf("pmap_lazyfix: spun for 50000000\n");
1732		mask = pmap->pm_active;
1733	}
1734}
1735
1736#else	/* SMP */
1737
1738/*
1739 * Cleaning up on uniprocessor is easy.  For various reasons, we're
1740 * unlikely to have to even execute this code, including the fact
1741 * that the cleanup is deferred until the parent does a wait(2), which
1742 * means that another userland process has run.
1743 */
1744static void
1745pmap_lazyfix(pmap_t pmap)
1746{
1747	u_int cr3;
1748
1749	cr3 = vtophys(pmap->pm_pdir);
1750	if (cr3 == rcr3()) {
1751		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1752		CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active);
1753	}
1754}
1755#endif	/* SMP */
1756
1757/*
1758 * Release any resources held by the given physical map.
1759 * Called when a pmap initialized by pmap_pinit is being released.
1760 * Should only be called if the map contains no valid mappings.
1761 */
1762void
1763pmap_release(pmap_t pmap)
1764{
1765	vm_page_t m, ptdpg[2*NPGPTD+1];
1766	vm_paddr_t ma;
1767	int i;
1768#ifdef PAE
1769	int npgptd = NPGPTD + 1;
1770#else
1771	int npgptd = NPGPTD;
1772#endif
1773
1774	KASSERT(pmap->pm_stats.resident_count == 0,
1775	    ("pmap_release: pmap resident count %ld != 0",
1776	    pmap->pm_stats.resident_count));
1777	PT_UPDATES_FLUSH();
1778
1779#ifdef HAMFISTED_LOCKING
1780	mtx_lock(&createdelete_lock);
1781#endif
1782
1783	pmap_lazyfix(pmap);
1784	mtx_lock_spin(&allpmaps_lock);
1785	LIST_REMOVE(pmap, pm_list);
1786	mtx_unlock_spin(&allpmaps_lock);
1787
1788	for (i = 0; i < NPGPTD; i++)
1789		ptdpg[i] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdir + (i*NPDEPG)) & PG_FRAME);
1790	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1791#ifdef PAE
1792	ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt));
1793#endif
1794
1795	for (i = 0; i < npgptd; i++) {
1796		m = ptdpg[i];
1797		ma = VM_PAGE_TO_MACH(m);
1798		/* unpinning L1 and L2 treated the same */
1799#if 0
1800                xen_pgd_unpin(ma);
1801#else
1802		if (i == NPGPTD)
1803	                xen_pgd_unpin(ma);
1804#endif
1805#ifdef PAE
1806		if (i < NPGPTD)
1807			KASSERT(VM_PAGE_TO_MACH(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1808			    ("pmap_release: got wrong ptd page"));
1809#endif
1810		m->wire_count--;
1811		atomic_subtract_int(&cnt.v_wire_count, 1);
1812		vm_page_free(m);
1813	}
1814#ifdef PAE
1815	pmap_qremove((vm_offset_t)pmap->pm_pdpt, 1);
1816#endif
1817
1818#ifdef HAMFISTED_LOCKING
1819	mtx_unlock(&createdelete_lock);
1820#endif
1821}
1822
1823static int
1824kvm_size(SYSCTL_HANDLER_ARGS)
1825{
1826	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1827
1828	return (sysctl_handle_long(oidp, &ksize, 0, req));
1829}
1830SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1831    0, 0, kvm_size, "IU", "Size of KVM");
1832
1833static int
1834kvm_free(SYSCTL_HANDLER_ARGS)
1835{
1836	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1837
1838	return (sysctl_handle_long(oidp, &kfree, 0, req));
1839}
1840SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1841    0, 0, kvm_free, "IU", "Amount of KVM free");
1842
1843/*
1844 * grow the number of kernel page table entries, if needed
1845 */
1846void
1847pmap_growkernel(vm_offset_t addr)
1848{
1849	struct pmap *pmap;
1850	vm_paddr_t ptppaddr;
1851	vm_page_t nkpg;
1852	pd_entry_t newpdir;
1853
1854	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1855	if (kernel_vm_end == 0) {
1856		kernel_vm_end = KERNBASE;
1857		nkpt = 0;
1858		while (pdir_pde(PTD, kernel_vm_end)) {
1859			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1860			nkpt++;
1861			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1862				kernel_vm_end = kernel_map->max_offset;
1863				break;
1864			}
1865		}
1866	}
1867	addr = roundup2(addr, NBPDR);
1868	if (addr - 1 >= kernel_map->max_offset)
1869		addr = kernel_map->max_offset;
1870	while (kernel_vm_end < addr) {
1871		if (pdir_pde(PTD, kernel_vm_end)) {
1872			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
1873			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1874				kernel_vm_end = kernel_map->max_offset;
1875				break;
1876			}
1877			continue;
1878		}
1879
1880		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
1881		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1882		    VM_ALLOC_ZERO);
1883		if (nkpg == NULL)
1884			panic("pmap_growkernel: no memory to grow kernel");
1885
1886		nkpt++;
1887
1888		if ((nkpg->flags & PG_ZERO) == 0)
1889			pmap_zero_page(nkpg);
1890		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1891		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1892		rw_wlock(&pvh_global_lock);
1893		PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE);
1894		mtx_lock_spin(&allpmaps_lock);
1895		LIST_FOREACH(pmap, &allpmaps, pm_list)
1896			PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE);
1897
1898		mtx_unlock_spin(&allpmaps_lock);
1899		rw_wunlock(&pvh_global_lock);
1900
1901		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
1902		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1903			kernel_vm_end = kernel_map->max_offset;
1904			break;
1905		}
1906	}
1907}
1908
1909
1910/***************************************************
1911 * page management routines.
1912 ***************************************************/
1913
1914CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1915CTASSERT(_NPCM == 11);
1916CTASSERT(_NPCPV == 336);
1917
1918static __inline struct pv_chunk *
1919pv_to_chunk(pv_entry_t pv)
1920{
1921
1922	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1923}
1924
1925#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1926
1927#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
1928#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
1929
1930static const uint32_t pc_freemask[_NPCM] = {
1931	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1932	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1933	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1934	PC_FREE0_9, PC_FREE10
1935};
1936
1937SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1938	"Current number of pv entries");
1939
1940#ifdef PV_STATS
1941static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1942
1943SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1944	"Current number of pv entry chunks");
1945SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1946	"Current number of pv entry chunks allocated");
1947SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1948	"Current number of pv entry chunks frees");
1949SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1950	"Number of times tried to get a chunk page but failed.");
1951
1952static long pv_entry_frees, pv_entry_allocs;
1953static int pv_entry_spare;
1954
1955SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1956	"Current number of pv entry frees");
1957SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1958	"Current number of pv entry allocs");
1959SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1960	"Current number of spare pv entries");
1961#endif
1962
1963/*
1964 * We are in a serious low memory condition.  Resort to
1965 * drastic measures to free some pages so we can allocate
1966 * another pv entry chunk.
1967 */
1968static vm_page_t
1969pmap_pv_reclaim(pmap_t locked_pmap)
1970{
1971	struct pch newtail;
1972	struct pv_chunk *pc;
1973	pmap_t pmap;
1974	pt_entry_t *pte, tpte;
1975	pv_entry_t pv;
1976	vm_offset_t va;
1977	vm_page_t free, m, m_pc;
1978	uint32_t inuse;
1979	int bit, field, freed;
1980
1981	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1982	pmap = NULL;
1983	free = m_pc = NULL;
1984	TAILQ_INIT(&newtail);
1985	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
1986	    free == NULL)) {
1987		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1988		if (pmap != pc->pc_pmap) {
1989			if (pmap != NULL) {
1990				pmap_invalidate_all(pmap);
1991				if (pmap != locked_pmap)
1992					PMAP_UNLOCK(pmap);
1993			}
1994			pmap = pc->pc_pmap;
1995			/* Avoid deadlock and lock recursion. */
1996			if (pmap > locked_pmap)
1997				PMAP_LOCK(pmap);
1998			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
1999				pmap = NULL;
2000				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2001				continue;
2002			}
2003		}
2004
2005		/*
2006		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2007		 */
2008		freed = 0;
2009		for (field = 0; field < _NPCM; field++) {
2010			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2011			    inuse != 0; inuse &= ~(1UL << bit)) {
2012				bit = bsfl(inuse);
2013				pv = &pc->pc_pventry[field * 32 + bit];
2014				va = pv->pv_va;
2015				pte = pmap_pte(pmap, va);
2016				tpte = *pte;
2017				if ((tpte & PG_W) == 0)
2018					tpte = pte_load_clear(pte);
2019				pmap_pte_release(pte);
2020				if ((tpte & PG_W) != 0)
2021					continue;
2022				KASSERT(tpte != 0,
2023				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2024				    pmap, va));
2025				if ((tpte & PG_G) != 0)
2026					pmap_invalidate_page(pmap, va);
2027				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2028				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2029					vm_page_dirty(m);
2030				if ((tpte & PG_A) != 0)
2031					vm_page_aflag_set(m, PGA_REFERENCED);
2032				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2033				if (TAILQ_EMPTY(&m->md.pv_list))
2034					vm_page_aflag_clear(m, PGA_WRITEABLE);
2035				pc->pc_map[field] |= 1UL << bit;
2036				pmap_unuse_pt(pmap, va, &free);
2037				freed++;
2038			}
2039		}
2040		if (freed == 0) {
2041			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2042			continue;
2043		}
2044		/* Every freed mapping is for a 4 KB page. */
2045		pmap->pm_stats.resident_count -= freed;
2046		PV_STAT(pv_entry_frees += freed);
2047		PV_STAT(pv_entry_spare += freed);
2048		pv_entry_count -= freed;
2049		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2050		for (field = 0; field < _NPCM; field++)
2051			if (pc->pc_map[field] != pc_freemask[field]) {
2052				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2053				    pc_list);
2054				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2055
2056				/*
2057				 * One freed pv entry in locked_pmap is
2058				 * sufficient.
2059				 */
2060				if (pmap == locked_pmap)
2061					goto out;
2062				break;
2063			}
2064		if (field == _NPCM) {
2065			PV_STAT(pv_entry_spare -= _NPCPV);
2066			PV_STAT(pc_chunk_count--);
2067			PV_STAT(pc_chunk_frees++);
2068			/* Entire chunk is free; return it. */
2069			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2070			pmap_qremove((vm_offset_t)pc, 1);
2071			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2072			break;
2073		}
2074	}
2075out:
2076	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2077	if (pmap != NULL) {
2078		pmap_invalidate_all(pmap);
2079		if (pmap != locked_pmap)
2080			PMAP_UNLOCK(pmap);
2081	}
2082	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
2083		m_pc = free;
2084		free = (void *)m_pc->object;
2085		/* Recycle a freed page table page. */
2086		m_pc->wire_count = 1;
2087		atomic_add_int(&cnt.v_wire_count, 1);
2088	}
2089	pmap_free_zero_pages(free);
2090	return (m_pc);
2091}
2092
2093/*
2094 * free the pv_entry back to the free list
2095 */
2096static void
2097free_pv_entry(pmap_t pmap, pv_entry_t pv)
2098{
2099	struct pv_chunk *pc;
2100	int idx, field, bit;
2101
2102	rw_assert(&pvh_global_lock, RA_WLOCKED);
2103	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2104	PV_STAT(pv_entry_frees++);
2105	PV_STAT(pv_entry_spare++);
2106	pv_entry_count--;
2107	pc = pv_to_chunk(pv);
2108	idx = pv - &pc->pc_pventry[0];
2109	field = idx / 32;
2110	bit = idx % 32;
2111	pc->pc_map[field] |= 1ul << bit;
2112	for (idx = 0; idx < _NPCM; idx++)
2113		if (pc->pc_map[idx] != pc_freemask[idx]) {
2114			/*
2115			 * 98% of the time, pc is already at the head of the
2116			 * list.  If it isn't already, move it to the head.
2117			 */
2118			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2119			    pc)) {
2120				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2121				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2122				    pc_list);
2123			}
2124			return;
2125		}
2126	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2127	free_pv_chunk(pc);
2128}
2129
2130static void
2131free_pv_chunk(struct pv_chunk *pc)
2132{
2133	vm_page_t m;
2134
2135 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2136	PV_STAT(pv_entry_spare -= _NPCPV);
2137	PV_STAT(pc_chunk_count--);
2138	PV_STAT(pc_chunk_frees++);
2139	/* entire chunk is free, return it */
2140	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2141	pmap_qremove((vm_offset_t)pc, 1);
2142	vm_page_unwire(m, 0);
2143	vm_page_free(m);
2144	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2145}
2146
2147/*
2148 * get a new pv_entry, allocating a block from the system
2149 * when needed.
2150 */
2151static pv_entry_t
2152get_pv_entry(pmap_t pmap, boolean_t try)
2153{
2154	static const struct timeval printinterval = { 60, 0 };
2155	static struct timeval lastprint;
2156	int bit, field;
2157	pv_entry_t pv;
2158	struct pv_chunk *pc;
2159	vm_page_t m;
2160
2161	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2162	rw_assert(&pvh_global_lock, RA_WLOCKED);
2163	PV_STAT(pv_entry_allocs++);
2164	pv_entry_count++;
2165	if (pv_entry_count > pv_entry_high_water)
2166		if (ratecheck(&lastprint, &printinterval))
2167			printf("Approaching the limit on PV entries, consider "
2168			    "increasing either the vm.pmap.shpgperproc or the "
2169			    "vm.pmap.pv_entry_max tunable.\n");
2170retry:
2171	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2172	if (pc != NULL) {
2173		for (field = 0; field < _NPCM; field++) {
2174			if (pc->pc_map[field]) {
2175				bit = bsfl(pc->pc_map[field]);
2176				break;
2177			}
2178		}
2179		if (field < _NPCM) {
2180			pv = &pc->pc_pventry[field * 32 + bit];
2181			pc->pc_map[field] &= ~(1ul << bit);
2182			/* If this was the last item, move it to tail */
2183			for (field = 0; field < _NPCM; field++)
2184				if (pc->pc_map[field] != 0) {
2185					PV_STAT(pv_entry_spare--);
2186					return (pv);	/* not full, return */
2187				}
2188			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2189			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2190			PV_STAT(pv_entry_spare--);
2191			return (pv);
2192		}
2193	}
2194	/*
2195	 * Access to the ptelist "pv_vafree" is synchronized by the page
2196	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2197	 * remain non-empty until pmap_ptelist_alloc() completes.
2198	 */
2199	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2200	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2201		if (try) {
2202			pv_entry_count--;
2203			PV_STAT(pc_chunk_tryfail++);
2204			return (NULL);
2205		}
2206		m = pmap_pv_reclaim(pmap);
2207		if (m == NULL)
2208			goto retry;
2209	}
2210	PV_STAT(pc_chunk_count++);
2211	PV_STAT(pc_chunk_allocs++);
2212	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2213	pmap_qenter((vm_offset_t)pc, &m, 1);
2214	if ((m->flags & PG_ZERO) == 0)
2215		pagezero(pc);
2216	pc->pc_pmap = pmap;
2217	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2218	for (field = 1; field < _NPCM; field++)
2219		pc->pc_map[field] = pc_freemask[field];
2220	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2221	pv = &pc->pc_pventry[0];
2222	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2223	PV_STAT(pv_entry_spare += _NPCPV - 1);
2224	return (pv);
2225}
2226
2227static __inline pv_entry_t
2228pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2229{
2230	pv_entry_t pv;
2231
2232	rw_assert(&pvh_global_lock, RA_WLOCKED);
2233	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2234		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2235			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2236			break;
2237		}
2238	}
2239	return (pv);
2240}
2241
2242static void
2243pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2244{
2245	pv_entry_t pv;
2246
2247	pv = pmap_pvh_remove(pvh, pmap, va);
2248	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2249	free_pv_entry(pmap, pv);
2250}
2251
2252static void
2253pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2254{
2255
2256	rw_assert(&pvh_global_lock, RA_WLOCKED);
2257	pmap_pvh_free(&m->md, pmap, va);
2258	if (TAILQ_EMPTY(&m->md.pv_list))
2259		vm_page_aflag_clear(m, PGA_WRITEABLE);
2260}
2261
2262/*
2263 * Conditionally create a pv entry.
2264 */
2265static boolean_t
2266pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2267{
2268	pv_entry_t pv;
2269
2270	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2271	rw_assert(&pvh_global_lock, RA_WLOCKED);
2272	if (pv_entry_count < pv_entry_high_water &&
2273	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2274		pv->pv_va = va;
2275		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2276		return (TRUE);
2277	} else
2278		return (FALSE);
2279}
2280
2281/*
2282 * pmap_remove_pte: do the things to unmap a page in a process
2283 */
2284static int
2285pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2286{
2287	pt_entry_t oldpte;
2288	vm_page_t m;
2289
2290	CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x",
2291	    pmap, (u_long)*ptq, va);
2292
2293	rw_assert(&pvh_global_lock, RA_WLOCKED);
2294	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2295	oldpte = *ptq;
2296	PT_SET_VA_MA(ptq, 0, TRUE);
2297	KASSERT(oldpte != 0,
2298	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2299	if (oldpte & PG_W)
2300		pmap->pm_stats.wired_count -= 1;
2301	/*
2302	 * Machines that don't support invlpg, also don't support
2303	 * PG_G.
2304	 */
2305	if (oldpte & PG_G)
2306		pmap_invalidate_page(kernel_pmap, va);
2307	pmap->pm_stats.resident_count -= 1;
2308	if (oldpte & PG_MANAGED) {
2309		m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte) & PG_FRAME);
2310		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2311			vm_page_dirty(m);
2312		if (oldpte & PG_A)
2313			vm_page_aflag_set(m, PGA_REFERENCED);
2314		pmap_remove_entry(pmap, m, va);
2315	}
2316	return (pmap_unuse_pt(pmap, va, free));
2317}
2318
2319/*
2320 * Remove a single page from a process address space
2321 */
2322static void
2323pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2324{
2325	pt_entry_t *pte;
2326
2327	CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x",
2328	    pmap, va);
2329
2330	rw_assert(&pvh_global_lock, RA_WLOCKED);
2331	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2332	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2333	if ((pte = pmap_pte_quick(pmap, va)) == NULL || (*pte & PG_V) == 0)
2334		return;
2335	pmap_remove_pte(pmap, pte, va, free);
2336	pmap_invalidate_page(pmap, va);
2337	if (*PMAP1)
2338		PT_SET_MA(PADDR1, 0);
2339
2340}
2341
2342/*
2343 *	Remove the given range of addresses from the specified map.
2344 *
2345 *	It is assumed that the start and end are properly
2346 *	rounded to the page size.
2347 */
2348void
2349pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2350{
2351	vm_offset_t pdnxt;
2352	pd_entry_t ptpaddr;
2353	pt_entry_t *pte;
2354	vm_page_t free = NULL;
2355	int anyvalid;
2356
2357	CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x",
2358	    pmap, sva, eva);
2359
2360	/*
2361	 * Perform an unsynchronized read.  This is, however, safe.
2362	 */
2363	if (pmap->pm_stats.resident_count == 0)
2364		return;
2365
2366	anyvalid = 0;
2367
2368	rw_wlock(&pvh_global_lock);
2369	sched_pin();
2370	PMAP_LOCK(pmap);
2371
2372	/*
2373	 * special handling of removing one page.  a very
2374	 * common operation and easy to short circuit some
2375	 * code.
2376	 */
2377	if ((sva + PAGE_SIZE == eva) &&
2378	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2379		pmap_remove_page(pmap, sva, &free);
2380		goto out;
2381	}
2382
2383	for (; sva < eva; sva = pdnxt) {
2384		u_int pdirindex;
2385
2386		/*
2387		 * Calculate index for next page table.
2388		 */
2389		pdnxt = (sva + NBPDR) & ~PDRMASK;
2390		if (pdnxt < sva)
2391			pdnxt = eva;
2392		if (pmap->pm_stats.resident_count == 0)
2393			break;
2394
2395		pdirindex = sva >> PDRSHIFT;
2396		ptpaddr = pmap->pm_pdir[pdirindex];
2397
2398		/*
2399		 * Weed out invalid mappings. Note: we assume that the page
2400		 * directory table is always allocated, and in kernel virtual.
2401		 */
2402		if (ptpaddr == 0)
2403			continue;
2404
2405		/*
2406		 * Check for large page.
2407		 */
2408		if ((ptpaddr & PG_PS) != 0) {
2409			PD_CLEAR_VA(pmap, pdirindex, TRUE);
2410			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2411			anyvalid = 1;
2412			continue;
2413		}
2414
2415		/*
2416		 * Limit our scan to either the end of the va represented
2417		 * by the current page table page, or to the end of the
2418		 * range being removed.
2419		 */
2420		if (pdnxt > eva)
2421			pdnxt = eva;
2422
2423		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2424		    sva += PAGE_SIZE) {
2425			if ((*pte & PG_V) == 0)
2426				continue;
2427
2428			/*
2429			 * The TLB entry for a PG_G mapping is invalidated
2430			 * by pmap_remove_pte().
2431			 */
2432			if ((*pte & PG_G) == 0)
2433				anyvalid = 1;
2434			if (pmap_remove_pte(pmap, pte, sva, &free))
2435				break;
2436		}
2437	}
2438	PT_UPDATES_FLUSH();
2439	if (*PMAP1)
2440		PT_SET_VA_MA(PMAP1, 0, TRUE);
2441out:
2442	if (anyvalid)
2443		pmap_invalidate_all(pmap);
2444	sched_unpin();
2445	rw_wunlock(&pvh_global_lock);
2446	PMAP_UNLOCK(pmap);
2447	pmap_free_zero_pages(free);
2448}
2449
2450/*
2451 *	Routine:	pmap_remove_all
2452 *	Function:
2453 *		Removes this physical page from
2454 *		all physical maps in which it resides.
2455 *		Reflects back modify bits to the pager.
2456 *
2457 *	Notes:
2458 *		Original versions of this routine were very
2459 *		inefficient because they iteratively called
2460 *		pmap_remove (slow...)
2461 */
2462
2463void
2464pmap_remove_all(vm_page_t m)
2465{
2466	pv_entry_t pv;
2467	pmap_t pmap;
2468	pt_entry_t *pte, tpte;
2469	vm_page_t free;
2470
2471	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2472	    ("pmap_remove_all: page %p is not managed", m));
2473	free = NULL;
2474	rw_wlock(&pvh_global_lock);
2475	sched_pin();
2476	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2477		pmap = PV_PMAP(pv);
2478		PMAP_LOCK(pmap);
2479		pmap->pm_stats.resident_count--;
2480		pte = pmap_pte_quick(pmap, pv->pv_va);
2481		tpte = *pte;
2482		PT_SET_VA_MA(pte, 0, TRUE);
2483		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
2484		    pmap, pv->pv_va));
2485		if (tpte & PG_W)
2486			pmap->pm_stats.wired_count--;
2487		if (tpte & PG_A)
2488			vm_page_aflag_set(m, PGA_REFERENCED);
2489
2490		/*
2491		 * Update the vm_page_t clean and reference bits.
2492		 */
2493		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2494			vm_page_dirty(m);
2495		pmap_unuse_pt(pmap, pv->pv_va, &free);
2496		pmap_invalidate_page(pmap, pv->pv_va);
2497		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2498		free_pv_entry(pmap, pv);
2499		PMAP_UNLOCK(pmap);
2500	}
2501	vm_page_aflag_clear(m, PGA_WRITEABLE);
2502	PT_UPDATES_FLUSH();
2503	if (*PMAP1)
2504		PT_SET_MA(PADDR1, 0);
2505	sched_unpin();
2506	rw_wunlock(&pvh_global_lock);
2507	pmap_free_zero_pages(free);
2508}
2509
2510/*
2511 *	Set the physical protection on the
2512 *	specified range of this map as requested.
2513 */
2514void
2515pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2516{
2517	vm_offset_t pdnxt;
2518	pd_entry_t ptpaddr;
2519	pt_entry_t *pte;
2520	int anychanged;
2521
2522	CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x",
2523	    pmap, sva, eva, prot);
2524
2525	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2526		pmap_remove(pmap, sva, eva);
2527		return;
2528	}
2529
2530#ifdef PAE
2531	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2532	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2533		return;
2534#else
2535	if (prot & VM_PROT_WRITE)
2536		return;
2537#endif
2538
2539	anychanged = 0;
2540
2541	rw_wlock(&pvh_global_lock);
2542	sched_pin();
2543	PMAP_LOCK(pmap);
2544	for (; sva < eva; sva = pdnxt) {
2545		pt_entry_t obits, pbits;
2546		u_int pdirindex;
2547
2548		pdnxt = (sva + NBPDR) & ~PDRMASK;
2549		if (pdnxt < sva)
2550			pdnxt = eva;
2551
2552		pdirindex = sva >> PDRSHIFT;
2553		ptpaddr = pmap->pm_pdir[pdirindex];
2554
2555		/*
2556		 * Weed out invalid mappings. Note: we assume that the page
2557		 * directory table is always allocated, and in kernel virtual.
2558		 */
2559		if (ptpaddr == 0)
2560			continue;
2561
2562		/*
2563		 * Check for large page.
2564		 */
2565		if ((ptpaddr & PG_PS) != 0) {
2566			if ((prot & VM_PROT_WRITE) == 0)
2567				pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2568#ifdef PAE
2569			if ((prot & VM_PROT_EXECUTE) == 0)
2570				pmap->pm_pdir[pdirindex] |= pg_nx;
2571#endif
2572			anychanged = 1;
2573			continue;
2574		}
2575
2576		if (pdnxt > eva)
2577			pdnxt = eva;
2578
2579		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2580		    sva += PAGE_SIZE) {
2581			vm_page_t m;
2582
2583retry:
2584			/*
2585			 * Regardless of whether a pte is 32 or 64 bits in
2586			 * size, PG_RW, PG_A, and PG_M are among the least
2587			 * significant 32 bits.
2588			 */
2589			obits = pbits = *pte;
2590			if ((pbits & PG_V) == 0)
2591				continue;
2592
2593			if ((prot & VM_PROT_WRITE) == 0) {
2594				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
2595				    (PG_MANAGED | PG_M | PG_RW)) {
2596					m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) &
2597					    PG_FRAME);
2598					vm_page_dirty(m);
2599				}
2600				pbits &= ~(PG_RW | PG_M);
2601			}
2602#ifdef PAE
2603			if ((prot & VM_PROT_EXECUTE) == 0)
2604				pbits |= pg_nx;
2605#endif
2606
2607			if (pbits != obits) {
2608				obits = *pte;
2609				PT_SET_VA_MA(pte, pbits, TRUE);
2610				if (*pte != pbits)
2611					goto retry;
2612				if (obits & PG_G)
2613					pmap_invalidate_page(pmap, sva);
2614				else
2615					anychanged = 1;
2616			}
2617		}
2618	}
2619	PT_UPDATES_FLUSH();
2620	if (*PMAP1)
2621		PT_SET_VA_MA(PMAP1, 0, TRUE);
2622	if (anychanged)
2623		pmap_invalidate_all(pmap);
2624	sched_unpin();
2625	rw_wunlock(&pvh_global_lock);
2626	PMAP_UNLOCK(pmap);
2627}
2628
2629/*
2630 *	Insert the given physical page (p) at
2631 *	the specified virtual address (v) in the
2632 *	target physical map with the protection requested.
2633 *
2634 *	If specified, the page will be wired down, meaning
2635 *	that the related pte can not be reclaimed.
2636 *
2637 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2638 *	or lose information.  That is, this routine must actually
2639 *	insert this page into the given map NOW.
2640 */
2641int
2642pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2643    u_int flags, int8_t psind __unused)
2644{
2645	pd_entry_t *pde;
2646	pt_entry_t *pte;
2647	pt_entry_t newpte, origpte;
2648	pv_entry_t pv;
2649	vm_paddr_t opa, pa;
2650	vm_page_t mpte, om;
2651	boolean_t invlva, wired;
2652
2653	CTR5(KTR_PMAP,
2654	    "pmap_enter: pmap=%08p va=0x%08x ma=0x%08x prot=0x%x flags=0x%x",
2655	    pmap, va, VM_PAGE_TO_MACH(m), prot, flags);
2656	va = trunc_page(va);
2657	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2658	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
2659	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
2660	    va));
2661	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
2662		VM_OBJECT_ASSERT_LOCKED(m->object);
2663
2664	mpte = NULL;
2665	wired = (flags & PMAP_ENTER_WIRED) != 0;
2666
2667	rw_wlock(&pvh_global_lock);
2668	PMAP_LOCK(pmap);
2669	sched_pin();
2670
2671	/*
2672	 * In the case that a page table page is not
2673	 * resident, we are creating it here.
2674	 */
2675	if (va < VM_MAXUSER_ADDRESS) {
2676		mpte = pmap_allocpte(pmap, va, flags);
2677		if (mpte == NULL) {
2678			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
2679			    ("pmap_allocpte failed with sleep allowed"));
2680			sched_unpin();
2681			rw_wunlock(&pvh_global_lock);
2682			PMAP_UNLOCK(pmap);
2683			return (KERN_RESOURCE_SHORTAGE);
2684		}
2685	}
2686
2687	pde = pmap_pde(pmap, va);
2688	if ((*pde & PG_PS) != 0)
2689		panic("pmap_enter: attempted pmap_enter on 4MB page");
2690	pte = pmap_pte_quick(pmap, va);
2691
2692	/*
2693	 * Page Directory table entry not valid, we need a new PT page
2694	 */
2695	if (pte == NULL) {
2696		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
2697			(uintmax_t)pmap->pm_pdir[va >> PDRSHIFT], va);
2698	}
2699
2700	pa = VM_PAGE_TO_PHYS(m);
2701	om = NULL;
2702	opa = origpte = 0;
2703
2704#if 0
2705	KASSERT((*pte & PG_V) || (*pte == 0), ("address set but not valid pte=%p *pte=0x%016jx",
2706		pte, *pte));
2707#endif
2708	origpte = *pte;
2709	if (origpte)
2710		origpte = xpmap_mtop(origpte);
2711	opa = origpte & PG_FRAME;
2712
2713	/*
2714	 * Mapping has not changed, must be protection or wiring change.
2715	 */
2716	if (origpte && (opa == pa)) {
2717		/*
2718		 * Wiring change, just update stats. We don't worry about
2719		 * wiring PT pages as they remain resident as long as there
2720		 * are valid mappings in them. Hence, if a user page is wired,
2721		 * the PT page will be also.
2722		 */
2723		if (wired && ((origpte & PG_W) == 0))
2724			pmap->pm_stats.wired_count++;
2725		else if (!wired && (origpte & PG_W))
2726			pmap->pm_stats.wired_count--;
2727
2728		/*
2729		 * Remove extra pte reference
2730		 */
2731		if (mpte)
2732			mpte->wire_count--;
2733
2734		if (origpte & PG_MANAGED) {
2735			om = m;
2736			pa |= PG_MANAGED;
2737		}
2738		goto validate;
2739	}
2740
2741	pv = NULL;
2742
2743	/*
2744	 * Mapping has changed, invalidate old range and fall through to
2745	 * handle validating new mapping.
2746	 */
2747	if (opa) {
2748		if (origpte & PG_W)
2749			pmap->pm_stats.wired_count--;
2750		if (origpte & PG_MANAGED) {
2751			om = PHYS_TO_VM_PAGE(opa);
2752			pv = pmap_pvh_remove(&om->md, pmap, va);
2753		} else if (va < VM_MAXUSER_ADDRESS)
2754			printf("va=0x%x is unmanaged :-( \n", va);
2755
2756		if (mpte != NULL) {
2757			mpte->wire_count--;
2758			KASSERT(mpte->wire_count > 0,
2759			    ("pmap_enter: missing reference to page table page,"
2760			     " va: 0x%x", va));
2761		}
2762	} else
2763		pmap->pm_stats.resident_count++;
2764
2765	/*
2766	 * Enter on the PV list if part of our managed memory.
2767	 */
2768	if ((m->oflags & VPO_UNMANAGED) == 0) {
2769		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2770		    ("pmap_enter: managed mapping within the clean submap"));
2771		if (pv == NULL)
2772			pv = get_pv_entry(pmap, FALSE);
2773		pv->pv_va = va;
2774		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2775		pa |= PG_MANAGED;
2776	} else if (pv != NULL)
2777		free_pv_entry(pmap, pv);
2778
2779	/*
2780	 * Increment counters
2781	 */
2782	if (wired)
2783		pmap->pm_stats.wired_count++;
2784
2785validate:
2786	/*
2787	 * Now validate mapping with desired protection/wiring.
2788	 */
2789	newpte = (pt_entry_t)(pa | PG_V);
2790	if ((prot & VM_PROT_WRITE) != 0) {
2791		newpte |= PG_RW;
2792		if ((newpte & PG_MANAGED) != 0)
2793			vm_page_aflag_set(m, PGA_WRITEABLE);
2794	}
2795#ifdef PAE
2796	if ((prot & VM_PROT_EXECUTE) == 0)
2797		newpte |= pg_nx;
2798#endif
2799	if (wired)
2800		newpte |= PG_W;
2801	if (va < VM_MAXUSER_ADDRESS)
2802		newpte |= PG_U;
2803	if (pmap == kernel_pmap)
2804		newpte |= pgeflag;
2805
2806	critical_enter();
2807	/*
2808	 * if the mapping or permission bits are different, we need
2809	 * to update the pte.
2810	 */
2811	if ((origpte & ~(PG_M|PG_A)) != newpte) {
2812		if (origpte) {
2813			invlva = FALSE;
2814			origpte = *pte;
2815			PT_SET_VA(pte, newpte | PG_A, FALSE);
2816			if (origpte & PG_A) {
2817				if (origpte & PG_MANAGED)
2818					vm_page_aflag_set(om, PGA_REFERENCED);
2819				if (opa != VM_PAGE_TO_PHYS(m))
2820					invlva = TRUE;
2821#ifdef PAE
2822				if ((origpte & PG_NX) == 0 &&
2823				    (newpte & PG_NX) != 0)
2824					invlva = TRUE;
2825#endif
2826			}
2827			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2828				if ((origpte & PG_MANAGED) != 0)
2829					vm_page_dirty(om);
2830				if ((prot & VM_PROT_WRITE) == 0)
2831					invlva = TRUE;
2832			}
2833			if ((origpte & PG_MANAGED) != 0 &&
2834			    TAILQ_EMPTY(&om->md.pv_list))
2835				vm_page_aflag_clear(om, PGA_WRITEABLE);
2836			if (invlva)
2837				pmap_invalidate_page(pmap, va);
2838		} else{
2839			PT_SET_VA(pte, newpte | PG_A, FALSE);
2840		}
2841
2842	}
2843	PT_UPDATES_FLUSH();
2844	critical_exit();
2845	if (*PMAP1)
2846		PT_SET_VA_MA(PMAP1, 0, TRUE);
2847	sched_unpin();
2848	rw_wunlock(&pvh_global_lock);
2849	PMAP_UNLOCK(pmap);
2850	return (KERN_SUCCESS);
2851}
2852
2853/*
2854 * Maps a sequence of resident pages belonging to the same object.
2855 * The sequence begins with the given page m_start.  This page is
2856 * mapped at the given virtual address start.  Each subsequent page is
2857 * mapped at a virtual address that is offset from start by the same
2858 * amount as the page is offset from m_start within the object.  The
2859 * last page in the sequence is the page with the largest offset from
2860 * m_start that can be mapped at a virtual address less than the given
2861 * virtual address end.  Not every virtual page between start and end
2862 * is mapped; only those for which a resident page exists with the
2863 * corresponding offset from m_start are mapped.
2864 */
2865void
2866pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2867    vm_page_t m_start, vm_prot_t prot)
2868{
2869	vm_page_t m, mpte;
2870	vm_pindex_t diff, psize;
2871	multicall_entry_t mcl[16];
2872	multicall_entry_t *mclp = mcl;
2873	int error, count = 0;
2874
2875	VM_OBJECT_ASSERT_LOCKED(m_start->object);
2876
2877	psize = atop(end - start);
2878	mpte = NULL;
2879	m = m_start;
2880	rw_wlock(&pvh_global_lock);
2881	PMAP_LOCK(pmap);
2882	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2883		mpte = pmap_enter_quick_locked(&mclp, &count, pmap, start + ptoa(diff), m,
2884		    prot, mpte);
2885		m = TAILQ_NEXT(m, listq);
2886		if (count == 16) {
2887			error = HYPERVISOR_multicall(mcl, count);
2888			KASSERT(error == 0, ("bad multicall %d", error));
2889			mclp = mcl;
2890			count = 0;
2891		}
2892	}
2893	if (count) {
2894		error = HYPERVISOR_multicall(mcl, count);
2895		KASSERT(error == 0, ("bad multicall %d", error));
2896	}
2897	rw_wunlock(&pvh_global_lock);
2898	PMAP_UNLOCK(pmap);
2899}
2900
2901/*
2902 * this code makes some *MAJOR* assumptions:
2903 * 1. Current pmap & pmap exists.
2904 * 2. Not wired.
2905 * 3. Read access.
2906 * 4. No page table pages.
2907 * but is *MUCH* faster than pmap_enter...
2908 */
2909
2910void
2911pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2912{
2913	multicall_entry_t mcl, *mclp;
2914	int count = 0;
2915	mclp = &mcl;
2916
2917	CTR4(KTR_PMAP, "pmap_enter_quick: pmap=%p va=0x%x m=%p prot=0x%x",
2918	    pmap, va, m, prot);
2919
2920	rw_wlock(&pvh_global_lock);
2921	PMAP_LOCK(pmap);
2922	(void)pmap_enter_quick_locked(&mclp, &count, pmap, va, m, prot, NULL);
2923	if (count)
2924		HYPERVISOR_multicall(&mcl, count);
2925	rw_wunlock(&pvh_global_lock);
2926	PMAP_UNLOCK(pmap);
2927}
2928
2929#ifdef notyet
2930void
2931pmap_enter_quick_range(pmap_t pmap, vm_offset_t *addrs, vm_page_t *pages, vm_prot_t *prots, int count)
2932{
2933	int i, error, index = 0;
2934	multicall_entry_t mcl[16];
2935	multicall_entry_t *mclp = mcl;
2936
2937	PMAP_LOCK(pmap);
2938	for (i = 0; i < count; i++, addrs++, pages++, prots++) {
2939		if (!pmap_is_prefaultable_locked(pmap, *addrs))
2940			continue;
2941
2942		(void) pmap_enter_quick_locked(&mclp, &index, pmap, *addrs, *pages, *prots, NULL);
2943		if (index == 16) {
2944			error = HYPERVISOR_multicall(mcl, index);
2945			mclp = mcl;
2946			index = 0;
2947			KASSERT(error == 0, ("bad multicall %d", error));
2948		}
2949	}
2950	if (index) {
2951		error = HYPERVISOR_multicall(mcl, index);
2952		KASSERT(error == 0, ("bad multicall %d", error));
2953	}
2954
2955	PMAP_UNLOCK(pmap);
2956}
2957#endif
2958
2959static vm_page_t
2960pmap_enter_quick_locked(multicall_entry_t **mclpp, int *count, pmap_t pmap, vm_offset_t va, vm_page_t m,
2961    vm_prot_t prot, vm_page_t mpte)
2962{
2963	pt_entry_t *pte;
2964	vm_paddr_t pa;
2965	vm_page_t free;
2966	multicall_entry_t *mcl = *mclpp;
2967
2968	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2969	    (m->oflags & VPO_UNMANAGED) != 0,
2970	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2971	rw_assert(&pvh_global_lock, RA_WLOCKED);
2972	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2973
2974	/*
2975	 * In the case that a page table page is not
2976	 * resident, we are creating it here.
2977	 */
2978	if (va < VM_MAXUSER_ADDRESS) {
2979		u_int ptepindex;
2980		pd_entry_t ptema;
2981
2982		/*
2983		 * Calculate pagetable page index
2984		 */
2985		ptepindex = va >> PDRSHIFT;
2986		if (mpte && (mpte->pindex == ptepindex)) {
2987			mpte->wire_count++;
2988		} else {
2989			/*
2990			 * Get the page directory entry
2991			 */
2992			ptema = pmap->pm_pdir[ptepindex];
2993
2994			/*
2995			 * If the page table page is mapped, we just increment
2996			 * the hold count, and activate it.
2997			 */
2998			if (ptema & PG_V) {
2999				if (ptema & PG_PS)
3000					panic("pmap_enter_quick: unexpected mapping into 4MB page");
3001				mpte = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME);
3002				mpte->wire_count++;
3003			} else {
3004				mpte = _pmap_allocpte(pmap, ptepindex,
3005				    PMAP_ENTER_NOSLEEP);
3006				if (mpte == NULL)
3007					return (mpte);
3008			}
3009		}
3010	} else {
3011		mpte = NULL;
3012	}
3013
3014	/*
3015	 * This call to vtopte makes the assumption that we are
3016	 * entering the page into the current pmap.  In order to support
3017	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3018	 * But that isn't as quick as vtopte.
3019	 */
3020	KASSERT(pmap_is_current(pmap), ("entering pages in non-current pmap"));
3021	pte = vtopte(va);
3022	if (*pte & PG_V) {
3023		if (mpte != NULL) {
3024			mpte->wire_count--;
3025			mpte = NULL;
3026		}
3027		return (mpte);
3028	}
3029
3030	/*
3031	 * Enter on the PV list if part of our managed memory.
3032	 */
3033	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3034	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3035		if (mpte != NULL) {
3036			free = NULL;
3037			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3038				pmap_invalidate_page(pmap, va);
3039				pmap_free_zero_pages(free);
3040			}
3041
3042			mpte = NULL;
3043		}
3044		return (mpte);
3045	}
3046
3047	/*
3048	 * Increment counters
3049	 */
3050	pmap->pm_stats.resident_count++;
3051
3052	pa = VM_PAGE_TO_PHYS(m);
3053#ifdef PAE
3054	if ((prot & VM_PROT_EXECUTE) == 0)
3055		pa |= pg_nx;
3056#endif
3057
3058#if 0
3059	/*
3060	 * Now validate mapping with RO protection
3061	 */
3062	if ((m->oflags & VPO_UNMANAGED) != 0)
3063		pte_store(pte, pa | PG_V | PG_U);
3064	else
3065		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3066#else
3067	/*
3068	 * Now validate mapping with RO protection
3069	 */
3070	if ((m->oflags & VPO_UNMANAGED) != 0)
3071		pa = 	xpmap_ptom(pa | PG_V | PG_U);
3072	else
3073		pa = xpmap_ptom(pa | PG_V | PG_U | PG_MANAGED);
3074
3075	mcl->op = __HYPERVISOR_update_va_mapping;
3076	mcl->args[0] = va;
3077	mcl->args[1] = (uint32_t)(pa & 0xffffffff);
3078	mcl->args[2] = (uint32_t)(pa >> 32);
3079	mcl->args[3] = 0;
3080	*mclpp = mcl + 1;
3081	*count = *count + 1;
3082#endif
3083	return (mpte);
3084}
3085
3086/*
3087 * Make a temporary mapping for a physical address.  This is only intended
3088 * to be used for panic dumps.
3089 */
3090void *
3091pmap_kenter_temporary(vm_paddr_t pa, int i)
3092{
3093	vm_offset_t va;
3094	vm_paddr_t ma = xpmap_ptom(pa);
3095
3096	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3097	PT_SET_MA(va, (ma & ~PAGE_MASK) | PG_V | pgeflag);
3098	invlpg(va);
3099	return ((void *)crashdumpmap);
3100}
3101
3102/*
3103 * This code maps large physical mmap regions into the
3104 * processor address space.  Note that some shortcuts
3105 * are taken, but the code works.
3106 */
3107void
3108pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3109    vm_pindex_t pindex, vm_size_t size)
3110{
3111	pd_entry_t *pde;
3112	vm_paddr_t pa, ptepa;
3113	vm_page_t p;
3114	int pat_mode;
3115
3116	VM_OBJECT_ASSERT_WLOCKED(object);
3117	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3118	    ("pmap_object_init_pt: non-device object"));
3119	if (pseflag &&
3120	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3121		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3122			return;
3123		p = vm_page_lookup(object, pindex);
3124		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3125		    ("pmap_object_init_pt: invalid page %p", p));
3126		pat_mode = p->md.pat_mode;
3127
3128		/*
3129		 * Abort the mapping if the first page is not physically
3130		 * aligned to a 2/4MB page boundary.
3131		 */
3132		ptepa = VM_PAGE_TO_PHYS(p);
3133		if (ptepa & (NBPDR - 1))
3134			return;
3135
3136		/*
3137		 * Skip the first page.  Abort the mapping if the rest of
3138		 * the pages are not physically contiguous or have differing
3139		 * memory attributes.
3140		 */
3141		p = TAILQ_NEXT(p, listq);
3142		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3143		    pa += PAGE_SIZE) {
3144			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3145			    ("pmap_object_init_pt: invalid page %p", p));
3146			if (pa != VM_PAGE_TO_PHYS(p) ||
3147			    pat_mode != p->md.pat_mode)
3148				return;
3149			p = TAILQ_NEXT(p, listq);
3150		}
3151
3152		/*
3153		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3154		 * "size" is a multiple of 2/4M, adding the PAT setting to
3155		 * "pa" will not affect the termination of this loop.
3156		 */
3157		PMAP_LOCK(pmap);
3158		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3159		    size; pa += NBPDR) {
3160			pde = pmap_pde(pmap, addr);
3161			if (*pde == 0) {
3162				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3163				    PG_U | PG_RW | PG_V);
3164				pmap->pm_stats.resident_count += NBPDR /
3165				    PAGE_SIZE;
3166				pmap_pde_mappings++;
3167			}
3168			/* Else continue on if the PDE is already valid. */
3169			addr += NBPDR;
3170		}
3171		PMAP_UNLOCK(pmap);
3172	}
3173}
3174
3175/*
3176 *	Clear the wired attribute from the mappings for the specified range of
3177 *	addresses in the given pmap.  Every valid mapping within that range
3178 *	must have the wired attribute set.  In contrast, invalid mappings
3179 *	cannot have the wired attribute set, so they are ignored.
3180 *
3181 *	The wired attribute of the page table entry is not a hardware feature,
3182 *	so there is no need to invalidate any TLB entries.
3183 */
3184void
3185pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3186{
3187	vm_offset_t pdnxt;
3188	pd_entry_t *pde;
3189	pt_entry_t *pte;
3190
3191	CTR3(KTR_PMAP, "pmap_unwire: pmap=%p sva=0x%x eva=0x%x", pmap, sva,
3192	    eva);
3193	rw_wlock(&pvh_global_lock);
3194	sched_pin();
3195	PMAP_LOCK(pmap);
3196	for (; sva < eva; sva = pdnxt) {
3197		pdnxt = (sva + NBPDR) & ~PDRMASK;
3198		if (pdnxt < sva)
3199			pdnxt = eva;
3200		pde = pmap_pde(pmap, sva);
3201		if ((*pde & PG_V) == 0)
3202			continue;
3203		if ((*pde & PG_PS) != 0)
3204			panic("pmap_unwire: unexpected PG_PS in pde %#jx",
3205			    (uintmax_t)*pde);
3206		if (pdnxt > eva)
3207			pdnxt = eva;
3208		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3209		    sva += PAGE_SIZE) {
3210			if ((*pte & PG_V) == 0)
3211				continue;
3212			if ((*pte & PG_W) == 0)
3213				panic("pmap_unwire: pte %#jx is missing PG_W",
3214				    (uintmax_t)*pte);
3215			PT_SET_VA_MA(pte, *pte & ~PG_W, FALSE);
3216			pmap->pm_stats.wired_count--;
3217		}
3218	}
3219	if (*PMAP1)
3220		PT_CLEAR_VA(PMAP1, FALSE);
3221	PT_UPDATES_FLUSH();
3222	sched_unpin();
3223	rw_wunlock(&pvh_global_lock);
3224	PMAP_UNLOCK(pmap);
3225}
3226
3227
3228/*
3229 *	Copy the range specified by src_addr/len
3230 *	from the source map to the range dst_addr/len
3231 *	in the destination map.
3232 *
3233 *	This routine is only advisory and need not do anything.
3234 */
3235
3236void
3237pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3238    vm_offset_t src_addr)
3239{
3240	vm_page_t   free;
3241	vm_offset_t addr;
3242	vm_offset_t end_addr = src_addr + len;
3243	vm_offset_t pdnxt;
3244
3245	if (dst_addr != src_addr)
3246		return;
3247
3248	if (!pmap_is_current(src_pmap)) {
3249		CTR2(KTR_PMAP,
3250		    "pmap_copy, skipping: pdir[PTDPTDI]=0x%jx PTDpde[0]=0x%jx",
3251		    (src_pmap->pm_pdir[PTDPTDI] & PG_FRAME), (PTDpde[0] & PG_FRAME));
3252
3253		return;
3254	}
3255	CTR5(KTR_PMAP, "pmap_copy:  dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x",
3256	    dst_pmap, src_pmap, dst_addr, len, src_addr);
3257
3258#ifdef HAMFISTED_LOCKING
3259	mtx_lock(&createdelete_lock);
3260#endif
3261
3262	rw_wlock(&pvh_global_lock);
3263	if (dst_pmap < src_pmap) {
3264		PMAP_LOCK(dst_pmap);
3265		PMAP_LOCK(src_pmap);
3266	} else {
3267		PMAP_LOCK(src_pmap);
3268		PMAP_LOCK(dst_pmap);
3269	}
3270	sched_pin();
3271	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3272		pt_entry_t *src_pte, *dst_pte;
3273		vm_page_t dstmpte, srcmpte;
3274		pd_entry_t srcptepaddr;
3275		u_int ptepindex;
3276
3277		KASSERT(addr < UPT_MIN_ADDRESS,
3278		    ("pmap_copy: invalid to pmap_copy page tables"));
3279
3280		pdnxt = (addr + NBPDR) & ~PDRMASK;
3281		if (pdnxt < addr)
3282			pdnxt = end_addr;
3283		ptepindex = addr >> PDRSHIFT;
3284
3285		srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]);
3286		if (srcptepaddr == 0)
3287			continue;
3288
3289		if (srcptepaddr & PG_PS) {
3290			if (dst_pmap->pm_pdir[ptepindex] == 0) {
3291				PD_SET_VA(dst_pmap, ptepindex, srcptepaddr & ~PG_W, TRUE);
3292				dst_pmap->pm_stats.resident_count +=
3293				    NBPDR / PAGE_SIZE;
3294			}
3295			continue;
3296		}
3297
3298		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3299		KASSERT(srcmpte->wire_count > 0,
3300		    ("pmap_copy: source page table page is unused"));
3301
3302		if (pdnxt > end_addr)
3303			pdnxt = end_addr;
3304
3305		src_pte = vtopte(addr);
3306		while (addr < pdnxt) {
3307			pt_entry_t ptetemp;
3308			ptetemp = *src_pte;
3309			/*
3310			 * we only virtual copy managed pages
3311			 */
3312			if ((ptetemp & PG_MANAGED) != 0) {
3313				dstmpte = pmap_allocpte(dst_pmap, addr,
3314				    PMAP_ENTER_NOSLEEP);
3315				if (dstmpte == NULL)
3316					goto out;
3317				dst_pte = pmap_pte_quick(dst_pmap, addr);
3318				if (*dst_pte == 0 &&
3319				    pmap_try_insert_pv_entry(dst_pmap, addr,
3320				    PHYS_TO_VM_PAGE(xpmap_mtop(ptetemp) & PG_FRAME))) {
3321					/*
3322					 * Clear the wired, modified, and
3323					 * accessed (referenced) bits
3324					 * during the copy.
3325					 */
3326					KASSERT(ptetemp != 0, ("src_pte not set"));
3327					PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), TRUE /* XXX debug */);
3328					KASSERT(*dst_pte == (ptetemp & ~(PG_W | PG_M | PG_A)),
3329					    ("no pmap copy expected: 0x%jx saw: 0x%jx",
3330						ptetemp &  ~(PG_W | PG_M | PG_A), *dst_pte));
3331					dst_pmap->pm_stats.resident_count++;
3332	 			} else {
3333					free = NULL;
3334					if (pmap_unwire_ptp(dst_pmap, dstmpte,
3335					    &free)) {
3336						pmap_invalidate_page(dst_pmap,
3337						    addr);
3338						pmap_free_zero_pages(free);
3339					}
3340					goto out;
3341				}
3342				if (dstmpte->wire_count >= srcmpte->wire_count)
3343					break;
3344			}
3345			addr += PAGE_SIZE;
3346			src_pte++;
3347		}
3348	}
3349out:
3350	PT_UPDATES_FLUSH();
3351	sched_unpin();
3352	rw_wunlock(&pvh_global_lock);
3353	PMAP_UNLOCK(src_pmap);
3354	PMAP_UNLOCK(dst_pmap);
3355
3356#ifdef HAMFISTED_LOCKING
3357	mtx_unlock(&createdelete_lock);
3358#endif
3359}
3360
3361static __inline void
3362pagezero(void *page)
3363{
3364#if defined(I686_CPU)
3365	if (cpu_class == CPUCLASS_686) {
3366#if defined(CPU_ENABLE_SSE)
3367		if (cpu_feature & CPUID_SSE2)
3368			sse2_pagezero(page);
3369		else
3370#endif
3371			i686_pagezero(page);
3372	} else
3373#endif
3374		bzero(page, PAGE_SIZE);
3375}
3376
3377/*
3378 *	pmap_zero_page zeros the specified hardware page by mapping
3379 *	the page into KVM and using bzero to clear its contents.
3380 */
3381void
3382pmap_zero_page(vm_page_t m)
3383{
3384	struct sysmaps *sysmaps;
3385
3386	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3387	mtx_lock(&sysmaps->lock);
3388	if (*sysmaps->CMAP2)
3389		panic("pmap_zero_page: CMAP2 busy");
3390	sched_pin();
3391	PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M);
3392	pagezero(sysmaps->CADDR2);
3393	PT_SET_MA(sysmaps->CADDR2, 0);
3394	sched_unpin();
3395	mtx_unlock(&sysmaps->lock);
3396}
3397
3398/*
3399 *	pmap_zero_page_area zeros the specified hardware page by mapping
3400 *	the page into KVM and using bzero to clear its contents.
3401 *
3402 *	off and size may not cover an area beyond a single hardware page.
3403 */
3404void
3405pmap_zero_page_area(vm_page_t m, int off, int size)
3406{
3407	struct sysmaps *sysmaps;
3408
3409	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3410	mtx_lock(&sysmaps->lock);
3411	if (*sysmaps->CMAP2)
3412		panic("pmap_zero_page_area: CMAP2 busy");
3413	sched_pin();
3414	PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M);
3415
3416	if (off == 0 && size == PAGE_SIZE)
3417		pagezero(sysmaps->CADDR2);
3418	else
3419		bzero((char *)sysmaps->CADDR2 + off, size);
3420	PT_SET_MA(sysmaps->CADDR2, 0);
3421	sched_unpin();
3422	mtx_unlock(&sysmaps->lock);
3423}
3424
3425/*
3426 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3427 *	the page into KVM and using bzero to clear its contents.  This
3428 *	is intended to be called from the vm_pagezero process only and
3429 *	outside of Giant.
3430 */
3431void
3432pmap_zero_page_idle(vm_page_t m)
3433{
3434
3435	if (*CMAP3)
3436		panic("pmap_zero_page_idle: CMAP3 busy");
3437	sched_pin();
3438	PT_SET_MA(CADDR3, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M);
3439	pagezero(CADDR3);
3440	PT_SET_MA(CADDR3, 0);
3441	sched_unpin();
3442}
3443
3444/*
3445 *	pmap_copy_page copies the specified (machine independent)
3446 *	page by mapping the page into virtual memory and using
3447 *	bcopy to copy the page, one machine dependent page at a
3448 *	time.
3449 */
3450void
3451pmap_copy_page(vm_page_t src, vm_page_t dst)
3452{
3453	struct sysmaps *sysmaps;
3454
3455	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3456	mtx_lock(&sysmaps->lock);
3457	if (*sysmaps->CMAP1)
3458		panic("pmap_copy_page: CMAP1 busy");
3459	if (*sysmaps->CMAP2)
3460		panic("pmap_copy_page: CMAP2 busy");
3461	sched_pin();
3462	PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(src) | PG_A);
3463	PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(dst) | PG_A | PG_M);
3464	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
3465	PT_SET_MA(sysmaps->CADDR1, 0);
3466	PT_SET_MA(sysmaps->CADDR2, 0);
3467	sched_unpin();
3468	mtx_unlock(&sysmaps->lock);
3469}
3470
3471int unmapped_buf_allowed = 1;
3472
3473void
3474pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3475    vm_offset_t b_offset, int xfersize)
3476{
3477	struct sysmaps *sysmaps;
3478	vm_page_t a_pg, b_pg;
3479	char *a_cp, *b_cp;
3480	vm_offset_t a_pg_offset, b_pg_offset;
3481	int cnt;
3482
3483	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3484	mtx_lock(&sysmaps->lock);
3485	if (*sysmaps->CMAP1 != 0)
3486		panic("pmap_copy_pages: CMAP1 busy");
3487	if (*sysmaps->CMAP2 != 0)
3488		panic("pmap_copy_pages: CMAP2 busy");
3489	sched_pin();
3490	while (xfersize > 0) {
3491		a_pg = ma[a_offset >> PAGE_SHIFT];
3492		a_pg_offset = a_offset & PAGE_MASK;
3493		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3494		b_pg = mb[b_offset >> PAGE_SHIFT];
3495		b_pg_offset = b_offset & PAGE_MASK;
3496		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3497		PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(a_pg) | PG_A);
3498		PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW |
3499		    VM_PAGE_TO_MACH(b_pg) | PG_A | PG_M);
3500		a_cp = sysmaps->CADDR1 + a_pg_offset;
3501		b_cp = sysmaps->CADDR2 + b_pg_offset;
3502		bcopy(a_cp, b_cp, cnt);
3503		a_offset += cnt;
3504		b_offset += cnt;
3505		xfersize -= cnt;
3506	}
3507	PT_SET_MA(sysmaps->CADDR1, 0);
3508	PT_SET_MA(sysmaps->CADDR2, 0);
3509	sched_unpin();
3510	mtx_unlock(&sysmaps->lock);
3511}
3512
3513/*
3514 * Returns true if the pmap's pv is one of the first
3515 * 16 pvs linked to from this page.  This count may
3516 * be changed upwards or downwards in the future; it
3517 * is only necessary that true be returned for a small
3518 * subset of pmaps for proper page aging.
3519 */
3520boolean_t
3521pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3522{
3523	pv_entry_t pv;
3524	int loops = 0;
3525	boolean_t rv;
3526
3527	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3528	    ("pmap_page_exists_quick: page %p is not managed", m));
3529	rv = FALSE;
3530	rw_wlock(&pvh_global_lock);
3531	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3532		if (PV_PMAP(pv) == pmap) {
3533			rv = TRUE;
3534			break;
3535		}
3536		loops++;
3537		if (loops >= 16)
3538			break;
3539	}
3540	rw_wunlock(&pvh_global_lock);
3541	return (rv);
3542}
3543
3544/*
3545 *	pmap_page_wired_mappings:
3546 *
3547 *	Return the number of managed mappings to the given physical page
3548 *	that are wired.
3549 */
3550int
3551pmap_page_wired_mappings(vm_page_t m)
3552{
3553	pv_entry_t pv;
3554	pt_entry_t *pte;
3555	pmap_t pmap;
3556	int count;
3557
3558	count = 0;
3559	if ((m->oflags & VPO_UNMANAGED) != 0)
3560		return (count);
3561	rw_wlock(&pvh_global_lock);
3562	sched_pin();
3563	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3564		pmap = PV_PMAP(pv);
3565		PMAP_LOCK(pmap);
3566		pte = pmap_pte_quick(pmap, pv->pv_va);
3567		if ((*pte & PG_W) != 0)
3568			count++;
3569		PMAP_UNLOCK(pmap);
3570	}
3571	sched_unpin();
3572	rw_wunlock(&pvh_global_lock);
3573	return (count);
3574}
3575
3576/*
3577 * Returns TRUE if the given page is mapped.  Otherwise, returns FALSE.
3578 */
3579boolean_t
3580pmap_page_is_mapped(vm_page_t m)
3581{
3582
3583	if ((m->oflags & VPO_UNMANAGED) != 0)
3584		return (FALSE);
3585	return (!TAILQ_EMPTY(&m->md.pv_list));
3586}
3587
3588/*
3589 * Remove all pages from specified address space
3590 * this aids process exit speeds.  Also, this code
3591 * is special cased for current process only, but
3592 * can have the more generic (and slightly slower)
3593 * mode enabled.  This is much faster than pmap_remove
3594 * in the case of running down an entire address space.
3595 */
3596void
3597pmap_remove_pages(pmap_t pmap)
3598{
3599	pt_entry_t *pte, tpte;
3600	vm_page_t m, free = NULL;
3601	pv_entry_t pv;
3602	struct pv_chunk *pc, *npc;
3603	int field, idx;
3604	int32_t bit;
3605	uint32_t inuse, bitmask;
3606	int allfree;
3607
3608	CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap);
3609
3610	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3611		printf("warning: pmap_remove_pages called with non-current pmap\n");
3612		return;
3613	}
3614	rw_wlock(&pvh_global_lock);
3615	KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap"));
3616	PMAP_LOCK(pmap);
3617	sched_pin();
3618	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3619		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
3620		    pc->pc_pmap));
3621		allfree = 1;
3622		for (field = 0; field < _NPCM; field++) {
3623			inuse = ~pc->pc_map[field] & pc_freemask[field];
3624			while (inuse != 0) {
3625				bit = bsfl(inuse);
3626				bitmask = 1UL << bit;
3627				idx = field * 32 + bit;
3628				pv = &pc->pc_pventry[idx];
3629				inuse &= ~bitmask;
3630
3631				pte = vtopte(pv->pv_va);
3632				tpte = *pte ? xpmap_mtop(*pte) : 0;
3633
3634				if (tpte == 0) {
3635					printf(
3636					    "TPTE at %p  IS ZERO @ VA %08x\n",
3637					    pte, pv->pv_va);
3638					panic("bad pte");
3639				}
3640
3641/*
3642 * We cannot remove wired pages from a process' mapping at this time
3643 */
3644				if (tpte & PG_W) {
3645					allfree = 0;
3646					continue;
3647				}
3648
3649				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3650				KASSERT(m->phys_addr == (tpte & PG_FRAME),
3651				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3652				    m, (uintmax_t)m->phys_addr,
3653				    (uintmax_t)tpte));
3654
3655				KASSERT(m < &vm_page_array[vm_page_array_size],
3656					("pmap_remove_pages: bad tpte %#jx",
3657					(uintmax_t)tpte));
3658
3659
3660				PT_CLEAR_VA(pte, FALSE);
3661
3662				/*
3663				 * Update the vm_page_t clean/reference bits.
3664				 */
3665				if (tpte & PG_M)
3666					vm_page_dirty(m);
3667
3668				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3669				if (TAILQ_EMPTY(&m->md.pv_list))
3670					vm_page_aflag_clear(m, PGA_WRITEABLE);
3671
3672				pmap_unuse_pt(pmap, pv->pv_va, &free);
3673
3674				/* Mark free */
3675				PV_STAT(pv_entry_frees++);
3676				PV_STAT(pv_entry_spare++);
3677				pv_entry_count--;
3678				pc->pc_map[field] |= bitmask;
3679				pmap->pm_stats.resident_count--;
3680			}
3681		}
3682		PT_UPDATES_FLUSH();
3683		if (allfree) {
3684			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3685			free_pv_chunk(pc);
3686		}
3687	}
3688	PT_UPDATES_FLUSH();
3689	if (*PMAP1)
3690		PT_SET_MA(PADDR1, 0);
3691
3692	sched_unpin();
3693	pmap_invalidate_all(pmap);
3694	rw_wunlock(&pvh_global_lock);
3695	PMAP_UNLOCK(pmap);
3696	pmap_free_zero_pages(free);
3697}
3698
3699/*
3700 *	pmap_is_modified:
3701 *
3702 *	Return whether or not the specified physical page was modified
3703 *	in any physical maps.
3704 */
3705boolean_t
3706pmap_is_modified(vm_page_t m)
3707{
3708	pv_entry_t pv;
3709	pt_entry_t *pte;
3710	pmap_t pmap;
3711	boolean_t rv;
3712
3713	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3714	    ("pmap_is_modified: page %p is not managed", m));
3715	rv = FALSE;
3716
3717	/*
3718	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3719	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
3720	 * is clear, no PTEs can have PG_M set.
3721	 */
3722	VM_OBJECT_ASSERT_WLOCKED(m->object);
3723	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3724		return (rv);
3725	rw_wlock(&pvh_global_lock);
3726	sched_pin();
3727	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3728		pmap = PV_PMAP(pv);
3729		PMAP_LOCK(pmap);
3730		pte = pmap_pte_quick(pmap, pv->pv_va);
3731		rv = (*pte & PG_M) != 0;
3732		PMAP_UNLOCK(pmap);
3733		if (rv)
3734			break;
3735	}
3736	if (*PMAP1)
3737		PT_SET_MA(PADDR1, 0);
3738	sched_unpin();
3739	rw_wunlock(&pvh_global_lock);
3740	return (rv);
3741}
3742
3743/*
3744 *	pmap_is_prefaultable:
3745 *
3746 *	Return whether or not the specified virtual address is elgible
3747 *	for prefault.
3748 */
3749static boolean_t
3750pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr)
3751{
3752	pt_entry_t *pte;
3753	boolean_t rv = FALSE;
3754
3755	return (rv);
3756
3757	if (pmap_is_current(pmap) && *pmap_pde(pmap, addr)) {
3758		pte = vtopte(addr);
3759		rv = (*pte == 0);
3760	}
3761	return (rv);
3762}
3763
3764boolean_t
3765pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3766{
3767	boolean_t rv;
3768
3769	PMAP_LOCK(pmap);
3770	rv = pmap_is_prefaultable_locked(pmap, addr);
3771	PMAP_UNLOCK(pmap);
3772	return (rv);
3773}
3774
3775boolean_t
3776pmap_is_referenced(vm_page_t m)
3777{
3778	pv_entry_t pv;
3779	pt_entry_t *pte;
3780	pmap_t pmap;
3781	boolean_t rv;
3782
3783	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3784	    ("pmap_is_referenced: page %p is not managed", m));
3785	rv = FALSE;
3786	rw_wlock(&pvh_global_lock);
3787	sched_pin();
3788	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3789		pmap = PV_PMAP(pv);
3790		PMAP_LOCK(pmap);
3791		pte = pmap_pte_quick(pmap, pv->pv_va);
3792		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
3793		PMAP_UNLOCK(pmap);
3794		if (rv)
3795			break;
3796	}
3797	if (*PMAP1)
3798		PT_SET_MA(PADDR1, 0);
3799	sched_unpin();
3800	rw_wunlock(&pvh_global_lock);
3801	return (rv);
3802}
3803
3804void
3805pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len)
3806{
3807	int i, npages = round_page(len) >> PAGE_SHIFT;
3808	for (i = 0; i < npages; i++) {
3809		pt_entry_t *pte;
3810		pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE));
3811		rw_wlock(&pvh_global_lock);
3812		pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M)));
3813		rw_wunlock(&pvh_global_lock);
3814		PMAP_MARK_PRIV(xpmap_mtop(*pte));
3815		pmap_pte_release(pte);
3816	}
3817}
3818
3819void
3820pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len)
3821{
3822	int i, npages = round_page(len) >> PAGE_SHIFT;
3823	for (i = 0; i < npages; i++) {
3824		pt_entry_t *pte;
3825		pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE));
3826		PMAP_MARK_UNPRIV(xpmap_mtop(*pte));
3827		rw_wlock(&pvh_global_lock);
3828		pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M));
3829		rw_wunlock(&pvh_global_lock);
3830		pmap_pte_release(pte);
3831	}
3832}
3833
3834/*
3835 * Clear the write and modified bits in each of the given page's mappings.
3836 */
3837void
3838pmap_remove_write(vm_page_t m)
3839{
3840	pv_entry_t pv;
3841	pmap_t pmap;
3842	pt_entry_t oldpte, *pte;
3843
3844	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3845	    ("pmap_remove_write: page %p is not managed", m));
3846
3847	/*
3848	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3849	 * set by another thread while the object is locked.  Thus,
3850	 * if PGA_WRITEABLE is clear, no page table entries need updating.
3851	 */
3852	VM_OBJECT_ASSERT_WLOCKED(m->object);
3853	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3854		return;
3855	rw_wlock(&pvh_global_lock);
3856	sched_pin();
3857	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3858		pmap = PV_PMAP(pv);
3859		PMAP_LOCK(pmap);
3860		pte = pmap_pte_quick(pmap, pv->pv_va);
3861retry:
3862		oldpte = *pte;
3863		if ((oldpte & PG_RW) != 0) {
3864			vm_paddr_t newpte = oldpte & ~(PG_RW | PG_M);
3865
3866			/*
3867			 * Regardless of whether a pte is 32 or 64 bits
3868			 * in size, PG_RW and PG_M are among the least
3869			 * significant 32 bits.
3870			 */
3871			PT_SET_VA_MA(pte, newpte, TRUE);
3872			if (*pte != newpte)
3873				goto retry;
3874
3875			if ((oldpte & PG_M) != 0)
3876				vm_page_dirty(m);
3877			pmap_invalidate_page(pmap, pv->pv_va);
3878		}
3879		PMAP_UNLOCK(pmap);
3880	}
3881	vm_page_aflag_clear(m, PGA_WRITEABLE);
3882	PT_UPDATES_FLUSH();
3883	if (*PMAP1)
3884		PT_SET_MA(PADDR1, 0);
3885	sched_unpin();
3886	rw_wunlock(&pvh_global_lock);
3887}
3888
3889/*
3890 *	pmap_ts_referenced:
3891 *
3892 *	Return a count of reference bits for a page, clearing those bits.
3893 *	It is not necessary for every reference bit to be cleared, but it
3894 *	is necessary that 0 only be returned when there are truly no
3895 *	reference bits set.
3896 *
3897 *	XXX: The exact number of bits to check and clear is a matter that
3898 *	should be tested and standardized at some point in the future for
3899 *	optimal aging of shared pages.
3900 */
3901int
3902pmap_ts_referenced(vm_page_t m)
3903{
3904	pv_entry_t pv, pvf, pvn;
3905	pmap_t pmap;
3906	pt_entry_t *pte;
3907	int rtval = 0;
3908
3909	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3910	    ("pmap_ts_referenced: page %p is not managed", m));
3911	rw_wlock(&pvh_global_lock);
3912	sched_pin();
3913	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3914		pvf = pv;
3915		do {
3916			pvn = TAILQ_NEXT(pv, pv_next);
3917			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3918			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3919			pmap = PV_PMAP(pv);
3920			PMAP_LOCK(pmap);
3921			pte = pmap_pte_quick(pmap, pv->pv_va);
3922			if ((*pte & PG_A) != 0) {
3923				PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE);
3924				pmap_invalidate_page(pmap, pv->pv_va);
3925				rtval++;
3926				if (rtval > 4)
3927					pvn = NULL;
3928			}
3929			PMAP_UNLOCK(pmap);
3930		} while ((pv = pvn) != NULL && pv != pvf);
3931	}
3932	PT_UPDATES_FLUSH();
3933	if (*PMAP1)
3934		PT_SET_MA(PADDR1, 0);
3935	sched_unpin();
3936	rw_wunlock(&pvh_global_lock);
3937	return (rtval);
3938}
3939
3940/*
3941 *	Apply the given advice to the specified range of addresses within the
3942 *	given pmap.  Depending on the advice, clear the referenced and/or
3943 *	modified flags in each mapping and set the mapped page's dirty field.
3944 */
3945void
3946pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
3947{
3948	pd_entry_t oldpde;
3949	pt_entry_t *pte;
3950	vm_offset_t pdnxt;
3951	vm_page_t m;
3952	boolean_t anychanged;
3953
3954	if (advice != MADV_DONTNEED && advice != MADV_FREE)
3955		return;
3956	anychanged = FALSE;
3957	rw_wlock(&pvh_global_lock);
3958	sched_pin();
3959	PMAP_LOCK(pmap);
3960	for (; sva < eva; sva = pdnxt) {
3961		pdnxt = (sva + NBPDR) & ~PDRMASK;
3962		if (pdnxt < sva)
3963			pdnxt = eva;
3964		oldpde = pmap->pm_pdir[sva >> PDRSHIFT];
3965		if ((oldpde & (PG_PS | PG_V)) != PG_V)
3966			continue;
3967		if (pdnxt > eva)
3968			pdnxt = eva;
3969		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3970		    sva += PAGE_SIZE) {
3971			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
3972			    PG_V))
3973				continue;
3974			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3975				if (advice == MADV_DONTNEED) {
3976					/*
3977					 * Future calls to pmap_is_modified()
3978					 * can be avoided by making the page
3979					 * dirty now.
3980					 */
3981					m = PHYS_TO_VM_PAGE(xpmap_mtop(*pte) &
3982					    PG_FRAME);
3983					vm_page_dirty(m);
3984				}
3985				PT_SET_VA_MA(pte, *pte & ~(PG_M | PG_A), TRUE);
3986			} else if ((*pte & PG_A) != 0)
3987				PT_SET_VA_MA(pte, *pte & ~PG_A, TRUE);
3988			else
3989				continue;
3990			if ((*pte & PG_G) != 0)
3991				pmap_invalidate_page(pmap, sva);
3992			else
3993				anychanged = TRUE;
3994		}
3995	}
3996	PT_UPDATES_FLUSH();
3997	if (*PMAP1)
3998		PT_SET_VA_MA(PMAP1, 0, TRUE);
3999	if (anychanged)
4000		pmap_invalidate_all(pmap);
4001	sched_unpin();
4002	rw_wunlock(&pvh_global_lock);
4003	PMAP_UNLOCK(pmap);
4004}
4005
4006/*
4007 *	Clear the modify bits on the specified physical page.
4008 */
4009void
4010pmap_clear_modify(vm_page_t m)
4011{
4012	pv_entry_t pv;
4013	pmap_t pmap;
4014	pt_entry_t *pte;
4015
4016	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4017	    ("pmap_clear_modify: page %p is not managed", m));
4018	VM_OBJECT_ASSERT_WLOCKED(m->object);
4019	KASSERT(!vm_page_xbusied(m),
4020	    ("pmap_clear_modify: page %p is exclusive busied", m));
4021
4022	/*
4023	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4024	 * If the object containing the page is locked and the page is not
4025	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4026	 */
4027	if ((m->aflags & PGA_WRITEABLE) == 0)
4028		return;
4029	rw_wlock(&pvh_global_lock);
4030	sched_pin();
4031	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4032		pmap = PV_PMAP(pv);
4033		PMAP_LOCK(pmap);
4034		pte = pmap_pte_quick(pmap, pv->pv_va);
4035		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4036			/*
4037			 * Regardless of whether a pte is 32 or 64 bits
4038			 * in size, PG_M is among the least significant
4039			 * 32 bits.
4040			 */
4041			PT_SET_VA_MA(pte, *pte & ~PG_M, FALSE);
4042			pmap_invalidate_page(pmap, pv->pv_va);
4043		}
4044		PMAP_UNLOCK(pmap);
4045	}
4046	sched_unpin();
4047	rw_wunlock(&pvh_global_lock);
4048}
4049
4050/*
4051 * Miscellaneous support routines follow
4052 */
4053
4054/*
4055 * Map a set of physical memory pages into the kernel virtual
4056 * address space. Return a pointer to where it is mapped. This
4057 * routine is intended to be used for mapping device memory,
4058 * NOT real memory.
4059 */
4060void *
4061pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4062{
4063	vm_offset_t va, offset;
4064	vm_size_t tmpsize;
4065
4066	offset = pa & PAGE_MASK;
4067	size = round_page(offset + size);
4068	pa = pa & PG_FRAME;
4069
4070	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4071		va = KERNBASE + pa;
4072	else
4073		va = kva_alloc(size);
4074	if (!va)
4075		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4076
4077	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4078		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4079	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4080	pmap_invalidate_cache_range(va, va + size, FALSE);
4081	return ((void *)(va + offset));
4082}
4083
4084void *
4085pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4086{
4087
4088	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4089}
4090
4091void *
4092pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4093{
4094
4095	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4096}
4097
4098void
4099pmap_unmapdev(vm_offset_t va, vm_size_t size)
4100{
4101	vm_offset_t base, offset;
4102
4103	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4104		return;
4105	base = trunc_page(va);
4106	offset = va & PAGE_MASK;
4107	size = round_page(offset + size);
4108	kva_free(base, size);
4109}
4110
4111/*
4112 * Sets the memory attribute for the specified page.
4113 */
4114void
4115pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4116{
4117
4118	m->md.pat_mode = ma;
4119	if ((m->flags & PG_FICTITIOUS) != 0)
4120		return;
4121
4122	/*
4123	 * If "m" is a normal page, flush it from the cache.
4124	 * See pmap_invalidate_cache_range().
4125	 *
4126	 * First, try to find an existing mapping of the page by sf
4127	 * buffer. sf_buf_invalidate_cache() modifies mapping and
4128	 * flushes the cache.
4129	 */
4130	if (sf_buf_invalidate_cache(m))
4131		return;
4132
4133	/*
4134	 * If page is not mapped by sf buffer, but CPU does not
4135	 * support self snoop, map the page transient and do
4136	 * invalidation. In the worst case, whole cache is flushed by
4137	 * pmap_invalidate_cache_range().
4138	 */
4139	if ((cpu_feature & CPUID_SS) == 0)
4140		pmap_flush_page(m);
4141}
4142
4143static void
4144pmap_flush_page(vm_page_t m)
4145{
4146	struct sysmaps *sysmaps;
4147	vm_offset_t sva, eva;
4148
4149	if ((cpu_feature & CPUID_CLFSH) != 0) {
4150		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4151		mtx_lock(&sysmaps->lock);
4152		if (*sysmaps->CMAP2)
4153			panic("pmap_flush_page: CMAP2 busy");
4154		sched_pin();
4155		PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW |
4156		    VM_PAGE_TO_MACH(m) | PG_A | PG_M |
4157		    pmap_cache_bits(m->md.pat_mode, 0));
4158		invlcaddr(sysmaps->CADDR2);
4159		sva = (vm_offset_t)sysmaps->CADDR2;
4160		eva = sva + PAGE_SIZE;
4161
4162		/*
4163		 * Use mfence despite the ordering implied by
4164		 * mtx_{un,}lock() because clflush is not guaranteed
4165		 * to be ordered by any other instruction.
4166		 */
4167		mfence();
4168		for (; sva < eva; sva += cpu_clflush_line_size)
4169			clflush(sva);
4170		mfence();
4171		PT_SET_MA(sysmaps->CADDR2, 0);
4172		sched_unpin();
4173		mtx_unlock(&sysmaps->lock);
4174	} else
4175		pmap_invalidate_cache();
4176}
4177
4178/*
4179 * Changes the specified virtual address range's memory type to that given by
4180 * the parameter "mode".  The specified virtual address range must be
4181 * completely contained within either the kernel map.
4182 *
4183 * Returns zero if the change completed successfully, and either EINVAL or
4184 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4185 * of the virtual address range was not mapped, and ENOMEM is returned if
4186 * there was insufficient memory available to complete the change.
4187 */
4188int
4189pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4190{
4191	vm_offset_t base, offset, tmpva;
4192	pt_entry_t *pte;
4193	u_int opte, npte;
4194	pd_entry_t *pde;
4195	boolean_t changed;
4196
4197	base = trunc_page(va);
4198	offset = va & PAGE_MASK;
4199	size = round_page(offset + size);
4200
4201	/* Only supported on kernel virtual addresses. */
4202	if (base <= VM_MAXUSER_ADDRESS)
4203		return (EINVAL);
4204
4205	/* 4MB pages and pages that aren't mapped aren't supported. */
4206	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
4207		pde = pmap_pde(kernel_pmap, tmpva);
4208		if (*pde & PG_PS)
4209			return (EINVAL);
4210		if ((*pde & PG_V) == 0)
4211			return (EINVAL);
4212		pte = vtopte(va);
4213		if ((*pte & PG_V) == 0)
4214			return (EINVAL);
4215	}
4216
4217	changed = FALSE;
4218
4219	/*
4220	 * Ok, all the pages exist and are 4k, so run through them updating
4221	 * their cache mode.
4222	 */
4223	for (tmpva = base; size > 0; ) {
4224		pte = vtopte(tmpva);
4225
4226		/*
4227		 * The cache mode bits are all in the low 32-bits of the
4228		 * PTE, so we can just spin on updating the low 32-bits.
4229		 */
4230		do {
4231			opte = *(u_int *)pte;
4232			npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
4233			npte |= pmap_cache_bits(mode, 0);
4234			PT_SET_VA_MA(pte, npte, TRUE);
4235		} while (npte != opte && (*pte != npte));
4236		if (npte != opte)
4237			changed = TRUE;
4238		tmpva += PAGE_SIZE;
4239		size -= PAGE_SIZE;
4240	}
4241
4242	/*
4243	 * Flush CPU caches to make sure any data isn't cached that
4244	 * shouldn't be, etc.
4245	 */
4246	if (changed) {
4247		pmap_invalidate_range(kernel_pmap, base, tmpva);
4248		pmap_invalidate_cache_range(base, tmpva, FALSE);
4249	}
4250	return (0);
4251}
4252
4253/*
4254 * perform the pmap work for mincore
4255 */
4256int
4257pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
4258{
4259	pt_entry_t *ptep, pte;
4260	vm_paddr_t pa;
4261	int val;
4262
4263	PMAP_LOCK(pmap);
4264retry:
4265	ptep = pmap_pte(pmap, addr);
4266	pte = (ptep != NULL) ? PT_GET(ptep) : 0;
4267	pmap_pte_release(ptep);
4268	val = 0;
4269	if ((pte & PG_V) != 0) {
4270		val |= MINCORE_INCORE;
4271		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4272			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4273		if ((pte & PG_A) != 0)
4274			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4275	}
4276	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
4277	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
4278	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
4279		pa = pte & PG_FRAME;
4280		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
4281		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
4282			goto retry;
4283	} else
4284		PA_UNLOCK_COND(*locked_pa);
4285	PMAP_UNLOCK(pmap);
4286	return (val);
4287}
4288
4289void
4290pmap_activate(struct thread *td)
4291{
4292	pmap_t	pmap, oldpmap;
4293	u_int	cpuid;
4294	u_int32_t  cr3;
4295
4296	critical_enter();
4297	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4298	oldpmap = PCPU_GET(curpmap);
4299	cpuid = PCPU_GET(cpuid);
4300#if defined(SMP)
4301	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
4302	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
4303#else
4304	CPU_CLR(cpuid, &oldpmap->pm_active);
4305	CPU_SET(cpuid, &pmap->pm_active);
4306#endif
4307#ifdef PAE
4308	cr3 = vtophys(pmap->pm_pdpt);
4309#else
4310	cr3 = vtophys(pmap->pm_pdir);
4311#endif
4312	/*
4313	 * pmap_activate is for the current thread on the current cpu
4314	 */
4315	td->td_pcb->pcb_cr3 = cr3;
4316	PT_UPDATES_FLUSH();
4317	load_cr3(cr3);
4318	PCPU_SET(curpmap, pmap);
4319	critical_exit();
4320}
4321
4322void
4323pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
4324{
4325}
4326
4327/*
4328 *	Increase the starting virtual address of the given mapping if a
4329 *	different alignment might result in more superpage mappings.
4330 */
4331void
4332pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4333    vm_offset_t *addr, vm_size_t size)
4334{
4335	vm_offset_t superpage_offset;
4336
4337	if (size < NBPDR)
4338		return;
4339	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4340		offset += ptoa(object->pg_color);
4341	superpage_offset = offset & PDRMASK;
4342	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
4343	    (*addr & PDRMASK) == superpage_offset)
4344		return;
4345	if ((*addr & PDRMASK) < superpage_offset)
4346		*addr = (*addr & ~PDRMASK) + superpage_offset;
4347	else
4348		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
4349}
4350
4351void
4352pmap_suspend()
4353{
4354	pmap_t pmap;
4355	int i, pdir, offset;
4356	vm_paddr_t pdirma;
4357	mmu_update_t mu[4];
4358
4359	/*
4360	 * We need to remove the recursive mapping structure from all
4361	 * our pmaps so that Xen doesn't get confused when it restores
4362	 * the page tables. The recursive map lives at page directory
4363	 * index PTDPTDI. We assume that the suspend code has stopped
4364	 * the other vcpus (if any).
4365	 */
4366	LIST_FOREACH(pmap, &allpmaps, pm_list) {
4367		for (i = 0; i < 4; i++) {
4368			/*
4369			 * Figure out which page directory (L2) page
4370			 * contains this bit of the recursive map and
4371			 * the offset within that page of the map
4372			 * entry
4373			 */
4374			pdir = (PTDPTDI + i) / NPDEPG;
4375			offset = (PTDPTDI + i) % NPDEPG;
4376			pdirma = pmap->pm_pdpt[pdir] & PG_FRAME;
4377			mu[i].ptr = pdirma + offset * sizeof(pd_entry_t);
4378			mu[i].val = 0;
4379		}
4380		HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF);
4381	}
4382}
4383
4384void
4385pmap_resume()
4386{
4387	pmap_t pmap;
4388	int i, pdir, offset;
4389	vm_paddr_t pdirma;
4390	mmu_update_t mu[4];
4391
4392	/*
4393	 * Restore the recursive map that we removed on suspend.
4394	 */
4395	LIST_FOREACH(pmap, &allpmaps, pm_list) {
4396		for (i = 0; i < 4; i++) {
4397			/*
4398			 * Figure out which page directory (L2) page
4399			 * contains this bit of the recursive map and
4400			 * the offset within that page of the map
4401			 * entry
4402			 */
4403			pdir = (PTDPTDI + i) / NPDEPG;
4404			offset = (PTDPTDI + i) % NPDEPG;
4405			pdirma = pmap->pm_pdpt[pdir] & PG_FRAME;
4406			mu[i].ptr = pdirma + offset * sizeof(pd_entry_t);
4407			mu[i].val = (pmap->pm_pdpt[i] & PG_FRAME) | PG_V;
4408		}
4409		HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF);
4410	}
4411}
4412
4413#if defined(PMAP_DEBUG)
4414pmap_pid_dump(int pid)
4415{
4416	pmap_t pmap;
4417	struct proc *p;
4418	int npte = 0;
4419	int index;
4420
4421	sx_slock(&allproc_lock);
4422	FOREACH_PROC_IN_SYSTEM(p) {
4423		if (p->p_pid != pid)
4424			continue;
4425
4426		if (p->p_vmspace) {
4427			int i,j;
4428			index = 0;
4429			pmap = vmspace_pmap(p->p_vmspace);
4430			for (i = 0; i < NPDEPTD; i++) {
4431				pd_entry_t *pde;
4432				pt_entry_t *pte;
4433				vm_offset_t base = i << PDRSHIFT;
4434
4435				pde = &pmap->pm_pdir[i];
4436				if (pde && pmap_pde_v(pde)) {
4437					for (j = 0; j < NPTEPG; j++) {
4438						vm_offset_t va = base + (j << PAGE_SHIFT);
4439						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
4440							if (index) {
4441								index = 0;
4442								printf("\n");
4443							}
4444							sx_sunlock(&allproc_lock);
4445							return (npte);
4446						}
4447						pte = pmap_pte(pmap, va);
4448						if (pte && pmap_pte_v(pte)) {
4449							pt_entry_t pa;
4450							vm_page_t m;
4451							pa = PT_GET(pte);
4452							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
4453							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
4454								va, pa, m->hold_count, m->wire_count, m->flags);
4455							npte++;
4456							index++;
4457							if (index >= 2) {
4458								index = 0;
4459								printf("\n");
4460							} else {
4461								printf(" ");
4462							}
4463						}
4464					}
4465				}
4466			}
4467		}
4468	}
4469	sx_sunlock(&allproc_lock);
4470	return (npte);
4471}
4472#endif
4473
4474#if defined(DEBUG)
4475
4476static void	pads(pmap_t pm);
4477void		pmap_pvdump(vm_paddr_t pa);
4478
4479/* print address space of pmap*/
4480static void
4481pads(pmap_t pm)
4482{
4483	int i, j;
4484	vm_paddr_t va;
4485	pt_entry_t *ptep;
4486
4487	if (pm == kernel_pmap)
4488		return;
4489	for (i = 0; i < NPDEPTD; i++)
4490		if (pm->pm_pdir[i])
4491			for (j = 0; j < NPTEPG; j++) {
4492				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
4493				if (pm == kernel_pmap && va < KERNBASE)
4494					continue;
4495				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
4496					continue;
4497				ptep = pmap_pte(pm, va);
4498				if (pmap_pte_v(ptep))
4499					printf("%x:%x ", va, *ptep);
4500			};
4501
4502}
4503
4504void
4505pmap_pvdump(vm_paddr_t pa)
4506{
4507	pv_entry_t pv;
4508	pmap_t pmap;
4509	vm_page_t m;
4510
4511	printf("pa %x", pa);
4512	m = PHYS_TO_VM_PAGE(pa);
4513	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4514		pmap = PV_PMAP(pv);
4515		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
4516		pads(pmap);
4517	}
4518	printf(" ");
4519}
4520#endif
4521