pmap.c revision 336711
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 *    notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 *    notice, this list of conditions and the following disclaimer in the
31 *    documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 *    must display the following acknowledgement:
34 *	This product includes software developed by the University of
35 *	California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 *    may be used to endorse or promote products derived from this software
38 *    without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53 */
54/*-
55 * Copyright (c) 2003 Networks Associates Technology, Inc.
56 * All rights reserved.
57 *
58 * This software was developed for the FreeBSD Project by Jake Burkholder,
59 * Safeport Network Services, and Network Associates Laboratories, the
60 * Security Research Division of Network Associates, Inc. under
61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62 * CHATS research program.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 *    notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 *    notice, this list of conditions and the following disclaimer in the
71 *    documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86#include <sys/cdefs.h>
87__FBSDID("$FreeBSD: stable/11/sys/arm64/arm64/pmap.c 336711 2018-07-25 15:40:27Z markj $");
88
89/*
90 *	Manages physical address maps.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include "opt_vm.h"
109
110#include <sys/param.h>
111#include <sys/bitstring.h>
112#include <sys/bus.h>
113#include <sys/systm.h>
114#include <sys/kernel.h>
115#include <sys/ktr.h>
116#include <sys/lock.h>
117#include <sys/malloc.h>
118#include <sys/mman.h>
119#include <sys/msgbuf.h>
120#include <sys/mutex.h>
121#include <sys/proc.h>
122#include <sys/rwlock.h>
123#include <sys/sx.h>
124#include <sys/vmem.h>
125#include <sys/vmmeter.h>
126#include <sys/sched.h>
127#include <sys/sysctl.h>
128#include <sys/_unrhdr.h>
129#include <sys/smp.h>
130
131#include <vm/vm.h>
132#include <vm/vm_param.h>
133#include <vm/vm_kern.h>
134#include <vm/vm_page.h>
135#include <vm/vm_map.h>
136#include <vm/vm_object.h>
137#include <vm/vm_extern.h>
138#include <vm/vm_pageout.h>
139#include <vm/vm_pager.h>
140#include <vm/vm_phys.h>
141#include <vm/vm_radix.h>
142#include <vm/vm_reserv.h>
143#include <vm/uma.h>
144
145#include <machine/machdep.h>
146#include <machine/md_var.h>
147#include <machine/pcb.h>
148
149#define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
150#define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
151#define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
152#define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
153
154#define	NUL0E		L0_ENTRIES
155#define	NUL1E		(NUL0E * NL1PG)
156#define	NUL2E		(NUL1E * NL2PG)
157
158#if !defined(DIAGNOSTIC)
159#ifdef __GNUC_GNU_INLINE__
160#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
161#else
162#define PMAP_INLINE	extern inline
163#endif
164#else
165#define PMAP_INLINE
166#endif
167
168/*
169 * These are configured by the mair_el1 register. This is set up in locore.S
170 */
171#define	DEVICE_MEMORY	0
172#define	UNCACHED_MEMORY	1
173#define	CACHED_MEMORY	2
174
175
176#ifdef PV_STATS
177#define PV_STAT(x)	do { x ; } while (0)
178#else
179#define PV_STAT(x)	do { } while (0)
180#endif
181
182#define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
183#define	pa_to_pvh(pa)		(&pv_table[pmap_l2_pindex(pa)])
184
185#define	NPV_LIST_LOCKS	MAXCPU
186
187#define	PHYS_TO_PV_LIST_LOCK(pa)	\
188			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
189
190#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
191	struct rwlock **_lockp = (lockp);		\
192	struct rwlock *_new_lock;			\
193							\
194	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
195	if (_new_lock != *_lockp) {			\
196		if (*_lockp != NULL)			\
197			rw_wunlock(*_lockp);		\
198		*_lockp = _new_lock;			\
199		rw_wlock(*_lockp);			\
200	}						\
201} while (0)
202
203#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
204			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
205
206#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
207	struct rwlock **_lockp = (lockp);		\
208							\
209	if (*_lockp != NULL) {				\
210		rw_wunlock(*_lockp);			\
211		*_lockp = NULL;				\
212	}						\
213} while (0)
214
215#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
216			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
217
218struct pmap kernel_pmap_store;
219
220vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
221vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
222vm_offset_t kernel_vm_end = 0;
223
224struct msgbuf *msgbufp = NULL;
225
226/*
227 * Data for the pv entry allocation mechanism.
228 * Updates to pv_invl_gen are protected by the pv_list_locks[]
229 * elements, but reads are not.
230 */
231static struct md_page *pv_table;
232static struct md_page pv_dummy;
233
234vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
235vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
236vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
237
238/* This code assumes all L1 DMAP entries will be used */
239CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
240CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
241
242#define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
243extern pt_entry_t pagetable_dmap[];
244
245static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
246
247static int superpages_enabled = 0;
248SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
249    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
250    "Are large page mappings enabled?");
251
252/*
253 * Data for the pv entry allocation mechanism
254 */
255static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
256static struct mtx pv_chunks_mutex;
257static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
258
259static void	free_pv_chunk(struct pv_chunk *pc);
260static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
261static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
262static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
263static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
264static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
265		    vm_offset_t va);
266
267static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
268static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
269static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
270static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
271    vm_offset_t va, struct rwlock **lockp);
272static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
273static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
274    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
275static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
276    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
277static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
278    vm_page_t m, struct rwlock **lockp);
279
280static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
281		struct rwlock **lockp);
282
283static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
284    struct spglist *free);
285static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
286
287/*
288 * These load the old table data and store the new value.
289 * They need to be atomic as the System MMU may write to the table at
290 * the same time as the CPU.
291 */
292#define	pmap_load_store(table, entry) atomic_swap_64(table, entry)
293#define	pmap_set(table, mask) atomic_set_64(table, mask)
294#define	pmap_load_clear(table) atomic_swap_64(table, 0)
295#define	pmap_load(table) (*table)
296
297/********************/
298/* Inline functions */
299/********************/
300
301static __inline void
302pagecopy(void *s, void *d)
303{
304
305	memcpy(d, s, PAGE_SIZE);
306}
307
308#define	pmap_l0_index(va)	(((va) >> L0_SHIFT) & L0_ADDR_MASK)
309#define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
310#define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
311#define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
312
313static __inline pd_entry_t *
314pmap_l0(pmap_t pmap, vm_offset_t va)
315{
316
317	return (&pmap->pm_l0[pmap_l0_index(va)]);
318}
319
320static __inline pd_entry_t *
321pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
322{
323	pd_entry_t *l1;
324
325	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
326	return (&l1[pmap_l1_index(va)]);
327}
328
329static __inline pd_entry_t *
330pmap_l1(pmap_t pmap, vm_offset_t va)
331{
332	pd_entry_t *l0;
333
334	l0 = pmap_l0(pmap, va);
335	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
336		return (NULL);
337
338	return (pmap_l0_to_l1(l0, va));
339}
340
341static __inline pd_entry_t *
342pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
343{
344	pd_entry_t *l2;
345
346	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
347	return (&l2[pmap_l2_index(va)]);
348}
349
350static __inline pd_entry_t *
351pmap_l2(pmap_t pmap, vm_offset_t va)
352{
353	pd_entry_t *l1;
354
355	l1 = pmap_l1(pmap, va);
356	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
357		return (NULL);
358
359	return (pmap_l1_to_l2(l1, va));
360}
361
362static __inline pt_entry_t *
363pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
364{
365	pt_entry_t *l3;
366
367	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
368	return (&l3[pmap_l3_index(va)]);
369}
370
371/*
372 * Returns the lowest valid pde for a given virtual address.
373 * The next level may or may not point to a valid page or block.
374 */
375static __inline pd_entry_t *
376pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
377{
378	pd_entry_t *l0, *l1, *l2, desc;
379
380	l0 = pmap_l0(pmap, va);
381	desc = pmap_load(l0) & ATTR_DESCR_MASK;
382	if (desc != L0_TABLE) {
383		*level = -1;
384		return (NULL);
385	}
386
387	l1 = pmap_l0_to_l1(l0, va);
388	desc = pmap_load(l1) & ATTR_DESCR_MASK;
389	if (desc != L1_TABLE) {
390		*level = 0;
391		return (l0);
392	}
393
394	l2 = pmap_l1_to_l2(l1, va);
395	desc = pmap_load(l2) & ATTR_DESCR_MASK;
396	if (desc != L2_TABLE) {
397		*level = 1;
398		return (l1);
399	}
400
401	*level = 2;
402	return (l2);
403}
404
405/*
406 * Returns the lowest valid pte block or table entry for a given virtual
407 * address. If there are no valid entries return NULL and set the level to
408 * the first invalid level.
409 */
410static __inline pt_entry_t *
411pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
412{
413	pd_entry_t *l1, *l2, desc;
414	pt_entry_t *l3;
415
416	l1 = pmap_l1(pmap, va);
417	if (l1 == NULL) {
418		*level = 0;
419		return (NULL);
420	}
421	desc = pmap_load(l1) & ATTR_DESCR_MASK;
422	if (desc == L1_BLOCK) {
423		*level = 1;
424		return (l1);
425	}
426
427	if (desc != L1_TABLE) {
428		*level = 1;
429		return (NULL);
430	}
431
432	l2 = pmap_l1_to_l2(l1, va);
433	desc = pmap_load(l2) & ATTR_DESCR_MASK;
434	if (desc == L2_BLOCK) {
435		*level = 2;
436		return (l2);
437	}
438
439	if (desc != L2_TABLE) {
440		*level = 2;
441		return (NULL);
442	}
443
444	*level = 3;
445	l3 = pmap_l2_to_l3(l2, va);
446	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
447		return (NULL);
448
449	return (l3);
450}
451
452static inline bool
453pmap_superpages_enabled(void)
454{
455
456	return (superpages_enabled != 0);
457}
458
459bool
460pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
461    pd_entry_t **l2, pt_entry_t **l3)
462{
463	pd_entry_t *l0p, *l1p, *l2p;
464
465	if (pmap->pm_l0 == NULL)
466		return (false);
467
468	l0p = pmap_l0(pmap, va);
469	*l0 = l0p;
470
471	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
472		return (false);
473
474	l1p = pmap_l0_to_l1(l0p, va);
475	*l1 = l1p;
476
477	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
478		*l2 = NULL;
479		*l3 = NULL;
480		return (true);
481	}
482
483	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
484		return (false);
485
486	l2p = pmap_l1_to_l2(l1p, va);
487	*l2 = l2p;
488
489	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
490		*l3 = NULL;
491		return (true);
492	}
493
494	*l3 = pmap_l2_to_l3(l2p, va);
495
496	return (true);
497}
498
499static __inline int
500pmap_is_current(pmap_t pmap)
501{
502
503	return ((pmap == pmap_kernel()) ||
504	    (pmap == curthread->td_proc->p_vmspace->vm_map.pmap));
505}
506
507static __inline int
508pmap_l3_valid(pt_entry_t l3)
509{
510
511	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
512}
513
514
515/* Is a level 1 or 2entry a valid block and cacheable */
516CTASSERT(L1_BLOCK == L2_BLOCK);
517static __inline int
518pmap_pte_valid_cacheable(pt_entry_t pte)
519{
520
521	return (((pte & ATTR_DESCR_MASK) == L1_BLOCK) &&
522	    ((pte & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY)));
523}
524
525static __inline int
526pmap_l3_valid_cacheable(pt_entry_t l3)
527{
528
529	return (((l3 & ATTR_DESCR_MASK) == L3_PAGE) &&
530	    ((l3 & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY)));
531}
532
533#define	PTE_SYNC(pte)	cpu_dcache_wb_range((vm_offset_t)pte, sizeof(*pte))
534
535/*
536 * Checks if the page is dirty. We currently lack proper tracking of this on
537 * arm64 so for now assume is a page mapped as rw was accessed it is.
538 */
539static inline int
540pmap_page_dirty(pt_entry_t pte)
541{
542
543	return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) ==
544	    (ATTR_AF | ATTR_AP(ATTR_AP_RW)));
545}
546
547static __inline void
548pmap_resident_count_inc(pmap_t pmap, int count)
549{
550
551	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
552	pmap->pm_stats.resident_count += count;
553}
554
555static __inline void
556pmap_resident_count_dec(pmap_t pmap, int count)
557{
558
559	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
560	KASSERT(pmap->pm_stats.resident_count >= count,
561	    ("pmap %p resident count underflow %ld %d", pmap,
562	    pmap->pm_stats.resident_count, count));
563	pmap->pm_stats.resident_count -= count;
564}
565
566static pt_entry_t *
567pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
568    u_int *l2_slot)
569{
570	pt_entry_t *l2;
571	pd_entry_t *l1;
572
573	l1 = (pd_entry_t *)l1pt;
574	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
575
576	/* Check locore has used a table L1 map */
577	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
578	   ("Invalid bootstrap L1 table"));
579	/* Find the address of the L2 table */
580	l2 = (pt_entry_t *)init_pt_va;
581	*l2_slot = pmap_l2_index(va);
582
583	return (l2);
584}
585
586static vm_paddr_t
587pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
588{
589	u_int l1_slot, l2_slot;
590	pt_entry_t *l2;
591
592	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
593
594	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
595}
596
597static void
598pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
599{
600	vm_offset_t va;
601	vm_paddr_t pa;
602	u_int l1_slot;
603
604	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
605	va = DMAP_MIN_ADDRESS;
606	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
607	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
608		l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
609
610		pmap_load_store(&pagetable_dmap[l1_slot],
611		    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
612		    ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
613	}
614
615	/* Set the upper limit of the DMAP region */
616	dmap_phys_max = pa;
617	dmap_max_addr = va;
618
619	cpu_dcache_wb_range((vm_offset_t)pagetable_dmap,
620	    PAGE_SIZE * DMAP_TABLES);
621	cpu_tlb_flushID();
622}
623
624static vm_offset_t
625pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
626{
627	vm_offset_t l2pt;
628	vm_paddr_t pa;
629	pd_entry_t *l1;
630	u_int l1_slot;
631
632	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
633
634	l1 = (pd_entry_t *)l1pt;
635	l1_slot = pmap_l1_index(va);
636	l2pt = l2_start;
637
638	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
639		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
640
641		pa = pmap_early_vtophys(l1pt, l2pt);
642		pmap_load_store(&l1[l1_slot],
643		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
644		l2pt += PAGE_SIZE;
645	}
646
647	/* Clean the L2 page table */
648	memset((void *)l2_start, 0, l2pt - l2_start);
649	cpu_dcache_wb_range(l2_start, l2pt - l2_start);
650
651	/* Flush the l1 table to ram */
652	cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE);
653
654	return l2pt;
655}
656
657static vm_offset_t
658pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
659{
660	vm_offset_t l2pt, l3pt;
661	vm_paddr_t pa;
662	pd_entry_t *l2;
663	u_int l2_slot;
664
665	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
666
667	l2 = pmap_l2(kernel_pmap, va);
668	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
669	l2pt = (vm_offset_t)l2;
670	l2_slot = pmap_l2_index(va);
671	l3pt = l3_start;
672
673	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
674		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
675
676		pa = pmap_early_vtophys(l1pt, l3pt);
677		pmap_load_store(&l2[l2_slot],
678		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
679		l3pt += PAGE_SIZE;
680	}
681
682	/* Clean the L2 page table */
683	memset((void *)l3_start, 0, l3pt - l3_start);
684	cpu_dcache_wb_range(l3_start, l3pt - l3_start);
685
686	cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE);
687
688	return l3pt;
689}
690
691/*
692 *	Bootstrap the system enough to run with virtual memory.
693 */
694void
695pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
696    vm_size_t kernlen)
697{
698	u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot;
699	uint64_t kern_delta;
700	pt_entry_t *l2;
701	vm_offset_t va, freemempos;
702	vm_offset_t dpcpu, msgbufpv;
703	vm_paddr_t pa, max_pa, min_pa;
704	int i;
705
706	kern_delta = KERNBASE - kernstart;
707	physmem = 0;
708
709	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
710	printf("%lx\n", l1pt);
711	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
712
713	/* Set this early so we can use the pagetable walking functions */
714	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
715	PMAP_LOCK_INIT(kernel_pmap);
716
717	/* Assume the address we were loaded to is a valid physical address */
718	min_pa = max_pa = KERNBASE - kern_delta;
719
720	/*
721	 * Find the minimum physical address. physmap is sorted,
722	 * but may contain empty ranges.
723	 */
724	for (i = 0; i < (physmap_idx * 2); i += 2) {
725		if (physmap[i] == physmap[i + 1])
726			continue;
727		if (physmap[i] <= min_pa)
728			min_pa = physmap[i];
729		if (physmap[i + 1] > max_pa)
730			max_pa = physmap[i + 1];
731	}
732
733	/* Create a direct map region early so we can use it for pa -> va */
734	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
735
736	va = KERNBASE;
737	pa = KERNBASE - kern_delta;
738
739	/*
740	 * Start to initialise phys_avail by copying from physmap
741	 * up to the physical address KERNBASE points at.
742	 */
743	map_slot = avail_slot = 0;
744	for (; map_slot < (physmap_idx * 2) &&
745	    avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) {
746		if (physmap[map_slot] == physmap[map_slot + 1])
747			continue;
748
749		if (physmap[map_slot] <= pa &&
750		    physmap[map_slot + 1] > pa)
751			break;
752
753		phys_avail[avail_slot] = physmap[map_slot];
754		phys_avail[avail_slot + 1] = physmap[map_slot + 1];
755		physmem += (phys_avail[avail_slot + 1] -
756		    phys_avail[avail_slot]) >> PAGE_SHIFT;
757		avail_slot += 2;
758	}
759
760	/* Add the memory before the kernel */
761	if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) {
762		phys_avail[avail_slot] = physmap[map_slot];
763		phys_avail[avail_slot + 1] = pa;
764		physmem += (phys_avail[avail_slot + 1] -
765		    phys_avail[avail_slot]) >> PAGE_SHIFT;
766		avail_slot += 2;
767	}
768	used_map_slot = map_slot;
769
770	/*
771	 * Read the page table to find out what is already mapped.
772	 * This assumes we have mapped a block of memory from KERNBASE
773	 * using a single L1 entry.
774	 */
775	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
776
777	/* Sanity check the index, KERNBASE should be the first VA */
778	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
779
780	/* Find how many pages we have mapped */
781	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
782		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
783			break;
784
785		/* Check locore used L2 blocks */
786		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
787		    ("Invalid bootstrap L2 table"));
788		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
789		    ("Incorrect PA in L2 table"));
790
791		va += L2_SIZE;
792		pa += L2_SIZE;
793	}
794
795	va = roundup2(va, L1_SIZE);
796
797	freemempos = KERNBASE + kernlen;
798	freemempos = roundup2(freemempos, PAGE_SIZE);
799	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
800	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
801	/* And the l3 tables for the early devmap */
802	freemempos = pmap_bootstrap_l3(l1pt,
803	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
804
805	cpu_tlb_flushID();
806
807#define alloc_pages(var, np)						\
808	(var) = freemempos;						\
809	freemempos += (np * PAGE_SIZE);					\
810	memset((char *)(var), 0, ((np) * PAGE_SIZE));
811
812	/* Allocate dynamic per-cpu area. */
813	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
814	dpcpu_init((void *)dpcpu, 0);
815
816	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
817	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
818	msgbufp = (void *)msgbufpv;
819
820	virtual_avail = roundup2(freemempos, L1_SIZE);
821	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
822	kernel_vm_end = virtual_avail;
823
824	pa = pmap_early_vtophys(l1pt, freemempos);
825
826	/* Finish initialising physmap */
827	map_slot = used_map_slot;
828	for (; avail_slot < (PHYS_AVAIL_SIZE - 2) &&
829	    map_slot < (physmap_idx * 2); map_slot += 2) {
830		if (physmap[map_slot] == physmap[map_slot + 1])
831			continue;
832
833		/* Have we used the current range? */
834		if (physmap[map_slot + 1] <= pa)
835			continue;
836
837		/* Do we need to split the entry? */
838		if (physmap[map_slot] < pa) {
839			phys_avail[avail_slot] = pa;
840			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
841		} else {
842			phys_avail[avail_slot] = physmap[map_slot];
843			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
844		}
845		physmem += (phys_avail[avail_slot + 1] -
846		    phys_avail[avail_slot]) >> PAGE_SHIFT;
847
848		avail_slot += 2;
849	}
850	phys_avail[avail_slot] = 0;
851	phys_avail[avail_slot + 1] = 0;
852
853	/*
854	 * Maxmem isn't the "maximum memory", it's one larger than the
855	 * highest page of the physical address space.  It should be
856	 * called something like "Maxphyspage".
857	 */
858	Maxmem = atop(phys_avail[avail_slot - 1]);
859
860	cpu_tlb_flushID();
861}
862
863/*
864 *	Initialize a vm_page's machine-dependent fields.
865 */
866void
867pmap_page_init(vm_page_t m)
868{
869
870	TAILQ_INIT(&m->md.pv_list);
871	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
872}
873
874/*
875 *	Initialize the pmap module.
876 *	Called by vm_init, to initialize any structures that the pmap
877 *	system needs to map virtual memory.
878 */
879void
880pmap_init(void)
881{
882	vm_size_t s;
883	int i, pv_npg;
884
885	/*
886	 * Are large page mappings enabled?
887	 */
888	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
889
890	/*
891	 * Initialize the pv chunk list mutex.
892	 */
893	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
894
895	/*
896	 * Initialize the pool of pv list locks.
897	 */
898	for (i = 0; i < NPV_LIST_LOCKS; i++)
899		rw_init(&pv_list_locks[i], "pmap pv list");
900
901	/*
902	 * Calculate the size of the pv head table for superpages.
903	 */
904	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
905
906	/*
907	 * Allocate memory for the pv head table for superpages.
908	 */
909	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
910	s = round_page(s);
911	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
912	    M_WAITOK | M_ZERO);
913	for (i = 0; i < pv_npg; i++)
914		TAILQ_INIT(&pv_table[i].pv_list);
915	TAILQ_INIT(&pv_dummy.pv_list);
916}
917
918static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
919    "2MB page mapping counters");
920
921static u_long pmap_l2_demotions;
922SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
923    &pmap_l2_demotions, 0, "2MB page demotions");
924
925static u_long pmap_l2_p_failures;
926SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
927    &pmap_l2_p_failures, 0, "2MB page promotion failures");
928
929static u_long pmap_l2_promotions;
930SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
931    &pmap_l2_promotions, 0, "2MB page promotions");
932
933/*
934 * Invalidate a single TLB entry.
935 */
936PMAP_INLINE void
937pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
938{
939
940	sched_pin();
941	__asm __volatile(
942	    "dsb  ishst		\n"
943	    "tlbi vaae1is, %0	\n"
944	    "dsb  ish		\n"
945	    "isb		\n"
946	    : : "r"(va >> PAGE_SHIFT));
947	sched_unpin();
948}
949
950PMAP_INLINE void
951pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
952{
953	vm_offset_t addr;
954
955	sched_pin();
956	dsb(ishst);
957	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
958		__asm __volatile(
959		    "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
960	}
961	__asm __volatile(
962	    "dsb  ish	\n"
963	    "isb	\n");
964	sched_unpin();
965}
966
967PMAP_INLINE void
968pmap_invalidate_all(pmap_t pmap)
969{
970
971	sched_pin();
972	__asm __volatile(
973	    "dsb  ishst		\n"
974	    "tlbi vmalle1is	\n"
975	    "dsb  ish		\n"
976	    "isb		\n");
977	sched_unpin();
978}
979
980/*
981 *	Routine:	pmap_extract
982 *	Function:
983 *		Extract the physical page address associated
984 *		with the given map/virtual_address pair.
985 */
986vm_paddr_t
987pmap_extract(pmap_t pmap, vm_offset_t va)
988{
989	pt_entry_t *pte, tpte;
990	vm_paddr_t pa;
991	int lvl;
992
993	pa = 0;
994	PMAP_LOCK(pmap);
995	/*
996	 * Find the block or page map for this virtual address. pmap_pte
997	 * will return either a valid block/page entry, or NULL.
998	 */
999	pte = pmap_pte(pmap, va, &lvl);
1000	if (pte != NULL) {
1001		tpte = pmap_load(pte);
1002		pa = tpte & ~ATTR_MASK;
1003		switch(lvl) {
1004		case 1:
1005			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1006			    ("pmap_extract: Invalid L1 pte found: %lx",
1007			    tpte & ATTR_DESCR_MASK));
1008			pa |= (va & L1_OFFSET);
1009			break;
1010		case 2:
1011			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1012			    ("pmap_extract: Invalid L2 pte found: %lx",
1013			    tpte & ATTR_DESCR_MASK));
1014			pa |= (va & L2_OFFSET);
1015			break;
1016		case 3:
1017			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1018			    ("pmap_extract: Invalid L3 pte found: %lx",
1019			    tpte & ATTR_DESCR_MASK));
1020			pa |= (va & L3_OFFSET);
1021			break;
1022		}
1023	}
1024	PMAP_UNLOCK(pmap);
1025	return (pa);
1026}
1027
1028/*
1029 *	Routine:	pmap_extract_and_hold
1030 *	Function:
1031 *		Atomically extract and hold the physical page
1032 *		with the given pmap and virtual address pair
1033 *		if that mapping permits the given protection.
1034 */
1035vm_page_t
1036pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1037{
1038	pt_entry_t *pte, tpte;
1039	vm_offset_t off;
1040	vm_paddr_t pa;
1041	vm_page_t m;
1042	int lvl;
1043
1044	pa = 0;
1045	m = NULL;
1046	PMAP_LOCK(pmap);
1047retry:
1048	pte = pmap_pte(pmap, va, &lvl);
1049	if (pte != NULL) {
1050		tpte = pmap_load(pte);
1051
1052		KASSERT(lvl > 0 && lvl <= 3,
1053		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1054		CTASSERT(L1_BLOCK == L2_BLOCK);
1055		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1056		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1057		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1058		     tpte & ATTR_DESCR_MASK));
1059		if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
1060		    ((prot & VM_PROT_WRITE) == 0)) {
1061			switch(lvl) {
1062			case 1:
1063				off = va & L1_OFFSET;
1064				break;
1065			case 2:
1066				off = va & L2_OFFSET;
1067				break;
1068			case 3:
1069			default:
1070				off = 0;
1071			}
1072			if (vm_page_pa_tryrelock(pmap,
1073			    (tpte & ~ATTR_MASK) | off, &pa))
1074				goto retry;
1075			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1076			vm_page_hold(m);
1077		}
1078	}
1079	PA_UNLOCK_COND(pa);
1080	PMAP_UNLOCK(pmap);
1081	return (m);
1082}
1083
1084vm_paddr_t
1085pmap_kextract(vm_offset_t va)
1086{
1087	pt_entry_t *pte, tpte;
1088	vm_paddr_t pa;
1089	int lvl;
1090
1091	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1092		pa = DMAP_TO_PHYS(va);
1093	} else {
1094		pa = 0;
1095		pte = pmap_pte(kernel_pmap, va, &lvl);
1096		if (pte != NULL) {
1097			tpte = pmap_load(pte);
1098			pa = tpte & ~ATTR_MASK;
1099			switch(lvl) {
1100			case 1:
1101				KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1102				    ("pmap_kextract: Invalid L1 pte found: %lx",
1103				    tpte & ATTR_DESCR_MASK));
1104				pa |= (va & L1_OFFSET);
1105				break;
1106			case 2:
1107				KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1108				    ("pmap_kextract: Invalid L2 pte found: %lx",
1109				    tpte & ATTR_DESCR_MASK));
1110				pa |= (va & L2_OFFSET);
1111				break;
1112			case 3:
1113				KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1114				    ("pmap_kextract: Invalid L3 pte found: %lx",
1115				    tpte & ATTR_DESCR_MASK));
1116				pa |= (va & L3_OFFSET);
1117				break;
1118			}
1119		}
1120	}
1121	return (pa);
1122}
1123
1124/***************************************************
1125 * Low level mapping routines.....
1126 ***************************************************/
1127
1128static void
1129pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1130{
1131	pd_entry_t *pde;
1132	pt_entry_t *pte, attr;
1133	vm_offset_t va;
1134	int lvl;
1135
1136	KASSERT((pa & L3_OFFSET) == 0,
1137	   ("pmap_kenter: Invalid physical address"));
1138	KASSERT((sva & L3_OFFSET) == 0,
1139	   ("pmap_kenter: Invalid virtual address"));
1140	KASSERT((size & PAGE_MASK) == 0,
1141	    ("pmap_kenter: Mapping is not page-sized"));
1142
1143	attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
1144	if (mode == DEVICE_MEMORY)
1145		attr |= ATTR_XN;
1146
1147	va = sva;
1148	while (size != 0) {
1149		pde = pmap_pde(kernel_pmap, va, &lvl);
1150		KASSERT(pde != NULL,
1151		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1152		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1153
1154		pte = pmap_l2_to_l3(pde, va);
1155		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
1156		PTE_SYNC(pte);
1157
1158		va += PAGE_SIZE;
1159		pa += PAGE_SIZE;
1160		size -= PAGE_SIZE;
1161	}
1162	pmap_invalidate_range(kernel_pmap, sva, va);
1163}
1164
1165void
1166pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1167{
1168
1169	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
1170}
1171
1172/*
1173 * Remove a page from the kernel pagetables.
1174 */
1175PMAP_INLINE void
1176pmap_kremove(vm_offset_t va)
1177{
1178	pt_entry_t *pte;
1179	int lvl;
1180
1181	pte = pmap_pte(kernel_pmap, va, &lvl);
1182	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
1183	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
1184
1185	if (pmap_l3_valid_cacheable(pmap_load(pte)))
1186		cpu_dcache_wb_range(va, L3_SIZE);
1187	pmap_load_clear(pte);
1188	PTE_SYNC(pte);
1189	pmap_invalidate_page(kernel_pmap, va);
1190}
1191
1192void
1193pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1194{
1195	pt_entry_t *pte;
1196	vm_offset_t va;
1197	int lvl;
1198
1199	KASSERT((sva & L3_OFFSET) == 0,
1200	   ("pmap_kremove_device: Invalid virtual address"));
1201	KASSERT((size & PAGE_MASK) == 0,
1202	    ("pmap_kremove_device: Mapping is not page-sized"));
1203
1204	va = sva;
1205	while (size != 0) {
1206		pte = pmap_pte(kernel_pmap, va, &lvl);
1207		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
1208		KASSERT(lvl == 3,
1209		    ("Invalid device pagetable level: %d != 3", lvl));
1210		pmap_load_clear(pte);
1211		PTE_SYNC(pte);
1212
1213		va += PAGE_SIZE;
1214		size -= PAGE_SIZE;
1215	}
1216	pmap_invalidate_range(kernel_pmap, sva, va);
1217}
1218
1219/*
1220 *	Used to map a range of physical addresses into kernel
1221 *	virtual address space.
1222 *
1223 *	The value passed in '*virt' is a suggested virtual address for
1224 *	the mapping. Architectures which can support a direct-mapped
1225 *	physical to virtual region can return the appropriate address
1226 *	within that region, leaving '*virt' unchanged. Other
1227 *	architectures should map the pages starting at '*virt' and
1228 *	update '*virt' with the first usable address after the mapped
1229 *	region.
1230 */
1231vm_offset_t
1232pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1233{
1234	return PHYS_TO_DMAP(start);
1235}
1236
1237
1238/*
1239 * Add a list of wired pages to the kva
1240 * this routine is only used for temporary
1241 * kernel mappings that do not need to have
1242 * page modification or references recorded.
1243 * Note that old mappings are simply written
1244 * over.  The page *must* be wired.
1245 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1246 */
1247void
1248pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1249{
1250	pd_entry_t *pde;
1251	pt_entry_t *pte, pa;
1252	vm_offset_t va;
1253	vm_page_t m;
1254	int i, lvl;
1255
1256	va = sva;
1257	for (i = 0; i < count; i++) {
1258		pde = pmap_pde(kernel_pmap, va, &lvl);
1259		KASSERT(pde != NULL,
1260		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1261		KASSERT(lvl == 2,
1262		    ("pmap_qenter: Invalid level %d", lvl));
1263
1264		m = ma[i];
1265		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
1266		    ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
1267		if (m->md.pv_memattr == DEVICE_MEMORY)
1268			pa |= ATTR_XN;
1269		pte = pmap_l2_to_l3(pde, va);
1270		pmap_load_store(pte, pa);
1271		PTE_SYNC(pte);
1272
1273		va += L3_SIZE;
1274	}
1275	pmap_invalidate_range(kernel_pmap, sva, va);
1276}
1277
1278/*
1279 * This routine tears out page mappings from the
1280 * kernel -- it is meant only for temporary mappings.
1281 */
1282void
1283pmap_qremove(vm_offset_t sva, int count)
1284{
1285	pt_entry_t *pte;
1286	vm_offset_t va;
1287	int lvl;
1288
1289	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1290
1291	va = sva;
1292	while (count-- > 0) {
1293		pte = pmap_pte(kernel_pmap, va, &lvl);
1294		KASSERT(lvl == 3,
1295		    ("Invalid device pagetable level: %d != 3", lvl));
1296		if (pte != NULL) {
1297			if (pmap_l3_valid_cacheable(pmap_load(pte)))
1298				cpu_dcache_wb_range(va, L3_SIZE);
1299			pmap_load_clear(pte);
1300			PTE_SYNC(pte);
1301		}
1302
1303		va += PAGE_SIZE;
1304	}
1305	pmap_invalidate_range(kernel_pmap, sva, va);
1306}
1307
1308/***************************************************
1309 * Page table page management routines.....
1310 ***************************************************/
1311static __inline void
1312pmap_free_zero_pages(struct spglist *free)
1313{
1314	vm_page_t m;
1315
1316	while ((m = SLIST_FIRST(free)) != NULL) {
1317		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1318		/* Preserve the page's PG_ZERO setting. */
1319		vm_page_free_toq(m);
1320	}
1321}
1322
1323/*
1324 * Schedule the specified unused page table page to be freed.  Specifically,
1325 * add the page to the specified list of pages that will be released to the
1326 * physical memory manager after the TLB has been updated.
1327 */
1328static __inline void
1329pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1330    boolean_t set_PG_ZERO)
1331{
1332
1333	if (set_PG_ZERO)
1334		m->flags |= PG_ZERO;
1335	else
1336		m->flags &= ~PG_ZERO;
1337	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1338}
1339
1340/*
1341 * Decrements a page table page's wire count, which is used to record the
1342 * number of valid page table entries within the page.  If the wire count
1343 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1344 * page table page was unmapped and FALSE otherwise.
1345 */
1346static inline boolean_t
1347pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1348{
1349
1350	--m->wire_count;
1351	if (m->wire_count == 0) {
1352		_pmap_unwire_l3(pmap, va, m, free);
1353		return (TRUE);
1354	} else
1355		return (FALSE);
1356}
1357
1358static void
1359_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1360{
1361
1362	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1363	/*
1364	 * unmap the page table page
1365	 */
1366	if (m->pindex >= (NUL2E + NUL1E)) {
1367		/* l1 page */
1368		pd_entry_t *l0;
1369
1370		l0 = pmap_l0(pmap, va);
1371		pmap_load_clear(l0);
1372		PTE_SYNC(l0);
1373	} else if (m->pindex >= NUL2E) {
1374		/* l2 page */
1375		pd_entry_t *l1;
1376
1377		l1 = pmap_l1(pmap, va);
1378		pmap_load_clear(l1);
1379		PTE_SYNC(l1);
1380	} else {
1381		/* l3 page */
1382		pd_entry_t *l2;
1383
1384		l2 = pmap_l2(pmap, va);
1385		pmap_load_clear(l2);
1386		PTE_SYNC(l2);
1387	}
1388	pmap_resident_count_dec(pmap, 1);
1389	if (m->pindex < NUL2E) {
1390		/* We just released an l3, unhold the matching l2 */
1391		pd_entry_t *l1, tl1;
1392		vm_page_t l2pg;
1393
1394		l1 = pmap_l1(pmap, va);
1395		tl1 = pmap_load(l1);
1396		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1397		pmap_unwire_l3(pmap, va, l2pg, free);
1398	} else if (m->pindex < (NUL2E + NUL1E)) {
1399		/* We just released an l2, unhold the matching l1 */
1400		pd_entry_t *l0, tl0;
1401		vm_page_t l1pg;
1402
1403		l0 = pmap_l0(pmap, va);
1404		tl0 = pmap_load(l0);
1405		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1406		pmap_unwire_l3(pmap, va, l1pg, free);
1407	}
1408	pmap_invalidate_page(pmap, va);
1409
1410	/*
1411	 * This is a release store so that the ordinary store unmapping
1412	 * the page table page is globally performed before TLB shoot-
1413	 * down is begun.
1414	 */
1415	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
1416
1417	/*
1418	 * Put page on a list so that it is released after
1419	 * *ALL* TLB shootdown is done
1420	 */
1421	pmap_add_delayed_free_list(m, free, TRUE);
1422}
1423
1424/*
1425 * After removing an l3 entry, this routine is used to
1426 * conditionally free the page, and manage the hold/wire counts.
1427 */
1428static int
1429pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1430    struct spglist *free)
1431{
1432	vm_page_t mpte;
1433
1434	if (va >= VM_MAXUSER_ADDRESS)
1435		return (0);
1436	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1437	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1438	return (pmap_unwire_l3(pmap, va, mpte, free));
1439}
1440
1441void
1442pmap_pinit0(pmap_t pmap)
1443{
1444
1445	PMAP_LOCK_INIT(pmap);
1446	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1447	pmap->pm_l0 = kernel_pmap->pm_l0;
1448	pmap->pm_root.rt_root = 0;
1449}
1450
1451int
1452pmap_pinit(pmap_t pmap)
1453{
1454	vm_paddr_t l0phys;
1455	vm_page_t l0pt;
1456
1457	/*
1458	 * allocate the l0 page
1459	 */
1460	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1461	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1462		VM_WAIT;
1463
1464	l0phys = VM_PAGE_TO_PHYS(l0pt);
1465	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
1466
1467	if ((l0pt->flags & PG_ZERO) == 0)
1468		pagezero(pmap->pm_l0);
1469
1470	pmap->pm_root.rt_root = 0;
1471	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1472
1473	return (1);
1474}
1475
1476/*
1477 * This routine is called if the desired page table page does not exist.
1478 *
1479 * If page table page allocation fails, this routine may sleep before
1480 * returning NULL.  It sleeps only if a lock pointer was given.
1481 *
1482 * Note: If a page allocation fails at page table level two or three,
1483 * one or two pages may be held during the wait, only to be released
1484 * afterwards.  This conservative approach is easily argued to avoid
1485 * race conditions.
1486 */
1487static vm_page_t
1488_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1489{
1490	vm_page_t m, l1pg, l2pg;
1491
1492	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1493
1494	/*
1495	 * Allocate a page table page.
1496	 */
1497	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1498	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1499		if (lockp != NULL) {
1500			RELEASE_PV_LIST_LOCK(lockp);
1501			PMAP_UNLOCK(pmap);
1502			VM_WAIT;
1503			PMAP_LOCK(pmap);
1504		}
1505
1506		/*
1507		 * Indicate the need to retry.  While waiting, the page table
1508		 * page may have been allocated.
1509		 */
1510		return (NULL);
1511	}
1512	if ((m->flags & PG_ZERO) == 0)
1513		pmap_zero_page(m);
1514
1515	/*
1516	 * Map the pagetable page into the process address space, if
1517	 * it isn't already there.
1518	 */
1519
1520	if (ptepindex >= (NUL2E + NUL1E)) {
1521		pd_entry_t *l0;
1522		vm_pindex_t l0index;
1523
1524		l0index = ptepindex - (NUL2E + NUL1E);
1525		l0 = &pmap->pm_l0[l0index];
1526		pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
1527		PTE_SYNC(l0);
1528	} else if (ptepindex >= NUL2E) {
1529		vm_pindex_t l0index, l1index;
1530		pd_entry_t *l0, *l1;
1531		pd_entry_t tl0;
1532
1533		l1index = ptepindex - NUL2E;
1534		l0index = l1index >> L0_ENTRIES_SHIFT;
1535
1536		l0 = &pmap->pm_l0[l0index];
1537		tl0 = pmap_load(l0);
1538		if (tl0 == 0) {
1539			/* recurse for allocating page dir */
1540			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1541			    lockp) == NULL) {
1542				--m->wire_count;
1543				/* XXX: release mem barrier? */
1544				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1545				vm_page_free_zero(m);
1546				return (NULL);
1547			}
1548		} else {
1549			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1550			l1pg->wire_count++;
1551		}
1552
1553		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1554		l1 = &l1[ptepindex & Ln_ADDR_MASK];
1555		pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1556		PTE_SYNC(l1);
1557	} else {
1558		vm_pindex_t l0index, l1index;
1559		pd_entry_t *l0, *l1, *l2;
1560		pd_entry_t tl0, tl1;
1561
1562		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1563		l0index = l1index >> L0_ENTRIES_SHIFT;
1564
1565		l0 = &pmap->pm_l0[l0index];
1566		tl0 = pmap_load(l0);
1567		if (tl0 == 0) {
1568			/* recurse for allocating page dir */
1569			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1570			    lockp) == NULL) {
1571				--m->wire_count;
1572				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1573				vm_page_free_zero(m);
1574				return (NULL);
1575			}
1576			tl0 = pmap_load(l0);
1577			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1578			l1 = &l1[l1index & Ln_ADDR_MASK];
1579		} else {
1580			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1581			l1 = &l1[l1index & Ln_ADDR_MASK];
1582			tl1 = pmap_load(l1);
1583			if (tl1 == 0) {
1584				/* recurse for allocating page dir */
1585				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1586				    lockp) == NULL) {
1587					--m->wire_count;
1588					/* XXX: release mem barrier? */
1589					atomic_subtract_int(
1590					    &vm_cnt.v_wire_count, 1);
1591					vm_page_free_zero(m);
1592					return (NULL);
1593				}
1594			} else {
1595				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1596				l2pg->wire_count++;
1597			}
1598		}
1599
1600		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
1601		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1602		pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
1603		PTE_SYNC(l2);
1604	}
1605
1606	pmap_resident_count_inc(pmap, 1);
1607
1608	return (m);
1609}
1610
1611static vm_page_t
1612pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1613{
1614	vm_pindex_t ptepindex;
1615	pd_entry_t *pde, tpde;
1616#ifdef INVARIANTS
1617	pt_entry_t *pte;
1618#endif
1619	vm_page_t m;
1620	int lvl;
1621
1622	/*
1623	 * Calculate pagetable page index
1624	 */
1625	ptepindex = pmap_l2_pindex(va);
1626retry:
1627	/*
1628	 * Get the page directory entry
1629	 */
1630	pde = pmap_pde(pmap, va, &lvl);
1631
1632	/*
1633	 * If the page table page is mapped, we just increment the hold count,
1634	 * and activate it. If we get a level 2 pde it will point to a level 3
1635	 * table.
1636	 */
1637	switch (lvl) {
1638	case -1:
1639		break;
1640	case 0:
1641#ifdef INVARIANTS
1642		pte = pmap_l0_to_l1(pde, va);
1643		KASSERT(pmap_load(pte) == 0,
1644		    ("pmap_alloc_l3: TODO: l0 superpages"));
1645#endif
1646		break;
1647	case 1:
1648#ifdef INVARIANTS
1649		pte = pmap_l1_to_l2(pde, va);
1650		KASSERT(pmap_load(pte) == 0,
1651		    ("pmap_alloc_l3: TODO: l1 superpages"));
1652#endif
1653		break;
1654	case 2:
1655		tpde = pmap_load(pde);
1656		if (tpde != 0) {
1657			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
1658			m->wire_count++;
1659			return (m);
1660		}
1661		break;
1662	default:
1663		panic("pmap_alloc_l3: Invalid level %d", lvl);
1664	}
1665
1666	/*
1667	 * Here if the pte page isn't mapped, or if it has been deallocated.
1668	 */
1669	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1670	if (m == NULL && lockp != NULL)
1671		goto retry;
1672
1673	return (m);
1674}
1675
1676
1677/***************************************************
1678 * Pmap allocation/deallocation routines.
1679 ***************************************************/
1680
1681/*
1682 * Release any resources held by the given physical map.
1683 * Called when a pmap initialized by pmap_pinit is being released.
1684 * Should only be called if the map contains no valid mappings.
1685 */
1686void
1687pmap_release(pmap_t pmap)
1688{
1689	vm_page_t m;
1690
1691	KASSERT(pmap->pm_stats.resident_count == 0,
1692	    ("pmap_release: pmap resident count %ld != 0",
1693	    pmap->pm_stats.resident_count));
1694	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1695	    ("pmap_release: pmap has reserved page table page(s)"));
1696
1697	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
1698
1699	m->wire_count--;
1700	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1701	vm_page_free_zero(m);
1702}
1703
1704static int
1705kvm_size(SYSCTL_HANDLER_ARGS)
1706{
1707	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1708
1709	return sysctl_handle_long(oidp, &ksize, 0, req);
1710}
1711SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1712    0, 0, kvm_size, "LU", "Size of KVM");
1713
1714static int
1715kvm_free(SYSCTL_HANDLER_ARGS)
1716{
1717	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1718
1719	return sysctl_handle_long(oidp, &kfree, 0, req);
1720}
1721SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1722    0, 0, kvm_free, "LU", "Amount of KVM free");
1723
1724/*
1725 * grow the number of kernel page table entries, if needed
1726 */
1727void
1728pmap_growkernel(vm_offset_t addr)
1729{
1730	vm_paddr_t paddr;
1731	vm_page_t nkpg;
1732	pd_entry_t *l0, *l1, *l2;
1733
1734	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1735
1736	addr = roundup2(addr, L2_SIZE);
1737	if (addr - 1 >= kernel_map->max_offset)
1738		addr = kernel_map->max_offset;
1739	while (kernel_vm_end < addr) {
1740		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
1741		KASSERT(pmap_load(l0) != 0,
1742		    ("pmap_growkernel: No level 0 kernel entry"));
1743
1744		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
1745		if (pmap_load(l1) == 0) {
1746			/* We need a new PDP entry */
1747			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
1748			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1749			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1750			if (nkpg == NULL)
1751				panic("pmap_growkernel: no memory to grow kernel");
1752			if ((nkpg->flags & PG_ZERO) == 0)
1753				pmap_zero_page(nkpg);
1754			paddr = VM_PAGE_TO_PHYS(nkpg);
1755			pmap_load_store(l1, paddr | L1_TABLE);
1756			PTE_SYNC(l1);
1757			continue; /* try again */
1758		}
1759		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1760		if ((pmap_load(l2) & ATTR_AF) != 0) {
1761			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1762			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1763				kernel_vm_end = kernel_map->max_offset;
1764				break;
1765			}
1766			continue;
1767		}
1768
1769		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
1770		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1771		    VM_ALLOC_ZERO);
1772		if (nkpg == NULL)
1773			panic("pmap_growkernel: no memory to grow kernel");
1774		if ((nkpg->flags & PG_ZERO) == 0)
1775			pmap_zero_page(nkpg);
1776		paddr = VM_PAGE_TO_PHYS(nkpg);
1777		pmap_load_store(l2, paddr | L2_TABLE);
1778		PTE_SYNC(l2);
1779		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1780
1781		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1782		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1783			kernel_vm_end = kernel_map->max_offset;
1784			break;
1785		}
1786	}
1787}
1788
1789
1790/***************************************************
1791 * page management routines.
1792 ***************************************************/
1793
1794CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1795CTASSERT(_NPCM == 3);
1796CTASSERT(_NPCPV == 168);
1797
1798static __inline struct pv_chunk *
1799pv_to_chunk(pv_entry_t pv)
1800{
1801
1802	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1803}
1804
1805#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1806
1807#define	PC_FREE0	0xfffffffffffffffful
1808#define	PC_FREE1	0xfffffffffffffffful
1809#define	PC_FREE2	0x000000fffffffffful
1810
1811static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1812
1813#if 0
1814#ifdef PV_STATS
1815static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1816
1817SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1818	"Current number of pv entry chunks");
1819SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1820	"Current number of pv entry chunks allocated");
1821SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1822	"Current number of pv entry chunks frees");
1823SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1824	"Number of times tried to get a chunk page but failed.");
1825
1826static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1827static int pv_entry_spare;
1828
1829SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1830	"Current number of pv entry frees");
1831SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1832	"Current number of pv entry allocs");
1833SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1834	"Current number of pv entries");
1835SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1836	"Current number of spare pv entries");
1837#endif
1838#endif /* 0 */
1839
1840/*
1841 * We are in a serious low memory condition.  Resort to
1842 * drastic measures to free some pages so we can allocate
1843 * another pv entry chunk.
1844 *
1845 * Returns NULL if PV entries were reclaimed from the specified pmap.
1846 *
1847 * We do not, however, unmap 2mpages because subsequent accesses will
1848 * allocate per-page pv entries until repromotion occurs, thereby
1849 * exacerbating the shortage of free pv entries.
1850 */
1851static vm_page_t
1852reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1853{
1854	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1855	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1856	struct md_page *pvh;
1857	pd_entry_t *pde;
1858	pmap_t next_pmap, pmap;
1859	pt_entry_t *pte, tpte;
1860	pv_entry_t pv;
1861	vm_offset_t va;
1862	vm_page_t m, m_pc;
1863	struct spglist free;
1864	uint64_t inuse;
1865	int bit, field, freed, lvl;
1866	static int active_reclaims = 0;
1867
1868	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1869	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1870
1871	pmap = NULL;
1872	m_pc = NULL;
1873	SLIST_INIT(&free);
1874	bzero(&pc_marker_b, sizeof(pc_marker_b));
1875	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1876	pc_marker = (struct pv_chunk *)&pc_marker_b;
1877	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1878
1879	mtx_lock(&pv_chunks_mutex);
1880	active_reclaims++;
1881	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1882	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1883	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1884	    SLIST_EMPTY(&free)) {
1885		next_pmap = pc->pc_pmap;
1886		if (next_pmap == NULL) {
1887			/*
1888			 * The next chunk is a marker.  However, it is
1889			 * not our marker, so active_reclaims must be
1890			 * > 1.  Consequently, the next_chunk code
1891			 * will not rotate the pv_chunks list.
1892			 */
1893			goto next_chunk;
1894		}
1895		mtx_unlock(&pv_chunks_mutex);
1896
1897		/*
1898		 * A pv_chunk can only be removed from the pc_lru list
1899		 * when both pv_chunks_mutex is owned and the
1900		 * corresponding pmap is locked.
1901		 */
1902		if (pmap != next_pmap) {
1903			if (pmap != NULL && pmap != locked_pmap)
1904				PMAP_UNLOCK(pmap);
1905			pmap = next_pmap;
1906			/* Avoid deadlock and lock recursion. */
1907			if (pmap > locked_pmap) {
1908				RELEASE_PV_LIST_LOCK(lockp);
1909				PMAP_LOCK(pmap);
1910				mtx_lock(&pv_chunks_mutex);
1911				continue;
1912			} else if (pmap != locked_pmap) {
1913				if (PMAP_TRYLOCK(pmap)) {
1914					mtx_lock(&pv_chunks_mutex);
1915					continue;
1916				} else {
1917					pmap = NULL; /* pmap is not locked */
1918					mtx_lock(&pv_chunks_mutex);
1919					pc = TAILQ_NEXT(pc_marker, pc_lru);
1920					if (pc == NULL ||
1921					    pc->pc_pmap != next_pmap)
1922						continue;
1923					goto next_chunk;
1924				}
1925			}
1926		}
1927
1928		/*
1929		 * Destroy every non-wired, 4 KB page mapping in the chunk.
1930		 */
1931		freed = 0;
1932		for (field = 0; field < _NPCM; field++) {
1933			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1934			    inuse != 0; inuse &= ~(1UL << bit)) {
1935				bit = ffsl(inuse) - 1;
1936				pv = &pc->pc_pventry[field * 64 + bit];
1937				va = pv->pv_va;
1938				pde = pmap_pde(pmap, va, &lvl);
1939				if (lvl != 2)
1940					continue;
1941				pte = pmap_l2_to_l3(pde, va);
1942				tpte = pmap_load(pte);
1943				if ((tpte & ATTR_SW_WIRED) != 0)
1944					continue;
1945				tpte = pmap_load_clear(pte);
1946				PTE_SYNC(pte);
1947				pmap_invalidate_page(pmap, va);
1948				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
1949				if (pmap_page_dirty(tpte))
1950					vm_page_dirty(m);
1951				if ((tpte & ATTR_AF) != 0)
1952					vm_page_aflag_set(m, PGA_REFERENCED);
1953				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1954				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
1955				m->md.pv_gen++;
1956				if (TAILQ_EMPTY(&m->md.pv_list) &&
1957				    (m->flags & PG_FICTITIOUS) == 0) {
1958					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1959					if (TAILQ_EMPTY(&pvh->pv_list)) {
1960						vm_page_aflag_clear(m,
1961						    PGA_WRITEABLE);
1962					}
1963				}
1964				pc->pc_map[field] |= 1UL << bit;
1965				pmap_unuse_l3(pmap, va, pmap_load(pde), &free);
1966				freed++;
1967			}
1968		}
1969		if (freed == 0) {
1970			mtx_lock(&pv_chunks_mutex);
1971			goto next_chunk;
1972		}
1973		/* Every freed mapping is for a 4 KB page. */
1974		pmap_resident_count_dec(pmap, freed);
1975		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1976		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1977		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1978		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1979		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
1980		    pc->pc_map[2] == PC_FREE2) {
1981			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1982			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1983			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1984			/* Entire chunk is free; return it. */
1985			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1986			dump_drop_page(m_pc->phys_addr);
1987			mtx_lock(&pv_chunks_mutex);
1988			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1989			break;
1990		}
1991		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1992		mtx_lock(&pv_chunks_mutex);
1993		/* One freed pv entry in locked_pmap is sufficient. */
1994		if (pmap == locked_pmap)
1995			break;
1996
1997next_chunk:
1998		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1999		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
2000		if (active_reclaims == 1 && pmap != NULL) {
2001			/*
2002			 * Rotate the pv chunks list so that we do not
2003			 * scan the same pv chunks that could not be
2004			 * freed (because they contained a wired
2005			 * and/or superpage mapping) on every
2006			 * invocation of reclaim_pv_chunk().
2007			 */
2008			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
2009				MPASS(pc->pc_pmap != NULL);
2010				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2011				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2012			}
2013		}
2014	}
2015	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2016	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
2017	active_reclaims--;
2018	mtx_unlock(&pv_chunks_mutex);
2019	if (pmap != NULL && pmap != locked_pmap)
2020		PMAP_UNLOCK(pmap);
2021	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2022		m_pc = SLIST_FIRST(&free);
2023		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2024		/* Recycle a freed page table page. */
2025		m_pc->wire_count = 1;
2026		atomic_add_int(&vm_cnt.v_wire_count, 1);
2027	}
2028	pmap_free_zero_pages(&free);
2029	return (m_pc);
2030}
2031
2032/*
2033 * free the pv_entry back to the free list
2034 */
2035static void
2036free_pv_entry(pmap_t pmap, pv_entry_t pv)
2037{
2038	struct pv_chunk *pc;
2039	int idx, field, bit;
2040
2041	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2042	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2043	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2044	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2045	pc = pv_to_chunk(pv);
2046	idx = pv - &pc->pc_pventry[0];
2047	field = idx / 64;
2048	bit = idx % 64;
2049	pc->pc_map[field] |= 1ul << bit;
2050	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2051	    pc->pc_map[2] != PC_FREE2) {
2052		/* 98% of the time, pc is already at the head of the list. */
2053		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2054			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2055			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2056		}
2057		return;
2058	}
2059	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2060	free_pv_chunk(pc);
2061}
2062
2063static void
2064free_pv_chunk(struct pv_chunk *pc)
2065{
2066	vm_page_t m;
2067
2068	mtx_lock(&pv_chunks_mutex);
2069 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2070	mtx_unlock(&pv_chunks_mutex);
2071	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2072	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2073	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2074	/* entire chunk is free, return it */
2075	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2076	dump_drop_page(m->phys_addr);
2077	vm_page_unwire(m, PQ_NONE);
2078	vm_page_free(m);
2079}
2080
2081/*
2082 * Returns a new PV entry, allocating a new PV chunk from the system when
2083 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2084 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2085 * returned.
2086 *
2087 * The given PV list lock may be released.
2088 */
2089static pv_entry_t
2090get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2091{
2092	int bit, field;
2093	pv_entry_t pv;
2094	struct pv_chunk *pc;
2095	vm_page_t m;
2096
2097	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2098	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2099retry:
2100	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2101	if (pc != NULL) {
2102		for (field = 0; field < _NPCM; field++) {
2103			if (pc->pc_map[field]) {
2104				bit = ffsl(pc->pc_map[field]) - 1;
2105				break;
2106			}
2107		}
2108		if (field < _NPCM) {
2109			pv = &pc->pc_pventry[field * 64 + bit];
2110			pc->pc_map[field] &= ~(1ul << bit);
2111			/* If this was the last item, move it to tail */
2112			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2113			    pc->pc_map[2] == 0) {
2114				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2115				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2116				    pc_list);
2117			}
2118			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2119			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2120			return (pv);
2121		}
2122	}
2123	/* No free items, allocate another chunk */
2124	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2125	    VM_ALLOC_WIRED);
2126	if (m == NULL) {
2127		if (lockp == NULL) {
2128			PV_STAT(pc_chunk_tryfail++);
2129			return (NULL);
2130		}
2131		m = reclaim_pv_chunk(pmap, lockp);
2132		if (m == NULL)
2133			goto retry;
2134	}
2135	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2136	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2137	dump_add_page(m->phys_addr);
2138	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2139	pc->pc_pmap = pmap;
2140	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2141	pc->pc_map[1] = PC_FREE1;
2142	pc->pc_map[2] = PC_FREE2;
2143	mtx_lock(&pv_chunks_mutex);
2144	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2145	mtx_unlock(&pv_chunks_mutex);
2146	pv = &pc->pc_pventry[0];
2147	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2148	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2149	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2150	return (pv);
2151}
2152
2153/*
2154 * Ensure that the number of spare PV entries in the specified pmap meets or
2155 * exceeds the given count, "needed".
2156 *
2157 * The given PV list lock may be released.
2158 */
2159static void
2160reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2161{
2162	struct pch new_tail;
2163	struct pv_chunk *pc;
2164	vm_page_t m;
2165	int avail, free;
2166	bool reclaimed;
2167
2168	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2169	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2170
2171	/*
2172	 * Newly allocated PV chunks must be stored in a private list until
2173	 * the required number of PV chunks have been allocated.  Otherwise,
2174	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2175	 * contrast, these chunks must be added to the pmap upon allocation.
2176	 */
2177	TAILQ_INIT(&new_tail);
2178retry:
2179	avail = 0;
2180	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2181		bit_count((bitstr_t *)pc->pc_map, 0,
2182		    sizeof(pc->pc_map) * NBBY, &free);
2183		if (free == 0)
2184			break;
2185		avail += free;
2186		if (avail >= needed)
2187			break;
2188	}
2189	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2190		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2191		    VM_ALLOC_WIRED);
2192		if (m == NULL) {
2193			m = reclaim_pv_chunk(pmap, lockp);
2194			if (m == NULL)
2195				goto retry;
2196			reclaimed = true;
2197		}
2198		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2199		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2200		dump_add_page(m->phys_addr);
2201		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2202		pc->pc_pmap = pmap;
2203		pc->pc_map[0] = PC_FREE0;
2204		pc->pc_map[1] = PC_FREE1;
2205		pc->pc_map[2] = PC_FREE2;
2206		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2207		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2208		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2209
2210		/*
2211		 * The reclaim might have freed a chunk from the current pmap.
2212		 * If that chunk contained available entries, we need to
2213		 * re-count the number of available entries.
2214		 */
2215		if (reclaimed)
2216			goto retry;
2217	}
2218	if (!TAILQ_EMPTY(&new_tail)) {
2219		mtx_lock(&pv_chunks_mutex);
2220		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2221		mtx_unlock(&pv_chunks_mutex);
2222	}
2223}
2224
2225/*
2226 * First find and then remove the pv entry for the specified pmap and virtual
2227 * address from the specified pv list.  Returns the pv entry if found and NULL
2228 * otherwise.  This operation can be performed on pv lists for either 4KB or
2229 * 2MB page mappings.
2230 */
2231static __inline pv_entry_t
2232pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2233{
2234	pv_entry_t pv;
2235
2236	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2237		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2238			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2239			pvh->pv_gen++;
2240			break;
2241		}
2242	}
2243	return (pv);
2244}
2245
2246/*
2247 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2248 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2249 * entries for each of the 4KB page mappings.
2250 */
2251static void
2252pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2253    struct rwlock **lockp)
2254{
2255	struct md_page *pvh;
2256	struct pv_chunk *pc;
2257	pv_entry_t pv;
2258	vm_offset_t va_last;
2259	vm_page_t m;
2260	int bit, field;
2261
2262	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2263	KASSERT((pa & L2_OFFSET) == 0,
2264	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
2265	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2266
2267	/*
2268	 * Transfer the 2mpage's pv entry for this mapping to the first
2269	 * page's pv list.  Once this transfer begins, the pv list lock
2270	 * must not be released until the last pv entry is reinstantiated.
2271	 */
2272	pvh = pa_to_pvh(pa);
2273	va = va & ~L2_OFFSET;
2274	pv = pmap_pvh_remove(pvh, pmap, va);
2275	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2276	m = PHYS_TO_VM_PAGE(pa);
2277	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2278	m->md.pv_gen++;
2279	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
2280	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2281	va_last = va + L2_SIZE - PAGE_SIZE;
2282	for (;;) {
2283		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2284		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2285		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2286		for (field = 0; field < _NPCM; field++) {
2287			while (pc->pc_map[field]) {
2288				bit = ffsl(pc->pc_map[field]) - 1;
2289				pc->pc_map[field] &= ~(1ul << bit);
2290				pv = &pc->pc_pventry[field * 64 + bit];
2291				va += PAGE_SIZE;
2292				pv->pv_va = va;
2293				m++;
2294				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2295			    ("pmap_pv_demote_l2: page %p is not managed", m));
2296				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2297				m->md.pv_gen++;
2298				if (va == va_last)
2299					goto out;
2300			}
2301		}
2302		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2303		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2304	}
2305out:
2306	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2307		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2308		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2309	}
2310	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2311	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
2312}
2313
2314/*
2315 * First find and then destroy the pv entry for the specified pmap and virtual
2316 * address.  This operation can be performed on pv lists for either 4KB or 2MB
2317 * page mappings.
2318 */
2319static void
2320pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2321{
2322	pv_entry_t pv;
2323
2324	pv = pmap_pvh_remove(pvh, pmap, va);
2325	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2326	free_pv_entry(pmap, pv);
2327}
2328
2329/*
2330 * Conditionally create the PV entry for a 4KB page mapping if the required
2331 * memory can be allocated without resorting to reclamation.
2332 */
2333static boolean_t
2334pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2335    struct rwlock **lockp)
2336{
2337	pv_entry_t pv;
2338
2339	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2340	/* Pass NULL instead of the lock pointer to disable reclamation. */
2341	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2342		pv->pv_va = va;
2343		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2344		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2345		m->md.pv_gen++;
2346		return (TRUE);
2347	} else
2348		return (FALSE);
2349}
2350
2351/*
2352 * pmap_remove_l3: do the things to unmap a page in a process
2353 */
2354static int
2355pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2356    pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2357{
2358	struct md_page *pvh;
2359	pt_entry_t old_l3;
2360	vm_page_t m;
2361
2362	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2363	if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3)))
2364		cpu_dcache_wb_range(va, L3_SIZE);
2365	old_l3 = pmap_load_clear(l3);
2366	PTE_SYNC(l3);
2367	pmap_invalidate_page(pmap, va);
2368	if (old_l3 & ATTR_SW_WIRED)
2369		pmap->pm_stats.wired_count -= 1;
2370	pmap_resident_count_dec(pmap, 1);
2371	if (old_l3 & ATTR_SW_MANAGED) {
2372		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2373		if (pmap_page_dirty(old_l3))
2374			vm_page_dirty(m);
2375		if (old_l3 & ATTR_AF)
2376			vm_page_aflag_set(m, PGA_REFERENCED);
2377		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2378		pmap_pvh_free(&m->md, pmap, va);
2379		if (TAILQ_EMPTY(&m->md.pv_list) &&
2380		    (m->flags & PG_FICTITIOUS) == 0) {
2381			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2382			if (TAILQ_EMPTY(&pvh->pv_list))
2383				vm_page_aflag_clear(m, PGA_WRITEABLE);
2384		}
2385	}
2386	return (pmap_unuse_l3(pmap, va, l2e, free));
2387}
2388
2389/*
2390 *	Remove the given range of addresses from the specified map.
2391 *
2392 *	It is assumed that the start and end are properly
2393 *	rounded to the page size.
2394 */
2395void
2396pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2397{
2398	struct rwlock *lock;
2399	vm_offset_t va, va_next;
2400	pd_entry_t *l0, *l1, *l2;
2401	pt_entry_t l3_paddr, *l3;
2402	struct spglist free;
2403
2404	/*
2405	 * Perform an unsynchronized read.  This is, however, safe.
2406	 */
2407	if (pmap->pm_stats.resident_count == 0)
2408		return;
2409
2410	SLIST_INIT(&free);
2411
2412	PMAP_LOCK(pmap);
2413
2414	lock = NULL;
2415	for (; sva < eva; sva = va_next) {
2416
2417		if (pmap->pm_stats.resident_count == 0)
2418			break;
2419
2420		l0 = pmap_l0(pmap, sva);
2421		if (pmap_load(l0) == 0) {
2422			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2423			if (va_next < sva)
2424				va_next = eva;
2425			continue;
2426		}
2427
2428		l1 = pmap_l0_to_l1(l0, sva);
2429		if (pmap_load(l1) == 0) {
2430			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2431			if (va_next < sva)
2432				va_next = eva;
2433			continue;
2434		}
2435
2436		/*
2437		 * Calculate index for next page table.
2438		 */
2439		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2440		if (va_next < sva)
2441			va_next = eva;
2442
2443		l2 = pmap_l1_to_l2(l1, sva);
2444		if (l2 == NULL)
2445			continue;
2446
2447		l3_paddr = pmap_load(l2);
2448
2449		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
2450			/* TODO: Add pmap_remove_l2 */
2451			if (pmap_demote_l2_locked(pmap, l2, sva & ~L2_OFFSET,
2452			    &lock) == NULL)
2453				continue;
2454			l3_paddr = pmap_load(l2);
2455		}
2456
2457		/*
2458		 * Weed out invalid mappings.
2459		 */
2460		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
2461			continue;
2462
2463		/*
2464		 * Limit our scan to either the end of the va represented
2465		 * by the current page table page, or to the end of the
2466		 * range being removed.
2467		 */
2468		if (va_next > eva)
2469			va_next = eva;
2470
2471		va = va_next;
2472		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2473		    sva += L3_SIZE) {
2474			if (l3 == NULL)
2475				panic("l3 == NULL");
2476			if (pmap_load(l3) == 0) {
2477				if (va != va_next) {
2478					pmap_invalidate_range(pmap, va, sva);
2479					va = va_next;
2480				}
2481				continue;
2482			}
2483			if (va == va_next)
2484				va = sva;
2485			if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
2486			    &lock)) {
2487				sva += L3_SIZE;
2488				break;
2489			}
2490		}
2491		if (va != va_next)
2492			pmap_invalidate_range(pmap, va, sva);
2493	}
2494	if (lock != NULL)
2495		rw_wunlock(lock);
2496	PMAP_UNLOCK(pmap);
2497	pmap_free_zero_pages(&free);
2498}
2499
2500/*
2501 *	Routine:	pmap_remove_all
2502 *	Function:
2503 *		Removes this physical page from
2504 *		all physical maps in which it resides.
2505 *		Reflects back modify bits to the pager.
2506 *
2507 *	Notes:
2508 *		Original versions of this routine were very
2509 *		inefficient because they iteratively called
2510 *		pmap_remove (slow...)
2511 */
2512
2513void
2514pmap_remove_all(vm_page_t m)
2515{
2516	struct md_page *pvh;
2517	pv_entry_t pv;
2518	pmap_t pmap;
2519	struct rwlock *lock;
2520	pd_entry_t *pde, tpde;
2521	pt_entry_t *pte, tpte;
2522	vm_offset_t va;
2523	struct spglist free;
2524	int lvl, pvh_gen, md_gen;
2525
2526	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2527	    ("pmap_remove_all: page %p is not managed", m));
2528	SLIST_INIT(&free);
2529	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2530	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2531	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2532retry:
2533	rw_wlock(lock);
2534	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2535		pmap = PV_PMAP(pv);
2536		if (!PMAP_TRYLOCK(pmap)) {
2537			pvh_gen = pvh->pv_gen;
2538			rw_wunlock(lock);
2539			PMAP_LOCK(pmap);
2540			rw_wlock(lock);
2541			if (pvh_gen != pvh->pv_gen) {
2542				rw_wunlock(lock);
2543				PMAP_UNLOCK(pmap);
2544				goto retry;
2545			}
2546		}
2547		va = pv->pv_va;
2548		pte = pmap_pte(pmap, va, &lvl);
2549		KASSERT(pte != NULL,
2550		    ("pmap_remove_all: no page table entry found"));
2551		KASSERT(lvl == 2,
2552		    ("pmap_remove_all: invalid pte level %d", lvl));
2553
2554		pmap_demote_l2_locked(pmap, pte, va, &lock);
2555		PMAP_UNLOCK(pmap);
2556	}
2557	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2558		pmap = PV_PMAP(pv);
2559		if (!PMAP_TRYLOCK(pmap)) {
2560			pvh_gen = pvh->pv_gen;
2561			md_gen = m->md.pv_gen;
2562			rw_wunlock(lock);
2563			PMAP_LOCK(pmap);
2564			rw_wlock(lock);
2565			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2566				rw_wunlock(lock);
2567				PMAP_UNLOCK(pmap);
2568				goto retry;
2569			}
2570		}
2571		pmap_resident_count_dec(pmap, 1);
2572
2573		pde = pmap_pde(pmap, pv->pv_va, &lvl);
2574		KASSERT(pde != NULL,
2575		    ("pmap_remove_all: no page directory entry found"));
2576		KASSERT(lvl == 2,
2577		    ("pmap_remove_all: invalid pde level %d", lvl));
2578		tpde = pmap_load(pde);
2579
2580		pte = pmap_l2_to_l3(pde, pv->pv_va);
2581		tpte = pmap_load(pte);
2582		if (pmap_is_current(pmap) &&
2583		    pmap_l3_valid_cacheable(tpte))
2584			cpu_dcache_wb_range(pv->pv_va, L3_SIZE);
2585		pmap_load_clear(pte);
2586		PTE_SYNC(pte);
2587		pmap_invalidate_page(pmap, pv->pv_va);
2588		if (tpte & ATTR_SW_WIRED)
2589			pmap->pm_stats.wired_count--;
2590		if ((tpte & ATTR_AF) != 0)
2591			vm_page_aflag_set(m, PGA_REFERENCED);
2592
2593		/*
2594		 * Update the vm_page_t clean and reference bits.
2595		 */
2596		if (pmap_page_dirty(tpte))
2597			vm_page_dirty(m);
2598		pmap_unuse_l3(pmap, pv->pv_va, tpde, &free);
2599		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2600		m->md.pv_gen++;
2601		free_pv_entry(pmap, pv);
2602		PMAP_UNLOCK(pmap);
2603	}
2604	vm_page_aflag_clear(m, PGA_WRITEABLE);
2605	rw_wunlock(lock);
2606	pmap_free_zero_pages(&free);
2607}
2608
2609/*
2610 *	Set the physical protection on the
2611 *	specified range of this map as requested.
2612 */
2613void
2614pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2615{
2616	vm_offset_t va, va_next;
2617	pd_entry_t *l0, *l1, *l2;
2618	pt_entry_t *l3p, l3, nbits;
2619
2620	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
2621	if (prot == VM_PROT_NONE) {
2622		pmap_remove(pmap, sva, eva);
2623		return;
2624	}
2625
2626	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2627	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2628		return;
2629
2630	PMAP_LOCK(pmap);
2631	for (; sva < eva; sva = va_next) {
2632
2633		l0 = pmap_l0(pmap, sva);
2634		if (pmap_load(l0) == 0) {
2635			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2636			if (va_next < sva)
2637				va_next = eva;
2638			continue;
2639		}
2640
2641		l1 = pmap_l0_to_l1(l0, sva);
2642		if (pmap_load(l1) == 0) {
2643			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2644			if (va_next < sva)
2645				va_next = eva;
2646			continue;
2647		}
2648
2649		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2650		if (va_next < sva)
2651			va_next = eva;
2652
2653		l2 = pmap_l1_to_l2(l1, sva);
2654		if (pmap_load(l2) == 0)
2655			continue;
2656
2657		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
2658			l3p = pmap_demote_l2(pmap, l2, sva);
2659			if (l3p == NULL)
2660				continue;
2661		}
2662		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
2663		    ("pmap_protect: Invalid L2 entry after demotion"));
2664
2665		if (va_next > eva)
2666			va_next = eva;
2667
2668		va = va_next;
2669		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
2670		    sva += L3_SIZE) {
2671			l3 = pmap_load(l3p);
2672			if (!pmap_l3_valid(l3))
2673				continue;
2674
2675			nbits = 0;
2676			if ((prot & VM_PROT_WRITE) == 0) {
2677				if ((l3 & ATTR_SW_MANAGED) &&
2678				    pmap_page_dirty(l3)) {
2679					vm_page_dirty(PHYS_TO_VM_PAGE(l3 &
2680					    ~ATTR_MASK));
2681				}
2682				nbits |= ATTR_AP(ATTR_AP_RO);
2683			}
2684			if ((prot & VM_PROT_EXECUTE) == 0)
2685				nbits |= ATTR_XN;
2686
2687			pmap_set(l3p, nbits);
2688			PTE_SYNC(l3p);
2689			/* XXX: Use pmap_invalidate_range */
2690			pmap_invalidate_page(pmap, sva);
2691		}
2692	}
2693	PMAP_UNLOCK(pmap);
2694}
2695
2696/*
2697 * Inserts the specified page table page into the specified pmap's collection
2698 * of idle page table pages.  Each of a pmap's page table pages is responsible
2699 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2700 * ordered by this virtual address range.
2701 */
2702static __inline int
2703pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2704{
2705
2706	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2707	return (vm_radix_insert(&pmap->pm_root, mpte));
2708}
2709
2710/*
2711 * Removes the page table page mapping the specified virtual address from the
2712 * specified pmap's collection of idle page table pages, and returns it.
2713 * Otherwise, returns NULL if there is no page table page corresponding to the
2714 * specified virtual address.
2715 */
2716static __inline vm_page_t
2717pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2718{
2719
2720	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2721	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
2722}
2723
2724/*
2725 * Performs a break-before-make update of a pmap entry. This is needed when
2726 * either promoting or demoting pages to ensure the TLB doesn't get into an
2727 * inconsistent state.
2728 */
2729static void
2730pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
2731    vm_offset_t va, vm_size_t size)
2732{
2733	register_t intr;
2734
2735	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2736
2737	/*
2738	 * Ensure we don't get switched out with the page table in an
2739	 * inconsistent state. We also need to ensure no interrupts fire
2740	 * as they may make use of an address we are about to invalidate.
2741	 */
2742	intr = intr_disable();
2743	critical_enter();
2744
2745	/* Clear the old mapping */
2746	pmap_load_clear(pte);
2747	PTE_SYNC(pte);
2748	pmap_invalidate_range(pmap, va, va + size);
2749
2750	/* Create the new mapping */
2751	pmap_load_store(pte, newpte);
2752	PTE_SYNC(pte);
2753
2754	critical_exit();
2755	intr_restore(intr);
2756}
2757
2758#if VM_NRESERVLEVEL > 0
2759/*
2760 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2761 * replace the many pv entries for the 4KB page mappings by a single pv entry
2762 * for the 2MB page mapping.
2763 */
2764static void
2765pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2766    struct rwlock **lockp)
2767{
2768	struct md_page *pvh;
2769	pv_entry_t pv;
2770	vm_offset_t va_last;
2771	vm_page_t m;
2772
2773	KASSERT((pa & L2_OFFSET) == 0,
2774	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
2775	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2776
2777	/*
2778	 * Transfer the first page's pv entry for this mapping to the 2mpage's
2779	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
2780	 * a transfer avoids the possibility that get_pv_entry() calls
2781	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
2782	 * mappings that is being promoted.
2783	 */
2784	m = PHYS_TO_VM_PAGE(pa);
2785	va = va & ~L2_OFFSET;
2786	pv = pmap_pvh_remove(&m->md, pmap, va);
2787	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
2788	pvh = pa_to_pvh(pa);
2789	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2790	pvh->pv_gen++;
2791	/* Free the remaining NPTEPG - 1 pv entries. */
2792	va_last = va + L2_SIZE - PAGE_SIZE;
2793	do {
2794		m++;
2795		va += PAGE_SIZE;
2796		pmap_pvh_free(&m->md, pmap, va);
2797	} while (va < va_last);
2798}
2799
2800/*
2801 * Tries to promote the 512, contiguous 4KB page mappings that are within a
2802 * single level 2 table entry to a single 2MB page mapping.  For promotion
2803 * to occur, two conditions must be met: (1) the 4KB page mappings must map
2804 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2805 * identical characteristics.
2806 */
2807static void
2808pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2809    struct rwlock **lockp)
2810{
2811	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
2812	vm_page_t mpte;
2813	vm_offset_t sva;
2814
2815	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2816
2817	sva = va & ~L2_OFFSET;
2818	firstl3 = pmap_l2_to_l3(l2, sva);
2819	newl2 = pmap_load(firstl3);
2820
2821	/* Check the alingment is valid */
2822	if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) {
2823		atomic_add_long(&pmap_l2_p_failures, 1);
2824		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
2825		    " in pmap %p", va, pmap);
2826		return;
2827	}
2828
2829	pa = newl2 + L2_SIZE - PAGE_SIZE;
2830	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
2831		oldl3 = pmap_load(l3);
2832		if (oldl3 != pa) {
2833			atomic_add_long(&pmap_l2_p_failures, 1);
2834			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
2835			    " in pmap %p", va, pmap);
2836			return;
2837		}
2838		pa -= PAGE_SIZE;
2839	}
2840
2841	/*
2842	 * Save the page table page in its current state until the L2
2843	 * mapping the superpage is demoted by pmap_demote_l2() or
2844	 * destroyed by pmap_remove_l3().
2845	 */
2846	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
2847	KASSERT(mpte >= vm_page_array &&
2848	    mpte < &vm_page_array[vm_page_array_size],
2849	    ("pmap_promote_l2: page table page is out of range"));
2850	KASSERT(mpte->pindex == pmap_l2_pindex(va),
2851	    ("pmap_promote_l2: page table page's pindex is wrong"));
2852	if (pmap_insert_pt_page(pmap, mpte)) {
2853		atomic_add_long(&pmap_l2_p_failures, 1);
2854		CTR2(KTR_PMAP,
2855		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
2856		    pmap);
2857		return;
2858	}
2859
2860	if ((newl2 & ATTR_SW_MANAGED) != 0)
2861		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
2862
2863	newl2 &= ~ATTR_DESCR_MASK;
2864	newl2 |= L2_BLOCK;
2865
2866	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
2867
2868	atomic_add_long(&pmap_l2_promotions, 1);
2869	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
2870		    pmap);
2871}
2872#endif /* VM_NRESERVLEVEL > 0 */
2873
2874/*
2875 *	Insert the given physical page (p) at
2876 *	the specified virtual address (v) in the
2877 *	target physical map with the protection requested.
2878 *
2879 *	If specified, the page will be wired down, meaning
2880 *	that the related pte can not be reclaimed.
2881 *
2882 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2883 *	or lose information.  That is, this routine must actually
2884 *	insert this page into the given map NOW.
2885 */
2886int
2887pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2888    u_int flags, int8_t psind __unused)
2889{
2890	struct rwlock *lock;
2891	pd_entry_t *pde;
2892	pt_entry_t new_l3, orig_l3;
2893	pt_entry_t *l2, *l3;
2894	pv_entry_t pv;
2895	vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa;
2896	vm_page_t mpte, om, l1_m, l2_m, l3_m;
2897	boolean_t nosleep;
2898	int lvl;
2899
2900	va = trunc_page(va);
2901	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
2902		VM_OBJECT_ASSERT_LOCKED(m->object);
2903	pa = VM_PAGE_TO_PHYS(m);
2904	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
2905	    L3_PAGE);
2906	if ((prot & VM_PROT_WRITE) == 0)
2907		new_l3 |= ATTR_AP(ATTR_AP_RO);
2908	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
2909		new_l3 |= ATTR_XN;
2910	if ((flags & PMAP_ENTER_WIRED) != 0)
2911		new_l3 |= ATTR_SW_WIRED;
2912	if ((va >> 63) == 0)
2913		new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
2914
2915	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
2916
2917	mpte = NULL;
2918
2919	lock = NULL;
2920	PMAP_LOCK(pmap);
2921
2922	pde = pmap_pde(pmap, va, &lvl);
2923	if (pde != NULL && lvl == 1) {
2924		l2 = pmap_l1_to_l2(pde, va);
2925		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
2926		    (l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET,
2927		    &lock)) != NULL) {
2928			l3 = &l3[pmap_l3_index(va)];
2929			if (va < VM_MAXUSER_ADDRESS) {
2930				mpte = PHYS_TO_VM_PAGE(
2931				    pmap_load(l2) & ~ATTR_MASK);
2932				mpte->wire_count++;
2933			}
2934			goto havel3;
2935		}
2936	}
2937
2938	if (va < VM_MAXUSER_ADDRESS) {
2939		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2940		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
2941		if (mpte == NULL && nosleep) {
2942			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
2943			if (lock != NULL)
2944				rw_wunlock(lock);
2945			PMAP_UNLOCK(pmap);
2946			return (KERN_RESOURCE_SHORTAGE);
2947		}
2948		pde = pmap_pde(pmap, va, &lvl);
2949		KASSERT(pde != NULL,
2950		    ("pmap_enter: Invalid page entry, va: 0x%lx", va));
2951		KASSERT(lvl == 2,
2952		    ("pmap_enter: Invalid level %d", lvl));
2953
2954		l3 = pmap_l2_to_l3(pde, va);
2955	} else {
2956		/*
2957		 * If we get a level 2 pde it must point to a level 3 entry
2958		 * otherwise we will need to create the intermediate tables
2959		 */
2960		if (lvl < 2) {
2961			switch(lvl) {
2962			default:
2963			case -1:
2964				/* Get the l0 pde to update */
2965				pde = pmap_l0(pmap, va);
2966				KASSERT(pde != NULL, ("..."));
2967
2968				l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2969				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2970				    VM_ALLOC_ZERO);
2971				if (l1_m == NULL)
2972					panic("pmap_enter: l1 pte_m == NULL");
2973				if ((l1_m->flags & PG_ZERO) == 0)
2974					pmap_zero_page(l1_m);
2975
2976				l1_pa = VM_PAGE_TO_PHYS(l1_m);
2977				pmap_load_store(pde, l1_pa | L0_TABLE);
2978				PTE_SYNC(pde);
2979				/* FALLTHROUGH */
2980			case 0:
2981				/* Get the l1 pde to update */
2982				pde = pmap_l1_to_l2(pde, va);
2983				KASSERT(pde != NULL, ("..."));
2984
2985				l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2986				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2987				    VM_ALLOC_ZERO);
2988				if (l2_m == NULL)
2989					panic("pmap_enter: l2 pte_m == NULL");
2990				if ((l2_m->flags & PG_ZERO) == 0)
2991					pmap_zero_page(l2_m);
2992
2993				l2_pa = VM_PAGE_TO_PHYS(l2_m);
2994				pmap_load_store(pde, l2_pa | L1_TABLE);
2995				PTE_SYNC(pde);
2996				/* FALLTHROUGH */
2997			case 1:
2998				/* Get the l2 pde to update */
2999				pde = pmap_l1_to_l2(pde, va);
3000
3001				l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
3002				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3003				    VM_ALLOC_ZERO);
3004				if (l3_m == NULL)
3005					panic("pmap_enter: l3 pte_m == NULL");
3006				if ((l3_m->flags & PG_ZERO) == 0)
3007					pmap_zero_page(l3_m);
3008
3009				l3_pa = VM_PAGE_TO_PHYS(l3_m);
3010				pmap_load_store(pde, l3_pa | L2_TABLE);
3011				PTE_SYNC(pde);
3012				break;
3013			}
3014		}
3015		l3 = pmap_l2_to_l3(pde, va);
3016		pmap_invalidate_page(pmap, va);
3017	}
3018havel3:
3019
3020	om = NULL;
3021	orig_l3 = pmap_load(l3);
3022	opa = orig_l3 & ~ATTR_MASK;
3023
3024	/*
3025	 * Is the specified virtual address already mapped?
3026	 */
3027	if (pmap_l3_valid(orig_l3)) {
3028		/*
3029		 * Wiring change, just update stats. We don't worry about
3030		 * wiring PT pages as they remain resident as long as there
3031		 * are valid mappings in them. Hence, if a user page is wired,
3032		 * the PT page will be also.
3033		 */
3034		if ((flags & PMAP_ENTER_WIRED) != 0 &&
3035		    (orig_l3 & ATTR_SW_WIRED) == 0)
3036			pmap->pm_stats.wired_count++;
3037		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3038		    (orig_l3 & ATTR_SW_WIRED) != 0)
3039			pmap->pm_stats.wired_count--;
3040
3041		/*
3042		 * Remove the extra PT page reference.
3043		 */
3044		if (mpte != NULL) {
3045			mpte->wire_count--;
3046			KASSERT(mpte->wire_count > 0,
3047			    ("pmap_enter: missing reference to page table page,"
3048			     " va: 0x%lx", va));
3049		}
3050
3051		/*
3052		 * Has the physical page changed?
3053		 */
3054		if (opa == pa) {
3055			/*
3056			 * No, might be a protection or wiring change.
3057			 */
3058			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3059				new_l3 |= ATTR_SW_MANAGED;
3060				if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
3061				    ATTR_AP(ATTR_AP_RW)) {
3062					vm_page_aflag_set(m, PGA_WRITEABLE);
3063				}
3064			}
3065			goto validate;
3066		}
3067
3068		/* Flush the cache, there might be uncommitted data in it */
3069		if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(orig_l3))
3070			cpu_dcache_wb_range(va, L3_SIZE);
3071	} else {
3072		/*
3073		 * Increment the counters.
3074		 */
3075		if ((new_l3 & ATTR_SW_WIRED) != 0)
3076			pmap->pm_stats.wired_count++;
3077		pmap_resident_count_inc(pmap, 1);
3078	}
3079	/*
3080	 * Enter on the PV list if part of our managed memory.
3081	 */
3082	if ((m->oflags & VPO_UNMANAGED) == 0) {
3083		new_l3 |= ATTR_SW_MANAGED;
3084		pv = get_pv_entry(pmap, &lock);
3085		pv->pv_va = va;
3086		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3087		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3088		m->md.pv_gen++;
3089		if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
3090			vm_page_aflag_set(m, PGA_WRITEABLE);
3091	}
3092
3093	/*
3094	 * Update the L3 entry.
3095	 */
3096	if (orig_l3 != 0) {
3097validate:
3098		orig_l3 = pmap_load(l3);
3099		opa = orig_l3 & ~ATTR_MASK;
3100
3101		if (opa != pa) {
3102			pmap_update_entry(pmap, l3, new_l3, va, PAGE_SIZE);
3103			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3104				om = PHYS_TO_VM_PAGE(opa);
3105				if (pmap_page_dirty(orig_l3))
3106					vm_page_dirty(om);
3107				if ((orig_l3 & ATTR_AF) != 0)
3108					vm_page_aflag_set(om, PGA_REFERENCED);
3109				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3110				pmap_pvh_free(&om->md, pmap, va);
3111				if ((om->aflags & PGA_WRITEABLE) != 0 &&
3112				    TAILQ_EMPTY(&om->md.pv_list) &&
3113				    ((om->flags & PG_FICTITIOUS) != 0 ||
3114				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3115					vm_page_aflag_clear(om, PGA_WRITEABLE);
3116			}
3117		} else {
3118			pmap_load_store(l3, new_l3);
3119			PTE_SYNC(l3);
3120			pmap_invalidate_page(pmap, va);
3121			if (pmap_page_dirty(orig_l3) &&
3122			    (orig_l3 & ATTR_SW_MANAGED) != 0)
3123				vm_page_dirty(m);
3124		}
3125	} else {
3126		pmap_load_store(l3, new_l3);
3127	}
3128
3129	PTE_SYNC(l3);
3130	pmap_invalidate_page(pmap, va);
3131
3132	if (pmap != pmap_kernel()) {
3133		if (pmap == &curproc->p_vmspace->vm_pmap &&
3134		    (prot & VM_PROT_EXECUTE) != 0)
3135			cpu_icache_sync_range(va, PAGE_SIZE);
3136
3137#if VM_NRESERVLEVEL > 0
3138		if ((mpte == NULL || mpte->wire_count == NL3PG) &&
3139		    pmap_superpages_enabled() &&
3140		    (m->flags & PG_FICTITIOUS) == 0 &&
3141		    vm_reserv_level_iffullpop(m) == 0) {
3142			pmap_promote_l2(pmap, pde, va, &lock);
3143		}
3144#endif
3145	}
3146
3147	if (lock != NULL)
3148		rw_wunlock(lock);
3149	PMAP_UNLOCK(pmap);
3150	return (KERN_SUCCESS);
3151}
3152
3153/*
3154 * Maps a sequence of resident pages belonging to the same object.
3155 * The sequence begins with the given page m_start.  This page is
3156 * mapped at the given virtual address start.  Each subsequent page is
3157 * mapped at a virtual address that is offset from start by the same
3158 * amount as the page is offset from m_start within the object.  The
3159 * last page in the sequence is the page with the largest offset from
3160 * m_start that can be mapped at a virtual address less than the given
3161 * virtual address end.  Not every virtual page between start and end
3162 * is mapped; only those for which a resident page exists with the
3163 * corresponding offset from m_start are mapped.
3164 */
3165void
3166pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3167    vm_page_t m_start, vm_prot_t prot)
3168{
3169	struct rwlock *lock;
3170	vm_offset_t va;
3171	vm_page_t m, mpte;
3172	vm_pindex_t diff, psize;
3173
3174	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3175
3176	psize = atop(end - start);
3177	mpte = NULL;
3178	m = m_start;
3179	lock = NULL;
3180	PMAP_LOCK(pmap);
3181	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3182		va = start + ptoa(diff);
3183		mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
3184		m = TAILQ_NEXT(m, listq);
3185	}
3186	if (lock != NULL)
3187		rw_wunlock(lock);
3188	PMAP_UNLOCK(pmap);
3189}
3190
3191/*
3192 * this code makes some *MAJOR* assumptions:
3193 * 1. Current pmap & pmap exists.
3194 * 2. Not wired.
3195 * 3. Read access.
3196 * 4. No page table pages.
3197 * but is *MUCH* faster than pmap_enter...
3198 */
3199
3200void
3201pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3202{
3203	struct rwlock *lock;
3204
3205	lock = NULL;
3206	PMAP_LOCK(pmap);
3207	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3208	if (lock != NULL)
3209		rw_wunlock(lock);
3210	PMAP_UNLOCK(pmap);
3211}
3212
3213static vm_page_t
3214pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3215    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3216{
3217	struct spglist free;
3218	pd_entry_t *pde;
3219	pt_entry_t *l2, *l3;
3220	vm_paddr_t pa;
3221	int lvl;
3222
3223	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3224	    (m->oflags & VPO_UNMANAGED) != 0,
3225	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3226	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3227
3228	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3229	/*
3230	 * In the case that a page table page is not
3231	 * resident, we are creating it here.
3232	 */
3233	if (va < VM_MAXUSER_ADDRESS) {
3234		vm_pindex_t l2pindex;
3235
3236		/*
3237		 * Calculate pagetable page index
3238		 */
3239		l2pindex = pmap_l2_pindex(va);
3240		if (mpte && (mpte->pindex == l2pindex)) {
3241			mpte->wire_count++;
3242		} else {
3243			/*
3244			 * Get the l2 entry
3245			 */
3246			pde = pmap_pde(pmap, va, &lvl);
3247
3248			/*
3249			 * If the page table page is mapped, we just increment
3250			 * the hold count, and activate it.  Otherwise, we
3251			 * attempt to allocate a page table page.  If this
3252			 * attempt fails, we don't retry.  Instead, we give up.
3253			 */
3254			if (lvl == 1) {
3255				l2 = pmap_l1_to_l2(pde, va);
3256				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
3257				    L2_BLOCK)
3258					return (NULL);
3259			}
3260			if (lvl == 2 && pmap_load(pde) != 0) {
3261				mpte =
3262				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
3263				mpte->wire_count++;
3264			} else {
3265				/*
3266				 * Pass NULL instead of the PV list lock
3267				 * pointer, because we don't intend to sleep.
3268				 */
3269				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3270				if (mpte == NULL)
3271					return (mpte);
3272			}
3273		}
3274		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3275		l3 = &l3[pmap_l3_index(va)];
3276	} else {
3277		mpte = NULL;
3278		pde = pmap_pde(kernel_pmap, va, &lvl);
3279		KASSERT(pde != NULL,
3280		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
3281		     va));
3282		KASSERT(lvl == 2,
3283		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
3284		l3 = pmap_l2_to_l3(pde, va);
3285	}
3286
3287	if (pmap_load(l3) != 0) {
3288		if (mpte != NULL) {
3289			mpte->wire_count--;
3290			mpte = NULL;
3291		}
3292		return (mpte);
3293	}
3294
3295	/*
3296	 * Enter on the PV list if part of our managed memory.
3297	 */
3298	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3299	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3300		if (mpte != NULL) {
3301			SLIST_INIT(&free);
3302			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
3303				pmap_invalidate_page(pmap, va);
3304				pmap_free_zero_pages(&free);
3305			}
3306			mpte = NULL;
3307		}
3308		return (mpte);
3309	}
3310
3311	/*
3312	 * Increment counters
3313	 */
3314	pmap_resident_count_inc(pmap, 1);
3315
3316	pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
3317	    ATTR_AP(ATTR_AP_RO) | L3_PAGE;
3318	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
3319		pa |= ATTR_XN;
3320	else if (va < VM_MAXUSER_ADDRESS)
3321		pa |= ATTR_PXN;
3322
3323	/*
3324	 * Now validate mapping with RO protection
3325	 */
3326	if ((m->oflags & VPO_UNMANAGED) == 0)
3327		pa |= ATTR_SW_MANAGED;
3328	pmap_load_store(l3, pa);
3329	PTE_SYNC(l3);
3330	pmap_invalidate_page(pmap, va);
3331	return (mpte);
3332}
3333
3334/*
3335 * This code maps large physical mmap regions into the
3336 * processor address space.  Note that some shortcuts
3337 * are taken, but the code works.
3338 */
3339void
3340pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3341    vm_pindex_t pindex, vm_size_t size)
3342{
3343
3344	VM_OBJECT_ASSERT_WLOCKED(object);
3345	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3346	    ("pmap_object_init_pt: non-device object"));
3347}
3348
3349/*
3350 *	Clear the wired attribute from the mappings for the specified range of
3351 *	addresses in the given pmap.  Every valid mapping within that range
3352 *	must have the wired attribute set.  In contrast, invalid mappings
3353 *	cannot have the wired attribute set, so they are ignored.
3354 *
3355 *	The wired attribute of the page table entry is not a hardware feature,
3356 *	so there is no need to invalidate any TLB entries.
3357 */
3358void
3359pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3360{
3361	vm_offset_t va_next;
3362	pd_entry_t *l0, *l1, *l2;
3363	pt_entry_t *l3;
3364
3365	PMAP_LOCK(pmap);
3366	for (; sva < eva; sva = va_next) {
3367		l0 = pmap_l0(pmap, sva);
3368		if (pmap_load(l0) == 0) {
3369			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3370			if (va_next < sva)
3371				va_next = eva;
3372			continue;
3373		}
3374
3375		l1 = pmap_l0_to_l1(l0, sva);
3376		if (pmap_load(l1) == 0) {
3377			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3378			if (va_next < sva)
3379				va_next = eva;
3380			continue;
3381		}
3382
3383		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3384		if (va_next < sva)
3385			va_next = eva;
3386
3387		l2 = pmap_l1_to_l2(l1, sva);
3388		if (pmap_load(l2) == 0)
3389			continue;
3390
3391		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3392			l3 = pmap_demote_l2(pmap, l2, sva);
3393			if (l3 == NULL)
3394				continue;
3395		}
3396		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3397		    ("pmap_unwire: Invalid l2 entry after demotion"));
3398
3399		if (va_next > eva)
3400			va_next = eva;
3401		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3402		    sva += L3_SIZE) {
3403			if (pmap_load(l3) == 0)
3404				continue;
3405			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
3406				panic("pmap_unwire: l3 %#jx is missing "
3407				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
3408
3409			/*
3410			 * PG_W must be cleared atomically.  Although the pmap
3411			 * lock synchronizes access to PG_W, another processor
3412			 * could be setting PG_M and/or PG_A concurrently.
3413			 */
3414			atomic_clear_long(l3, ATTR_SW_WIRED);
3415			pmap->pm_stats.wired_count--;
3416		}
3417	}
3418	PMAP_UNLOCK(pmap);
3419}
3420
3421/*
3422 *	Copy the range specified by src_addr/len
3423 *	from the source map to the range dst_addr/len
3424 *	in the destination map.
3425 *
3426 *	This routine is only advisory and need not do anything.
3427 */
3428
3429void
3430pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3431    vm_offset_t src_addr)
3432{
3433}
3434
3435/*
3436 *	pmap_zero_page zeros the specified hardware page by mapping
3437 *	the page into KVM and using bzero to clear its contents.
3438 */
3439void
3440pmap_zero_page(vm_page_t m)
3441{
3442	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3443
3444	pagezero((void *)va);
3445}
3446
3447/*
3448 *	pmap_zero_page_area zeros the specified hardware page by mapping
3449 *	the page into KVM and using bzero to clear its contents.
3450 *
3451 *	off and size may not cover an area beyond a single hardware page.
3452 */
3453void
3454pmap_zero_page_area(vm_page_t m, int off, int size)
3455{
3456	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3457
3458	if (off == 0 && size == PAGE_SIZE)
3459		pagezero((void *)va);
3460	else
3461		bzero((char *)va + off, size);
3462}
3463
3464/*
3465 *	pmap_zero_page_idle zeros the specified hardware page by mapping
3466 *	the page into KVM and using bzero to clear its contents.  This
3467 *	is intended to be called from the vm_pagezero process only and
3468 *	outside of Giant.
3469 */
3470void
3471pmap_zero_page_idle(vm_page_t m)
3472{
3473	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3474
3475	pagezero((void *)va);
3476}
3477
3478/*
3479 *	pmap_copy_page copies the specified (machine independent)
3480 *	page by mapping the page into virtual memory and using
3481 *	bcopy to copy the page, one machine dependent page at a
3482 *	time.
3483 */
3484void
3485pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3486{
3487	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3488	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3489
3490	pagecopy((void *)src, (void *)dst);
3491}
3492
3493int unmapped_buf_allowed = 1;
3494
3495void
3496pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3497    vm_offset_t b_offset, int xfersize)
3498{
3499	void *a_cp, *b_cp;
3500	vm_page_t m_a, m_b;
3501	vm_paddr_t p_a, p_b;
3502	vm_offset_t a_pg_offset, b_pg_offset;
3503	int cnt;
3504
3505	while (xfersize > 0) {
3506		a_pg_offset = a_offset & PAGE_MASK;
3507		m_a = ma[a_offset >> PAGE_SHIFT];
3508		p_a = m_a->phys_addr;
3509		b_pg_offset = b_offset & PAGE_MASK;
3510		m_b = mb[b_offset >> PAGE_SHIFT];
3511		p_b = m_b->phys_addr;
3512		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3513		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3514		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
3515			panic("!DMAP a %lx", p_a);
3516		} else {
3517			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
3518		}
3519		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
3520			panic("!DMAP b %lx", p_b);
3521		} else {
3522			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
3523		}
3524		bcopy(a_cp, b_cp, cnt);
3525		a_offset += cnt;
3526		b_offset += cnt;
3527		xfersize -= cnt;
3528	}
3529}
3530
3531vm_offset_t
3532pmap_quick_enter_page(vm_page_t m)
3533{
3534
3535	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
3536}
3537
3538void
3539pmap_quick_remove_page(vm_offset_t addr)
3540{
3541}
3542
3543/*
3544 * Returns true if the pmap's pv is one of the first
3545 * 16 pvs linked to from this page.  This count may
3546 * be changed upwards or downwards in the future; it
3547 * is only necessary that true be returned for a small
3548 * subset of pmaps for proper page aging.
3549 */
3550boolean_t
3551pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3552{
3553	struct md_page *pvh;
3554	struct rwlock *lock;
3555	pv_entry_t pv;
3556	int loops = 0;
3557	boolean_t rv;
3558
3559	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3560	    ("pmap_page_exists_quick: page %p is not managed", m));
3561	rv = FALSE;
3562	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3563	rw_rlock(lock);
3564	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3565		if (PV_PMAP(pv) == pmap) {
3566			rv = TRUE;
3567			break;
3568		}
3569		loops++;
3570		if (loops >= 16)
3571			break;
3572	}
3573	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
3574		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3575		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3576			if (PV_PMAP(pv) == pmap) {
3577				rv = TRUE;
3578				break;
3579			}
3580			loops++;
3581			if (loops >= 16)
3582				break;
3583		}
3584	}
3585	rw_runlock(lock);
3586	return (rv);
3587}
3588
3589/*
3590 *	pmap_page_wired_mappings:
3591 *
3592 *	Return the number of managed mappings to the given physical page
3593 *	that are wired.
3594 */
3595int
3596pmap_page_wired_mappings(vm_page_t m)
3597{
3598	struct rwlock *lock;
3599	struct md_page *pvh;
3600	pmap_t pmap;
3601	pt_entry_t *pte;
3602	pv_entry_t pv;
3603	int count, lvl, md_gen, pvh_gen;
3604
3605	if ((m->oflags & VPO_UNMANAGED) != 0)
3606		return (0);
3607	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3608	rw_rlock(lock);
3609restart:
3610	count = 0;
3611	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3612		pmap = PV_PMAP(pv);
3613		if (!PMAP_TRYLOCK(pmap)) {
3614			md_gen = m->md.pv_gen;
3615			rw_runlock(lock);
3616			PMAP_LOCK(pmap);
3617			rw_rlock(lock);
3618			if (md_gen != m->md.pv_gen) {
3619				PMAP_UNLOCK(pmap);
3620				goto restart;
3621			}
3622		}
3623		pte = pmap_pte(pmap, pv->pv_va, &lvl);
3624		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
3625			count++;
3626		PMAP_UNLOCK(pmap);
3627	}
3628	if ((m->flags & PG_FICTITIOUS) == 0) {
3629		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3630		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3631			pmap = PV_PMAP(pv);
3632			if (!PMAP_TRYLOCK(pmap)) {
3633				md_gen = m->md.pv_gen;
3634				pvh_gen = pvh->pv_gen;
3635				rw_runlock(lock);
3636				PMAP_LOCK(pmap);
3637				rw_rlock(lock);
3638				if (md_gen != m->md.pv_gen ||
3639				    pvh_gen != pvh->pv_gen) {
3640					PMAP_UNLOCK(pmap);
3641					goto restart;
3642				}
3643			}
3644			pte = pmap_pte(pmap, pv->pv_va, &lvl);
3645			if (pte != NULL &&
3646			    (pmap_load(pte) & ATTR_SW_WIRED) != 0)
3647				count++;
3648			PMAP_UNLOCK(pmap);
3649		}
3650	}
3651	rw_runlock(lock);
3652	return (count);
3653}
3654
3655/*
3656 * Destroy all managed, non-wired mappings in the given user-space
3657 * pmap.  This pmap cannot be active on any processor besides the
3658 * caller.
3659 *
3660 * This function cannot be applied to the kernel pmap.  Moreover, it
3661 * is not intended for general use.  It is only to be used during
3662 * process termination.  Consequently, it can be implemented in ways
3663 * that make it faster than pmap_remove().  First, it can more quickly
3664 * destroy mappings by iterating over the pmap's collection of PV
3665 * entries, rather than searching the page table.  Second, it doesn't
3666 * have to test and clear the page table entries atomically, because
3667 * no processor is currently accessing the user address space.  In
3668 * particular, a page table entry's dirty bit won't change state once
3669 * this function starts.
3670 */
3671void
3672pmap_remove_pages(pmap_t pmap)
3673{
3674	pd_entry_t *pde;
3675	pt_entry_t *pte, tpte;
3676	struct spglist free;
3677	vm_page_t m, ml3, mt;
3678	pv_entry_t pv;
3679	struct md_page *pvh;
3680	struct pv_chunk *pc, *npc;
3681	struct rwlock *lock;
3682	int64_t bit;
3683	uint64_t inuse, bitmask;
3684	int allfree, field, freed, idx, lvl;
3685	vm_paddr_t pa;
3686
3687	lock = NULL;
3688
3689	SLIST_INIT(&free);
3690	PMAP_LOCK(pmap);
3691	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3692		allfree = 1;
3693		freed = 0;
3694		for (field = 0; field < _NPCM; field++) {
3695			inuse = ~pc->pc_map[field] & pc_freemask[field];
3696			while (inuse != 0) {
3697				bit = ffsl(inuse) - 1;
3698				bitmask = 1UL << bit;
3699				idx = field * 64 + bit;
3700				pv = &pc->pc_pventry[idx];
3701				inuse &= ~bitmask;
3702
3703				pde = pmap_pde(pmap, pv->pv_va, &lvl);
3704				KASSERT(pde != NULL,
3705				    ("Attempting to remove an unmapped page"));
3706
3707				switch(lvl) {
3708				case 1:
3709					pte = pmap_l1_to_l2(pde, pv->pv_va);
3710					tpte = pmap_load(pte);
3711					KASSERT((tpte & ATTR_DESCR_MASK) ==
3712					    L2_BLOCK,
3713					    ("Attempting to remove an invalid "
3714					    "block: %lx", tpte));
3715					tpte = pmap_load(pte);
3716					break;
3717				case 2:
3718					pte = pmap_l2_to_l3(pde, pv->pv_va);
3719					tpte = pmap_load(pte);
3720					KASSERT((tpte & ATTR_DESCR_MASK) ==
3721					    L3_PAGE,
3722					    ("Attempting to remove an invalid "
3723					     "page: %lx", tpte));
3724					break;
3725				default:
3726					panic(
3727					    "Invalid page directory level: %d",
3728					    lvl);
3729				}
3730
3731/*
3732 * We cannot remove wired pages from a process' mapping at this time
3733 */
3734				if (tpte & ATTR_SW_WIRED) {
3735					allfree = 0;
3736					continue;
3737				}
3738
3739				pa = tpte & ~ATTR_MASK;
3740
3741				m = PHYS_TO_VM_PAGE(pa);
3742				KASSERT(m->phys_addr == pa,
3743				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3744				    m, (uintmax_t)m->phys_addr,
3745				    (uintmax_t)tpte));
3746
3747				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
3748				    m < &vm_page_array[vm_page_array_size],
3749				    ("pmap_remove_pages: bad pte %#jx",
3750				    (uintmax_t)tpte));
3751
3752				if (pmap_is_current(pmap)) {
3753					if (lvl == 2 &&
3754					    pmap_l3_valid_cacheable(tpte)) {
3755						cpu_dcache_wb_range(pv->pv_va,
3756						    L3_SIZE);
3757					} else if (lvl == 1 &&
3758					    pmap_pte_valid_cacheable(tpte)) {
3759						cpu_dcache_wb_range(pv->pv_va,
3760						    L2_SIZE);
3761					}
3762				}
3763				pmap_load_clear(pte);
3764				PTE_SYNC(pte);
3765				pmap_invalidate_page(pmap, pv->pv_va);
3766
3767				/*
3768				 * Update the vm_page_t clean/reference bits.
3769				 */
3770				if ((tpte & ATTR_AP_RW_BIT) ==
3771				    ATTR_AP(ATTR_AP_RW)) {
3772					switch (lvl) {
3773					case 1:
3774						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3775							vm_page_dirty(m);
3776						break;
3777					case 2:
3778						vm_page_dirty(m);
3779						break;
3780					}
3781				}
3782
3783				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
3784
3785				/* Mark free */
3786				pc->pc_map[field] |= bitmask;
3787				switch (lvl) {
3788				case 1:
3789					pmap_resident_count_dec(pmap,
3790					    L2_SIZE / PAGE_SIZE);
3791					pvh = pa_to_pvh(tpte & ~ATTR_MASK);
3792					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
3793					pvh->pv_gen++;
3794					if (TAILQ_EMPTY(&pvh->pv_list)) {
3795						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3796							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
3797							    TAILQ_EMPTY(&mt->md.pv_list))
3798								vm_page_aflag_clear(mt, PGA_WRITEABLE);
3799					}
3800					ml3 = pmap_remove_pt_page(pmap,
3801					    pv->pv_va);
3802					if (ml3 != NULL) {
3803						pmap_resident_count_dec(pmap,1);
3804						KASSERT(ml3->wire_count == NL3PG,
3805						    ("pmap_remove_pages: l3 page wire count error"));
3806						ml3->wire_count = 0;
3807						pmap_add_delayed_free_list(ml3,
3808						    &free, FALSE);
3809						atomic_subtract_int(
3810						    &vm_cnt.v_wire_count, 1);
3811					}
3812					break;
3813				case 2:
3814					pmap_resident_count_dec(pmap, 1);
3815					TAILQ_REMOVE(&m->md.pv_list, pv,
3816					    pv_next);
3817					m->md.pv_gen++;
3818					if ((m->aflags & PGA_WRITEABLE) != 0 &&
3819					    TAILQ_EMPTY(&m->md.pv_list) &&
3820					    (m->flags & PG_FICTITIOUS) == 0) {
3821						pvh = pa_to_pvh(
3822						    VM_PAGE_TO_PHYS(m));
3823						if (TAILQ_EMPTY(&pvh->pv_list))
3824							vm_page_aflag_clear(m,
3825							    PGA_WRITEABLE);
3826					}
3827					break;
3828				}
3829				pmap_unuse_l3(pmap, pv->pv_va, pmap_load(pde),
3830				    &free);
3831				freed++;
3832			}
3833		}
3834		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3835		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3836		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3837		if (allfree) {
3838			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3839			free_pv_chunk(pc);
3840		}
3841	}
3842	pmap_invalidate_all(pmap);
3843	if (lock != NULL)
3844		rw_wunlock(lock);
3845	PMAP_UNLOCK(pmap);
3846	pmap_free_zero_pages(&free);
3847}
3848
3849/*
3850 * This is used to check if a page has been accessed or modified. As we
3851 * don't have a bit to see if it has been modified we have to assume it
3852 * has been if the page is read/write.
3853 */
3854static boolean_t
3855pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3856{
3857	struct rwlock *lock;
3858	pv_entry_t pv;
3859	struct md_page *pvh;
3860	pt_entry_t *pte, mask, value;
3861	pmap_t pmap;
3862	int lvl, md_gen, pvh_gen;
3863	boolean_t rv;
3864
3865	rv = FALSE;
3866	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3867	rw_rlock(lock);
3868restart:
3869	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3870		pmap = PV_PMAP(pv);
3871		if (!PMAP_TRYLOCK(pmap)) {
3872			md_gen = m->md.pv_gen;
3873			rw_runlock(lock);
3874			PMAP_LOCK(pmap);
3875			rw_rlock(lock);
3876			if (md_gen != m->md.pv_gen) {
3877				PMAP_UNLOCK(pmap);
3878				goto restart;
3879			}
3880		}
3881		pte = pmap_pte(pmap, pv->pv_va, &lvl);
3882		KASSERT(lvl == 3,
3883		    ("pmap_page_test_mappings: Invalid level %d", lvl));
3884		mask = 0;
3885		value = 0;
3886		if (modified) {
3887			mask |= ATTR_AP_RW_BIT;
3888			value |= ATTR_AP(ATTR_AP_RW);
3889		}
3890		if (accessed) {
3891			mask |= ATTR_AF | ATTR_DESCR_MASK;
3892			value |= ATTR_AF | L3_PAGE;
3893		}
3894		rv = (pmap_load(pte) & mask) == value;
3895		PMAP_UNLOCK(pmap);
3896		if (rv)
3897			goto out;
3898	}
3899	if ((m->flags & PG_FICTITIOUS) == 0) {
3900		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3901		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3902			pmap = PV_PMAP(pv);
3903			if (!PMAP_TRYLOCK(pmap)) {
3904				md_gen = m->md.pv_gen;
3905				pvh_gen = pvh->pv_gen;
3906				rw_runlock(lock);
3907				PMAP_LOCK(pmap);
3908				rw_rlock(lock);
3909				if (md_gen != m->md.pv_gen ||
3910				    pvh_gen != pvh->pv_gen) {
3911					PMAP_UNLOCK(pmap);
3912					goto restart;
3913				}
3914			}
3915			pte = pmap_pte(pmap, pv->pv_va, &lvl);
3916			KASSERT(lvl == 2,
3917			    ("pmap_page_test_mappings: Invalid level %d", lvl));
3918			mask = 0;
3919			value = 0;
3920			if (modified) {
3921				mask |= ATTR_AP_RW_BIT;
3922				value |= ATTR_AP(ATTR_AP_RW);
3923			}
3924			if (accessed) {
3925				mask |= ATTR_AF | ATTR_DESCR_MASK;
3926				value |= ATTR_AF | L2_BLOCK;
3927			}
3928			rv = (pmap_load(pte) & mask) == value;
3929			PMAP_UNLOCK(pmap);
3930			if (rv)
3931				goto out;
3932		}
3933	}
3934out:
3935	rw_runlock(lock);
3936	return (rv);
3937}
3938
3939/*
3940 *	pmap_is_modified:
3941 *
3942 *	Return whether or not the specified physical page was modified
3943 *	in any physical maps.
3944 */
3945boolean_t
3946pmap_is_modified(vm_page_t m)
3947{
3948
3949	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3950	    ("pmap_is_modified: page %p is not managed", m));
3951
3952	/*
3953	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3954	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
3955	 * is clear, no PTEs can have PG_M set.
3956	 */
3957	VM_OBJECT_ASSERT_WLOCKED(m->object);
3958	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3959		return (FALSE);
3960	return (pmap_page_test_mappings(m, FALSE, TRUE));
3961}
3962
3963/*
3964 *	pmap_is_prefaultable:
3965 *
3966 *	Return whether or not the specified virtual address is eligible
3967 *	for prefault.
3968 */
3969boolean_t
3970pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3971{
3972	pt_entry_t *pte;
3973	boolean_t rv;
3974	int lvl;
3975
3976	rv = FALSE;
3977	PMAP_LOCK(pmap);
3978	pte = pmap_pte(pmap, addr, &lvl);
3979	if (pte != NULL && pmap_load(pte) != 0) {
3980		rv = TRUE;
3981	}
3982	PMAP_UNLOCK(pmap);
3983	return (rv);
3984}
3985
3986/*
3987 *	pmap_is_referenced:
3988 *
3989 *	Return whether or not the specified physical page was referenced
3990 *	in any physical maps.
3991 */
3992boolean_t
3993pmap_is_referenced(vm_page_t m)
3994{
3995
3996	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3997	    ("pmap_is_referenced: page %p is not managed", m));
3998	return (pmap_page_test_mappings(m, TRUE, FALSE));
3999}
4000
4001/*
4002 * Clear the write and modified bits in each of the given page's mappings.
4003 */
4004void
4005pmap_remove_write(vm_page_t m)
4006{
4007	struct md_page *pvh;
4008	pmap_t pmap;
4009	struct rwlock *lock;
4010	pv_entry_t next_pv, pv;
4011	pt_entry_t oldpte, *pte;
4012	vm_offset_t va;
4013	int lvl, md_gen, pvh_gen;
4014
4015	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4016	    ("pmap_remove_write: page %p is not managed", m));
4017
4018	/*
4019	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4020	 * set by another thread while the object is locked.  Thus,
4021	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4022	 */
4023	VM_OBJECT_ASSERT_WLOCKED(m->object);
4024	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4025		return;
4026	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4027	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4028	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4029retry_pv_loop:
4030	rw_wlock(lock);
4031	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4032		pmap = PV_PMAP(pv);
4033		if (!PMAP_TRYLOCK(pmap)) {
4034			pvh_gen = pvh->pv_gen;
4035			rw_wunlock(lock);
4036			PMAP_LOCK(pmap);
4037			rw_wlock(lock);
4038			if (pvh_gen != pvh->pv_gen) {
4039				PMAP_UNLOCK(pmap);
4040				rw_wunlock(lock);
4041				goto retry_pv_loop;
4042			}
4043		}
4044		va = pv->pv_va;
4045		pte = pmap_pte(pmap, pv->pv_va, &lvl);
4046		if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
4047			pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET,
4048			    &lock);
4049		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4050		    ("inconsistent pv lock %p %p for page %p",
4051		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4052		PMAP_UNLOCK(pmap);
4053	}
4054	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4055		pmap = PV_PMAP(pv);
4056		if (!PMAP_TRYLOCK(pmap)) {
4057			pvh_gen = pvh->pv_gen;
4058			md_gen = m->md.pv_gen;
4059			rw_wunlock(lock);
4060			PMAP_LOCK(pmap);
4061			rw_wlock(lock);
4062			if (pvh_gen != pvh->pv_gen ||
4063			    md_gen != m->md.pv_gen) {
4064				PMAP_UNLOCK(pmap);
4065				rw_wunlock(lock);
4066				goto retry_pv_loop;
4067			}
4068		}
4069		pte = pmap_pte(pmap, pv->pv_va, &lvl);
4070retry:
4071		oldpte = pmap_load(pte);
4072		if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
4073			if (!atomic_cmpset_long(pte, oldpte,
4074			    oldpte | ATTR_AP(ATTR_AP_RO)))
4075				goto retry;
4076			if ((oldpte & ATTR_AF) != 0)
4077				vm_page_dirty(m);
4078			pmap_invalidate_page(pmap, pv->pv_va);
4079		}
4080		PMAP_UNLOCK(pmap);
4081	}
4082	rw_wunlock(lock);
4083	vm_page_aflag_clear(m, PGA_WRITEABLE);
4084}
4085
4086static __inline boolean_t
4087safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
4088{
4089
4090	return (FALSE);
4091}
4092
4093/*
4094 *	pmap_ts_referenced:
4095 *
4096 *	Return a count of reference bits for a page, clearing those bits.
4097 *	It is not necessary for every reference bit to be cleared, but it
4098 *	is necessary that 0 only be returned when there are truly no
4099 *	reference bits set.
4100 *
4101 *	As an optimization, update the page's dirty field if a modified bit is
4102 *	found while counting reference bits.  This opportunistic update can be
4103 *	performed at low cost and can eliminate the need for some future calls
4104 *	to pmap_is_modified().  However, since this function stops after
4105 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4106 *	dirty pages.  Those dirty pages will only be detected by a future call
4107 *	to pmap_is_modified().
4108 */
4109int
4110pmap_ts_referenced(vm_page_t m)
4111{
4112	struct md_page *pvh;
4113	pv_entry_t pv, pvf;
4114	pmap_t pmap;
4115	struct rwlock *lock;
4116	pd_entry_t *pde, tpde;
4117	pt_entry_t *pte, tpte;
4118	pt_entry_t *l3;
4119	vm_offset_t va;
4120	vm_paddr_t pa;
4121	int cleared, md_gen, not_cleared, lvl, pvh_gen;
4122	struct spglist free;
4123	bool demoted;
4124
4125	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4126	    ("pmap_ts_referenced: page %p is not managed", m));
4127	SLIST_INIT(&free);
4128	cleared = 0;
4129	pa = VM_PAGE_TO_PHYS(m);
4130	lock = PHYS_TO_PV_LIST_LOCK(pa);
4131	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4132	rw_wlock(lock);
4133retry:
4134	not_cleared = 0;
4135	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4136		goto small_mappings;
4137	pv = pvf;
4138	do {
4139		if (pvf == NULL)
4140			pvf = pv;
4141		pmap = PV_PMAP(pv);
4142		if (!PMAP_TRYLOCK(pmap)) {
4143			pvh_gen = pvh->pv_gen;
4144			rw_wunlock(lock);
4145			PMAP_LOCK(pmap);
4146			rw_wlock(lock);
4147			if (pvh_gen != pvh->pv_gen) {
4148				PMAP_UNLOCK(pmap);
4149				goto retry;
4150			}
4151		}
4152		va = pv->pv_va;
4153		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4154		KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
4155		KASSERT(lvl == 1,
4156		    ("pmap_ts_referenced: invalid pde level %d", lvl));
4157		tpde = pmap_load(pde);
4158		KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
4159		    ("pmap_ts_referenced: found an invalid l1 table"));
4160		pte = pmap_l1_to_l2(pde, pv->pv_va);
4161		tpte = pmap_load(pte);
4162		if (pmap_page_dirty(tpte)) {
4163			/*
4164			 * Although "tpte" is mapping a 2MB page, because
4165			 * this function is called at a 4KB page granularity,
4166			 * we only update the 4KB page under test.
4167			 */
4168			vm_page_dirty(m);
4169		}
4170		if ((tpte & ATTR_AF) != 0) {
4171			/*
4172			 * Since this reference bit is shared by 512 4KB
4173			 * pages, it should not be cleared every time it is
4174			 * tested.  Apply a simple "hash" function on the
4175			 * physical page number, the virtual superpage number,
4176			 * and the pmap address to select one 4KB page out of
4177			 * the 512 on which testing the reference bit will
4178			 * result in clearing that reference bit.  This
4179			 * function is designed to avoid the selection of the
4180			 * same 4KB page for every 2MB page mapping.
4181			 *
4182			 * On demotion, a mapping that hasn't been referenced
4183			 * is simply destroyed.  To avoid the possibility of a
4184			 * subsequent page fault on a demoted wired mapping,
4185			 * always leave its reference bit set.  Moreover,
4186			 * since the superpage is wired, the current state of
4187			 * its reference bit won't affect page replacement.
4188			 */
4189			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4190			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4191			    (tpte & ATTR_SW_WIRED) == 0) {
4192				if (safe_to_clear_referenced(pmap, tpte)) {
4193					/*
4194					 * TODO: We don't handle the access
4195					 * flag at all. We need to be able
4196					 * to set it in  the exception handler.
4197					 */
4198					panic("ARM64TODO: "
4199					    "safe_to_clear_referenced\n");
4200				} else if (pmap_demote_l2_locked(pmap, pte,
4201				    pv->pv_va, &lock) != NULL) {
4202					demoted = true;
4203					va += VM_PAGE_TO_PHYS(m) -
4204					    (tpte & ~ATTR_MASK);
4205					l3 = pmap_l2_to_l3(pte, va);
4206					pmap_remove_l3(pmap, l3, va,
4207					    pmap_load(pte), NULL, &lock);
4208				} else
4209					demoted = true;
4210
4211				if (demoted) {
4212					/*
4213					 * The superpage mapping was removed
4214					 * entirely and therefore 'pv' is no
4215					 * longer valid.
4216					 */
4217					if (pvf == pv)
4218						pvf = NULL;
4219					pv = NULL;
4220				}
4221				cleared++;
4222				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4223				    ("inconsistent pv lock %p %p for page %p",
4224				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4225			} else
4226				not_cleared++;
4227		}
4228		PMAP_UNLOCK(pmap);
4229		/* Rotate the PV list if it has more than one entry. */
4230		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4231			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4232			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4233			pvh->pv_gen++;
4234		}
4235		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4236			goto out;
4237	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4238small_mappings:
4239	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4240		goto out;
4241	pv = pvf;
4242	do {
4243		if (pvf == NULL)
4244			pvf = pv;
4245		pmap = PV_PMAP(pv);
4246		if (!PMAP_TRYLOCK(pmap)) {
4247			pvh_gen = pvh->pv_gen;
4248			md_gen = m->md.pv_gen;
4249			rw_wunlock(lock);
4250			PMAP_LOCK(pmap);
4251			rw_wlock(lock);
4252			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4253				PMAP_UNLOCK(pmap);
4254				goto retry;
4255			}
4256		}
4257		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4258		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
4259		KASSERT(lvl == 2,
4260		    ("pmap_ts_referenced: invalid pde level %d", lvl));
4261		tpde = pmap_load(pde);
4262		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
4263		    ("pmap_ts_referenced: found an invalid l2 table"));
4264		pte = pmap_l2_to_l3(pde, pv->pv_va);
4265		tpte = pmap_load(pte);
4266		if (pmap_page_dirty(tpte))
4267			vm_page_dirty(m);
4268		if ((tpte & ATTR_AF) != 0) {
4269			if (safe_to_clear_referenced(pmap, tpte)) {
4270				/*
4271				 * TODO: We don't handle the access flag
4272				 * at all. We need to be able to set it in
4273				 * the exception handler.
4274				 */
4275				panic("ARM64TODO: safe_to_clear_referenced\n");
4276			} else if ((tpte & ATTR_SW_WIRED) == 0) {
4277				/*
4278				 * Wired pages cannot be paged out so
4279				 * doing accessed bit emulation for
4280				 * them is wasted effort. We do the
4281				 * hard work for unwired pages only.
4282				 */
4283				pmap_remove_l3(pmap, pte, pv->pv_va, tpde,
4284				    &free, &lock);
4285				pmap_invalidate_page(pmap, pv->pv_va);
4286				cleared++;
4287				if (pvf == pv)
4288					pvf = NULL;
4289				pv = NULL;
4290				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4291				    ("inconsistent pv lock %p %p for page %p",
4292				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4293			} else
4294				not_cleared++;
4295		}
4296		PMAP_UNLOCK(pmap);
4297		/* Rotate the PV list if it has more than one entry. */
4298		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4299			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4300			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4301			m->md.pv_gen++;
4302		}
4303	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4304	    not_cleared < PMAP_TS_REFERENCED_MAX);
4305out:
4306	rw_wunlock(lock);
4307	pmap_free_zero_pages(&free);
4308	return (cleared + not_cleared);
4309}
4310
4311/*
4312 *	Apply the given advice to the specified range of addresses within the
4313 *	given pmap.  Depending on the advice, clear the referenced and/or
4314 *	modified flags in each mapping and set the mapped page's dirty field.
4315 */
4316void
4317pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4318{
4319}
4320
4321/*
4322 *	Clear the modify bits on the specified physical page.
4323 */
4324void
4325pmap_clear_modify(vm_page_t m)
4326{
4327
4328	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4329	    ("pmap_clear_modify: page %p is not managed", m));
4330	VM_OBJECT_ASSERT_WLOCKED(m->object);
4331	KASSERT(!vm_page_xbusied(m),
4332	    ("pmap_clear_modify: page %p is exclusive busied", m));
4333
4334	/*
4335	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4336	 * If the object containing the page is locked and the page is not
4337	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4338	 */
4339	if ((m->aflags & PGA_WRITEABLE) == 0)
4340		return;
4341
4342	/* ARM64TODO: We lack support for tracking if a page is modified */
4343}
4344
4345void *
4346pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4347{
4348
4349        return ((void *)PHYS_TO_DMAP(pa));
4350}
4351
4352void
4353pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
4354{
4355}
4356
4357/*
4358 * Sets the memory attribute for the specified page.
4359 */
4360void
4361pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4362{
4363
4364	m->md.pv_memattr = ma;
4365
4366	/*
4367	 * If "m" is a normal page, update its direct mapping.  This update
4368	 * can be relied upon to perform any cache operations that are
4369	 * required for data coherence.
4370	 */
4371	if ((m->flags & PG_FICTITIOUS) == 0 &&
4372	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4373	    m->md.pv_memattr) != 0)
4374		panic("memory attribute change on the direct map failed");
4375}
4376
4377/*
4378 * Changes the specified virtual address range's memory type to that given by
4379 * the parameter "mode".  The specified virtual address range must be
4380 * completely contained within either the direct map or the kernel map.  If
4381 * the virtual address range is contained within the kernel map, then the
4382 * memory type for each of the corresponding ranges of the direct map is also
4383 * changed.  (The corresponding ranges of the direct map are those ranges that
4384 * map the same physical pages as the specified virtual address range.)  These
4385 * changes to the direct map are necessary because Intel describes the
4386 * behavior of their processors as "undefined" if two or more mappings to the
4387 * same physical page have different memory types.
4388 *
4389 * Returns zero if the change completed successfully, and either EINVAL or
4390 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4391 * of the virtual address range was not mapped, and ENOMEM is returned if
4392 * there was insufficient memory available to complete the change.  In the
4393 * latter case, the memory type may have been changed on some part of the
4394 * virtual address range or the direct map.
4395 */
4396static int
4397pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4398{
4399	int error;
4400
4401	PMAP_LOCK(kernel_pmap);
4402	error = pmap_change_attr_locked(va, size, mode);
4403	PMAP_UNLOCK(kernel_pmap);
4404	return (error);
4405}
4406
4407static int
4408pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4409{
4410	vm_offset_t base, offset, tmpva;
4411	pt_entry_t l3, *pte, *newpte;
4412	int lvl;
4413
4414	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4415	base = trunc_page(va);
4416	offset = va & PAGE_MASK;
4417	size = round_page(offset + size);
4418
4419	if (!VIRT_IN_DMAP(base))
4420		return (EINVAL);
4421
4422	for (tmpva = base; tmpva < base + size; ) {
4423		pte = pmap_pte(kernel_pmap, va, &lvl);
4424		if (pte == NULL)
4425			return (EINVAL);
4426
4427		if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
4428			/*
4429			 * We already have the correct attribute,
4430			 * ignore this entry.
4431			 */
4432			switch (lvl) {
4433			default:
4434				panic("Invalid DMAP table level: %d\n", lvl);
4435			case 1:
4436				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4437				break;
4438			case 2:
4439				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
4440				break;
4441			case 3:
4442				tmpva += PAGE_SIZE;
4443				break;
4444			}
4445		} else {
4446			/*
4447			 * Split the entry to an level 3 table, then
4448			 * set the new attribute.
4449			 */
4450			switch (lvl) {
4451			default:
4452				panic("Invalid DMAP table level: %d\n", lvl);
4453			case 1:
4454				newpte = pmap_demote_l1(kernel_pmap, pte,
4455				    tmpva & ~L1_OFFSET);
4456				if (newpte == NULL)
4457					return (EINVAL);
4458				pte = pmap_l1_to_l2(pte, tmpva);
4459			case 2:
4460				newpte = pmap_demote_l2(kernel_pmap, pte,
4461				    tmpva & ~L2_OFFSET);
4462				if (newpte == NULL)
4463					return (EINVAL);
4464				pte = pmap_l2_to_l3(pte, tmpva);
4465			case 3:
4466				/* Update the entry */
4467				l3 = pmap_load(pte);
4468				l3 &= ~ATTR_IDX_MASK;
4469				l3 |= ATTR_IDX(mode);
4470				if (mode == DEVICE_MEMORY)
4471					l3 |= ATTR_XN;
4472
4473				pmap_update_entry(kernel_pmap, pte, l3, tmpva,
4474				    PAGE_SIZE);
4475
4476				/*
4477				 * If moving to a non-cacheable entry flush
4478				 * the cache.
4479				 */
4480				if (mode == VM_MEMATTR_UNCACHEABLE)
4481					cpu_dcache_wbinv_range(tmpva, L3_SIZE);
4482
4483				break;
4484			}
4485			tmpva += PAGE_SIZE;
4486		}
4487	}
4488
4489	return (0);
4490}
4491
4492/*
4493 * Create an L2 table to map all addresses within an L1 mapping.
4494 */
4495static pt_entry_t *
4496pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
4497{
4498	pt_entry_t *l2, newl2, oldl1;
4499	vm_offset_t tmpl1;
4500	vm_paddr_t l2phys, phys;
4501	vm_page_t ml2;
4502	int i;
4503
4504	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4505	oldl1 = pmap_load(l1);
4506	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
4507	    ("pmap_demote_l1: Demoting a non-block entry"));
4508	KASSERT((va & L1_OFFSET) == 0,
4509	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
4510	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
4511	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
4512
4513	tmpl1 = 0;
4514	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
4515		tmpl1 = kva_alloc(PAGE_SIZE);
4516		if (tmpl1 == 0)
4517			return (NULL);
4518	}
4519
4520	if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
4521	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4522		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
4523		    " in pmap %p", va, pmap);
4524		return (NULL);
4525	}
4526
4527	l2phys = VM_PAGE_TO_PHYS(ml2);
4528	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
4529
4530	/* Address the range points at */
4531	phys = oldl1 & ~ATTR_MASK;
4532	/* The attributed from the old l1 table to be copied */
4533	newl2 = oldl1 & ATTR_MASK;
4534
4535	/* Create the new entries */
4536	for (i = 0; i < Ln_ENTRIES; i++) {
4537		l2[i] = newl2 | phys;
4538		phys += L2_SIZE;
4539	}
4540	cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE);
4541	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
4542	    ("Invalid l2 page (%lx != %lx)", l2[0],
4543	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
4544
4545	if (tmpl1 != 0) {
4546		pmap_kenter(tmpl1, PAGE_SIZE,
4547		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
4548		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
4549	}
4550
4551	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
4552
4553	if (tmpl1 != 0) {
4554		pmap_kremove(tmpl1);
4555		kva_free(tmpl1, PAGE_SIZE);
4556	}
4557
4558	return (l2);
4559}
4560
4561/*
4562 * Create an L3 table to map all addresses within an L2 mapping.
4563 */
4564static pt_entry_t *
4565pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
4566    struct rwlock **lockp)
4567{
4568	pt_entry_t *l3, newl3, oldl2;
4569	vm_offset_t tmpl2;
4570	vm_paddr_t l3phys, phys;
4571	vm_page_t ml3;
4572	int i;
4573
4574	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4575	l3 = NULL;
4576	oldl2 = pmap_load(l2);
4577	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
4578	    ("pmap_demote_l2: Demoting a non-block entry"));
4579	KASSERT((va & L2_OFFSET) == 0,
4580	    ("pmap_demote_l2: Invalid virtual address %#lx", va));
4581
4582	tmpl2 = 0;
4583	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
4584		tmpl2 = kva_alloc(PAGE_SIZE);
4585		if (tmpl2 == 0)
4586			return (NULL);
4587	}
4588
4589	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
4590		ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
4591		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4592		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
4593		if (ml3 == NULL) {
4594			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
4595			    " in pmap %p", va, pmap);
4596			goto fail;
4597		}
4598		if (va < VM_MAXUSER_ADDRESS)
4599			pmap_resident_count_inc(pmap, 1);
4600	}
4601
4602	l3phys = VM_PAGE_TO_PHYS(ml3);
4603	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
4604
4605	/* Address the range points at */
4606	phys = oldl2 & ~ATTR_MASK;
4607	/* The attributed from the old l2 table to be copied */
4608	newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE;
4609
4610	/*
4611	 * If the page table page is new, initialize it.
4612	 */
4613	if (ml3->wire_count == 1) {
4614		for (i = 0; i < Ln_ENTRIES; i++) {
4615			l3[i] = newl3 | phys;
4616			phys += L3_SIZE;
4617		}
4618		cpu_dcache_wb_range((vm_offset_t)l3, PAGE_SIZE);
4619	}
4620	KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE),
4621	    ("Invalid l3 page (%lx != %lx)", l3[0],
4622	    (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE));
4623
4624	/*
4625	 * Map the temporary page so we don't lose access to the l2 table.
4626	 */
4627	if (tmpl2 != 0) {
4628		pmap_kenter(tmpl2, PAGE_SIZE,
4629		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
4630		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
4631	}
4632
4633	/*
4634	 * The spare PV entries must be reserved prior to demoting the
4635	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
4636	 * of the L2 and the PV lists will be inconsistent, which can result
4637	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
4638	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
4639	 * PV entry for the 2MB page mapping that is being demoted.
4640	 */
4641	if ((oldl2 & ATTR_SW_MANAGED) != 0)
4642		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
4643
4644	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
4645
4646	/*
4647	 * Demote the PV entry.
4648	 */
4649	if ((oldl2 & ATTR_SW_MANAGED) != 0)
4650		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
4651
4652	atomic_add_long(&pmap_l2_demotions, 1);
4653	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
4654	    " in pmap %p %lx", va, pmap, l3[0]);
4655
4656fail:
4657	if (tmpl2 != 0) {
4658		pmap_kremove(tmpl2);
4659		kva_free(tmpl2, PAGE_SIZE);
4660	}
4661
4662	return (l3);
4663
4664}
4665
4666static pt_entry_t *
4667pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
4668{
4669	struct rwlock *lock;
4670	pt_entry_t *l3;
4671
4672	lock = NULL;
4673	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
4674	if (lock != NULL)
4675		rw_wunlock(lock);
4676	return (l3);
4677}
4678
4679/*
4680 * perform the pmap work for mincore
4681 */
4682int
4683pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
4684{
4685	pd_entry_t *l1p, l1;
4686	pd_entry_t *l2p, l2;
4687	pt_entry_t *l3p, l3;
4688	vm_paddr_t pa;
4689	bool managed;
4690	int val;
4691
4692	PMAP_LOCK(pmap);
4693retry:
4694	pa = 0;
4695	val = 0;
4696	managed = false;
4697
4698	l1p = pmap_l1(pmap, addr);
4699	if (l1p == NULL) /* No l1 */
4700		goto done;
4701
4702	l1 = pmap_load(l1p);
4703	if ((l1 & ATTR_DESCR_MASK) == L1_INVAL)
4704		goto done;
4705
4706	if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
4707		pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET);
4708		managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4709		val = MINCORE_SUPER | MINCORE_INCORE;
4710		if (pmap_page_dirty(l1))
4711			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4712		if ((l1 & ATTR_AF) == ATTR_AF)
4713			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4714		goto done;
4715	}
4716
4717	l2p = pmap_l1_to_l2(l1p, addr);
4718	if (l2p == NULL) /* No l2 */
4719		goto done;
4720
4721	l2 = pmap_load(l2p);
4722	if ((l2 & ATTR_DESCR_MASK) == L2_INVAL)
4723		goto done;
4724
4725	if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
4726		pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET);
4727		managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4728		val = MINCORE_SUPER | MINCORE_INCORE;
4729		if (pmap_page_dirty(l2))
4730			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4731		if ((l2 & ATTR_AF) == ATTR_AF)
4732			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4733		goto done;
4734	}
4735
4736	l3p = pmap_l2_to_l3(l2p, addr);
4737	if (l3p == NULL) /* No l3 */
4738		goto done;
4739
4740	l3 = pmap_load(l2p);
4741	if ((l3 & ATTR_DESCR_MASK) == L3_INVAL)
4742		goto done;
4743
4744	if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) {
4745		pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET);
4746		managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4747		val = MINCORE_INCORE;
4748		if (pmap_page_dirty(l3))
4749			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4750		if ((l3 & ATTR_AF) == ATTR_AF)
4751			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4752	}
4753
4754done:
4755	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
4756	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
4757		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
4758		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
4759			goto retry;
4760	} else
4761		PA_UNLOCK_COND(*locked_pa);
4762	PMAP_UNLOCK(pmap);
4763
4764	return (val);
4765}
4766
4767void
4768pmap_activate(struct thread *td)
4769{
4770	pmap_t	pmap;
4771
4772	critical_enter();
4773	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4774	td->td_pcb->pcb_l0addr = vtophys(pmap->pm_l0);
4775	__asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_pcb->pcb_l0addr));
4776	pmap_invalidate_all(pmap);
4777	critical_exit();
4778}
4779
4780void
4781pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
4782{
4783
4784	if (va >= VM_MIN_KERNEL_ADDRESS) {
4785		cpu_icache_sync_range(va, sz);
4786	} else {
4787		u_int len, offset;
4788		vm_paddr_t pa;
4789
4790		/* Find the length of data in this page to flush */
4791		offset = va & PAGE_MASK;
4792		len = imin(PAGE_SIZE - offset, sz);
4793
4794		while (sz != 0) {
4795			/* Extract the physical address & find it in the DMAP */
4796			pa = pmap_extract(pmap, va);
4797			if (pa != 0)
4798				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
4799
4800			/* Move to the next page */
4801			sz -= len;
4802			va += len;
4803			/* Set the length for the next iteration */
4804			len = imin(PAGE_SIZE, sz);
4805		}
4806	}
4807}
4808
4809int
4810pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
4811{
4812#ifdef SMP
4813	uint64_t par;
4814#endif
4815
4816	switch (ESR_ELx_EXCEPTION(esr)) {
4817	case EXCP_DATA_ABORT_L:
4818	case EXCP_DATA_ABORT:
4819		break;
4820	default:
4821		return (KERN_FAILURE);
4822	}
4823
4824#ifdef SMP
4825	PMAP_LOCK(pmap);
4826	switch (esr & ISS_DATA_DFSC_MASK) {
4827	case ISS_DATA_DFSC_TF_L0:
4828	case ISS_DATA_DFSC_TF_L1:
4829	case ISS_DATA_DFSC_TF_L2:
4830	case ISS_DATA_DFSC_TF_L3:
4831		/* Ask the MMU to check the address */
4832		if (pmap == kernel_pmap)
4833			par = arm64_address_translate_s1e1r(far);
4834		else
4835			par = arm64_address_translate_s1e0r(far);
4836
4837		/*
4838		 * If the translation was successful the address was invalid
4839		 * due to a break-before-make sequence. We can unlock and
4840		 * return success to the trap handler.
4841		 */
4842		if (PAR_SUCCESS(par)) {
4843			PMAP_UNLOCK(pmap);
4844			return (KERN_SUCCESS);
4845		}
4846		break;
4847	default:
4848		break;
4849	}
4850	PMAP_UNLOCK(pmap);
4851#endif
4852
4853	return (KERN_FAILURE);
4854}
4855
4856/*
4857 *	Increase the starting virtual address of the given mapping if a
4858 *	different alignment might result in more superpage mappings.
4859 */
4860void
4861pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4862    vm_offset_t *addr, vm_size_t size)
4863{
4864	vm_offset_t superpage_offset;
4865
4866	if (size < L2_SIZE)
4867		return;
4868	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4869		offset += ptoa(object->pg_color);
4870	superpage_offset = offset & L2_OFFSET;
4871	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
4872	    (*addr & L2_OFFSET) == superpage_offset)
4873		return;
4874	if ((*addr & L2_OFFSET) < superpage_offset)
4875		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
4876	else
4877		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
4878}
4879
4880/**
4881 * Get the kernel virtual address of a set of physical pages. If there are
4882 * physical addresses not covered by the DMAP perform a transient mapping
4883 * that will be removed when calling pmap_unmap_io_transient.
4884 *
4885 * \param page        The pages the caller wishes to obtain the virtual
4886 *                    address on the kernel memory map.
4887 * \param vaddr       On return contains the kernel virtual memory address
4888 *                    of the pages passed in the page parameter.
4889 * \param count       Number of pages passed in.
4890 * \param can_fault   TRUE if the thread using the mapped pages can take
4891 *                    page faults, FALSE otherwise.
4892 *
4893 * \returns TRUE if the caller must call pmap_unmap_io_transient when
4894 *          finished or FALSE otherwise.
4895 *
4896 */
4897boolean_t
4898pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
4899    boolean_t can_fault)
4900{
4901	vm_paddr_t paddr;
4902	boolean_t needs_mapping;
4903	int error, i;
4904
4905	/*
4906	 * Allocate any KVA space that we need, this is done in a separate
4907	 * loop to prevent calling vmem_alloc while pinned.
4908	 */
4909	needs_mapping = FALSE;
4910	for (i = 0; i < count; i++) {
4911		paddr = VM_PAGE_TO_PHYS(page[i]);
4912		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
4913			error = vmem_alloc(kernel_arena, PAGE_SIZE,
4914			    M_BESTFIT | M_WAITOK, &vaddr[i]);
4915			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
4916			needs_mapping = TRUE;
4917		} else {
4918			vaddr[i] = PHYS_TO_DMAP(paddr);
4919		}
4920	}
4921
4922	/* Exit early if everything is covered by the DMAP */
4923	if (!needs_mapping)
4924		return (FALSE);
4925
4926	if (!can_fault)
4927		sched_pin();
4928	for (i = 0; i < count; i++) {
4929		paddr = VM_PAGE_TO_PHYS(page[i]);
4930		if (!PHYS_IN_DMAP(paddr)) {
4931			panic(
4932			   "pmap_map_io_transient: TODO: Map out of DMAP data");
4933		}
4934	}
4935
4936	return (needs_mapping);
4937}
4938
4939void
4940pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
4941    boolean_t can_fault)
4942{
4943	vm_paddr_t paddr;
4944	int i;
4945
4946	if (!can_fault)
4947		sched_unpin();
4948	for (i = 0; i < count; i++) {
4949		paddr = VM_PAGE_TO_PHYS(page[i]);
4950		if (!PHYS_IN_DMAP(paddr)) {
4951			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
4952		}
4953	}
4954}
4955