1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 *    notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 *    notice, this list of conditions and the following disclaimer in the
31 *    documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 *    must display the following acknowledgement:
34 *	This product includes software developed by the University of
35 *	California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 *    may be used to endorse or promote products derived from this software
38 *    without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52/*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 *    notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 *    notice, this list of conditions and the following disclaimer in the
69 *    documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84#include <sys/cdefs.h>
85/*
86 *	Manages physical address maps.
87 *
88 *	Since the information managed by this module is
89 *	also stored by the logical address mapping module,
90 *	this module may throw away valid virtual-to-physical
91 *	mappings at almost any time.  However, invalidations
92 *	of virtual-to-physical mappings must be done as
93 *	requested.
94 *
95 *	In order to cope with hardware architectures which
96 *	make virtual-to-physical map invalidates expensive,
97 *	this module may delay invalidate or reduced protection
98 *	operations until such time as they are actually
99 *	necessary.  This module is given full information as
100 *	to which processors are currently using which maps,
101 *	and to when physical maps must be made correct.
102 */
103
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/asan.h>
108#include <sys/bitstring.h>
109#include <sys/bus.h>
110#include <sys/systm.h>
111#include <sys/kernel.h>
112#include <sys/ktr.h>
113#include <sys/limits.h>
114#include <sys/lock.h>
115#include <sys/malloc.h>
116#include <sys/mman.h>
117#include <sys/msan.h>
118#include <sys/msgbuf.h>
119#include <sys/mutex.h>
120#include <sys/physmem.h>
121#include <sys/proc.h>
122#include <sys/rangeset.h>
123#include <sys/rwlock.h>
124#include <sys/sbuf.h>
125#include <sys/sx.h>
126#include <sys/vmem.h>
127#include <sys/vmmeter.h>
128#include <sys/sched.h>
129#include <sys/sysctl.h>
130#include <sys/_unrhdr.h>
131#include <sys/smp.h>
132
133#include <vm/vm.h>
134#include <vm/vm_param.h>
135#include <vm/vm_kern.h>
136#include <vm/vm_page.h>
137#include <vm/vm_map.h>
138#include <vm/vm_object.h>
139#include <vm/vm_extern.h>
140#include <vm/vm_pageout.h>
141#include <vm/vm_pager.h>
142#include <vm/vm_phys.h>
143#include <vm/vm_radix.h>
144#include <vm/vm_reserv.h>
145#include <vm/vm_dumpset.h>
146#include <vm/uma.h>
147
148#include <machine/asan.h>
149#include <machine/machdep.h>
150#include <machine/md_var.h>
151#include <machine/pcb.h>
152
153#ifdef NUMA
154#define	PMAP_MEMDOM	MAXMEMDOM
155#else
156#define	PMAP_MEMDOM	1
157#endif
158
159#define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
160#define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
161
162#define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
163#define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
164#define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
165#define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
166
167#define	NUL0E		L0_ENTRIES
168#define	NUL1E		(NUL0E * NL1PG)
169#define	NUL2E		(NUL1E * NL2PG)
170
171#ifdef PV_STATS
172#define PV_STAT(x)	do { x ; } while (0)
173#define __pvused
174#else
175#define PV_STAT(x)	do { } while (0)
176#define __pvused	__unused
177#endif
178
179#define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
180#define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
181#define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
182
183#ifdef __ARM_FEATURE_BTI_DEFAULT
184#define	ATTR_KERN_GP		ATTR_S1_GP
185#else
186#define	ATTR_KERN_GP		0
187#endif
188#define	PMAP_SAN_PTE_BITS	(ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \
189	ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
190
191struct pmap_large_md_page {
192	struct rwlock   pv_lock;
193	struct md_page  pv_page;
194	/* Pad to a power of 2, see pmap_init_pv_table(). */
195	int		pv_pad[2];
196};
197
198__exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
199#define pv_dummy pv_dummy_large.pv_page
200__read_mostly static struct pmap_large_md_page *pv_table;
201
202static struct pmap_large_md_page *
203_pa_to_pmdp(vm_paddr_t pa)
204{
205	struct vm_phys_seg *seg;
206
207	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
208		return ((struct pmap_large_md_page *)seg->md_first +
209		    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
210	return (NULL);
211}
212
213static struct pmap_large_md_page *
214pa_to_pmdp(vm_paddr_t pa)
215{
216	struct pmap_large_md_page *pvd;
217
218	pvd = _pa_to_pmdp(pa);
219	if (pvd == NULL)
220		panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
221	return (pvd);
222}
223
224static struct pmap_large_md_page *
225page_to_pmdp(vm_page_t m)
226{
227	struct vm_phys_seg *seg;
228
229	seg = &vm_phys_segs[m->segind];
230	return ((struct pmap_large_md_page *)seg->md_first +
231	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
232}
233
234#define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
235#define	page_to_pvh(m)	(&(page_to_pmdp(m)->pv_page))
236
237#define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
238	struct pmap_large_md_page *_pvd;			\
239	struct rwlock *_lock;					\
240	_pvd = _pa_to_pmdp(pa);					\
241	if (__predict_false(_pvd == NULL))			\
242		_lock = &pv_dummy_large.pv_lock;		\
243	else							\
244		_lock = &(_pvd->pv_lock);			\
245	_lock;							\
246})
247
248static struct rwlock *
249VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
250{
251	if ((m->flags & PG_FICTITIOUS) == 0)
252		return (&page_to_pmdp(m)->pv_lock);
253	else
254		return (&pv_dummy_large.pv_lock);
255}
256
257#define	CHANGE_PV_LIST_LOCK(lockp, new_lock)	do {	\
258	struct rwlock **_lockp = (lockp);		\
259	struct rwlock *_new_lock = (new_lock);		\
260							\
261	if (_new_lock != *_lockp) {			\
262		if (*_lockp != NULL)			\
263			rw_wunlock(*_lockp);		\
264		*_lockp = _new_lock;			\
265		rw_wlock(*_lockp);			\
266	}						\
267} while (0)
268
269#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)		\
270			CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
271
272#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
273			CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
274
275#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
276	struct rwlock **_lockp = (lockp);		\
277							\
278	if (*_lockp != NULL) {				\
279		rw_wunlock(*_lockp);			\
280		*_lockp = NULL;				\
281	}						\
282} while (0)
283
284#define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
285#define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
286
287/*
288 * The presence of this flag indicates that the mapping is writeable.
289 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
290 * it is dirty.  This flag may only be set on managed mappings.
291 *
292 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
293 * as a software managed bit.
294 */
295#define	ATTR_SW_DBM	ATTR_DBM
296
297struct pmap kernel_pmap_store;
298
299/* Used for mapping ACPI memory before VM is initialized */
300#define	PMAP_PREINIT_MAPPING_COUNT	32
301#define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
302static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
303static int vm_initialized = 0;		/* No need to use pre-init maps when set */
304
305/*
306 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
307 * Always map entire L2 block for simplicity.
308 * VA of L2 block = preinit_map_va + i * L2_SIZE
309 */
310static struct pmap_preinit_mapping {
311	vm_paddr_t	pa;
312	vm_offset_t	va;
313	vm_size_t	size;
314} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
315
316vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318vm_offset_t kernel_vm_end = 0;
319
320/*
321 * Data for the pv entry allocation mechanism.
322 */
323#ifdef NUMA
324static __inline int
325pc_to_domain(struct pv_chunk *pc)
326{
327	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
328}
329#else
330static __inline int
331pc_to_domain(struct pv_chunk *pc __unused)
332{
333	return (0);
334}
335#endif
336
337struct pv_chunks_list {
338	struct mtx pvc_lock;
339	TAILQ_HEAD(pch, pv_chunk) pvc_list;
340	int active_reclaims;
341} __aligned(CACHE_LINE_SIZE);
342
343struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
344
345vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
346vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
347vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
348
349extern pt_entry_t pagetable_l0_ttbr1[];
350
351#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
352static vm_paddr_t physmap[PHYSMAP_SIZE];
353static u_int physmap_idx;
354
355static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
356    "VM/pmap parameters");
357
358#if PAGE_SIZE == PAGE_SIZE_4K
359#define	L1_BLOCKS_SUPPORTED	1
360#else
361/* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
362#define	L1_BLOCKS_SUPPORTED	0
363#endif
364
365#define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
366
367/*
368 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
369 * that it has currently allocated to a pmap, a cursor ("asid_next") to
370 * optimize its search for a free ASID in the bit vector, and an epoch number
371 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
372 * ASIDs that are not currently active on a processor.
373 *
374 * The current epoch number is always in the range [0, INT_MAX).  Negative
375 * numbers and INT_MAX are reserved for special cases that are described
376 * below.
377 */
378struct asid_set {
379	int asid_bits;
380	bitstr_t *asid_set;
381	int asid_set_size;
382	int asid_next;
383	int asid_epoch;
384	struct mtx asid_set_mutex;
385};
386
387static struct asid_set asids;
388static struct asid_set vmids;
389
390static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
391    "ASID allocator");
392SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
393    "The number of bits in an ASID");
394SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
395    "The last allocated ASID plus one");
396SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
397    "The current epoch number");
398
399static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
400SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
401    "The number of bits in an VMID");
402SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
403    "The last allocated VMID plus one");
404SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
405    "The current epoch number");
406
407void (*pmap_clean_stage2_tlbi)(void);
408void (*pmap_invalidate_vpipt_icache)(void);
409void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
410void (*pmap_stage2_invalidate_all)(uint64_t);
411
412/*
413 * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
414 * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
415 * dynamically allocated ASIDs have a non-negative epoch number.
416 *
417 * An invalid ASID is represented by -1.
418 *
419 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
420 * which indicates that an ASID should never be allocated to the pmap, and
421 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
422 * allocated when the pmap is next activated.
423 */
424#define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
425					    ((u_long)(epoch) << 32)))
426#define	COOKIE_TO_ASID(cookie)		((int)(cookie))
427#define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
428
429#define	TLBI_VA_SHIFT			12
430#define	TLBI_VA_MASK			((1ul << 44) - 1)
431#define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
432#define	TLBI_VA_L3_INCR			(L3_SIZE >> TLBI_VA_SHIFT)
433
434static int __read_frequently superpages_enabled = 1;
435SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
436    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
437    "Are large page mappings enabled?");
438
439/*
440 * True when Branch Target Identification should be used by userspace. This
441 * allows pmap to mark pages as guarded with ATTR_S1_GP.
442 */
443__read_mostly static bool pmap_bti_support = false;
444
445/*
446 * Internal flags for pmap_enter()'s helper functions.
447 */
448#define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
449#define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
450
451TAILQ_HEAD(pv_chunklist, pv_chunk);
452
453static void	free_pv_chunk(struct pv_chunk *pc);
454static void	free_pv_chunk_batch(struct pv_chunklist *batch);
455static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
456static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
457static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
458static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
459static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
460		    vm_offset_t va);
461
462static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
463static bool pmap_activate_int(pmap_t pmap);
464static void pmap_alloc_asid(pmap_t pmap);
465static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
466    vm_prot_t prot, int mode, bool skip_unmapped);
467static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
468    pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
469static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
470static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
471    vm_offset_t va, struct rwlock **lockp);
472static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
473static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
474static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
475    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
476static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
477    u_int flags, vm_page_t m, struct rwlock **lockp);
478static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
479    vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
480static bool pmap_every_pte_zero(vm_paddr_t pa);
481static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
482    bool all_l3e_AF_set);
483static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
484static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
485    vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
486static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
487    struct rwlock **lockp);
488static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
489static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
490    pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
491static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
492    pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
493static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
494    vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
495    struct rwlock **lockp);
496static void pmap_reset_asid_set(pmap_t pmap);
497static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
498    vm_page_t m, struct rwlock **lockp);
499
500static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
501		struct rwlock **lockp);
502
503static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
504    struct spglist *free);
505static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
506static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
507    vm_offset_t va, vm_size_t size);
508static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
509
510static uma_zone_t pmap_bti_ranges_zone;
511static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
512static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
513static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
514static void *bti_dup_range(void *ctx, void *data);
515static void bti_free_range(void *ctx, void *node);
516static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
517static void pmap_bti_deassign_all(pmap_t pmap);
518
519/*
520 * These load the old table data and store the new value.
521 * They need to be atomic as the System MMU may write to the table at
522 * the same time as the CPU.
523 */
524#define	pmap_clear(table)		atomic_store_64(table, 0)
525#define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
526#define	pmap_load(table)		(*table)
527#define	pmap_load_clear(table)		atomic_swap_64(table, 0)
528#define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
529#define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
530#define	pmap_store(table, entry)	atomic_store_64(table, entry)
531
532/********************/
533/* Inline functions */
534/********************/
535
536static __inline void
537pagecopy(void *s, void *d)
538{
539
540	memcpy(d, s, PAGE_SIZE);
541}
542
543static __inline pd_entry_t *
544pmap_l0(pmap_t pmap, vm_offset_t va)
545{
546
547	return (&pmap->pm_l0[pmap_l0_index(va)]);
548}
549
550static __inline pd_entry_t *
551pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
552{
553	pd_entry_t *l1;
554
555	l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
556	return (&l1[pmap_l1_index(va)]);
557}
558
559static __inline pd_entry_t *
560pmap_l1(pmap_t pmap, vm_offset_t va)
561{
562	pd_entry_t *l0;
563
564	l0 = pmap_l0(pmap, va);
565	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
566		return (NULL);
567
568	return (pmap_l0_to_l1(l0, va));
569}
570
571static __inline pd_entry_t *
572pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
573{
574	pd_entry_t l1, *l2p;
575
576	l1 = pmap_load(l1p);
577
578	KASSERT(ADDR_IS_CANONICAL(va),
579	    ("%s: Address not in canonical form: %lx", __func__, va));
580	/*
581	 * The valid bit may be clear if pmap_update_entry() is concurrently
582	 * modifying the entry, so for KVA only the entry type may be checked.
583	 */
584	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
585	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
586	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
587	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
588	l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
589	return (&l2p[pmap_l2_index(va)]);
590}
591
592static __inline pd_entry_t *
593pmap_l2(pmap_t pmap, vm_offset_t va)
594{
595	pd_entry_t *l1;
596
597	l1 = pmap_l1(pmap, va);
598	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
599		return (NULL);
600
601	return (pmap_l1_to_l2(l1, va));
602}
603
604static __inline pt_entry_t *
605pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
606{
607	pd_entry_t l2;
608	pt_entry_t *l3p;
609
610	l2 = pmap_load(l2p);
611
612	KASSERT(ADDR_IS_CANONICAL(va),
613	    ("%s: Address not in canonical form: %lx", __func__, va));
614	/*
615	 * The valid bit may be clear if pmap_update_entry() is concurrently
616	 * modifying the entry, so for KVA only the entry type may be checked.
617	 */
618	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
619	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
620	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
621	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
622	l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
623	return (&l3p[pmap_l3_index(va)]);
624}
625
626/*
627 * Returns the lowest valid pde for a given virtual address.
628 * The next level may or may not point to a valid page or block.
629 */
630static __inline pd_entry_t *
631pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
632{
633	pd_entry_t *l0, *l1, *l2, desc;
634
635	l0 = pmap_l0(pmap, va);
636	desc = pmap_load(l0) & ATTR_DESCR_MASK;
637	if (desc != L0_TABLE) {
638		*level = -1;
639		return (NULL);
640	}
641
642	l1 = pmap_l0_to_l1(l0, va);
643	desc = pmap_load(l1) & ATTR_DESCR_MASK;
644	if (desc != L1_TABLE) {
645		*level = 0;
646		return (l0);
647	}
648
649	l2 = pmap_l1_to_l2(l1, va);
650	desc = pmap_load(l2) & ATTR_DESCR_MASK;
651	if (desc != L2_TABLE) {
652		*level = 1;
653		return (l1);
654	}
655
656	*level = 2;
657	return (l2);
658}
659
660/*
661 * Returns the lowest valid pte block or table entry for a given virtual
662 * address. If there are no valid entries return NULL and set the level to
663 * the first invalid level.
664 */
665static __inline pt_entry_t *
666pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
667{
668	pd_entry_t *l1, *l2, desc;
669	pt_entry_t *l3;
670
671	l1 = pmap_l1(pmap, va);
672	if (l1 == NULL) {
673		*level = 0;
674		return (NULL);
675	}
676	desc = pmap_load(l1) & ATTR_DESCR_MASK;
677	if (desc == L1_BLOCK) {
678		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
679		*level = 1;
680		return (l1);
681	}
682
683	if (desc != L1_TABLE) {
684		*level = 1;
685		return (NULL);
686	}
687
688	l2 = pmap_l1_to_l2(l1, va);
689	desc = pmap_load(l2) & ATTR_DESCR_MASK;
690	if (desc == L2_BLOCK) {
691		*level = 2;
692		return (l2);
693	}
694
695	if (desc != L2_TABLE) {
696		*level = 2;
697		return (NULL);
698	}
699
700	*level = 3;
701	l3 = pmap_l2_to_l3(l2, va);
702	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
703		return (NULL);
704
705	return (l3);
706}
707
708/*
709 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
710 * level that maps the specified virtual address, then a pointer to that entry
711 * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
712 * and a diagnostic message is provided, in which case this function panics.
713 */
714static __always_inline pt_entry_t *
715pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
716{
717	pd_entry_t *l0p, *l1p, *l2p;
718	pt_entry_t desc, *l3p;
719	int walk_level __diagused;
720
721	KASSERT(level >= 0 && level < 4,
722	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
723	    level));
724	l0p = pmap_l0(pmap, va);
725	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
726	if (desc == L0_TABLE && level > 0) {
727		l1p = pmap_l0_to_l1(l0p, va);
728		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
729		if (desc == L1_BLOCK && level == 1) {
730			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
731			return (l1p);
732		}
733		if (desc == L1_TABLE && level > 1) {
734			l2p = pmap_l1_to_l2(l1p, va);
735			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
736			if (desc == L2_BLOCK && level == 2)
737				return (l2p);
738			else if (desc == L2_TABLE && level > 2) {
739				l3p = pmap_l2_to_l3(l2p, va);
740				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
741				if (desc == L3_PAGE && level == 3)
742					return (l3p);
743				else
744					walk_level = 3;
745			} else
746				walk_level = 2;
747		} else
748			walk_level = 1;
749	} else
750		walk_level = 0;
751	KASSERT(diag == NULL,
752	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
753	    diag, va, level, desc, walk_level));
754	return (NULL);
755}
756
757bool
758pmap_ps_enabled(pmap_t pmap)
759{
760	/*
761	 * Promotion requires a hypervisor call when the kernel is running
762	 * in EL1. To stop this disable superpage support on non-stage 1
763	 * pmaps for now.
764	 */
765	if (pmap->pm_stage != PM_STAGE1)
766		return (false);
767
768#ifdef KMSAN
769	/*
770	 * The break-before-make in pmap_update_entry() results in a situation
771	 * where a CPU may call into the KMSAN runtime while the entry is
772	 * invalid.  If the entry is used to map the current thread structure,
773	 * then the runtime will attempt to access unmapped memory.  Avoid this
774	 * by simply disabling superpage promotion for the kernel map.
775	 */
776	if (pmap == kernel_pmap)
777		return (false);
778#endif
779
780	return (superpages_enabled != 0);
781}
782
783bool
784pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
785    pd_entry_t **l2, pt_entry_t **l3)
786{
787	pd_entry_t *l0p, *l1p, *l2p;
788
789	if (pmap->pm_l0 == NULL)
790		return (false);
791
792	l0p = pmap_l0(pmap, va);
793	*l0 = l0p;
794
795	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
796		return (false);
797
798	l1p = pmap_l0_to_l1(l0p, va);
799	*l1 = l1p;
800
801	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
802		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
803		*l2 = NULL;
804		*l3 = NULL;
805		return (true);
806	}
807
808	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
809		return (false);
810
811	l2p = pmap_l1_to_l2(l1p, va);
812	*l2 = l2p;
813
814	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
815		*l3 = NULL;
816		return (true);
817	}
818
819	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
820		return (false);
821
822	*l3 = pmap_l2_to_l3(l2p, va);
823
824	return (true);
825}
826
827static __inline int
828pmap_l3_valid(pt_entry_t l3)
829{
830
831	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
832}
833
834CTASSERT(L1_BLOCK == L2_BLOCK);
835
836static pt_entry_t
837pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
838{
839	pt_entry_t val;
840
841	if (pmap->pm_stage == PM_STAGE1) {
842		val = ATTR_S1_IDX(memattr);
843		if (memattr == VM_MEMATTR_DEVICE)
844			val |= ATTR_S1_XN;
845		return (val);
846	}
847
848	val = 0;
849
850	switch (memattr) {
851	case VM_MEMATTR_DEVICE:
852		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
853		    ATTR_S2_XN(ATTR_S2_XN_ALL));
854	case VM_MEMATTR_UNCACHEABLE:
855		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
856	case VM_MEMATTR_WRITE_BACK:
857		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
858	case VM_MEMATTR_WRITE_THROUGH:
859		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
860	default:
861		panic("%s: invalid memory attribute %x", __func__, memattr);
862	}
863}
864
865static pt_entry_t
866pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
867{
868	pt_entry_t val;
869
870	val = 0;
871	if (pmap->pm_stage == PM_STAGE1) {
872		if ((prot & VM_PROT_EXECUTE) == 0)
873			val |= ATTR_S1_XN;
874		if ((prot & VM_PROT_WRITE) == 0)
875			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
876	} else {
877		if ((prot & VM_PROT_WRITE) != 0)
878			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
879		if ((prot & VM_PROT_READ) != 0)
880			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
881		if ((prot & VM_PROT_EXECUTE) == 0)
882			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
883	}
884
885	return (val);
886}
887
888/*
889 * Checks if the PTE is dirty.
890 */
891static inline int
892pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
893{
894
895	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
896
897	if (pmap->pm_stage == PM_STAGE1) {
898		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
899		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
900
901		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
902		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
903	}
904
905	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
906	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
907}
908
909static __inline void
910pmap_resident_count_inc(pmap_t pmap, int count)
911{
912
913	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
914	pmap->pm_stats.resident_count += count;
915}
916
917static __inline void
918pmap_resident_count_dec(pmap_t pmap, int count)
919{
920
921	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
922	KASSERT(pmap->pm_stats.resident_count >= count,
923	    ("pmap %p resident count underflow %ld %d", pmap,
924	    pmap->pm_stats.resident_count, count));
925	pmap->pm_stats.resident_count -= count;
926}
927
928static vm_paddr_t
929pmap_early_vtophys(vm_offset_t va)
930{
931	vm_paddr_t pa_page;
932
933	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
934	return (pa_page | (va & PAR_LOW_MASK));
935}
936
937/* State of the bootstrapped DMAP page tables */
938struct pmap_bootstrap_state {
939	pt_entry_t	*l1;
940	pt_entry_t	*l2;
941	pt_entry_t	*l3;
942	vm_offset_t	freemempos;
943	vm_offset_t	va;
944	vm_paddr_t	pa;
945	pt_entry_t	table_attrs;
946	u_int		l0_slot;
947	u_int		l1_slot;
948	u_int		l2_slot;
949	bool		dmap_valid;
950};
951
952/* The bootstrap state */
953static struct pmap_bootstrap_state bs_state = {
954	.l1 = NULL,
955	.l2 = NULL,
956	.l3 = NULL,
957	.table_attrs = TATTR_PXN_TABLE,
958	.l0_slot = L0_ENTRIES,
959	.l1_slot = Ln_ENTRIES,
960	.l2_slot = Ln_ENTRIES,
961	.dmap_valid = false,
962};
963
964static void
965pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
966{
967	vm_paddr_t l1_pa;
968	pd_entry_t l0e;
969	u_int l0_slot;
970
971	/* Link the level 0 table to a level 1 table */
972	l0_slot = pmap_l0_index(state->va);
973	if (l0_slot != state->l0_slot) {
974		/*
975		 * Make sure we move from a low address to high address
976		 * before the DMAP region is ready. This ensures we never
977		 * modify an existing mapping until we can map from a
978		 * physical address to a virtual address.
979		 */
980		MPASS(state->l0_slot < l0_slot ||
981		    state->l0_slot == L0_ENTRIES ||
982		    state->dmap_valid);
983
984		/* Reset lower levels */
985		state->l2 = NULL;
986		state->l3 = NULL;
987		state->l1_slot = Ln_ENTRIES;
988		state->l2_slot = Ln_ENTRIES;
989
990		/* Check the existing L0 entry */
991		state->l0_slot = l0_slot;
992		if (state->dmap_valid) {
993			l0e = pagetable_l0_ttbr1[l0_slot];
994			if ((l0e & ATTR_DESCR_VALID) != 0) {
995				MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
996				l1_pa = PTE_TO_PHYS(l0e);
997				state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
998				return;
999			}
1000		}
1001
1002		/* Create a new L0 table entry */
1003		state->l1 = (pt_entry_t *)state->freemempos;
1004		memset(state->l1, 0, PAGE_SIZE);
1005		state->freemempos += PAGE_SIZE;
1006
1007		l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1008		MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1009		MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1010		pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1011		    TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1012	}
1013	KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1014}
1015
1016static void
1017pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1018{
1019	vm_paddr_t l2_pa;
1020	pd_entry_t l1e;
1021	u_int l1_slot;
1022
1023	/* Make sure there is a valid L0 -> L1 table */
1024	pmap_bootstrap_l0_table(state);
1025
1026	/* Link the level 1 table to a level 2 table */
1027	l1_slot = pmap_l1_index(state->va);
1028	if (l1_slot != state->l1_slot) {
1029		/* See pmap_bootstrap_l0_table for a description */
1030		MPASS(state->l1_slot < l1_slot ||
1031		    state->l1_slot == Ln_ENTRIES ||
1032		    state->dmap_valid);
1033
1034		/* Reset lower levels */
1035		state->l3 = NULL;
1036		state->l2_slot = Ln_ENTRIES;
1037
1038		/* Check the existing L1 entry */
1039		state->l1_slot = l1_slot;
1040		if (state->dmap_valid) {
1041			l1e = state->l1[l1_slot];
1042			if ((l1e & ATTR_DESCR_VALID) != 0) {
1043				MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1044				l2_pa = PTE_TO_PHYS(l1e);
1045				state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1046				return;
1047			}
1048		}
1049
1050		/* Create a new L1 table entry */
1051		state->l2 = (pt_entry_t *)state->freemempos;
1052		memset(state->l2, 0, PAGE_SIZE);
1053		state->freemempos += PAGE_SIZE;
1054
1055		l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1056		MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1057		MPASS(state->l1[l1_slot] == 0);
1058		pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1059		    state->table_attrs | L1_TABLE);
1060	}
1061	KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1062}
1063
1064static void
1065pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1066{
1067	vm_paddr_t l3_pa;
1068	pd_entry_t l2e;
1069	u_int l2_slot;
1070
1071	/* Make sure there is a valid L1 -> L2 table */
1072	pmap_bootstrap_l1_table(state);
1073
1074	/* Link the level 2 table to a level 3 table */
1075	l2_slot = pmap_l2_index(state->va);
1076	if (l2_slot != state->l2_slot) {
1077		/* See pmap_bootstrap_l0_table for a description */
1078		MPASS(state->l2_slot < l2_slot ||
1079		    state->l2_slot == Ln_ENTRIES ||
1080		    state->dmap_valid);
1081
1082		/* Check the existing L2 entry */
1083		state->l2_slot = l2_slot;
1084		if (state->dmap_valid) {
1085			l2e = state->l2[l2_slot];
1086			if ((l2e & ATTR_DESCR_VALID) != 0) {
1087				MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1088				l3_pa = PTE_TO_PHYS(l2e);
1089				state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1090				return;
1091			}
1092		}
1093
1094		/* Create a new L2 table entry */
1095		state->l3 = (pt_entry_t *)state->freemempos;
1096		memset(state->l3, 0, PAGE_SIZE);
1097		state->freemempos += PAGE_SIZE;
1098
1099		l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1100		MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1101		MPASS(state->l2[l2_slot] == 0);
1102		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1103		    state->table_attrs | L2_TABLE);
1104	}
1105	KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1106}
1107
1108static void
1109pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1110{
1111	u_int l2_slot;
1112	bool first;
1113
1114	if ((physmap[i + 1] - state->pa) < L2_SIZE)
1115		return;
1116
1117	/* Make sure there is a valid L1 table */
1118	pmap_bootstrap_l1_table(state);
1119
1120	MPASS((state->va & L2_OFFSET) == 0);
1121	for (first = true;
1122	    state->va < DMAP_MAX_ADDRESS &&
1123	    (physmap[i + 1] - state->pa) >= L2_SIZE;
1124	    state->va += L2_SIZE, state->pa += L2_SIZE) {
1125		/*
1126		 * Stop if we are about to walk off the end of what the
1127		 * current L1 slot can address.
1128		 */
1129		if (!first && (state->pa & L1_OFFSET) == 0)
1130			break;
1131
1132		first = false;
1133		l2_slot = pmap_l2_index(state->va);
1134		MPASS((state->pa & L2_OFFSET) == 0);
1135		MPASS(state->l2[l2_slot] == 0);
1136		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1137		    ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1138		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
1139	}
1140	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1141}
1142
1143static void
1144pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1145{
1146	pt_entry_t contig;
1147	u_int l3_slot;
1148	bool first;
1149
1150	if (physmap[i + 1] - state->pa < L3_SIZE)
1151		return;
1152
1153	/* Make sure there is a valid L2 table */
1154	pmap_bootstrap_l2_table(state);
1155
1156	MPASS((state->va & L3_OFFSET) == 0);
1157	for (first = true, contig = 0;
1158	    state->va < DMAP_MAX_ADDRESS &&
1159	    physmap[i + 1] - state->pa >= L3_SIZE;
1160	    state->va += L3_SIZE, state->pa += L3_SIZE) {
1161		/*
1162		 * Stop if we are about to walk off the end of what the
1163		 * current L2 slot can address.
1164		 */
1165		if (!first && (state->pa & L2_OFFSET) == 0)
1166			break;
1167
1168		/*
1169		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1170		 * L3 pages, set the contiguous bit within each PTE so that
1171		 * the chunk can be cached using only one TLB entry.
1172		 */
1173		if ((state->pa & L3C_OFFSET) == 0) {
1174			if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1175			    physmap[i + 1] - state->pa >= L3C_SIZE) {
1176				contig = ATTR_CONTIGUOUS;
1177			} else {
1178				contig = 0;
1179			}
1180		}
1181
1182		first = false;
1183		l3_slot = pmap_l3_index(state->va);
1184		MPASS((state->pa & L3_OFFSET) == 0);
1185		MPASS(state->l3[l3_slot] == 0);
1186		pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1187		    ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1188		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1189	}
1190	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1191}
1192
1193static void
1194pmap_bootstrap_dmap(vm_paddr_t min_pa)
1195{
1196	int i;
1197
1198	dmap_phys_base = min_pa & ~L1_OFFSET;
1199	dmap_phys_max = 0;
1200	dmap_max_addr = 0;
1201
1202	for (i = 0; i < (physmap_idx * 2); i += 2) {
1203		bs_state.pa = physmap[i] & ~L3_OFFSET;
1204		bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1205
1206		/* Create L3 mappings at the start of the region */
1207		if ((bs_state.pa & L2_OFFSET) != 0)
1208			pmap_bootstrap_l3_page(&bs_state, i);
1209		MPASS(bs_state.pa <= physmap[i + 1]);
1210
1211		if (L1_BLOCKS_SUPPORTED) {
1212			/* Create L2 mappings at the start of the region */
1213			if ((bs_state.pa & L1_OFFSET) != 0)
1214				pmap_bootstrap_l2_block(&bs_state, i);
1215			MPASS(bs_state.pa <= physmap[i + 1]);
1216
1217			/* Create the main L1 block mappings */
1218			for (; bs_state.va < DMAP_MAX_ADDRESS &&
1219			    (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1220			    bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1221				/* Make sure there is a valid L1 table */
1222				pmap_bootstrap_l0_table(&bs_state);
1223				MPASS((bs_state.pa & L1_OFFSET) == 0);
1224				pmap_store(
1225				    &bs_state.l1[pmap_l1_index(bs_state.va)],
1226				    PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1227				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1228				    ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1229			}
1230			MPASS(bs_state.pa <= physmap[i + 1]);
1231
1232			/* Create L2 mappings at the end of the region */
1233			pmap_bootstrap_l2_block(&bs_state, i);
1234		} else {
1235			while (bs_state.va < DMAP_MAX_ADDRESS &&
1236			    (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1237				pmap_bootstrap_l2_block(&bs_state, i);
1238			}
1239		}
1240		MPASS(bs_state.pa <= physmap[i + 1]);
1241
1242		/* Create L3 mappings at the end of the region */
1243		pmap_bootstrap_l3_page(&bs_state, i);
1244		MPASS(bs_state.pa == physmap[i + 1]);
1245
1246		if (bs_state.pa > dmap_phys_max) {
1247			dmap_phys_max = bs_state.pa;
1248			dmap_max_addr = bs_state.va;
1249		}
1250	}
1251
1252	cpu_tlb_flushID();
1253}
1254
1255static void
1256pmap_bootstrap_l2(vm_offset_t va)
1257{
1258	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1259
1260	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1261	bs_state.va = va;
1262
1263	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1264		pmap_bootstrap_l1_table(&bs_state);
1265}
1266
1267static void
1268pmap_bootstrap_l3(vm_offset_t va)
1269{
1270	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1271
1272	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1273	bs_state.va = va;
1274
1275	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1276		pmap_bootstrap_l2_table(&bs_state);
1277}
1278
1279/*
1280 *	Bootstrap the system enough to run with virtual memory.
1281 */
1282void
1283pmap_bootstrap(vm_size_t kernlen)
1284{
1285	vm_offset_t dpcpu, msgbufpv;
1286	vm_paddr_t start_pa, pa, min_pa;
1287	int i;
1288
1289	/* Verify that the ASID is set through TTBR0. */
1290	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1291	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1292
1293	/* Set this early so we can use the pagetable walking functions */
1294	kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1295	PMAP_LOCK_INIT(kernel_pmap);
1296	kernel_pmap->pm_l0_paddr =
1297	    pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1298	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1299	vm_radix_init(&kernel_pmap->pm_root);
1300	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1301	kernel_pmap->pm_stage = PM_STAGE1;
1302	kernel_pmap->pm_levels = 4;
1303	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1304	kernel_pmap->pm_asid_set = &asids;
1305
1306	/* Assume the address we were loaded to is a valid physical address */
1307	min_pa = pmap_early_vtophys(KERNBASE);
1308
1309	physmap_idx = physmem_avail(physmap, nitems(physmap));
1310	physmap_idx /= 2;
1311
1312	/*
1313	 * Find the minimum physical address. physmap is sorted,
1314	 * but may contain empty ranges.
1315	 */
1316	for (i = 0; i < physmap_idx * 2; i += 2) {
1317		if (physmap[i] == physmap[i + 1])
1318			continue;
1319		if (physmap[i] <= min_pa)
1320			min_pa = physmap[i];
1321	}
1322
1323	bs_state.freemempos = KERNBASE + kernlen;
1324	bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1325
1326	/* Create a direct map region early so we can use it for pa -> va */
1327	pmap_bootstrap_dmap(min_pa);
1328	bs_state.dmap_valid = true;
1329	/*
1330	 * We only use PXN when we know nothing will be executed from it, e.g.
1331	 * the DMAP region.
1332	 */
1333	bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1334
1335	start_pa = pa = pmap_early_vtophys(KERNBASE);
1336
1337	/*
1338	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1339	 * loader allocated the first and only l2 page table page used to map
1340	 * the kernel, preloaded files and module metadata.
1341	 */
1342	pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1343	/* And the l3 tables for the early devmap */
1344	pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1345
1346	cpu_tlb_flushID();
1347
1348#define alloc_pages(var, np)						\
1349	(var) = bs_state.freemempos;					\
1350	bs_state.freemempos += (np * PAGE_SIZE);			\
1351	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1352
1353	/* Allocate dynamic per-cpu area. */
1354	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1355	dpcpu_init((void *)dpcpu, 0);
1356
1357	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1358	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1359	msgbufp = (void *)msgbufpv;
1360
1361	/* Reserve some VA space for early BIOS/ACPI mapping */
1362	preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1363
1364	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1365	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1366	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1367	kernel_vm_end = virtual_avail;
1368
1369	pa = pmap_early_vtophys(bs_state.freemempos);
1370
1371	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1372
1373	cpu_tlb_flushID();
1374}
1375
1376#if defined(KASAN) || defined(KMSAN)
1377static void
1378pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1379    vm_offset_t *vap, vm_offset_t eva)
1380{
1381	vm_paddr_t pa;
1382	vm_offset_t va;
1383	pd_entry_t *l2;
1384
1385	va = *vap;
1386	pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1387	for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1388		l2 = pmap_l2(kernel_pmap, va);
1389
1390		/*
1391		 * KASAN stack checking results in us having already allocated
1392		 * part of our shadow map, so we can just skip those segments.
1393		 */
1394		if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1395			pa += L2_SIZE;
1396			continue;
1397		}
1398
1399		bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1400		physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1401		pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1402	}
1403	*vap = va;
1404}
1405
1406/*
1407 * Finish constructing the initial shadow map:
1408 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1409 *   shadow map)
1410 * - Map that entire range using L2 superpages.
1411 */
1412static void
1413pmap_bootstrap_san1(vm_offset_t va, int scale)
1414{
1415	vm_offset_t eva;
1416	vm_paddr_t kernstart;
1417	int i;
1418
1419	kernstart = pmap_early_vtophys(KERNBASE);
1420
1421	/*
1422	 * Rebuild physmap one more time, we may have excluded more regions from
1423	 * allocation since pmap_bootstrap().
1424	 */
1425	bzero(physmap, sizeof(physmap));
1426	physmap_idx = physmem_avail(physmap, nitems(physmap));
1427	physmap_idx /= 2;
1428
1429	eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1430
1431	/*
1432	 * Find a slot in the physmap large enough for what we needed.  We try to put
1433	 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1434	 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1435	 */
1436	for (i = (physmap_idx * 2) - 2; i >= 0; i -= 2) {
1437		vm_paddr_t plow, phigh;
1438
1439		/* L2 mappings must be backed by memory that is L2-aligned */
1440		plow = roundup2(physmap[i], L2_SIZE);
1441		phigh = physmap[i + 1];
1442		if (plow >= phigh)
1443			continue;
1444		if (kernstart >= plow && kernstart < phigh)
1445			phigh = kernstart;
1446		if (phigh - plow >= L2_SIZE) {
1447			pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1448			if (va >= eva)
1449				break;
1450		}
1451	}
1452	if (i < 0)
1453		panic("Could not find phys region for shadow map");
1454
1455	/*
1456	 * Done. We should now have a valid shadow address mapped for all KVA
1457	 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1458	 * shadow accesses by the sanitizer runtime will succeed for this range.
1459	 * When the kernel virtual address range is later expanded, as will
1460	 * happen in vm_mem_init(), the shadow map will be grown as well. This
1461	 * is handled by pmap_san_enter().
1462	 */
1463}
1464
1465void
1466pmap_bootstrap_san(void)
1467{
1468#ifdef KASAN
1469	pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1470#else
1471	static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1472	static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1473	pd_entry_t *l0, *l1;
1474
1475	if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1476		panic("initial kernel map is too large");
1477
1478	l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1479	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1480	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1481	l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1482	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1483	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1484	pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1485
1486	l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1487	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1488	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1489	l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1490	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1491	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1492	pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1493#endif
1494}
1495#endif
1496
1497/*
1498 *	Initialize a vm_page's machine-dependent fields.
1499 */
1500void
1501pmap_page_init(vm_page_t m)
1502{
1503
1504	TAILQ_INIT(&m->md.pv_list);
1505	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1506}
1507
1508static void
1509pmap_init_asids(struct asid_set *set, int bits)
1510{
1511	int i;
1512
1513	set->asid_bits = bits;
1514
1515	/*
1516	 * We may be too early in the overall initialization process to use
1517	 * bit_alloc().
1518	 */
1519	set->asid_set_size = 1 << set->asid_bits;
1520	set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1521	    M_WAITOK | M_ZERO);
1522	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1523		bit_set(set->asid_set, i);
1524	set->asid_next = ASID_FIRST_AVAILABLE;
1525	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1526}
1527
1528static void
1529pmap_init_pv_table(void)
1530{
1531	struct vm_phys_seg *seg, *next_seg;
1532	struct pmap_large_md_page *pvd;
1533	vm_size_t s;
1534	int domain, i, j, pages;
1535
1536	/*
1537	 * We strongly depend on the size being a power of two, so the assert
1538	 * is overzealous. However, should the struct be resized to a
1539	 * different power of two, the code below needs to be revisited.
1540	 */
1541	CTASSERT((sizeof(*pvd) == 64));
1542
1543	/*
1544	 * Calculate the size of the array.
1545	 */
1546	s = 0;
1547	for (i = 0; i < vm_phys_nsegs; i++) {
1548		seg = &vm_phys_segs[i];
1549		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1550		    pmap_l2_pindex(seg->start);
1551		s += round_page(pages * sizeof(*pvd));
1552	}
1553	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1554	if (pv_table == NULL)
1555		panic("%s: kva_alloc failed\n", __func__);
1556
1557	/*
1558	 * Iterate physical segments to allocate domain-local memory for PV
1559	 * list headers.
1560	 */
1561	pvd = pv_table;
1562	for (i = 0; i < vm_phys_nsegs; i++) {
1563		seg = &vm_phys_segs[i];
1564		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1565		    pmap_l2_pindex(seg->start);
1566		domain = seg->domain;
1567
1568		s = round_page(pages * sizeof(*pvd));
1569
1570		for (j = 0; j < s; j += PAGE_SIZE) {
1571			vm_page_t m = vm_page_alloc_noobj_domain(domain,
1572			    VM_ALLOC_ZERO);
1573			if (m == NULL)
1574				panic("failed to allocate PV table page");
1575			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1576		}
1577
1578		for (j = 0; j < s / sizeof(*pvd); j++) {
1579			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1580			TAILQ_INIT(&pvd->pv_page.pv_list);
1581			pvd++;
1582		}
1583	}
1584	pvd = &pv_dummy_large;
1585	memset(pvd, 0, sizeof(*pvd));
1586	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1587	TAILQ_INIT(&pvd->pv_page.pv_list);
1588
1589	/*
1590	 * Set pointers from vm_phys_segs to pv_table.
1591	 */
1592	for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1593		seg = &vm_phys_segs[i];
1594		seg->md_first = pvd;
1595		pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1596		    pmap_l2_pindex(seg->start);
1597
1598		/*
1599		 * If there is a following segment, and the final
1600		 * superpage of this segment and the initial superpage
1601		 * of the next segment are the same then adjust the
1602		 * pv_table entry for that next segment down by one so
1603		 * that the pv_table entries will be shared.
1604		 */
1605		if (i + 1 < vm_phys_nsegs) {
1606			next_seg = &vm_phys_segs[i + 1];
1607			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1608			    pmap_l2_pindex(next_seg->start)) {
1609				pvd--;
1610			}
1611		}
1612	}
1613}
1614
1615/*
1616 *	Initialize the pmap module.
1617 *	Called by vm_init, to initialize any structures that the pmap
1618 *	system needs to map virtual memory.
1619 */
1620void
1621pmap_init(void)
1622{
1623	uint64_t mmfr1;
1624	int i, vmid_bits;
1625
1626	/*
1627	 * Are large page mappings enabled?
1628	 */
1629	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1630	if (superpages_enabled) {
1631		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1632		    ("pmap_init: can't assign to pagesizes[1]"));
1633		pagesizes[1] = L2_SIZE;
1634		if (L1_BLOCKS_SUPPORTED) {
1635			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1636			    ("pmap_init: can't assign to pagesizes[2]"));
1637			pagesizes[2] = L1_SIZE;
1638		}
1639	}
1640
1641	/*
1642	 * Initialize the ASID allocator.
1643	 */
1644	pmap_init_asids(&asids,
1645	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1646
1647	if (has_hyp()) {
1648		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1649		vmid_bits = 8;
1650
1651		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1652		    ID_AA64MMFR1_VMIDBits_16)
1653			vmid_bits = 16;
1654		pmap_init_asids(&vmids, vmid_bits);
1655	}
1656
1657	/*
1658	 * Initialize pv chunk lists.
1659	 */
1660	for (i = 0; i < PMAP_MEMDOM; i++) {
1661		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1662		    MTX_DEF);
1663		TAILQ_INIT(&pv_chunks[i].pvc_list);
1664	}
1665	pmap_init_pv_table();
1666
1667	vm_initialized = 1;
1668}
1669
1670static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1671    "2MB page mapping counters");
1672
1673static u_long pmap_l2_demotions;
1674SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1675    &pmap_l2_demotions, 0, "2MB page demotions");
1676
1677static u_long pmap_l2_mappings;
1678SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1679    &pmap_l2_mappings, 0, "2MB page mappings");
1680
1681static u_long pmap_l2_p_failures;
1682SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1683    &pmap_l2_p_failures, 0, "2MB page promotion failures");
1684
1685static u_long pmap_l2_promotions;
1686SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1687    &pmap_l2_promotions, 0, "2MB page promotions");
1688
1689static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1690    "L3C (64KB/2MB) page mapping counters");
1691
1692static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1693SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1694    &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1695
1696static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1697SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1698    &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1699
1700static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1701SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1702    &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1703
1704static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1705SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1706    &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1707
1708/*
1709 * If the given value for "final_only" is false, then any cached intermediate-
1710 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1711 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1712 * Otherwise, just the cached final-level entry is invalidated.
1713 */
1714static __inline void
1715pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1716{
1717	if (final_only)
1718		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1719	else
1720		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1721}
1722
1723static __inline void
1724pmap_s1_invalidate_user(uint64_t r, bool final_only)
1725{
1726	if (final_only)
1727		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1728	else
1729		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1730}
1731
1732/*
1733 * Invalidates any cached final- and optionally intermediate-level TLB entries
1734 * for the specified virtual address in the given virtual address space.
1735 */
1736static __inline void
1737pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1738{
1739	uint64_t r;
1740
1741	PMAP_ASSERT_STAGE1(pmap);
1742
1743	dsb(ishst);
1744	r = TLBI_VA(va);
1745	if (pmap == kernel_pmap) {
1746		pmap_s1_invalidate_kernel(r, final_only);
1747	} else {
1748		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1749		pmap_s1_invalidate_user(r, final_only);
1750	}
1751	dsb(ish);
1752	isb();
1753}
1754
1755static __inline void
1756pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1757{
1758	PMAP_ASSERT_STAGE2(pmap);
1759	MPASS(pmap_stage2_invalidate_range != NULL);
1760	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1761	    final_only);
1762}
1763
1764static __inline void
1765pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1766{
1767	if (pmap->pm_stage == PM_STAGE1)
1768		pmap_s1_invalidate_page(pmap, va, final_only);
1769	else
1770		pmap_s2_invalidate_page(pmap, va, final_only);
1771}
1772
1773/*
1774 * Invalidates any cached final- and optionally intermediate-level TLB entries
1775 * for the specified virtual address range in the given virtual address space.
1776 */
1777static __inline void
1778pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1779    bool final_only)
1780{
1781	uint64_t end, r, start;
1782
1783	PMAP_ASSERT_STAGE1(pmap);
1784
1785	dsb(ishst);
1786	if (pmap == kernel_pmap) {
1787		start = TLBI_VA(sva);
1788		end = TLBI_VA(eva);
1789		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1790			pmap_s1_invalidate_kernel(r, final_only);
1791	} else {
1792		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1793		start |= TLBI_VA(sva);
1794		end |= TLBI_VA(eva);
1795		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1796			pmap_s1_invalidate_user(r, final_only);
1797	}
1798	dsb(ish);
1799	isb();
1800}
1801
1802static __inline void
1803pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1804    bool final_only)
1805{
1806	PMAP_ASSERT_STAGE2(pmap);
1807	MPASS(pmap_stage2_invalidate_range != NULL);
1808	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1809}
1810
1811static __inline void
1812pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1813    bool final_only)
1814{
1815	if (pmap->pm_stage == PM_STAGE1)
1816		pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1817	else
1818		pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1819}
1820
1821/*
1822 * Invalidates all cached intermediate- and final-level TLB entries for the
1823 * given virtual address space.
1824 */
1825static __inline void
1826pmap_s1_invalidate_all(pmap_t pmap)
1827{
1828	uint64_t r;
1829
1830	PMAP_ASSERT_STAGE1(pmap);
1831
1832	dsb(ishst);
1833	if (pmap == kernel_pmap) {
1834		__asm __volatile("tlbi vmalle1is");
1835	} else {
1836		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1837		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1838	}
1839	dsb(ish);
1840	isb();
1841}
1842
1843static __inline void
1844pmap_s2_invalidate_all(pmap_t pmap)
1845{
1846	PMAP_ASSERT_STAGE2(pmap);
1847	MPASS(pmap_stage2_invalidate_all != NULL);
1848	pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1849}
1850
1851static __inline void
1852pmap_invalidate_all(pmap_t pmap)
1853{
1854	if (pmap->pm_stage == PM_STAGE1)
1855		pmap_s1_invalidate_all(pmap);
1856	else
1857		pmap_s2_invalidate_all(pmap);
1858}
1859
1860/*
1861 *	Routine:	pmap_extract
1862 *	Function:
1863 *		Extract the physical page address associated
1864 *		with the given map/virtual_address pair.
1865 */
1866vm_paddr_t
1867pmap_extract(pmap_t pmap, vm_offset_t va)
1868{
1869	pt_entry_t *pte, tpte;
1870	vm_paddr_t pa;
1871	int lvl;
1872
1873	pa = 0;
1874	PMAP_LOCK(pmap);
1875	/*
1876	 * Find the block or page map for this virtual address. pmap_pte
1877	 * will return either a valid block/page entry, or NULL.
1878	 */
1879	pte = pmap_pte(pmap, va, &lvl);
1880	if (pte != NULL) {
1881		tpte = pmap_load(pte);
1882		pa = PTE_TO_PHYS(tpte);
1883		switch(lvl) {
1884		case 1:
1885			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1886			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1887			    ("pmap_extract: Invalid L1 pte found: %lx",
1888			    tpte & ATTR_DESCR_MASK));
1889			pa |= (va & L1_OFFSET);
1890			break;
1891		case 2:
1892			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1893			    ("pmap_extract: Invalid L2 pte found: %lx",
1894			    tpte & ATTR_DESCR_MASK));
1895			pa |= (va & L2_OFFSET);
1896			break;
1897		case 3:
1898			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1899			    ("pmap_extract: Invalid L3 pte found: %lx",
1900			    tpte & ATTR_DESCR_MASK));
1901			pa |= (va & L3_OFFSET);
1902			break;
1903		}
1904	}
1905	PMAP_UNLOCK(pmap);
1906	return (pa);
1907}
1908
1909/*
1910 *	Routine:	pmap_extract_and_hold
1911 *	Function:
1912 *		Atomically extract and hold the physical page
1913 *		with the given pmap and virtual address pair
1914 *		if that mapping permits the given protection.
1915 */
1916vm_page_t
1917pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1918{
1919	pt_entry_t *pte, tpte;
1920	vm_offset_t off;
1921	vm_page_t m;
1922	int lvl;
1923	bool use;
1924
1925	m = NULL;
1926	PMAP_LOCK(pmap);
1927	pte = pmap_pte(pmap, va, &lvl);
1928	if (pte != NULL) {
1929		tpte = pmap_load(pte);
1930
1931		KASSERT(lvl > 0 && lvl <= 3,
1932		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1933		/*
1934		 * Check that the pte is either a L3 page, or a L1 or L2 block
1935		 * entry. We can assume L1_BLOCK == L2_BLOCK.
1936		 */
1937		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1938		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1939		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1940		     tpte & ATTR_DESCR_MASK));
1941
1942		use = false;
1943		if ((prot & VM_PROT_WRITE) == 0)
1944			use = true;
1945		else if (pmap->pm_stage == PM_STAGE1 &&
1946		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1947			use = true;
1948		else if (pmap->pm_stage == PM_STAGE2 &&
1949		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1950		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1951			use = true;
1952
1953		if (use) {
1954			switch (lvl) {
1955			case 1:
1956				off = va & L1_OFFSET;
1957				break;
1958			case 2:
1959				off = va & L2_OFFSET;
1960				break;
1961			case 3:
1962			default:
1963				off = 0;
1964			}
1965			m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
1966			if (m != NULL && !vm_page_wire_mapped(m))
1967				m = NULL;
1968		}
1969	}
1970	PMAP_UNLOCK(pmap);
1971	return (m);
1972}
1973
1974/*
1975 * Walks the page tables to translate a kernel virtual address to a
1976 * physical address. Returns true if the kva is valid and stores the
1977 * physical address in pa if it is not NULL.
1978 *
1979 * See the comment above data_abort() for the rationale for specifying
1980 * NO_PERTHREAD_SSP here.
1981 */
1982bool NO_PERTHREAD_SSP
1983pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1984{
1985	pt_entry_t *pte, tpte;
1986	register_t intr;
1987	uint64_t par;
1988
1989	/*
1990	 * Disable interrupts so we don't get interrupted between asking
1991	 * for address translation, and getting the result back.
1992	 */
1993	intr = intr_disable();
1994	par = arm64_address_translate_s1e1r(va);
1995	intr_restore(intr);
1996
1997	if (PAR_SUCCESS(par)) {
1998		if (pa != NULL)
1999			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2000		return (true);
2001	}
2002
2003	/*
2004	 * Fall back to walking the page table. The address translation
2005	 * instruction may fail when the page is in a break-before-make
2006	 * sequence. As we only clear the valid bit in said sequence we
2007	 * can walk the page table to find the physical address.
2008	 */
2009
2010	pte = pmap_l1(kernel_pmap, va);
2011	if (pte == NULL)
2012		return (false);
2013
2014	/*
2015	 * A concurrent pmap_update_entry() will clear the entry's valid bit
2016	 * but leave the rest of the entry unchanged.  Therefore, we treat a
2017	 * non-zero entry as being valid, and we ignore the valid bit when
2018	 * determining whether the entry maps a block, page, or table.
2019	 */
2020	tpte = pmap_load(pte);
2021	if (tpte == 0)
2022		return (false);
2023	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2024		if (pa != NULL)
2025			*pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2026		return (true);
2027	}
2028	pte = pmap_l1_to_l2(&tpte, va);
2029	tpte = pmap_load(pte);
2030	if (tpte == 0)
2031		return (false);
2032	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2033		if (pa != NULL)
2034			*pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2035		return (true);
2036	}
2037	pte = pmap_l2_to_l3(&tpte, va);
2038	tpte = pmap_load(pte);
2039	if (tpte == 0)
2040		return (false);
2041	if (pa != NULL)
2042		*pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2043	return (true);
2044}
2045
2046/*
2047 *	Routine:	pmap_kextract
2048 *	Function:
2049 *		Extract the physical page address associated with the given kernel
2050 *		virtual address.
2051 */
2052vm_paddr_t
2053pmap_kextract(vm_offset_t va)
2054{
2055	vm_paddr_t pa;
2056
2057	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2058		return (DMAP_TO_PHYS(va));
2059
2060	if (pmap_klookup(va, &pa) == false)
2061		return (0);
2062	return (pa);
2063}
2064
2065/***************************************************
2066 * Low level mapping routines.....
2067 ***************************************************/
2068
2069void
2070pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2071{
2072	pd_entry_t *pde;
2073	pt_entry_t attr, old_l3e, *pte;
2074	vm_offset_t va;
2075	vm_page_t mpte;
2076	int error, lvl;
2077
2078	KASSERT((pa & L3_OFFSET) == 0,
2079	    ("pmap_kenter: Invalid physical address"));
2080	KASSERT((sva & L3_OFFSET) == 0,
2081	    ("pmap_kenter: Invalid virtual address"));
2082	KASSERT((size & PAGE_MASK) == 0,
2083	    ("pmap_kenter: Mapping is not page-sized"));
2084
2085	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2086	    ATTR_KERN_GP | ATTR_S1_IDX(mode);
2087	old_l3e = 0;
2088	va = sva;
2089	while (size != 0) {
2090		pde = pmap_pde(kernel_pmap, va, &lvl);
2091		KASSERT(pde != NULL,
2092		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2093		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2094
2095		/*
2096		 * If we have an aligned, contiguous chunk of L2_SIZE, try
2097		 * to create an L2_BLOCK mapping.
2098		 */
2099		if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2100		    (pa & L2_OFFSET) == 0 && vm_initialized) {
2101			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2102			KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2103			    ("pmap_kenter: Unexpected mapping"));
2104			PMAP_LOCK(kernel_pmap);
2105			error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2106			    false);
2107			if (error == 0) {
2108				attr &= ~ATTR_CONTIGUOUS;
2109
2110				/*
2111				 * Although the page table page "mpte" should
2112				 * be devoid of mappings, the TLB might hold
2113				 * intermediate entries that reference it, so
2114				 * we perform a single-page invalidation.
2115				 */
2116				pmap_update_entry(kernel_pmap, pde,
2117				    PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2118				    PAGE_SIZE);
2119			}
2120			PMAP_UNLOCK(kernel_pmap);
2121			if (error == 0) {
2122				va += L2_SIZE;
2123				pa += L2_SIZE;
2124				size -= L2_SIZE;
2125				continue;
2126			}
2127		}
2128
2129		/*
2130		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2131		 * L3 pages, set the contiguous bit within each PTE so that
2132		 * the chunk can be cached using only one TLB entry.
2133		 */
2134		if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2135			if (size >= L3C_SIZE)
2136				attr |= ATTR_CONTIGUOUS;
2137			else
2138				attr &= ~ATTR_CONTIGUOUS;
2139		}
2140
2141		pte = pmap_l2_to_l3(pde, va);
2142		old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2143		    L3_PAGE);
2144
2145		va += PAGE_SIZE;
2146		pa += PAGE_SIZE;
2147		size -= PAGE_SIZE;
2148	}
2149	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2150		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2151	else {
2152		/*
2153		 * Because the old entries were invalid and the new mappings
2154		 * are not executable, an isb is not required.
2155		 */
2156		dsb(ishst);
2157	}
2158}
2159
2160void
2161pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2162{
2163
2164	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2165}
2166
2167/*
2168 * Remove a page from the kernel pagetables.
2169 */
2170void
2171pmap_kremove(vm_offset_t va)
2172{
2173	pt_entry_t *pte;
2174
2175	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2176	KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2177	    ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2178	pmap_clear(pte);
2179	pmap_s1_invalidate_page(kernel_pmap, va, true);
2180}
2181
2182/*
2183 * Remove the specified range of mappings from the kernel address space.
2184 *
2185 * Should only be applied to mappings that were created by pmap_kenter() or
2186 * pmap_kenter_device().  Nothing about this function is actually specific
2187 * to device mappings.
2188 */
2189void
2190pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2191{
2192	pt_entry_t *ptep, *ptep_end;
2193	vm_offset_t va;
2194	int lvl;
2195
2196	KASSERT((sva & L3_OFFSET) == 0,
2197	    ("pmap_kremove_device: Invalid virtual address"));
2198	KASSERT((size & PAGE_MASK) == 0,
2199	    ("pmap_kremove_device: Mapping is not page-sized"));
2200
2201	va = sva;
2202	while (size != 0) {
2203		ptep = pmap_pte(kernel_pmap, va, &lvl);
2204		KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2205		switch (lvl) {
2206		case 2:
2207			KASSERT((va & L2_OFFSET) == 0,
2208			    ("Unaligned virtual address"));
2209			KASSERT(size >= L2_SIZE, ("Insufficient size"));
2210
2211			if (va != sva) {
2212				pmap_s1_invalidate_range(kernel_pmap, sva, va,
2213				    true);
2214			}
2215			pmap_clear(ptep);
2216			pmap_s1_invalidate_page(kernel_pmap, va, true);
2217			PMAP_LOCK(kernel_pmap);
2218			pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2219			PMAP_UNLOCK(kernel_pmap);
2220
2221			va += L2_SIZE;
2222			sva = va;
2223			size -= L2_SIZE;
2224			break;
2225		case 3:
2226			if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2227				KASSERT((va & L3C_OFFSET) == 0,
2228				    ("Unaligned L3C virtual address"));
2229				KASSERT(size >= L3C_SIZE,
2230				    ("Insufficient L3C size"));
2231
2232				ptep_end = ptep + L3C_ENTRIES;
2233				for (; ptep < ptep_end; ptep++)
2234					pmap_clear(ptep);
2235
2236				va += L3C_SIZE;
2237				size -= L3C_SIZE;
2238				break;
2239			}
2240			pmap_clear(ptep);
2241
2242			va += PAGE_SIZE;
2243			size -= PAGE_SIZE;
2244			break;
2245		default:
2246			__assert_unreachable();
2247			break;
2248		}
2249	}
2250	if (va != sva)
2251		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2252}
2253
2254/*
2255 *	Used to map a range of physical addresses into kernel
2256 *	virtual address space.
2257 *
2258 *	The value passed in '*virt' is a suggested virtual address for
2259 *	the mapping. Architectures which can support a direct-mapped
2260 *	physical to virtual region can return the appropriate address
2261 *	within that region, leaving '*virt' unchanged. Other
2262 *	architectures should map the pages starting at '*virt' and
2263 *	update '*virt' with the first usable address after the mapped
2264 *	region.
2265 */
2266vm_offset_t
2267pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2268{
2269	return PHYS_TO_DMAP(start);
2270}
2271
2272/*
2273 * Add a list of wired pages to the kva
2274 * this routine is only used for temporary
2275 * kernel mappings that do not need to have
2276 * page modification or references recorded.
2277 * Note that old mappings are simply written
2278 * over.  The page *must* be wired.
2279 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2280 */
2281void
2282pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2283{
2284	pd_entry_t *pde;
2285	pt_entry_t attr, old_l3e, *pte;
2286	vm_offset_t va;
2287	vm_page_t m;
2288	int i, lvl;
2289
2290	old_l3e = 0;
2291	va = sva;
2292	for (i = 0; i < count; i++) {
2293		pde = pmap_pde(kernel_pmap, va, &lvl);
2294		KASSERT(pde != NULL,
2295		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2296		KASSERT(lvl == 2,
2297		    ("pmap_qenter: Invalid level %d", lvl));
2298
2299		m = ma[i];
2300		attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2301		    ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2302		pte = pmap_l2_to_l3(pde, va);
2303		old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2304
2305		va += L3_SIZE;
2306	}
2307	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2308		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2309	else {
2310		/*
2311		 * Because the old entries were invalid and the new mappings
2312		 * are not executable, an isb is not required.
2313		 */
2314		dsb(ishst);
2315	}
2316}
2317
2318/*
2319 * This routine tears out page mappings from the
2320 * kernel -- it is meant only for temporary mappings.
2321 */
2322void
2323pmap_qremove(vm_offset_t sva, int count)
2324{
2325	pt_entry_t *pte;
2326	vm_offset_t va;
2327
2328	KASSERT(ADDR_IS_CANONICAL(sva),
2329	    ("%s: Address not in canonical form: %lx", __func__, sva));
2330	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2331
2332	va = sva;
2333	while (count-- > 0) {
2334		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2335		if (pte != NULL) {
2336			pmap_clear(pte);
2337		}
2338
2339		va += PAGE_SIZE;
2340	}
2341	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2342}
2343
2344/***************************************************
2345 * Page table page management routines.....
2346 ***************************************************/
2347/*
2348 * Schedule the specified unused page table page to be freed.  Specifically,
2349 * add the page to the specified list of pages that will be released to the
2350 * physical memory manager after the TLB has been updated.
2351 */
2352static __inline void
2353pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2354{
2355
2356	if (set_PG_ZERO)
2357		m->flags |= PG_ZERO;
2358	else
2359		m->flags &= ~PG_ZERO;
2360	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2361}
2362
2363/*
2364 * Decrements a page table page's reference count, which is used to record the
2365 * number of valid page table entries within the page.  If the reference count
2366 * drops to zero, then the page table page is unmapped.  Returns true if the
2367 * page table page was unmapped and false otherwise.
2368 */
2369static inline bool
2370pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2371{
2372
2373	--m->ref_count;
2374	if (m->ref_count == 0) {
2375		_pmap_unwire_l3(pmap, va, m, free);
2376		return (true);
2377	} else
2378		return (false);
2379}
2380
2381static void
2382_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2383{
2384
2385	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2386	/*
2387	 * unmap the page table page
2388	 */
2389	if (m->pindex >= (NUL2E + NUL1E)) {
2390		/* l1 page */
2391		pd_entry_t *l0;
2392
2393		l0 = pmap_l0(pmap, va);
2394		pmap_clear(l0);
2395	} else if (m->pindex >= NUL2E) {
2396		/* l2 page */
2397		pd_entry_t *l1;
2398
2399		l1 = pmap_l1(pmap, va);
2400		pmap_clear(l1);
2401	} else {
2402		/* l3 page */
2403		pd_entry_t *l2;
2404
2405		l2 = pmap_l2(pmap, va);
2406		pmap_clear(l2);
2407	}
2408	pmap_resident_count_dec(pmap, 1);
2409	if (m->pindex < NUL2E) {
2410		/* We just released an l3, unhold the matching l2 */
2411		pd_entry_t *l1, tl1;
2412		vm_page_t l2pg;
2413
2414		l1 = pmap_l1(pmap, va);
2415		tl1 = pmap_load(l1);
2416		l2pg = PTE_TO_VM_PAGE(tl1);
2417		pmap_unwire_l3(pmap, va, l2pg, free);
2418	} else if (m->pindex < (NUL2E + NUL1E)) {
2419		/* We just released an l2, unhold the matching l1 */
2420		pd_entry_t *l0, tl0;
2421		vm_page_t l1pg;
2422
2423		l0 = pmap_l0(pmap, va);
2424		tl0 = pmap_load(l0);
2425		l1pg = PTE_TO_VM_PAGE(tl0);
2426		pmap_unwire_l3(pmap, va, l1pg, free);
2427	}
2428	pmap_invalidate_page(pmap, va, false);
2429
2430	/*
2431	 * Put page on a list so that it is released after
2432	 * *ALL* TLB shootdown is done
2433	 */
2434	pmap_add_delayed_free_list(m, free, true);
2435}
2436
2437/*
2438 * After removing a page table entry, this routine is used to
2439 * conditionally free the page, and manage the reference count.
2440 */
2441static int
2442pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2443    struct spglist *free)
2444{
2445	vm_page_t mpte;
2446
2447	KASSERT(ADDR_IS_CANONICAL(va),
2448	    ("%s: Address not in canonical form: %lx", __func__, va));
2449	if (ADDR_IS_KERNEL(va))
2450		return (0);
2451	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2452	mpte = PTE_TO_VM_PAGE(ptepde);
2453	return (pmap_unwire_l3(pmap, va, mpte, free));
2454}
2455
2456/*
2457 * Release a page table page reference after a failed attempt to create a
2458 * mapping.
2459 */
2460static void
2461pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2462{
2463	struct spglist free;
2464
2465	SLIST_INIT(&free);
2466	if (pmap_unwire_l3(pmap, va, mpte, &free))
2467		vm_page_free_pages_toq(&free, true);
2468}
2469
2470void
2471pmap_pinit0(pmap_t pmap)
2472{
2473
2474	PMAP_LOCK_INIT(pmap);
2475	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2476	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2477	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2478	TAILQ_INIT(&pmap->pm_pvchunk);
2479	vm_radix_init(&pmap->pm_root);
2480	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2481	pmap->pm_stage = PM_STAGE1;
2482	pmap->pm_levels = 4;
2483	pmap->pm_ttbr = pmap->pm_l0_paddr;
2484	pmap->pm_asid_set = &asids;
2485	pmap->pm_bti = NULL;
2486
2487	PCPU_SET(curpmap, pmap);
2488}
2489
2490int
2491pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2492{
2493	vm_page_t m;
2494
2495	/*
2496	 * allocate the l0 page
2497	 */
2498	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2499	    VM_ALLOC_ZERO);
2500	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2501	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2502
2503	TAILQ_INIT(&pmap->pm_pvchunk);
2504	vm_radix_init(&pmap->pm_root);
2505	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2506	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2507
2508	MPASS(levels == 3 || levels == 4);
2509	pmap->pm_levels = levels;
2510	pmap->pm_stage = stage;
2511	pmap->pm_bti = NULL;
2512	switch (stage) {
2513	case PM_STAGE1:
2514		pmap->pm_asid_set = &asids;
2515		if (pmap_bti_support) {
2516			pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2517			    M_ZERO | M_WAITOK);
2518			rangeset_init(pmap->pm_bti, bti_dup_range,
2519			    bti_free_range, pmap, M_NOWAIT);
2520		}
2521		break;
2522	case PM_STAGE2:
2523		pmap->pm_asid_set = &vmids;
2524		break;
2525	default:
2526		panic("%s: Invalid pmap type %d", __func__, stage);
2527		break;
2528	}
2529
2530	/* XXX Temporarily disable deferred ASID allocation. */
2531	pmap_alloc_asid(pmap);
2532
2533	/*
2534	 * Allocate the level 1 entry to use as the root. This will increase
2535	 * the refcount on the level 1 page so it won't be removed until
2536	 * pmap_release() is called.
2537	 */
2538	if (pmap->pm_levels == 3) {
2539		PMAP_LOCK(pmap);
2540		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2541		PMAP_UNLOCK(pmap);
2542	}
2543	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2544
2545	return (1);
2546}
2547
2548int
2549pmap_pinit(pmap_t pmap)
2550{
2551
2552	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2553}
2554
2555/*
2556 * This routine is called if the desired page table page does not exist.
2557 *
2558 * If page table page allocation fails, this routine may sleep before
2559 * returning NULL.  It sleeps only if a lock pointer was given.
2560 *
2561 * Note: If a page allocation fails at page table level two or three,
2562 * one or two pages may be held during the wait, only to be released
2563 * afterwards.  This conservative approach is easily argued to avoid
2564 * race conditions.
2565 */
2566static vm_page_t
2567_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2568{
2569	vm_page_t m, l1pg, l2pg;
2570
2571	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2572
2573	/*
2574	 * Allocate a page table page.
2575	 */
2576	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2577		if (lockp != NULL) {
2578			RELEASE_PV_LIST_LOCK(lockp);
2579			PMAP_UNLOCK(pmap);
2580			vm_wait(NULL);
2581			PMAP_LOCK(pmap);
2582		}
2583
2584		/*
2585		 * Indicate the need to retry.  While waiting, the page table
2586		 * page may have been allocated.
2587		 */
2588		return (NULL);
2589	}
2590	m->pindex = ptepindex;
2591
2592	/*
2593	 * Because of AArch64's weak memory consistency model, we must have a
2594	 * barrier here to ensure that the stores for zeroing "m", whether by
2595	 * pmap_zero_page() or an earlier function, are visible before adding
2596	 * "m" to the page table.  Otherwise, a page table walk by another
2597	 * processor's MMU could see the mapping to "m" and a stale, non-zero
2598	 * PTE within "m".
2599	 */
2600	dmb(ishst);
2601
2602	/*
2603	 * Map the pagetable page into the process address space, if
2604	 * it isn't already there.
2605	 */
2606
2607	if (ptepindex >= (NUL2E + NUL1E)) {
2608		pd_entry_t *l0p, l0e;
2609		vm_pindex_t l0index;
2610
2611		l0index = ptepindex - (NUL2E + NUL1E);
2612		l0p = &pmap->pm_l0[l0index];
2613		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2614		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2615		l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2616
2617		/*
2618		 * Mark all kernel memory as not accessible from userspace
2619		 * and userspace memory as not executable from the kernel.
2620		 * This has been done for the bootstrap L0 entries in
2621		 * locore.S.
2622		 */
2623		if (pmap == kernel_pmap)
2624			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2625		else
2626			l0e |= TATTR_PXN_TABLE;
2627		pmap_store(l0p, l0e);
2628	} else if (ptepindex >= NUL2E) {
2629		vm_pindex_t l0index, l1index;
2630		pd_entry_t *l0, *l1;
2631		pd_entry_t tl0;
2632
2633		l1index = ptepindex - NUL2E;
2634		l0index = l1index >> Ln_ENTRIES_SHIFT;
2635
2636		l0 = &pmap->pm_l0[l0index];
2637		tl0 = pmap_load(l0);
2638		if (tl0 == 0) {
2639			/* recurse for allocating page dir */
2640			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2641			    lockp) == NULL) {
2642				vm_page_unwire_noq(m);
2643				vm_page_free_zero(m);
2644				return (NULL);
2645			}
2646		} else {
2647			l1pg = PTE_TO_VM_PAGE(tl0);
2648			l1pg->ref_count++;
2649		}
2650
2651		l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2652		l1 = &l1[ptepindex & Ln_ADDR_MASK];
2653		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2654		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2655		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2656	} else {
2657		vm_pindex_t l0index, l1index;
2658		pd_entry_t *l0, *l1, *l2;
2659		pd_entry_t tl0, tl1;
2660
2661		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2662		l0index = l1index >> Ln_ENTRIES_SHIFT;
2663
2664		l0 = &pmap->pm_l0[l0index];
2665		tl0 = pmap_load(l0);
2666		if (tl0 == 0) {
2667			/* recurse for allocating page dir */
2668			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2669			    lockp) == NULL) {
2670				vm_page_unwire_noq(m);
2671				vm_page_free_zero(m);
2672				return (NULL);
2673			}
2674			tl0 = pmap_load(l0);
2675			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2676			l1 = &l1[l1index & Ln_ADDR_MASK];
2677		} else {
2678			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2679			l1 = &l1[l1index & Ln_ADDR_MASK];
2680			tl1 = pmap_load(l1);
2681			if (tl1 == 0) {
2682				/* recurse for allocating page dir */
2683				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2684				    lockp) == NULL) {
2685					vm_page_unwire_noq(m);
2686					vm_page_free_zero(m);
2687					return (NULL);
2688				}
2689			} else {
2690				l2pg = PTE_TO_VM_PAGE(tl1);
2691				l2pg->ref_count++;
2692			}
2693		}
2694
2695		l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2696		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2697		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2698		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2699		pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2700	}
2701
2702	pmap_resident_count_inc(pmap, 1);
2703
2704	return (m);
2705}
2706
2707static pd_entry_t *
2708pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2709    struct rwlock **lockp)
2710{
2711	pd_entry_t *l1, *l2;
2712	vm_page_t l2pg;
2713	vm_pindex_t l2pindex;
2714
2715	KASSERT(ADDR_IS_CANONICAL(va),
2716	    ("%s: Address not in canonical form: %lx", __func__, va));
2717
2718retry:
2719	l1 = pmap_l1(pmap, va);
2720	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2721		l2 = pmap_l1_to_l2(l1, va);
2722		if (!ADDR_IS_KERNEL(va)) {
2723			/* Add a reference to the L2 page. */
2724			l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2725			l2pg->ref_count++;
2726		} else
2727			l2pg = NULL;
2728	} else if (!ADDR_IS_KERNEL(va)) {
2729		/* Allocate a L2 page. */
2730		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2731		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2732		if (l2pg == NULL) {
2733			if (lockp != NULL)
2734				goto retry;
2735			else
2736				return (NULL);
2737		}
2738		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2739		l2 = &l2[pmap_l2_index(va)];
2740	} else
2741		panic("pmap_alloc_l2: missing page table page for va %#lx",
2742		    va);
2743	*l2pgp = l2pg;
2744	return (l2);
2745}
2746
2747static vm_page_t
2748pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2749{
2750	vm_pindex_t ptepindex;
2751	pd_entry_t *pde, tpde;
2752#ifdef INVARIANTS
2753	pt_entry_t *pte;
2754#endif
2755	vm_page_t m;
2756	int lvl;
2757
2758	/*
2759	 * Calculate pagetable page index
2760	 */
2761	ptepindex = pmap_l2_pindex(va);
2762retry:
2763	/*
2764	 * Get the page directory entry
2765	 */
2766	pde = pmap_pde(pmap, va, &lvl);
2767
2768	/*
2769	 * If the page table page is mapped, we just increment the hold count,
2770	 * and activate it. If we get a level 2 pde it will point to a level 3
2771	 * table.
2772	 */
2773	switch (lvl) {
2774	case -1:
2775		break;
2776	case 0:
2777#ifdef INVARIANTS
2778		pte = pmap_l0_to_l1(pde, va);
2779		KASSERT(pmap_load(pte) == 0,
2780		    ("pmap_alloc_l3: TODO: l0 superpages"));
2781#endif
2782		break;
2783	case 1:
2784#ifdef INVARIANTS
2785		pte = pmap_l1_to_l2(pde, va);
2786		KASSERT(pmap_load(pte) == 0,
2787		    ("pmap_alloc_l3: TODO: l1 superpages"));
2788#endif
2789		break;
2790	case 2:
2791		tpde = pmap_load(pde);
2792		if (tpde != 0) {
2793			m = PTE_TO_VM_PAGE(tpde);
2794			m->ref_count++;
2795			return (m);
2796		}
2797		break;
2798	default:
2799		panic("pmap_alloc_l3: Invalid level %d", lvl);
2800	}
2801
2802	/*
2803	 * Here if the pte page isn't mapped, or if it has been deallocated.
2804	 */
2805	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2806	if (m == NULL && lockp != NULL)
2807		goto retry;
2808
2809	return (m);
2810}
2811
2812/***************************************************
2813 * Pmap allocation/deallocation routines.
2814 ***************************************************/
2815
2816/*
2817 * Release any resources held by the given physical map.
2818 * Called when a pmap initialized by pmap_pinit is being released.
2819 * Should only be called if the map contains no valid mappings.
2820 */
2821void
2822pmap_release(pmap_t pmap)
2823{
2824	bool rv __diagused;
2825	struct spglist freelist;
2826	struct asid_set *set;
2827	vm_page_t m;
2828	int asid;
2829
2830	if (pmap->pm_levels != 4) {
2831		PMAP_ASSERT_STAGE2(pmap);
2832		KASSERT(pmap->pm_stats.resident_count == 1,
2833		    ("pmap_release: pmap resident count %ld != 0",
2834		    pmap->pm_stats.resident_count));
2835		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2836		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2837
2838		SLIST_INIT(&freelist);
2839		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2840		PMAP_LOCK(pmap);
2841		rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2842		PMAP_UNLOCK(pmap);
2843		MPASS(rv == true);
2844		vm_page_free_pages_toq(&freelist, true);
2845	}
2846
2847	KASSERT(pmap->pm_stats.resident_count == 0,
2848	    ("pmap_release: pmap resident count %ld != 0",
2849	    pmap->pm_stats.resident_count));
2850	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2851	    ("pmap_release: pmap has reserved page table page(s)"));
2852
2853	set = pmap->pm_asid_set;
2854	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2855
2856	/*
2857	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2858	 * the entries when removing them so rely on a later tlb invalidation.
2859	 * this will happen when updating the VMID generation. Because of this
2860	 * we don't reuse VMIDs within a generation.
2861	 */
2862	if (pmap->pm_stage == PM_STAGE1) {
2863		mtx_lock_spin(&set->asid_set_mutex);
2864		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2865			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2866			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2867			    asid < set->asid_set_size,
2868			    ("pmap_release: pmap cookie has out-of-range asid"));
2869			bit_clear(set->asid_set, asid);
2870		}
2871		mtx_unlock_spin(&set->asid_set_mutex);
2872
2873		if (pmap->pm_bti != NULL) {
2874			rangeset_fini(pmap->pm_bti);
2875			free(pmap->pm_bti, M_DEVBUF);
2876		}
2877	}
2878
2879	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2880	vm_page_unwire_noq(m);
2881	vm_page_free_zero(m);
2882}
2883
2884static int
2885kvm_size(SYSCTL_HANDLER_ARGS)
2886{
2887	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2888
2889	return sysctl_handle_long(oidp, &ksize, 0, req);
2890}
2891SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2892    0, 0, kvm_size, "LU",
2893    "Size of KVM");
2894
2895static int
2896kvm_free(SYSCTL_HANDLER_ARGS)
2897{
2898	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2899
2900	return sysctl_handle_long(oidp, &kfree, 0, req);
2901}
2902SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2903    0, 0, kvm_free, "LU",
2904    "Amount of KVM free");
2905
2906/*
2907 * grow the number of kernel page table entries, if needed
2908 */
2909void
2910pmap_growkernel(vm_offset_t addr)
2911{
2912	vm_page_t nkpg;
2913	pd_entry_t *l0, *l1, *l2;
2914
2915	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2916
2917	addr = roundup2(addr, L2_SIZE);
2918	if (addr - 1 >= vm_map_max(kernel_map))
2919		addr = vm_map_max(kernel_map);
2920	if (kernel_vm_end < addr) {
2921		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2922		kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2923	}
2924	while (kernel_vm_end < addr) {
2925		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2926		KASSERT(pmap_load(l0) != 0,
2927		    ("pmap_growkernel: No level 0 kernel entry"));
2928
2929		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2930		if (pmap_load(l1) == 0) {
2931			/* We need a new PDP entry */
2932			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2933			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2934			if (nkpg == NULL)
2935				panic("pmap_growkernel: no memory to grow kernel");
2936			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2937			/* See the dmb() in _pmap_alloc_l3(). */
2938			dmb(ishst);
2939			pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
2940			continue; /* try again */
2941		}
2942		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2943		if (pmap_load(l2) != 0) {
2944			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2945			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2946				kernel_vm_end = vm_map_max(kernel_map);
2947				break;
2948			}
2949			continue;
2950		}
2951
2952		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2953		    VM_ALLOC_ZERO);
2954		if (nkpg == NULL)
2955			panic("pmap_growkernel: no memory to grow kernel");
2956		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2957		/* See the dmb() in _pmap_alloc_l3(). */
2958		dmb(ishst);
2959		pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
2960
2961		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2962		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2963			kernel_vm_end = vm_map_max(kernel_map);
2964			break;
2965		}
2966	}
2967}
2968
2969/***************************************************
2970 * page management routines.
2971 ***************************************************/
2972
2973static const uint64_t pc_freemask[_NPCM] = {
2974	[0 ... _NPCM - 2] = PC_FREEN,
2975	[_NPCM - 1] = PC_FREEL
2976};
2977
2978#ifdef PV_STATS
2979static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2980
2981SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2982	"Current number of pv entry chunks");
2983SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2984	"Current number of pv entry chunks allocated");
2985SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2986	"Current number of pv entry chunks frees");
2987SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2988	"Number of times tried to get a chunk page but failed.");
2989
2990static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2991static int pv_entry_spare;
2992
2993SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2994	"Current number of pv entry frees");
2995SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2996	"Current number of pv entry allocs");
2997SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2998	"Current number of pv entries");
2999SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3000	"Current number of spare pv entries");
3001#endif
3002
3003/*
3004 * We are in a serious low memory condition.  Resort to
3005 * drastic measures to free some pages so we can allocate
3006 * another pv entry chunk.
3007 *
3008 * Returns NULL if PV entries were reclaimed from the specified pmap.
3009 *
3010 * We do not, however, unmap 2mpages because subsequent accesses will
3011 * allocate per-page pv entries until repromotion occurs, thereby
3012 * exacerbating the shortage of free pv entries.
3013 */
3014static vm_page_t
3015reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3016{
3017	struct pv_chunks_list *pvc;
3018	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3019	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3020	struct md_page *pvh;
3021	pd_entry_t *pde;
3022	pmap_t next_pmap, pmap;
3023	pt_entry_t *pte, tpte;
3024	pv_entry_t pv;
3025	vm_offset_t va;
3026	vm_page_t m, m_pc;
3027	struct spglist free;
3028	uint64_t inuse;
3029	int bit, field, freed, lvl;
3030
3031	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3032	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3033
3034	pmap = NULL;
3035	m_pc = NULL;
3036	SLIST_INIT(&free);
3037	bzero(&pc_marker_b, sizeof(pc_marker_b));
3038	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3039	pc_marker = (struct pv_chunk *)&pc_marker_b;
3040	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3041
3042	pvc = &pv_chunks[domain];
3043	mtx_lock(&pvc->pvc_lock);
3044	pvc->active_reclaims++;
3045	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3046	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3047	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3048	    SLIST_EMPTY(&free)) {
3049		next_pmap = pc->pc_pmap;
3050		if (next_pmap == NULL) {
3051			/*
3052			 * The next chunk is a marker.  However, it is
3053			 * not our marker, so active_reclaims must be
3054			 * > 1.  Consequently, the next_chunk code
3055			 * will not rotate the pv_chunks list.
3056			 */
3057			goto next_chunk;
3058		}
3059		mtx_unlock(&pvc->pvc_lock);
3060
3061		/*
3062		 * A pv_chunk can only be removed from the pc_lru list
3063		 * when both pvc->pvc_lock is owned and the
3064		 * corresponding pmap is locked.
3065		 */
3066		if (pmap != next_pmap) {
3067			if (pmap != NULL && pmap != locked_pmap)
3068				PMAP_UNLOCK(pmap);
3069			pmap = next_pmap;
3070			/* Avoid deadlock and lock recursion. */
3071			if (pmap > locked_pmap) {
3072				RELEASE_PV_LIST_LOCK(lockp);
3073				PMAP_LOCK(pmap);
3074				mtx_lock(&pvc->pvc_lock);
3075				continue;
3076			} else if (pmap != locked_pmap) {
3077				if (PMAP_TRYLOCK(pmap)) {
3078					mtx_lock(&pvc->pvc_lock);
3079					continue;
3080				} else {
3081					pmap = NULL; /* pmap is not locked */
3082					mtx_lock(&pvc->pvc_lock);
3083					pc = TAILQ_NEXT(pc_marker, pc_lru);
3084					if (pc == NULL ||
3085					    pc->pc_pmap != next_pmap)
3086						continue;
3087					goto next_chunk;
3088				}
3089			}
3090		}
3091
3092		/*
3093		 * Destroy every non-wired, 4 KB page mapping in the chunk.
3094		 */
3095		freed = 0;
3096		for (field = 0; field < _NPCM; field++) {
3097			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3098			    inuse != 0; inuse &= ~(1UL << bit)) {
3099				bit = ffsl(inuse) - 1;
3100				pv = &pc->pc_pventry[field * 64 + bit];
3101				va = pv->pv_va;
3102				pde = pmap_pde(pmap, va, &lvl);
3103				if (lvl != 2)
3104					continue;
3105				pte = pmap_l2_to_l3(pde, va);
3106				tpte = pmap_load(pte);
3107				if ((tpte & ATTR_SW_WIRED) != 0)
3108					continue;
3109				if ((tpte & ATTR_CONTIGUOUS) != 0)
3110					(void)pmap_demote_l3c(pmap, pte, va);
3111				tpte = pmap_load_clear(pte);
3112				m = PTE_TO_VM_PAGE(tpte);
3113				if (pmap_pte_dirty(pmap, tpte))
3114					vm_page_dirty(m);
3115				if ((tpte & ATTR_AF) != 0) {
3116					pmap_s1_invalidate_page(pmap, va, true);
3117					vm_page_aflag_set(m, PGA_REFERENCED);
3118				}
3119				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3120				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3121				m->md.pv_gen++;
3122				if (TAILQ_EMPTY(&m->md.pv_list) &&
3123				    (m->flags & PG_FICTITIOUS) == 0) {
3124					pvh = page_to_pvh(m);
3125					if (TAILQ_EMPTY(&pvh->pv_list)) {
3126						vm_page_aflag_clear(m,
3127						    PGA_WRITEABLE);
3128					}
3129				}
3130				pc->pc_map[field] |= 1UL << bit;
3131				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3132				freed++;
3133			}
3134		}
3135		if (freed == 0) {
3136			mtx_lock(&pvc->pvc_lock);
3137			goto next_chunk;
3138		}
3139		/* Every freed mapping is for a 4 KB page. */
3140		pmap_resident_count_dec(pmap, freed);
3141		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3142		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3143		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3144		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3145		if (pc_is_free(pc)) {
3146			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3147			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3148			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3149			/* Entire chunk is free; return it. */
3150			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3151			dump_drop_page(m_pc->phys_addr);
3152			mtx_lock(&pvc->pvc_lock);
3153			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3154			break;
3155		}
3156		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3157		mtx_lock(&pvc->pvc_lock);
3158		/* One freed pv entry in locked_pmap is sufficient. */
3159		if (pmap == locked_pmap)
3160			break;
3161
3162next_chunk:
3163		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3164		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3165		if (pvc->active_reclaims == 1 && pmap != NULL) {
3166			/*
3167			 * Rotate the pv chunks list so that we do not
3168			 * scan the same pv chunks that could not be
3169			 * freed (because they contained a wired
3170			 * and/or superpage mapping) on every
3171			 * invocation of reclaim_pv_chunk().
3172			 */
3173			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3174				MPASS(pc->pc_pmap != NULL);
3175				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3176				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3177			}
3178		}
3179	}
3180	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3181	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3182	pvc->active_reclaims--;
3183	mtx_unlock(&pvc->pvc_lock);
3184	if (pmap != NULL && pmap != locked_pmap)
3185		PMAP_UNLOCK(pmap);
3186	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3187		m_pc = SLIST_FIRST(&free);
3188		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3189		/* Recycle a freed page table page. */
3190		m_pc->ref_count = 1;
3191	}
3192	vm_page_free_pages_toq(&free, true);
3193	return (m_pc);
3194}
3195
3196static vm_page_t
3197reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3198{
3199	vm_page_t m;
3200	int i, domain;
3201
3202	domain = PCPU_GET(domain);
3203	for (i = 0; i < vm_ndomains; i++) {
3204		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3205		if (m != NULL)
3206			break;
3207		domain = (domain + 1) % vm_ndomains;
3208	}
3209
3210	return (m);
3211}
3212
3213/*
3214 * free the pv_entry back to the free list
3215 */
3216static void
3217free_pv_entry(pmap_t pmap, pv_entry_t pv)
3218{
3219	struct pv_chunk *pc;
3220	int idx, field, bit;
3221
3222	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3223	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3224	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3225	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3226	pc = pv_to_chunk(pv);
3227	idx = pv - &pc->pc_pventry[0];
3228	field = idx / 64;
3229	bit = idx % 64;
3230	pc->pc_map[field] |= 1ul << bit;
3231	if (!pc_is_free(pc)) {
3232		/* 98% of the time, pc is already at the head of the list. */
3233		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3234			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3235			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3236		}
3237		return;
3238	}
3239	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3240	free_pv_chunk(pc);
3241}
3242
3243static void
3244free_pv_chunk_dequeued(struct pv_chunk *pc)
3245{
3246	vm_page_t m;
3247
3248	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3249	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3250	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3251	/* entire chunk is free, return it */
3252	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3253	dump_drop_page(m->phys_addr);
3254	vm_page_unwire_noq(m);
3255	vm_page_free(m);
3256}
3257
3258static void
3259free_pv_chunk(struct pv_chunk *pc)
3260{
3261	struct pv_chunks_list *pvc;
3262
3263	pvc = &pv_chunks[pc_to_domain(pc)];
3264	mtx_lock(&pvc->pvc_lock);
3265	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3266	mtx_unlock(&pvc->pvc_lock);
3267	free_pv_chunk_dequeued(pc);
3268}
3269
3270static void
3271free_pv_chunk_batch(struct pv_chunklist *batch)
3272{
3273	struct pv_chunks_list *pvc;
3274	struct pv_chunk *pc, *npc;
3275	int i;
3276
3277	for (i = 0; i < vm_ndomains; i++) {
3278		if (TAILQ_EMPTY(&batch[i]))
3279			continue;
3280		pvc = &pv_chunks[i];
3281		mtx_lock(&pvc->pvc_lock);
3282		TAILQ_FOREACH(pc, &batch[i], pc_list) {
3283			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3284		}
3285		mtx_unlock(&pvc->pvc_lock);
3286	}
3287
3288	for (i = 0; i < vm_ndomains; i++) {
3289		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3290			free_pv_chunk_dequeued(pc);
3291		}
3292	}
3293}
3294
3295/*
3296 * Returns a new PV entry, allocating a new PV chunk from the system when
3297 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3298 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3299 * returned.
3300 *
3301 * The given PV list lock may be released.
3302 */
3303static pv_entry_t
3304get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3305{
3306	struct pv_chunks_list *pvc;
3307	int bit, field;
3308	pv_entry_t pv;
3309	struct pv_chunk *pc;
3310	vm_page_t m;
3311
3312	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3313	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3314retry:
3315	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3316	if (pc != NULL) {
3317		for (field = 0; field < _NPCM; field++) {
3318			if (pc->pc_map[field]) {
3319				bit = ffsl(pc->pc_map[field]) - 1;
3320				break;
3321			}
3322		}
3323		if (field < _NPCM) {
3324			pv = &pc->pc_pventry[field * 64 + bit];
3325			pc->pc_map[field] &= ~(1ul << bit);
3326			/* If this was the last item, move it to tail */
3327			if (pc_is_full(pc)) {
3328				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3329				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3330				    pc_list);
3331			}
3332			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3333			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3334			return (pv);
3335		}
3336	}
3337	/* No free items, allocate another chunk */
3338	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3339	if (m == NULL) {
3340		if (lockp == NULL) {
3341			PV_STAT(pc_chunk_tryfail++);
3342			return (NULL);
3343		}
3344		m = reclaim_pv_chunk(pmap, lockp);
3345		if (m == NULL)
3346			goto retry;
3347	}
3348	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3349	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3350	dump_add_page(m->phys_addr);
3351	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3352	pc->pc_pmap = pmap;
3353	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3354	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
3355	pvc = &pv_chunks[vm_page_domain(m)];
3356	mtx_lock(&pvc->pvc_lock);
3357	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3358	mtx_unlock(&pvc->pvc_lock);
3359	pv = &pc->pc_pventry[0];
3360	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3361	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3362	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3363	return (pv);
3364}
3365
3366/*
3367 * Ensure that the number of spare PV entries in the specified pmap meets or
3368 * exceeds the given count, "needed".
3369 *
3370 * The given PV list lock may be released.
3371 */
3372static void
3373reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3374{
3375	struct pv_chunks_list *pvc;
3376	struct pch new_tail[PMAP_MEMDOM];
3377	struct pv_chunk *pc;
3378	vm_page_t m;
3379	int avail, free, i;
3380	bool reclaimed;
3381
3382	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3383	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3384
3385	/*
3386	 * Newly allocated PV chunks must be stored in a private list until
3387	 * the required number of PV chunks have been allocated.  Otherwise,
3388	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3389	 * contrast, these chunks must be added to the pmap upon allocation.
3390	 */
3391	for (i = 0; i < PMAP_MEMDOM; i++)
3392		TAILQ_INIT(&new_tail[i]);
3393retry:
3394	avail = 0;
3395	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3396		bit_count((bitstr_t *)pc->pc_map, 0,
3397		    sizeof(pc->pc_map) * NBBY, &free);
3398		if (free == 0)
3399			break;
3400		avail += free;
3401		if (avail >= needed)
3402			break;
3403	}
3404	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3405		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3406		if (m == NULL) {
3407			m = reclaim_pv_chunk(pmap, lockp);
3408			if (m == NULL)
3409				goto retry;
3410			reclaimed = true;
3411		}
3412		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3413		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3414		dump_add_page(m->phys_addr);
3415		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3416		pc->pc_pmap = pmap;
3417		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3418		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3419		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3420		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3421
3422		/*
3423		 * The reclaim might have freed a chunk from the current pmap.
3424		 * If that chunk contained available entries, we need to
3425		 * re-count the number of available entries.
3426		 */
3427		if (reclaimed)
3428			goto retry;
3429	}
3430	for (i = 0; i < vm_ndomains; i++) {
3431		if (TAILQ_EMPTY(&new_tail[i]))
3432			continue;
3433		pvc = &pv_chunks[i];
3434		mtx_lock(&pvc->pvc_lock);
3435		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3436		mtx_unlock(&pvc->pvc_lock);
3437	}
3438}
3439
3440/*
3441 * First find and then remove the pv entry for the specified pmap and virtual
3442 * address from the specified pv list.  Returns the pv entry if found and NULL
3443 * otherwise.  This operation can be performed on pv lists for either 4KB or
3444 * 2MB page mappings.
3445 */
3446static __inline pv_entry_t
3447pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3448{
3449	pv_entry_t pv;
3450
3451	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3452		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3453			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3454			pvh->pv_gen++;
3455			break;
3456		}
3457	}
3458	return (pv);
3459}
3460
3461/*
3462 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3463 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3464 * entries for each of the 4KB page mappings.
3465 */
3466static void
3467pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3468    struct rwlock **lockp)
3469{
3470	struct md_page *pvh;
3471	struct pv_chunk *pc;
3472	pv_entry_t pv;
3473	vm_offset_t va_last;
3474	vm_page_t m;
3475	int bit, field;
3476
3477	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3478	KASSERT((va & L2_OFFSET) == 0,
3479	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3480	KASSERT((pa & L2_OFFSET) == 0,
3481	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3482	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3483
3484	/*
3485	 * Transfer the 2mpage's pv entry for this mapping to the first
3486	 * page's pv list.  Once this transfer begins, the pv list lock
3487	 * must not be released until the last pv entry is reinstantiated.
3488	 */
3489	pvh = pa_to_pvh(pa);
3490	pv = pmap_pvh_remove(pvh, pmap, va);
3491	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3492	m = PHYS_TO_VM_PAGE(pa);
3493	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3494	m->md.pv_gen++;
3495	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3496	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3497	va_last = va + L2_SIZE - PAGE_SIZE;
3498	for (;;) {
3499		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3500		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3501		for (field = 0; field < _NPCM; field++) {
3502			while (pc->pc_map[field]) {
3503				bit = ffsl(pc->pc_map[field]) - 1;
3504				pc->pc_map[field] &= ~(1ul << bit);
3505				pv = &pc->pc_pventry[field * 64 + bit];
3506				va += PAGE_SIZE;
3507				pv->pv_va = va;
3508				m++;
3509				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3510			    ("pmap_pv_demote_l2: page %p is not managed", m));
3511				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3512				m->md.pv_gen++;
3513				if (va == va_last)
3514					goto out;
3515			}
3516		}
3517		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3518		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3519	}
3520out:
3521	if (pc_is_full(pc)) {
3522		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3523		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3524	}
3525	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3526	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3527}
3528
3529/*
3530 * First find and then destroy the pv entry for the specified pmap and virtual
3531 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3532 * page mappings.
3533 */
3534static void
3535pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3536{
3537	pv_entry_t pv;
3538
3539	pv = pmap_pvh_remove(pvh, pmap, va);
3540	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3541	free_pv_entry(pmap, pv);
3542}
3543
3544/*
3545 * Conditionally create the PV entry for a 4KB page mapping if the required
3546 * memory can be allocated without resorting to reclamation.
3547 */
3548static bool
3549pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3550    struct rwlock **lockp)
3551{
3552	pv_entry_t pv;
3553
3554	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3555	/* Pass NULL instead of the lock pointer to disable reclamation. */
3556	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3557		pv->pv_va = va;
3558		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3559		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3560		m->md.pv_gen++;
3561		return (true);
3562	} else
3563		return (false);
3564}
3565
3566/*
3567 * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3568 * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3569 * false if the PV entry cannot be allocated without resorting to reclamation.
3570 */
3571static bool
3572pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3573    struct rwlock **lockp)
3574{
3575	struct md_page *pvh;
3576	pv_entry_t pv;
3577	vm_paddr_t pa;
3578
3579	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3580	/* Pass NULL instead of the lock pointer to disable reclamation. */
3581	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3582	    NULL : lockp)) == NULL)
3583		return (false);
3584	pv->pv_va = va;
3585	pa = PTE_TO_PHYS(l2e);
3586	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3587	pvh = pa_to_pvh(pa);
3588	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3589	pvh->pv_gen++;
3590	return (true);
3591}
3592
3593/*
3594 * Conditionally creates the PV entries for a L3C superpage mapping if
3595 * the required memory can be allocated without resorting to reclamation.
3596 */
3597static bool
3598pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3599    struct rwlock **lockp)
3600{
3601	pv_entry_t pv;
3602	vm_offset_t tva;
3603	vm_paddr_t pa __diagused;
3604	vm_page_t mt;
3605
3606	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3607	KASSERT((va & L3C_OFFSET) == 0,
3608	    ("pmap_pv_insert_l3c: va is not aligned"));
3609	pa = VM_PAGE_TO_PHYS(m);
3610	KASSERT((pa & L3C_OFFSET) == 0,
3611	    ("pmap_pv_insert_l3c: pa is not aligned"));
3612	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3613	for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3614		/* Pass NULL instead of lockp to disable reclamation. */
3615		pv = get_pv_entry(pmap, NULL);
3616		if (__predict_false(pv == NULL)) {
3617			while (tva > va) {
3618				mt--;
3619				tva -= L3_SIZE;
3620				pmap_pvh_free(&mt->md, pmap, tva);
3621			}
3622			return (false);
3623		}
3624		pv->pv_va = tva;
3625		TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3626		mt->md.pv_gen++;
3627	}
3628	return (true);
3629}
3630
3631static void
3632pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3633{
3634	pt_entry_t newl2, oldl2 __diagused;
3635	vm_page_t ml3;
3636	vm_paddr_t ml3pa;
3637
3638	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3639	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3640	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3641
3642	ml3 = pmap_remove_pt_page(pmap, va);
3643	if (ml3 == NULL)
3644		panic("pmap_remove_kernel_l2: Missing pt page");
3645
3646	ml3pa = VM_PAGE_TO_PHYS(ml3);
3647	newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3648
3649	/*
3650	 * If this page table page was unmapped by a promotion, then it
3651	 * contains valid mappings.  Zero it to invalidate those mappings.
3652	 */
3653	if (vm_page_any_valid(ml3))
3654		pagezero((void *)PHYS_TO_DMAP(ml3pa));
3655
3656	/*
3657	 * Demote the mapping.  The caller must have already invalidated the
3658	 * mapping (i.e., the "break" in break-before-make).
3659	 */
3660	oldl2 = pmap_load_store(l2, newl2);
3661	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3662	    __func__, l2, oldl2));
3663}
3664
3665/*
3666 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3667 */
3668static int
3669pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3670    pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3671{
3672	struct md_page *pvh;
3673	pt_entry_t old_l2;
3674	vm_page_t m, ml3, mt;
3675
3676	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3677	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3678	old_l2 = pmap_load_clear(l2);
3679	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3680	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3681
3682	/*
3683	 * Since a promotion must break the 4KB page mappings before making
3684	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3685	 */
3686	pmap_s1_invalidate_page(pmap, sva, true);
3687
3688	if (old_l2 & ATTR_SW_WIRED)
3689		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3690	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3691	if (old_l2 & ATTR_SW_MANAGED) {
3692		m = PTE_TO_VM_PAGE(old_l2);
3693		pvh = page_to_pvh(m);
3694		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3695		pmap_pvh_free(pvh, pmap, sva);
3696		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3697			if (pmap_pte_dirty(pmap, old_l2))
3698				vm_page_dirty(mt);
3699			if (old_l2 & ATTR_AF)
3700				vm_page_aflag_set(mt, PGA_REFERENCED);
3701			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3702			    TAILQ_EMPTY(&pvh->pv_list))
3703				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3704		}
3705	}
3706	if (pmap == kernel_pmap) {
3707		pmap_remove_kernel_l2(pmap, l2, sva);
3708	} else {
3709		ml3 = pmap_remove_pt_page(pmap, sva);
3710		if (ml3 != NULL) {
3711			KASSERT(vm_page_any_valid(ml3),
3712			    ("pmap_remove_l2: l3 page not promoted"));
3713			pmap_resident_count_dec(pmap, 1);
3714			KASSERT(ml3->ref_count == NL3PG,
3715			    ("pmap_remove_l2: l3 page ref count error"));
3716			ml3->ref_count = 0;
3717			pmap_add_delayed_free_list(ml3, free, false);
3718		}
3719	}
3720	return (pmap_unuse_pt(pmap, sva, l1e, free));
3721}
3722
3723/*
3724 * pmap_remove_l3: do the things to unmap a page in a process
3725 */
3726static int
3727pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3728    pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3729{
3730	struct md_page *pvh;
3731	pt_entry_t old_l3;
3732	vm_page_t m;
3733
3734	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3735	old_l3 = pmap_load(l3);
3736	if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3737		(void)pmap_demote_l3c(pmap, l3, va);
3738	old_l3 = pmap_load_clear(l3);
3739	pmap_s1_invalidate_page(pmap, va, true);
3740	if (old_l3 & ATTR_SW_WIRED)
3741		pmap->pm_stats.wired_count -= 1;
3742	pmap_resident_count_dec(pmap, 1);
3743	if (old_l3 & ATTR_SW_MANAGED) {
3744		m = PTE_TO_VM_PAGE(old_l3);
3745		if (pmap_pte_dirty(pmap, old_l3))
3746			vm_page_dirty(m);
3747		if (old_l3 & ATTR_AF)
3748			vm_page_aflag_set(m, PGA_REFERENCED);
3749		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3750		pmap_pvh_free(&m->md, pmap, va);
3751		if (TAILQ_EMPTY(&m->md.pv_list) &&
3752		    (m->flags & PG_FICTITIOUS) == 0) {
3753			pvh = page_to_pvh(m);
3754			if (TAILQ_EMPTY(&pvh->pv_list))
3755				vm_page_aflag_clear(m, PGA_WRITEABLE);
3756		}
3757	}
3758	return (pmap_unuse_pt(pmap, va, l2e, free));
3759}
3760
3761/*
3762 * Removes the specified L3C superpage mapping.  Requests TLB invalidations
3763 * to be performed by the caller through the returned "*vap". Returns true
3764 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3765 * Otherwise, returns false.
3766 */
3767static bool
3768pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3769    vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3770    struct rwlock **lockp)
3771{
3772	struct md_page *pvh;
3773	struct rwlock *new_lock;
3774	pt_entry_t first_l3e, l3e, *tl3p;
3775	vm_offset_t tva;
3776	vm_page_t m, mt;
3777
3778	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3779	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3780	    0, ("pmap_remove_l3c: l3p is not aligned"));
3781	KASSERT((va & L3C_OFFSET) == 0,
3782	    ("pmap_remove_l3c: va is not aligned"));
3783
3784	/*
3785	 * Hardware accessed and dirty bit maintenance might only update a
3786	 * single L3 entry, so we must combine the accessed and dirty bits
3787	 * from this entire set of contiguous L3 entries.
3788	 */
3789	first_l3e = pmap_load_clear(l3p);
3790	for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3791		l3e = pmap_load_clear(tl3p);
3792		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3793		    ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3794		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3795		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3796			first_l3e &= ~ATTR_S1_AP_RW_BIT;
3797		first_l3e |= l3e & ATTR_AF;
3798	}
3799	if ((first_l3e & ATTR_SW_WIRED) != 0)
3800		pmap->pm_stats.wired_count -= L3C_ENTRIES;
3801	pmap_resident_count_dec(pmap, L3C_ENTRIES);
3802	if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3803		m = PTE_TO_VM_PAGE(first_l3e);
3804		new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3805		if (new_lock != *lockp) {
3806			if (*lockp != NULL) {
3807				/*
3808				 * Pending TLB invalidations must be
3809				 * performed before the PV list lock is
3810				 * released.  Otherwise, a concurrent
3811				 * pmap_remove_all() on a physical page
3812				 * could return while a stale TLB entry
3813				 * still provides access to that page.
3814				 */
3815				if (*vap != va_next) {
3816					pmap_invalidate_range(pmap, *vap, va,
3817					    true);
3818					*vap = va_next;
3819				}
3820				rw_wunlock(*lockp);
3821			}
3822			*lockp = new_lock;
3823			rw_wlock(*lockp);
3824		}
3825		pvh = page_to_pvh(m);
3826		for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3827		    L3_SIZE) {
3828			if (pmap_pte_dirty(pmap, first_l3e))
3829				vm_page_dirty(mt);
3830			if ((first_l3e & ATTR_AF) != 0)
3831				vm_page_aflag_set(mt, PGA_REFERENCED);
3832			pmap_pvh_free(&mt->md, pmap, tva);
3833			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3834			    TAILQ_EMPTY(&pvh->pv_list))
3835				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3836		}
3837	}
3838	if (*vap == va_next)
3839		*vap = va;
3840	if (ml3 != NULL) {
3841		ml3->ref_count -= L3C_ENTRIES;
3842		if (ml3->ref_count == 0) {
3843			_pmap_unwire_l3(pmap, va, ml3, free);
3844			return (true);
3845		}
3846	}
3847	return (false);
3848}
3849
3850/*
3851 * Remove the specified range of addresses from the L3 page table that is
3852 * identified by the given L2 entry.
3853 */
3854static void
3855pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3856    vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3857{
3858	struct md_page *pvh;
3859	struct rwlock *new_lock;
3860	pt_entry_t *l3, old_l3;
3861	vm_offset_t va;
3862	vm_page_t l3pg, m;
3863
3864	KASSERT(ADDR_IS_CANONICAL(sva),
3865	    ("%s: Start address not in canonical form: %lx", __func__, sva));
3866	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3867	    ("%s: End address not in canonical form: %lx", __func__, eva));
3868
3869	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3870	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3871	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3872	l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3873	va = eva;
3874	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3875		old_l3 = pmap_load(l3);
3876		if (!pmap_l3_valid(old_l3)) {
3877			if (va != eva) {
3878				pmap_invalidate_range(pmap, va, sva, true);
3879				va = eva;
3880			}
3881			continue;
3882		}
3883		if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3884			/*
3885			 * Is this entire set of contiguous L3 entries being
3886			 * removed?  Handle the possibility that "eva" is zero
3887			 * because of address wraparound.
3888			 */
3889			if ((sva & L3C_OFFSET) == 0 &&
3890			    sva + L3C_OFFSET <= eva - 1) {
3891				if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
3892				    l3pg, free, lockp)) {
3893					/* The L3 table was unmapped. */
3894					sva += L3C_SIZE;
3895					break;
3896				}
3897				l3 += L3C_ENTRIES - 1;
3898				sva += L3C_SIZE - L3_SIZE;
3899				continue;
3900			}
3901
3902			(void)pmap_demote_l3c(pmap, l3, sva);
3903		}
3904		old_l3 = pmap_load_clear(l3);
3905		if ((old_l3 & ATTR_SW_WIRED) != 0)
3906			pmap->pm_stats.wired_count--;
3907		pmap_resident_count_dec(pmap, 1);
3908		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3909			m = PTE_TO_VM_PAGE(old_l3);
3910			if (pmap_pte_dirty(pmap, old_l3))
3911				vm_page_dirty(m);
3912			if ((old_l3 & ATTR_AF) != 0)
3913				vm_page_aflag_set(m, PGA_REFERENCED);
3914			new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3915			if (new_lock != *lockp) {
3916				if (*lockp != NULL) {
3917					/*
3918					 * Pending TLB invalidations must be
3919					 * performed before the PV list lock is
3920					 * released.  Otherwise, a concurrent
3921					 * pmap_remove_all() on a physical page
3922					 * could return while a stale TLB entry
3923					 * still provides access to that page.
3924					 */
3925					if (va != eva) {
3926						pmap_invalidate_range(pmap, va,
3927						    sva, true);
3928						va = eva;
3929					}
3930					rw_wunlock(*lockp);
3931				}
3932				*lockp = new_lock;
3933				rw_wlock(*lockp);
3934			}
3935			pmap_pvh_free(&m->md, pmap, sva);
3936			if (TAILQ_EMPTY(&m->md.pv_list) &&
3937			    (m->flags & PG_FICTITIOUS) == 0) {
3938				pvh = page_to_pvh(m);
3939				if (TAILQ_EMPTY(&pvh->pv_list))
3940					vm_page_aflag_clear(m, PGA_WRITEABLE);
3941			}
3942		}
3943		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3944			/*
3945			 * _pmap_unwire_l3() has already invalidated the TLB
3946			 * entries at all levels for "sva".  So, we need not
3947			 * perform "sva += L3_SIZE;" here.  Moreover, we need
3948			 * not perform "va = sva;" if "sva" is at the start
3949			 * of a new valid range consisting of a single page.
3950			 */
3951			break;
3952		}
3953		if (va == eva)
3954			va = sva;
3955	}
3956	if (va != eva)
3957		pmap_invalidate_range(pmap, va, sva, true);
3958}
3959
3960static void
3961pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
3962{
3963	struct rwlock *lock;
3964	vm_offset_t va_next;
3965	pd_entry_t *l0, *l1, *l2;
3966	pt_entry_t l3_paddr;
3967	struct spglist free;
3968
3969	/*
3970	 * Perform an unsynchronized read.  This is, however, safe.
3971	 */
3972	if (pmap->pm_stats.resident_count == 0)
3973		return;
3974
3975	SLIST_INIT(&free);
3976
3977	PMAP_LOCK(pmap);
3978	if (map_delete)
3979		pmap_bti_on_remove(pmap, sva, eva);
3980
3981	lock = NULL;
3982	for (; sva < eva; sva = va_next) {
3983		if (pmap->pm_stats.resident_count == 0)
3984			break;
3985
3986		l0 = pmap_l0(pmap, sva);
3987		if (pmap_load(l0) == 0) {
3988			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3989			if (va_next < sva)
3990				va_next = eva;
3991			continue;
3992		}
3993
3994		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3995		if (va_next < sva)
3996			va_next = eva;
3997		l1 = pmap_l0_to_l1(l0, sva);
3998		if (pmap_load(l1) == 0)
3999			continue;
4000		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4001			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4002			KASSERT(va_next <= eva,
4003			    ("partial update of non-transparent 1G page "
4004			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4005			    pmap_load(l1), sva, eva, va_next));
4006			MPASS(pmap != kernel_pmap);
4007			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4008			pmap_clear(l1);
4009			pmap_s1_invalidate_page(pmap, sva, true);
4010			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4011			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4012			continue;
4013		}
4014
4015		/*
4016		 * Calculate index for next page table.
4017		 */
4018		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4019		if (va_next < sva)
4020			va_next = eva;
4021
4022		l2 = pmap_l1_to_l2(l1, sva);
4023		if (l2 == NULL)
4024			continue;
4025
4026		l3_paddr = pmap_load(l2);
4027
4028		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4029			if (sva + L2_SIZE == va_next && eva >= va_next) {
4030				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4031				    &free, &lock);
4032				continue;
4033			} else if (pmap_demote_l2_locked(pmap, l2, sva,
4034			    &lock) == NULL)
4035				continue;
4036			l3_paddr = pmap_load(l2);
4037		}
4038
4039		/*
4040		 * Weed out invalid mappings.
4041		 */
4042		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4043			continue;
4044
4045		/*
4046		 * Limit our scan to either the end of the va represented
4047		 * by the current page table page, or to the end of the
4048		 * range being removed.
4049		 */
4050		if (va_next > eva)
4051			va_next = eva;
4052
4053		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4054		    &lock);
4055	}
4056	if (lock != NULL)
4057		rw_wunlock(lock);
4058	PMAP_UNLOCK(pmap);
4059	vm_page_free_pages_toq(&free, true);
4060}
4061
4062/*
4063 *	Remove the given range of addresses from the specified map.
4064 *
4065 *	It is assumed that the start and end are properly
4066 *	rounded to the page size.
4067 */
4068void
4069pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4070{
4071	pmap_remove1(pmap, sva, eva, false);
4072}
4073
4074/*
4075 *	Remove the given range of addresses as part of a logical unmap
4076 *	operation. This has the effect of calling pmap_remove(), but
4077 *	also clears any metadata that should persist for the lifetime
4078 *	of a logical mapping.
4079 */
4080void
4081pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4082{
4083	pmap_remove1(pmap, sva, eva, true);
4084}
4085
4086/*
4087 *	Routine:	pmap_remove_all
4088 *	Function:
4089 *		Removes this physical page from
4090 *		all physical maps in which it resides.
4091 *		Reflects back modify bits to the pager.
4092 *
4093 *	Notes:
4094 *		Original versions of this routine were very
4095 *		inefficient because they iteratively called
4096 *		pmap_remove (slow...)
4097 */
4098
4099void
4100pmap_remove_all(vm_page_t m)
4101{
4102	struct md_page *pvh;
4103	pv_entry_t pv;
4104	pmap_t pmap;
4105	struct rwlock *lock;
4106	pd_entry_t *pde, tpde;
4107	pt_entry_t *pte, tpte;
4108	vm_offset_t va;
4109	struct spglist free;
4110	int lvl, pvh_gen, md_gen;
4111
4112	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4113	    ("pmap_remove_all: page %p is not managed", m));
4114	SLIST_INIT(&free);
4115	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4116	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4117	rw_wlock(lock);
4118retry:
4119	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4120		pmap = PV_PMAP(pv);
4121		if (!PMAP_TRYLOCK(pmap)) {
4122			pvh_gen = pvh->pv_gen;
4123			rw_wunlock(lock);
4124			PMAP_LOCK(pmap);
4125			rw_wlock(lock);
4126			if (pvh_gen != pvh->pv_gen) {
4127				PMAP_UNLOCK(pmap);
4128				goto retry;
4129			}
4130		}
4131		va = pv->pv_va;
4132		pte = pmap_pte_exists(pmap, va, 2, __func__);
4133		pmap_demote_l2_locked(pmap, pte, va, &lock);
4134		PMAP_UNLOCK(pmap);
4135	}
4136	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4137		pmap = PV_PMAP(pv);
4138		if (!PMAP_TRYLOCK(pmap)) {
4139			pvh_gen = pvh->pv_gen;
4140			md_gen = m->md.pv_gen;
4141			rw_wunlock(lock);
4142			PMAP_LOCK(pmap);
4143			rw_wlock(lock);
4144			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4145				PMAP_UNLOCK(pmap);
4146				goto retry;
4147			}
4148		}
4149		pmap_resident_count_dec(pmap, 1);
4150
4151		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4152		KASSERT(pde != NULL,
4153		    ("pmap_remove_all: no page directory entry found"));
4154		KASSERT(lvl == 2,
4155		    ("pmap_remove_all: invalid pde level %d", lvl));
4156		tpde = pmap_load(pde);
4157
4158		pte = pmap_l2_to_l3(pde, pv->pv_va);
4159		tpte = pmap_load(pte);
4160		if ((tpte & ATTR_CONTIGUOUS) != 0)
4161			(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4162		tpte = pmap_load_clear(pte);
4163		if (tpte & ATTR_SW_WIRED)
4164			pmap->pm_stats.wired_count--;
4165		if ((tpte & ATTR_AF) != 0) {
4166			pmap_invalidate_page(pmap, pv->pv_va, true);
4167			vm_page_aflag_set(m, PGA_REFERENCED);
4168		}
4169
4170		/*
4171		 * Update the vm_page_t clean and reference bits.
4172		 */
4173		if (pmap_pte_dirty(pmap, tpte))
4174			vm_page_dirty(m);
4175		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4176		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4177		m->md.pv_gen++;
4178		free_pv_entry(pmap, pv);
4179		PMAP_UNLOCK(pmap);
4180	}
4181	vm_page_aflag_clear(m, PGA_WRITEABLE);
4182	rw_wunlock(lock);
4183	vm_page_free_pages_toq(&free, true);
4184}
4185
4186/*
4187 * Masks and sets bits in a level 2 page table entries in the specified pmap
4188 */
4189static void
4190pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4191    pt_entry_t nbits)
4192{
4193	pd_entry_t old_l2;
4194	vm_page_t m, mt;
4195
4196	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4197	PMAP_ASSERT_STAGE1(pmap);
4198	KASSERT((sva & L2_OFFSET) == 0,
4199	    ("pmap_protect_l2: sva is not 2mpage aligned"));
4200	old_l2 = pmap_load(l2);
4201	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4202	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4203
4204	/*
4205	 * Return if the L2 entry already has the desired access restrictions
4206	 * in place.
4207	 */
4208	if ((old_l2 & mask) == nbits)
4209		return;
4210
4211	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4212		cpu_spinwait();
4213
4214	/*
4215	 * When a dirty read/write superpage mapping is write protected,
4216	 * update the dirty field of each of the superpage's constituent 4KB
4217	 * pages.
4218	 */
4219	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4220	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4221	    pmap_pte_dirty(pmap, old_l2)) {
4222		m = PTE_TO_VM_PAGE(old_l2);
4223		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4224			vm_page_dirty(mt);
4225	}
4226
4227	/*
4228	 * Since a promotion must break the 4KB page mappings before making
4229	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4230	 */
4231	pmap_s1_invalidate_page(pmap, sva, true);
4232}
4233
4234/*
4235 * Masks and sets bits in the specified L3C superpage mapping.
4236 *
4237 * Requests TLB invalidations to be performed by the caller through the
4238 * returned "*vap".
4239 */
4240static void
4241pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4242    vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4243{
4244	pt_entry_t l3e, *tl3p;
4245	vm_page_t m, mt;
4246	bool dirty;
4247
4248	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4249	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4250	    0, ("pmap_mask_set_l3c: l3p is not aligned"));
4251	KASSERT((va & L3C_OFFSET) == 0,
4252	    ("pmap_mask_set_l3c: va is not aligned"));
4253	dirty = false;
4254	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4255		l3e = pmap_load(tl3p);
4256		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4257		    ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4258		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4259			cpu_spinwait();
4260		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4261		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4262			dirty = true;
4263	}
4264
4265	/*
4266	 * When a dirty read/write superpage mapping is write protected,
4267	 * update the dirty field of each of the superpage's constituent 4KB
4268	 * pages.
4269	 */
4270	if ((l3e & ATTR_SW_MANAGED) != 0 &&
4271	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4272	    dirty) {
4273		m = PTE_TO_VM_PAGE(pmap_load(l3p));
4274		for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4275			vm_page_dirty(mt);
4276	}
4277
4278	if (*vap == va_next)
4279		*vap = va;
4280}
4281
4282/*
4283 * Masks and sets bits in last level page table entries in the specified
4284 * pmap and range
4285 */
4286static void
4287pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4288    pt_entry_t nbits, bool invalidate)
4289{
4290	vm_offset_t va, va_next;
4291	pd_entry_t *l0, *l1, *l2;
4292	pt_entry_t *l3p, l3;
4293
4294	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4295	for (; sva < eva; sva = va_next) {
4296		l0 = pmap_l0(pmap, sva);
4297		if (pmap_load(l0) == 0) {
4298			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4299			if (va_next < sva)
4300				va_next = eva;
4301			continue;
4302		}
4303
4304		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4305		if (va_next < sva)
4306			va_next = eva;
4307		l1 = pmap_l0_to_l1(l0, sva);
4308		if (pmap_load(l1) == 0)
4309			continue;
4310		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4311			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4312			KASSERT(va_next <= eva,
4313			    ("partial update of non-transparent 1G page "
4314			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4315			    pmap_load(l1), sva, eva, va_next));
4316			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4317			if ((pmap_load(l1) & mask) != nbits) {
4318				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4319				if (invalidate)
4320					pmap_s1_invalidate_page(pmap, sva, true);
4321			}
4322			continue;
4323		}
4324
4325		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4326		if (va_next < sva)
4327			va_next = eva;
4328
4329		l2 = pmap_l1_to_l2(l1, sva);
4330		if (pmap_load(l2) == 0)
4331			continue;
4332
4333		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4334			if (sva + L2_SIZE == va_next && eva >= va_next) {
4335				pmap_protect_l2(pmap, l2, sva, mask, nbits);
4336				continue;
4337			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4338				continue;
4339		}
4340		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4341		    ("pmap_protect: Invalid L2 entry after demotion"));
4342
4343		if (va_next > eva)
4344			va_next = eva;
4345
4346		va = va_next;
4347		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4348		    sva += L3_SIZE) {
4349			l3 = pmap_load(l3p);
4350
4351			/*
4352			 * Go to the next L3 entry if the current one is
4353			 * invalid or already has the desired access
4354			 * restrictions in place.  (The latter case occurs
4355			 * frequently.  For example, in a "buildworld"
4356			 * workload, almost 1 out of 4 L3 entries already
4357			 * have the desired restrictions.)
4358			 */
4359			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4360				if (va != va_next) {
4361					if (invalidate)
4362						pmap_s1_invalidate_range(pmap,
4363						    va, sva, true);
4364					va = va_next;
4365				}
4366				if ((l3 & ATTR_CONTIGUOUS) != 0) {
4367					l3p += L3C_ENTRIES - 1;
4368					sva += L3C_SIZE - L3_SIZE;
4369				}
4370				continue;
4371			}
4372
4373			if ((l3 & ATTR_CONTIGUOUS) != 0) {
4374				/*
4375				 * Is this entire set of contiguous L3 entries
4376				 * being protected?  Handle the possibility
4377				 * that "va_next" is zero because of address
4378				 * wraparound.
4379				 */
4380				if ((sva & L3C_OFFSET) == 0 &&
4381				    sva + L3C_OFFSET <= va_next - 1) {
4382					pmap_mask_set_l3c(pmap, l3p, sva, &va,
4383					    va_next, mask, nbits);
4384					l3p += L3C_ENTRIES - 1;
4385					sva += L3C_SIZE - L3_SIZE;
4386					continue;
4387				}
4388
4389				(void)pmap_demote_l3c(pmap, l3p, sva);
4390
4391				/*
4392				 * The L3 entry's accessed bit may have changed.
4393				 */
4394				l3 = pmap_load(l3p);
4395			}
4396			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4397			    nbits))
4398				cpu_spinwait();
4399
4400			/*
4401			 * When a dirty read/write mapping is write protected,
4402			 * update the page's dirty field.
4403			 */
4404			if ((l3 & ATTR_SW_MANAGED) != 0 &&
4405			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4406			    pmap_pte_dirty(pmap, l3))
4407				vm_page_dirty(PTE_TO_VM_PAGE(l3));
4408
4409			if (va == va_next)
4410				va = sva;
4411		}
4412		if (va != va_next && invalidate)
4413			pmap_s1_invalidate_range(pmap, va, sva, true);
4414	}
4415}
4416
4417static void
4418pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4419    pt_entry_t nbits, bool invalidate)
4420{
4421	PMAP_LOCK(pmap);
4422	pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4423	PMAP_UNLOCK(pmap);
4424}
4425
4426/*
4427 *	Set the physical protection on the
4428 *	specified range of this map as requested.
4429 */
4430void
4431pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4432{
4433	pt_entry_t mask, nbits;
4434
4435	PMAP_ASSERT_STAGE1(pmap);
4436	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4437	if (prot == VM_PROT_NONE) {
4438		pmap_remove(pmap, sva, eva);
4439		return;
4440	}
4441
4442	mask = nbits = 0;
4443	if ((prot & VM_PROT_WRITE) == 0) {
4444		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4445		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4446	}
4447	if ((prot & VM_PROT_EXECUTE) == 0) {
4448		mask |= ATTR_S1_XN;
4449		nbits |= ATTR_S1_XN;
4450	}
4451	if (pmap == kernel_pmap) {
4452		mask |= ATTR_KERN_GP;
4453		nbits |= ATTR_KERN_GP;
4454	}
4455	if (mask == 0)
4456		return;
4457
4458	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4459}
4460
4461void
4462pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4463{
4464
4465	MPASS((sva & L3_OFFSET) == 0);
4466	MPASS(((sva + size) & L3_OFFSET) == 0);
4467
4468	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4469	    ATTR_SW_NO_PROMOTE, false);
4470}
4471
4472/*
4473 * Inserts the specified page table page into the specified pmap's collection
4474 * of idle page table pages.  Each of a pmap's page table pages is responsible
4475 * for mapping a distinct range of virtual addresses.  The pmap's collection is
4476 * ordered by this virtual address range.
4477 *
4478 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4479 * "mpte"'s valid field will be set to 0.
4480 *
4481 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4482 * contain valid mappings with identical attributes except for ATTR_AF;
4483 * "mpte"'s valid field will be set to 1.
4484 *
4485 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4486 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4487 * field will be set to VM_PAGE_BITS_ALL.
4488 */
4489static __inline int
4490pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4491    bool all_l3e_AF_set)
4492{
4493
4494	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4495	KASSERT(promoted || !all_l3e_AF_set,
4496	    ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4497	mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4498	return (vm_radix_insert(&pmap->pm_root, mpte));
4499}
4500
4501/*
4502 * Removes the page table page mapping the specified virtual address from the
4503 * specified pmap's collection of idle page table pages, and returns it.
4504 * Otherwise, returns NULL if there is no page table page corresponding to the
4505 * specified virtual address.
4506 */
4507static __inline vm_page_t
4508pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4509{
4510
4511	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4512	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4513}
4514
4515/*
4516 * Performs a break-before-make update of a pmap entry. This is needed when
4517 * either promoting or demoting pages to ensure the TLB doesn't get into an
4518 * inconsistent state.
4519 */
4520static void
4521pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4522    vm_offset_t va, vm_size_t size)
4523{
4524	pd_entry_t *lip, *ptep_end;
4525	register_t intr;
4526
4527	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4528
4529	if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
4530		panic("%s: Updating non-promote pte", __func__);
4531
4532	if (size == L3C_SIZE)
4533		ptep_end = ptep + L3C_ENTRIES;
4534	else
4535		ptep_end = ptep + 1;
4536
4537	/*
4538	 * Ensure we don't get switched out with the page table in an
4539	 * inconsistent state. We also need to ensure no interrupts fire
4540	 * as they may make use of an address we are about to invalidate.
4541	 */
4542	intr = intr_disable();
4543
4544	/*
4545	 * Clear the old mapping's valid bit, but leave the rest of the entry
4546	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4547	 * lookup the physical address.
4548	 */
4549	for (lip = ptep; lip < ptep_end; lip++)
4550		pmap_clear_bits(lip, ATTR_DESCR_VALID);
4551
4552	/*
4553	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4554	 * be cached, so we invalidate intermediate entries as well as final
4555	 * entries.
4556	 */
4557	pmap_s1_invalidate_range(pmap, va, va + size, size == L3C_SIZE);
4558
4559	/* Create the new mapping */
4560	for (lip = ptep; lip < ptep_end; lip++) {
4561		pmap_store(lip, newpte);
4562		newpte += PAGE_SIZE;
4563	}
4564	dsb(ishst);
4565
4566	intr_restore(intr);
4567}
4568
4569#if VM_NRESERVLEVEL > 0
4570/*
4571 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4572 * replace the many pv entries for the 4KB page mappings by a single pv entry
4573 * for the 2MB page mapping.
4574 */
4575static void
4576pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4577    struct rwlock **lockp)
4578{
4579	struct md_page *pvh;
4580	pv_entry_t pv;
4581	vm_offset_t va_last;
4582	vm_page_t m;
4583
4584	KASSERT((pa & L2_OFFSET) == 0,
4585	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4586	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4587
4588	/*
4589	 * Transfer the first page's pv entry for this mapping to the 2mpage's
4590	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
4591	 * a transfer avoids the possibility that get_pv_entry() calls
4592	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4593	 * mappings that is being promoted.
4594	 */
4595	m = PHYS_TO_VM_PAGE(pa);
4596	va = va & ~L2_OFFSET;
4597	pv = pmap_pvh_remove(&m->md, pmap, va);
4598	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4599	pvh = page_to_pvh(m);
4600	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4601	pvh->pv_gen++;
4602	/* Free the remaining NPTEPG - 1 pv entries. */
4603	va_last = va + L2_SIZE - PAGE_SIZE;
4604	do {
4605		m++;
4606		va += PAGE_SIZE;
4607		pmap_pvh_free(&m->md, pmap, va);
4608	} while (va < va_last);
4609}
4610
4611/*
4612 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4613 * single level 2 table entry to a single 2MB page mapping.  For promotion
4614 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4615 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4616 * identical characteristics.
4617 */
4618static bool
4619pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4620    struct rwlock **lockp)
4621{
4622	pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4623
4624	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4625
4626	/*
4627	 * Currently, this function only supports promotion on stage 1 pmaps
4628	 * because it tests stage 1 specific fields and performs a break-
4629	 * before-make sequence that is incorrect for stage 2 pmaps.
4630	 */
4631	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4632		return (false);
4633
4634	/*
4635	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
4636	 * ineligible for promotion...
4637	 */
4638	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4639	newl2 = pmap_load(firstl3);
4640	if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4641		return (false);
4642	/* ... is not the first physical page within an L2 block */
4643	if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4644	    ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4645		atomic_add_long(&pmap_l2_p_failures, 1);
4646		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4647		    " in pmap %p", va, pmap);
4648		return (false);
4649	}
4650
4651	/*
4652	 * Both here and in the below "for" loop, to allow for repromotion
4653	 * after MADV_FREE, conditionally write protect a clean L3E before
4654	 * possibly aborting the promotion due to other L3E attributes.  Why?
4655	 * Suppose that MADV_FREE is applied to a part of a superpage, the
4656	 * address range [S, E).  pmap_advise() will demote the superpage
4657	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4658	 * set AP_RO and clear AF in the L3Es for the rest of [S, E).  Later,
4659	 * imagine that the memory in [S, E) is recycled, but the last 4KB
4660	 * page in [S, E) is not the last to be rewritten, or simply accessed.
4661	 * In other words, there is still a 4KB page in [S, E), call it P,
4662	 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4663	 * Unless we write protect P before aborting the promotion, if and
4664	 * when P is finally rewritten, there won't be a page fault to trigger
4665	 * repromotion.
4666	 */
4667setl2:
4668	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4669	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4670		/*
4671		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4672		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4673		 */
4674		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4675			goto setl2;
4676		newl2 &= ~ATTR_SW_DBM;
4677		CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4678		    " in pmap %p", va & ~L2_OFFSET, pmap);
4679	}
4680
4681	/*
4682	 * Examine each of the other L3Es in the specified PTP.  Abort if this
4683	 * L3E maps an unexpected 4KB physical page or does not have identical
4684	 * characteristics to the first L3E.  If ATTR_AF is not set in every
4685	 * PTE, then request that the PTP be refilled on demotion.
4686	 */
4687	all_l3e_AF = newl2 & ATTR_AF;
4688	pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4689	    + L2_SIZE - PAGE_SIZE;
4690	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4691		oldl3 = pmap_load(l3);
4692		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4693			atomic_add_long(&pmap_l2_p_failures, 1);
4694			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4695			    " in pmap %p", va, pmap);
4696			return (false);
4697		}
4698setl3:
4699		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4700		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4701			/*
4702			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4703			 * set, ATTR_SW_DBM can be cleared without a TLB
4704			 * invalidation.
4705			 */
4706			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4707			    ~ATTR_SW_DBM))
4708				goto setl3;
4709			oldl3 &= ~ATTR_SW_DBM;
4710		}
4711		if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4712			atomic_add_long(&pmap_l2_p_failures, 1);
4713			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4714			    " in pmap %p", va, pmap);
4715			return (false);
4716		}
4717		all_l3e_AF &= oldl3;
4718		pa -= PAGE_SIZE;
4719	}
4720
4721	/*
4722	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4723	 * mapping, so that promotions triggered by speculative mappings,
4724	 * such as pmap_enter_quick(), don't automatically mark the
4725	 * underlying pages as referenced.
4726	 */
4727	newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4728
4729	/*
4730	 * Save the page table page in its current state until the L2
4731	 * mapping the superpage is demoted by pmap_demote_l2() or
4732	 * destroyed by pmap_remove_l3().
4733	 */
4734	if (mpte == NULL)
4735		mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4736	KASSERT(mpte >= vm_page_array &&
4737	    mpte < &vm_page_array[vm_page_array_size],
4738	    ("pmap_promote_l2: page table page is out of range"));
4739	KASSERT(mpte->pindex == pmap_l2_pindex(va),
4740	    ("pmap_promote_l2: page table page's pindex is wrong"));
4741	if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4742		atomic_add_long(&pmap_l2_p_failures, 1);
4743		CTR2(KTR_PMAP,
4744		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4745		    pmap);
4746		return (false);
4747	}
4748
4749	if ((newl2 & ATTR_SW_MANAGED) != 0)
4750		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4751
4752	pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4753
4754	atomic_add_long(&pmap_l2_promotions, 1);
4755	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4756	    pmap);
4757	return (true);
4758}
4759
4760/*
4761 * Tries to promote an aligned, contiguous set of base page mappings to a
4762 * single L3C page mapping.  For promotion to occur, two conditions must be
4763 * met: (1) the base page mappings must map aligned, contiguous physical
4764 * memory and (2) the base page mappings must have identical characteristics
4765 * except for the accessed flag.
4766 */
4767static bool
4768pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4769{
4770	pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4771
4772	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4773
4774	/*
4775	 * Currently, this function only supports promotion on stage 1 pmaps
4776	 * because it tests stage 1 specific fields and performs a break-
4777	 * before-make sequence that is incorrect for stage 2 pmaps.
4778	 */
4779	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4780		return (false);
4781
4782	/*
4783	 * Compute the address of the first L3 entry in the superpage
4784	 * candidate.
4785	 */
4786	l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4787	    sizeof(pt_entry_t)) - 1));
4788
4789	firstl3c = pmap_load(l3p);
4790
4791	/*
4792	 * Examine the first L3 entry. Abort if this L3E is ineligible for
4793	 * promotion...
4794	 */
4795	if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4796		return (false);
4797	/* ...is not properly aligned... */
4798	if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4799	    (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4800		counter_u64_add(pmap_l3c_p_failures, 1);
4801		CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4802		    " in pmap %p", va, pmap);
4803		return (false);
4804	}
4805
4806	/*
4807	 * If the first L3 entry is a clean read-write mapping, convert it
4808	 * to a read-only mapping.  See pmap_promote_l2() for the rationale.
4809	 */
4810set_first:
4811	if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4812	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4813		/*
4814		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4815		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4816		 */
4817		if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4818			goto set_first;
4819		firstl3c &= ~ATTR_SW_DBM;
4820		CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4821		    " in pmap %p", va & ~L3C_OFFSET, pmap);
4822	}
4823
4824	/*
4825	 * Check that the rest of the L3 entries are compatible with the first,
4826	 * and convert clean read-write mappings to read-only mappings.
4827	 */
4828	all_l3e_AF = firstl3c & ATTR_AF;
4829	pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4830	    L3C_SIZE - PAGE_SIZE;
4831	for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4832		oldl3 = pmap_load(l3);
4833		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4834			counter_u64_add(pmap_l3c_p_failures, 1);
4835			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4836			    " in pmap %p", va, pmap);
4837			return (false);
4838		}
4839set_l3:
4840		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4841		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4842			/*
4843			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4844			 * set, ATTR_SW_DBM can be cleared without a TLB
4845			 * invalidation.
4846			 */
4847			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4848			    ~ATTR_SW_DBM))
4849				goto set_l3;
4850			oldl3 &= ~ATTR_SW_DBM;
4851			CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4852			    " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
4853			    (va & ~L3C_OFFSET), pmap);
4854		}
4855		if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
4856			counter_u64_add(pmap_l3c_p_failures, 1);
4857			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4858			    " in pmap %p", va, pmap);
4859			return (false);
4860		}
4861		all_l3e_AF &= oldl3;
4862		pa -= PAGE_SIZE;
4863	}
4864
4865	/*
4866	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4867	 * mapping, so that promotions triggered by speculative mappings,
4868	 * such as pmap_enter_quick(), don't automatically mark the
4869	 * underlying pages as referenced.
4870	 */
4871	firstl3c &= ~ATTR_AF | all_l3e_AF;
4872
4873	/*
4874	 * Remake the mappings with the contiguous bit set.
4875	 */
4876	pmap_update_entry(pmap, l3p, firstl3c | ATTR_CONTIGUOUS, va &
4877	    ~L3C_OFFSET, L3C_SIZE);
4878
4879	counter_u64_add(pmap_l3c_promotions, 1);
4880	CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
4881	    pmap);
4882	return (true);
4883}
4884#endif /* VM_NRESERVLEVEL > 0 */
4885
4886static int
4887pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
4888    int psind)
4889{
4890	pd_entry_t *l0p, *l1p, *l2p, origpte;
4891	vm_page_t mp;
4892
4893	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4894	KASSERT(psind > 0 && psind < MAXPAGESIZES,
4895	    ("psind %d unexpected", psind));
4896	KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0,
4897	    ("unaligned phys address %#lx newpte %#lx psind %d",
4898	    PTE_TO_PHYS(newpte), newpte, psind));
4899
4900restart:
4901	if (!pmap_bti_same(pmap, va, va + pagesizes[psind]))
4902		return (KERN_PROTECTION_FAILURE);
4903	if (psind == 2) {
4904		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4905
4906		l0p = pmap_l0(pmap, va);
4907		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4908			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4909			if (mp == NULL) {
4910				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4911					return (KERN_RESOURCE_SHORTAGE);
4912				PMAP_UNLOCK(pmap);
4913				vm_wait(NULL);
4914				PMAP_LOCK(pmap);
4915				goto restart;
4916			}
4917			l1p = pmap_l0_to_l1(l0p, va);
4918			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4919			origpte = pmap_load(l1p);
4920		} else {
4921			l1p = pmap_l0_to_l1(l0p, va);
4922			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4923			origpte = pmap_load(l1p);
4924			if ((origpte & ATTR_DESCR_VALID) == 0) {
4925				mp = PTE_TO_VM_PAGE(pmap_load(l0p));
4926				mp->ref_count++;
4927			}
4928		}
4929		KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
4930		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
4931		    (origpte & ATTR_DESCR_VALID) == 0,
4932		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
4933		    va, origpte, newpte));
4934		pmap_store(l1p, newpte);
4935	} else /* (psind == 1) */ {
4936		l2p = pmap_l2(pmap, va);
4937		if (l2p == NULL) {
4938			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
4939			if (mp == NULL) {
4940				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4941					return (KERN_RESOURCE_SHORTAGE);
4942				PMAP_UNLOCK(pmap);
4943				vm_wait(NULL);
4944				PMAP_LOCK(pmap);
4945				goto restart;
4946			}
4947			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
4948			l2p = &l2p[pmap_l2_index(va)];
4949			origpte = pmap_load(l2p);
4950		} else {
4951			l1p = pmap_l1(pmap, va);
4952			origpte = pmap_load(l2p);
4953			if ((origpte & ATTR_DESCR_VALID) == 0) {
4954				mp = PTE_TO_VM_PAGE(pmap_load(l1p));
4955				mp->ref_count++;
4956			}
4957		}
4958		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
4959		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
4960		    PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
4961		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
4962		    va, origpte, newpte));
4963		pmap_store(l2p, newpte);
4964	}
4965	dsb(ishst);
4966
4967	if ((origpte & ATTR_DESCR_VALID) == 0)
4968		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
4969	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
4970		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
4971	else if ((newpte & ATTR_SW_WIRED) == 0 &&
4972	    (origpte & ATTR_SW_WIRED) != 0)
4973		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
4974
4975	return (KERN_SUCCESS);
4976}
4977
4978/*
4979 *	Insert the given physical page (p) at
4980 *	the specified virtual address (v) in the
4981 *	target physical map with the protection requested.
4982 *
4983 *	If specified, the page will be wired down, meaning
4984 *	that the related pte can not be reclaimed.
4985 *
4986 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4987 *	or lose information.  That is, this routine must actually
4988 *	insert this page into the given map NOW.
4989 */
4990int
4991pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4992    u_int flags, int8_t psind)
4993{
4994	struct rwlock *lock;
4995	pd_entry_t *pde;
4996	pt_entry_t new_l3, orig_l3;
4997	pt_entry_t *l2, *l3;
4998	pv_entry_t pv;
4999	vm_paddr_t opa, pa;
5000	vm_page_t mpte, om;
5001	bool nosleep;
5002	int lvl, rv;
5003
5004	KASSERT(ADDR_IS_CANONICAL(va),
5005	    ("%s: Address not in canonical form: %lx", __func__, va));
5006
5007	va = trunc_page(va);
5008	if ((m->oflags & VPO_UNMANAGED) == 0)
5009		VM_PAGE_OBJECT_BUSY_ASSERT(m);
5010	pa = VM_PAGE_TO_PHYS(m);
5011	new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
5012	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5013	new_l3 |= pmap_pte_prot(pmap, prot);
5014	if ((flags & PMAP_ENTER_WIRED) != 0)
5015		new_l3 |= ATTR_SW_WIRED;
5016	if (pmap->pm_stage == PM_STAGE1) {
5017		if (!ADDR_IS_KERNEL(va))
5018			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5019		else
5020			new_l3 |= ATTR_S1_UXN;
5021		if (pmap != kernel_pmap)
5022			new_l3 |= ATTR_S1_nG;
5023	} else {
5024		/*
5025		 * Clear the access flag on executable mappings, this will be
5026		 * set later when the page is accessed. The fault handler is
5027		 * required to invalidate the I-cache.
5028		 *
5029		 * TODO: Switch to the valid flag to allow hardware management
5030		 * of the access flag. Much of the pmap code assumes the
5031		 * valid flag is set and fails to destroy the old page tables
5032		 * correctly if it is clear.
5033		 */
5034		if (prot & VM_PROT_EXECUTE)
5035			new_l3 &= ~ATTR_AF;
5036	}
5037	if ((m->oflags & VPO_UNMANAGED) == 0) {
5038		new_l3 |= ATTR_SW_MANAGED;
5039		if ((prot & VM_PROT_WRITE) != 0) {
5040			new_l3 |= ATTR_SW_DBM;
5041			if ((flags & VM_PROT_WRITE) == 0) {
5042				if (pmap->pm_stage == PM_STAGE1)
5043					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5044				else
5045					new_l3 &=
5046					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5047			}
5048		}
5049	}
5050
5051	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5052
5053	lock = NULL;
5054	PMAP_LOCK(pmap);
5055	/* Wait until we lock the pmap to protect the bti rangeset */
5056	new_l3 |= pmap_pte_bti(pmap, va);
5057
5058	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5059		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5060		    ("managed largepage va %#lx flags %#x", va, flags));
5061		new_l3 &= ~L3_PAGE;
5062		if (psind == 2) {
5063			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5064			new_l3 |= L1_BLOCK;
5065		} else /* (psind == 1) */
5066			new_l3 |= L2_BLOCK;
5067		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5068		goto out;
5069	}
5070	if (psind == 1) {
5071		/* Assert the required virtual and physical alignment. */
5072		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5073		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5074		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5075		    flags, m, &lock);
5076		goto out;
5077	}
5078	mpte = NULL;
5079
5080	/*
5081	 * In the case that a page table page is not
5082	 * resident, we are creating it here.
5083	 */
5084retry:
5085	pde = pmap_pde(pmap, va, &lvl);
5086	if (pde != NULL && lvl == 2) {
5087		l3 = pmap_l2_to_l3(pde, va);
5088		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5089			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5090			mpte->ref_count++;
5091		}
5092		goto havel3;
5093	} else if (pde != NULL && lvl == 1) {
5094		l2 = pmap_l1_to_l2(pde, va);
5095		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5096		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5097			l3 = &l3[pmap_l3_index(va)];
5098			if (!ADDR_IS_KERNEL(va)) {
5099				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5100				mpte->ref_count++;
5101			}
5102			goto havel3;
5103		}
5104		/* We need to allocate an L3 table. */
5105	}
5106	if (!ADDR_IS_KERNEL(va)) {
5107		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5108
5109		/*
5110		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5111		 * to handle the possibility that a superpage mapping for "va"
5112		 * was created while we slept.
5113		 */
5114		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5115		    nosleep ? NULL : &lock);
5116		if (mpte == NULL && nosleep) {
5117			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5118			rv = KERN_RESOURCE_SHORTAGE;
5119			goto out;
5120		}
5121		goto retry;
5122	} else
5123		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5124
5125havel3:
5126	orig_l3 = pmap_load(l3);
5127	opa = PTE_TO_PHYS(orig_l3);
5128	pv = NULL;
5129
5130	/*
5131	 * Is the specified virtual address already mapped?
5132	 */
5133	if (pmap_l3_valid(orig_l3)) {
5134		/*
5135		 * Wiring change, just update stats. We don't worry about
5136		 * wiring PT pages as they remain resident as long as there
5137		 * are valid mappings in them. Hence, if a user page is wired,
5138		 * the PT page will be also.
5139		 */
5140		if ((flags & PMAP_ENTER_WIRED) != 0 &&
5141		    (orig_l3 & ATTR_SW_WIRED) == 0)
5142			pmap->pm_stats.wired_count++;
5143		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5144		    (orig_l3 & ATTR_SW_WIRED) != 0)
5145			pmap->pm_stats.wired_count--;
5146
5147		/*
5148		 * Remove the extra PT page reference.
5149		 */
5150		if (mpte != NULL) {
5151			mpte->ref_count--;
5152			KASSERT(mpte->ref_count > 0,
5153			    ("pmap_enter: missing reference to page table page,"
5154			     " va: 0x%lx", va));
5155		}
5156
5157		/*
5158		 * Has the physical page changed?
5159		 */
5160		if (opa == pa) {
5161			/*
5162			 * No, might be a protection or wiring change.
5163			 */
5164			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5165			    (new_l3 & ATTR_SW_DBM) != 0)
5166				vm_page_aflag_set(m, PGA_WRITEABLE);
5167			goto validate;
5168		}
5169
5170		/*
5171		 * The physical page has changed.  Temporarily invalidate
5172		 * the mapping.
5173		 */
5174		if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5175			(void)pmap_demote_l3c(pmap, l3, va);
5176		orig_l3 = pmap_load_clear(l3);
5177		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5178		    ("pmap_enter: unexpected pa update for %#lx", va));
5179		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5180			om = PHYS_TO_VM_PAGE(opa);
5181
5182			/*
5183			 * The pmap lock is sufficient to synchronize with
5184			 * concurrent calls to pmap_page_test_mappings() and
5185			 * pmap_ts_referenced().
5186			 */
5187			if (pmap_pte_dirty(pmap, orig_l3))
5188				vm_page_dirty(om);
5189			if ((orig_l3 & ATTR_AF) != 0) {
5190				pmap_invalidate_page(pmap, va, true);
5191				vm_page_aflag_set(om, PGA_REFERENCED);
5192			}
5193			CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5194			pv = pmap_pvh_remove(&om->md, pmap, va);
5195			if ((m->oflags & VPO_UNMANAGED) != 0)
5196				free_pv_entry(pmap, pv);
5197			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5198			    TAILQ_EMPTY(&om->md.pv_list) &&
5199			    ((om->flags & PG_FICTITIOUS) != 0 ||
5200			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5201				vm_page_aflag_clear(om, PGA_WRITEABLE);
5202		} else {
5203			KASSERT((orig_l3 & ATTR_AF) != 0,
5204			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5205			pmap_invalidate_page(pmap, va, true);
5206		}
5207		orig_l3 = 0;
5208	} else {
5209		/*
5210		 * Increment the counters.
5211		 */
5212		if ((new_l3 & ATTR_SW_WIRED) != 0)
5213			pmap->pm_stats.wired_count++;
5214		pmap_resident_count_inc(pmap, 1);
5215	}
5216	/*
5217	 * Enter on the PV list if part of our managed memory.
5218	 */
5219	if ((m->oflags & VPO_UNMANAGED) == 0) {
5220		if (pv == NULL) {
5221			pv = get_pv_entry(pmap, &lock);
5222			pv->pv_va = va;
5223		}
5224		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5225		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5226		m->md.pv_gen++;
5227		if ((new_l3 & ATTR_SW_DBM) != 0)
5228			vm_page_aflag_set(m, PGA_WRITEABLE);
5229	}
5230
5231validate:
5232	if (pmap->pm_stage == PM_STAGE1) {
5233		/*
5234		 * Sync icache if exec permission and attribute
5235		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5236		 * is stored and made valid for hardware table walk. If done
5237		 * later, then other can access this page before caches are
5238		 * properly synced. Don't do it for kernel memory which is
5239		 * mapped with exec permission even if the memory isn't going
5240		 * to hold executable code. The only time when icache sync is
5241		 * needed is after kernel module is loaded and the relocation
5242		 * info is processed. And it's done in elf_cpu_load_file().
5243		*/
5244		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
5245		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5246		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5247			PMAP_ASSERT_STAGE1(pmap);
5248			cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5249			    PAGE_SIZE);
5250		}
5251	} else {
5252		cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5253	}
5254
5255	/*
5256	 * Update the L3 entry
5257	 */
5258	if (pmap_l3_valid(orig_l3)) {
5259		KASSERT(opa == pa, ("pmap_enter: invalid update"));
5260		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5261			/* same PA, different attributes */
5262			if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5263				(void)pmap_demote_l3c(pmap, l3, va);
5264			orig_l3 = pmap_load_store(l3, new_l3);
5265			pmap_invalidate_page(pmap, va, true);
5266			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5267			    pmap_pte_dirty(pmap, orig_l3))
5268				vm_page_dirty(m);
5269		} else {
5270			/*
5271			 * orig_l3 == new_l3
5272			 * This can happens if multiple threads simultaneously
5273			 * access not yet mapped page. This bad for performance
5274			 * since this can cause full demotion-NOP-promotion
5275			 * cycle.
5276			 * Another possible reasons are:
5277			 * - VM and pmap memory layout are diverged
5278			 * - tlb flush is missing somewhere and CPU doesn't see
5279			 *   actual mapping.
5280			 */
5281			CTR4(KTR_PMAP, "%s: already mapped page - "
5282			    "pmap %p va 0x%#lx pte 0x%lx",
5283			    __func__, pmap, va, new_l3);
5284		}
5285	} else {
5286		/* New mapping */
5287		pmap_store(l3, new_l3);
5288		dsb(ishst);
5289	}
5290
5291#if VM_NRESERVLEVEL > 0
5292	/*
5293	 * First, attempt L3C promotion, if the virtual and physical addresses
5294	 * are aligned with each other and an underlying reservation has the
5295	 * neighboring L3 pages allocated.  The first condition is simply an
5296	 * optimization that recognizes some eventual promotion failures early
5297	 * at a lower run-time cost.  Then, if both the page table page and
5298	 * the reservation are fully populated, attempt L2 promotion.
5299	 */
5300	if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5301	    (m->flags & PG_FICTITIOUS) == 0 &&
5302	    vm_reserv_is_populated(m, L3C_ENTRIES) &&
5303	    pmap_promote_l3c(pmap, l3, va) &&
5304	    (mpte == NULL || mpte->ref_count == NL3PG) &&
5305	    vm_reserv_level_iffullpop(m) == 0)
5306		(void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5307#endif
5308
5309	rv = KERN_SUCCESS;
5310out:
5311	if (lock != NULL)
5312		rw_wunlock(lock);
5313	PMAP_UNLOCK(pmap);
5314	return (rv);
5315}
5316
5317/*
5318 * Tries to create a read- and/or execute-only L2 page mapping.  Returns
5319 * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5320 * value.  See pmap_enter_l2() for the possible error values when "no sleep",
5321 * "no replace", and "no reclaim" are specified.
5322 */
5323static int
5324pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5325    struct rwlock **lockp)
5326{
5327	pd_entry_t new_l2;
5328
5329	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5330	PMAP_ASSERT_STAGE1(pmap);
5331	KASSERT(ADDR_IS_CANONICAL(va),
5332	    ("%s: Address not in canonical form: %lx", __func__, va));
5333
5334	new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5335	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5336	    L2_BLOCK);
5337	new_l2 |= pmap_pte_bti(pmap, va);
5338	if ((m->oflags & VPO_UNMANAGED) == 0) {
5339		new_l2 |= ATTR_SW_MANAGED;
5340		new_l2 &= ~ATTR_AF;
5341	}
5342	if ((prot & VM_PROT_EXECUTE) == 0 ||
5343	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5344		new_l2 |= ATTR_S1_XN;
5345	if (!ADDR_IS_KERNEL(va))
5346		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5347	else
5348		new_l2 |= ATTR_S1_UXN;
5349	if (pmap != kernel_pmap)
5350		new_l2 |= ATTR_S1_nG;
5351	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5352	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5353}
5354
5355/*
5356 * Returns true if every page table entry in the specified page table is
5357 * zero.
5358 */
5359static bool
5360pmap_every_pte_zero(vm_paddr_t pa)
5361{
5362	pt_entry_t *pt_end, *pte;
5363
5364	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5365	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5366	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5367		if (*pte != 0)
5368			return (false);
5369	}
5370	return (true);
5371}
5372
5373/*
5374 * Tries to create the specified L2 page mapping.  Returns KERN_SUCCESS if
5375 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5376 * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
5377 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5378 * within the L2 virtual address range starting at the specified virtual
5379 * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5380 * L2 page mapping already exists at the specified virtual address.  Returns
5381 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5382 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5383 * and a PV entry allocation failed.
5384 */
5385static int
5386pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5387    vm_page_t m, struct rwlock **lockp)
5388{
5389	struct spglist free;
5390	pd_entry_t *l2, old_l2;
5391	vm_page_t l2pg, mt;
5392	vm_page_t uwptpg;
5393
5394	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5395	KASSERT(ADDR_IS_CANONICAL(va),
5396	    ("%s: Address not in canonical form: %lx", __func__, va));
5397
5398	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5399	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5400		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5401		    va, pmap);
5402		return (KERN_RESOURCE_SHORTAGE);
5403	}
5404
5405	/*
5406	 * If bti is not the same for the whole l2 range, return failure
5407	 * and let vm_fault() cope.  Check after l2 allocation, since
5408	 * it could sleep.
5409	 */
5410	if (!pmap_bti_same(pmap, va, va + L2_SIZE)) {
5411		KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5412		pmap_abort_ptp(pmap, va, l2pg);
5413		return (KERN_PROTECTION_FAILURE);
5414	}
5415
5416	/*
5417	 * If there are existing mappings, either abort or remove them.
5418	 */
5419	if ((old_l2 = pmap_load(l2)) != 0) {
5420		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5421		    ("pmap_enter_l2: l2pg's ref count is too low"));
5422		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5423			if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5424				if (l2pg != NULL)
5425					l2pg->ref_count--;
5426				CTR2(KTR_PMAP,
5427				    "pmap_enter_l2: no space for va %#lx"
5428				    " in pmap %p", va, pmap);
5429				return (KERN_NO_SPACE);
5430			} else if (!ADDR_IS_KERNEL(va) ||
5431			    !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5432				if (l2pg != NULL)
5433					l2pg->ref_count--;
5434				CTR2(KTR_PMAP,
5435				    "pmap_enter_l2: failure for va %#lx"
5436				    " in pmap %p", va, pmap);
5437				return (KERN_FAILURE);
5438			}
5439		}
5440		SLIST_INIT(&free);
5441		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5442			(void)pmap_remove_l2(pmap, l2, va,
5443			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
5444		else
5445			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5446			    &free, lockp);
5447		if (!ADDR_IS_KERNEL(va)) {
5448			vm_page_free_pages_toq(&free, true);
5449			KASSERT(pmap_load(l2) == 0,
5450			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
5451		} else {
5452			KASSERT(SLIST_EMPTY(&free),
5453			    ("pmap_enter_l2: freed kernel page table page"));
5454
5455			/*
5456			 * Both pmap_remove_l2() and pmap_remove_l3_range()
5457			 * will leave the kernel page table page zero filled.
5458			 * Nonetheless, the TLB could have an intermediate
5459			 * entry for the kernel page table page, so request
5460			 * an invalidation at all levels after clearing
5461			 * the L2_TABLE entry.
5462			 */
5463			mt = PTE_TO_VM_PAGE(pmap_load(l2));
5464			if (pmap_insert_pt_page(pmap, mt, false, false))
5465				panic("pmap_enter_l2: trie insert failed");
5466			pmap_clear(l2);
5467			pmap_s1_invalidate_page(pmap, va, false);
5468		}
5469	}
5470
5471	/*
5472	 * Allocate leaf ptpage for wired userspace pages.
5473	 */
5474	uwptpg = NULL;
5475	if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5476		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5477		if (uwptpg == NULL) {
5478			return (KERN_RESOURCE_SHORTAGE);
5479		}
5480		uwptpg->pindex = pmap_l2_pindex(va);
5481		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5482			vm_page_unwire_noq(uwptpg);
5483			vm_page_free(uwptpg);
5484			return (KERN_RESOURCE_SHORTAGE);
5485		}
5486		pmap_resident_count_inc(pmap, 1);
5487		uwptpg->ref_count = NL3PG;
5488	}
5489	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5490		/*
5491		 * Abort this mapping if its PV entry could not be created.
5492		 */
5493		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5494			if (l2pg != NULL)
5495				pmap_abort_ptp(pmap, va, l2pg);
5496			if (uwptpg != NULL) {
5497				mt = pmap_remove_pt_page(pmap, va);
5498				KASSERT(mt == uwptpg,
5499				    ("removed pt page %p, expected %p", mt,
5500				    uwptpg));
5501				pmap_resident_count_dec(pmap, 1);
5502				uwptpg->ref_count = 1;
5503				vm_page_unwire_noq(uwptpg);
5504				vm_page_free(uwptpg);
5505			}
5506			CTR2(KTR_PMAP,
5507			    "pmap_enter_l2: failure for va %#lx in pmap %p",
5508			    va, pmap);
5509			return (KERN_RESOURCE_SHORTAGE);
5510		}
5511		if ((new_l2 & ATTR_SW_DBM) != 0)
5512			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5513				vm_page_aflag_set(mt, PGA_WRITEABLE);
5514	}
5515
5516	/*
5517	 * Increment counters.
5518	 */
5519	if ((new_l2 & ATTR_SW_WIRED) != 0)
5520		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5521	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5522
5523	/*
5524	 * Conditionally sync the icache.  See pmap_enter() for details.
5525	 */
5526	if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5527	    PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5528	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5529		cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5530		    L2_SIZE);
5531	}
5532
5533	/*
5534	 * Map the superpage.
5535	 */
5536	pmap_store(l2, new_l2);
5537	dsb(ishst);
5538
5539	atomic_add_long(&pmap_l2_mappings, 1);
5540	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5541	    va, pmap);
5542
5543	return (KERN_SUCCESS);
5544}
5545
5546/*
5547 * Tries to create a read- and/or execute-only L3C page mapping.  Returns
5548 * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5549 * value.
5550 */
5551static int
5552pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5553    vm_prot_t prot, struct rwlock **lockp)
5554{
5555	pt_entry_t l3e;
5556
5557	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5558	PMAP_ASSERT_STAGE1(pmap);
5559	KASSERT(ADDR_IS_CANONICAL(va),
5560	    ("%s: Address not in canonical form: %lx", __func__, va));
5561
5562	l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5563	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5564	    ATTR_CONTIGUOUS | L3_PAGE;
5565	l3e |= pmap_pte_bti(pmap, va);
5566	if ((m->oflags & VPO_UNMANAGED) == 0) {
5567		l3e |= ATTR_SW_MANAGED;
5568		l3e &= ~ATTR_AF;
5569	}
5570	if ((prot & VM_PROT_EXECUTE) == 0 ||
5571	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5572		l3e |= ATTR_S1_XN;
5573	if (!ADDR_IS_KERNEL(va))
5574		l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5575	else
5576		l3e |= ATTR_S1_UXN;
5577	if (pmap != kernel_pmap)
5578		l3e |= ATTR_S1_nG;
5579	return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5580	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5581}
5582
5583static int
5584pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5585    vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5586{
5587	pd_entry_t *l2p, *pde;
5588	pt_entry_t *l3p, *tl3p;
5589	vm_page_t mt;
5590	vm_paddr_t pa;
5591	vm_pindex_t l2pindex;
5592	int lvl;
5593
5594	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5595	KASSERT((va & L3C_OFFSET) == 0,
5596	    ("pmap_enter_l3c: va is not aligned"));
5597	KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5598	    ("pmap_enter_l3c: managed mapping within the clean submap"));
5599
5600	/*
5601	 * If the L3 PTP is not resident, we attempt to create it here.
5602	 */
5603	if (!ADDR_IS_KERNEL(va)) {
5604		/*
5605		 * Were we given the correct L3 PTP?  If so, we can simply
5606		 * increment its ref count.
5607		 */
5608		l2pindex = pmap_l2_pindex(va);
5609		if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5610			(*ml3p)->ref_count += L3C_ENTRIES;
5611		} else {
5612retry:
5613			/*
5614			 * Get the L2 entry.
5615			 */
5616			pde = pmap_pde(pmap, va, &lvl);
5617
5618			/*
5619			 * If the L2 entry is a superpage, we either abort or
5620			 * demote depending on the given flags.
5621			 */
5622			if (lvl == 1) {
5623				l2p = pmap_l1_to_l2(pde, va);
5624				if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5625				    L2_BLOCK) {
5626					if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5627						return (KERN_FAILURE);
5628					l3p = pmap_demote_l2_locked(pmap, l2p,
5629					    va, lockp);
5630					if (l3p != NULL) {
5631						*ml3p = PTE_TO_VM_PAGE(
5632						    pmap_load(l2p));
5633						(*ml3p)->ref_count +=
5634						    L3C_ENTRIES;
5635						goto have_l3p;
5636					}
5637				}
5638				/* We need to allocate an L3 PTP. */
5639			}
5640
5641			/*
5642			 * If the L3 PTP is mapped, we just increment its ref
5643			 * count.  Otherwise, we attempt to allocate it.
5644			 */
5645			if (lvl == 2 && pmap_load(pde) != 0) {
5646				*ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5647				(*ml3p)->ref_count += L3C_ENTRIES;
5648			} else {
5649				*ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5650				    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5651				if (*ml3p == NULL) {
5652					if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5653						return (KERN_FAILURE);
5654
5655					/*
5656					 * The page table may have changed
5657					 * while we slept.
5658					 */
5659					goto retry;
5660				}
5661				(*ml3p)->ref_count += L3C_ENTRIES - 1;
5662			}
5663		}
5664		l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5665
5666have_l3p:
5667		/*
5668		 * If bti is not the same for the whole L3C range, return
5669		 * failure and let vm_fault() cope.  Check after L3 allocation,
5670		 * since it could sleep.
5671		 */
5672		if (!pmap_bti_same(pmap, va, va + L3C_SIZE)) {
5673			(*ml3p)->ref_count -= L3C_ENTRIES - 1;
5674			pmap_abort_ptp(pmap, va, *ml3p);
5675			*ml3p = NULL;
5676			return (KERN_PROTECTION_FAILURE);
5677		}
5678	} else {
5679		*ml3p = NULL;
5680
5681		/*
5682		 * If the L2 entry is a superpage, we either abort or demote
5683		 * depending on the given flags.
5684		 */
5685		pde = pmap_pde(kernel_pmap, va, &lvl);
5686		if (lvl == 1) {
5687			l2p = pmap_l1_to_l2(pde, va);
5688			KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5689			    ("pmap_enter_l3c: missing L2 block"));
5690			if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5691				return (KERN_FAILURE);
5692			l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5693		} else {
5694			KASSERT(lvl == 2,
5695			    ("pmap_enter_l3c: Invalid level %d", lvl));
5696			l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5697			    pmap_load(pde)));
5698		}
5699	}
5700	l3p = &l3p[pmap_l3_index(va)];
5701
5702	/*
5703	 * If there are existing mappings, either abort or remove them.
5704	 */
5705	if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5706		for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5707			if (pmap_load(tl3p) != 0) {
5708				if (*ml3p != NULL)
5709					(*ml3p)->ref_count -= L3C_ENTRIES;
5710				return (KERN_FAILURE);
5711			}
5712		}
5713	} else {
5714		/*
5715		 * Because we increment the L3 page's reference count above,
5716		 * it is guaranteed not to be freed here and we can pass NULL
5717		 * instead of a valid free list.
5718		 */
5719		pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5720		    va + L3C_SIZE, NULL, lockp);
5721	}
5722
5723	/*
5724	 * Enter on the PV list if part of our managed memory.
5725	 */
5726	if ((l3e & ATTR_SW_MANAGED) != 0) {
5727		if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5728			if (*ml3p != NULL) {
5729				(*ml3p)->ref_count -= L3C_ENTRIES - 1;
5730				pmap_abort_ptp(pmap, va, *ml3p);
5731				*ml3p = NULL;
5732			}
5733			return (KERN_RESOURCE_SHORTAGE);
5734		}
5735		if ((l3e & ATTR_SW_DBM) != 0)
5736			for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5737				vm_page_aflag_set(mt, PGA_WRITEABLE);
5738	}
5739
5740	/*
5741	 * Increment counters.
5742	 */
5743	if ((l3e & ATTR_SW_WIRED) != 0)
5744		pmap->pm_stats.wired_count += L3C_ENTRIES;
5745	pmap_resident_count_inc(pmap, L3C_ENTRIES);
5746
5747	pa = VM_PAGE_TO_PHYS(m);
5748	KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5749
5750	/*
5751	 * Sync the icache before the mapping is stored.
5752	 */
5753	if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5754	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5755		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5756
5757	/*
5758	 * Map the superpage.
5759	 */
5760	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5761		pmap_store(tl3p, l3e);
5762		l3e += L3_SIZE;
5763	}
5764	dsb(ishst);
5765
5766	counter_u64_add(pmap_l3c_mappings, 1);
5767	CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5768	    va, pmap);
5769	return (KERN_SUCCESS);
5770}
5771
5772/*
5773 * Maps a sequence of resident pages belonging to the same object.
5774 * The sequence begins with the given page m_start.  This page is
5775 * mapped at the given virtual address start.  Each subsequent page is
5776 * mapped at a virtual address that is offset from start by the same
5777 * amount as the page is offset from m_start within the object.  The
5778 * last page in the sequence is the page with the largest offset from
5779 * m_start that can be mapped at a virtual address less than the given
5780 * virtual address end.  Not every virtual page between start and end
5781 * is mapped; only those for which a resident page exists with the
5782 * corresponding offset from m_start are mapped.
5783 */
5784void
5785pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5786    vm_page_t m_start, vm_prot_t prot)
5787{
5788	struct rwlock *lock;
5789	vm_offset_t va;
5790	vm_page_t m, mpte;
5791	vm_pindex_t diff, psize;
5792	int rv;
5793
5794	VM_OBJECT_ASSERT_LOCKED(m_start->object);
5795
5796	psize = atop(end - start);
5797	mpte = NULL;
5798	m = m_start;
5799	lock = NULL;
5800	PMAP_LOCK(pmap);
5801	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5802		va = start + ptoa(diff);
5803		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
5804		    m->psind == 1 && pmap_ps_enabled(pmap) &&
5805		    ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
5806		    KERN_SUCCESS || rv == KERN_NO_SPACE))
5807			m = &m[L2_SIZE / PAGE_SIZE - 1];
5808		else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
5809		    (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 &&
5810		    vm_reserv_is_populated(m, L3C_ENTRIES) &&
5811		    pmap_ps_enabled(pmap) &&
5812		    ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
5813		    &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
5814			m = &m[L3C_ENTRIES - 1];
5815		else
5816			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
5817			    &lock);
5818		m = TAILQ_NEXT(m, listq);
5819	}
5820	if (lock != NULL)
5821		rw_wunlock(lock);
5822	PMAP_UNLOCK(pmap);
5823}
5824
5825/*
5826 * this code makes some *MAJOR* assumptions:
5827 * 1. Current pmap & pmap exists.
5828 * 2. Not wired.
5829 * 3. Read access.
5830 * 4. No page table pages.
5831 * but is *MUCH* faster than pmap_enter...
5832 */
5833
5834void
5835pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5836{
5837	struct rwlock *lock;
5838
5839	lock = NULL;
5840	PMAP_LOCK(pmap);
5841	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5842	if (lock != NULL)
5843		rw_wunlock(lock);
5844	PMAP_UNLOCK(pmap);
5845}
5846
5847static vm_page_t
5848pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5849    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5850{
5851	pd_entry_t *pde;
5852	pt_entry_t *l1, *l2, *l3, l3_val;
5853	vm_paddr_t pa;
5854	int lvl;
5855
5856	KASSERT(!VA_IS_CLEANMAP(va) ||
5857	    (m->oflags & VPO_UNMANAGED) != 0,
5858	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5859	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5860	PMAP_ASSERT_STAGE1(pmap);
5861	KASSERT(ADDR_IS_CANONICAL(va),
5862	    ("%s: Address not in canonical form: %lx", __func__, va));
5863	l2 = NULL;
5864
5865	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
5866	/*
5867	 * In the case that a page table page is not
5868	 * resident, we are creating it here.
5869	 */
5870	if (!ADDR_IS_KERNEL(va)) {
5871		vm_pindex_t l2pindex;
5872
5873		/*
5874		 * Calculate pagetable page index
5875		 */
5876		l2pindex = pmap_l2_pindex(va);
5877		if (mpte && (mpte->pindex == l2pindex)) {
5878			mpte->ref_count++;
5879		} else {
5880			/*
5881			 * If the page table page is mapped, we just increment
5882			 * the hold count, and activate it.  Otherwise, we
5883			 * attempt to allocate a page table page, passing NULL
5884			 * instead of the PV list lock pointer because we don't
5885			 * intend to sleep.  If this attempt fails, we don't
5886			 * retry.  Instead, we give up.
5887			 */
5888			l1 = pmap_l1(pmap, va);
5889			if (l1 != NULL && pmap_load(l1) != 0) {
5890				if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
5891				    L1_BLOCK)
5892					return (NULL);
5893				l2 = pmap_l1_to_l2(l1, va);
5894				if (pmap_load(l2) != 0) {
5895					if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
5896					    L2_BLOCK)
5897						return (NULL);
5898					mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5899					mpte->ref_count++;
5900				} else {
5901					mpte = _pmap_alloc_l3(pmap, l2pindex,
5902					    NULL);
5903					if (mpte == NULL)
5904						return (mpte);
5905				}
5906			} else {
5907				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
5908				if (mpte == NULL)
5909					return (mpte);
5910			}
5911		}
5912		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5913		l3 = &l3[pmap_l3_index(va)];
5914	} else {
5915		mpte = NULL;
5916		pde = pmap_pde(kernel_pmap, va, &lvl);
5917		KASSERT(pde != NULL,
5918		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
5919		     va));
5920		KASSERT(lvl == 2,
5921		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
5922		l3 = pmap_l2_to_l3(pde, va);
5923	}
5924
5925	/*
5926	 * Abort if a mapping already exists.
5927	 */
5928	if (pmap_load(l3) != 0) {
5929		if (mpte != NULL)
5930			mpte->ref_count--;
5931		return (NULL);
5932	}
5933
5934	/*
5935	 * Enter on the PV list if part of our managed memory.
5936	 */
5937	if ((m->oflags & VPO_UNMANAGED) == 0 &&
5938	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5939		if (mpte != NULL)
5940			pmap_abort_ptp(pmap, va, mpte);
5941		return (NULL);
5942	}
5943
5944	/*
5945	 * Increment counters
5946	 */
5947	pmap_resident_count_inc(pmap, 1);
5948
5949	pa = VM_PAGE_TO_PHYS(m);
5950	l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
5951	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
5952	l3_val |= pmap_pte_bti(pmap, va);
5953	if ((prot & VM_PROT_EXECUTE) == 0 ||
5954	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5955		l3_val |= ATTR_S1_XN;
5956	if (!ADDR_IS_KERNEL(va))
5957		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5958	else
5959		l3_val |= ATTR_S1_UXN;
5960	if (pmap != kernel_pmap)
5961		l3_val |= ATTR_S1_nG;
5962
5963	/*
5964	 * Now validate mapping with RO protection
5965	 */
5966	if ((m->oflags & VPO_UNMANAGED) == 0) {
5967		l3_val |= ATTR_SW_MANAGED;
5968		l3_val &= ~ATTR_AF;
5969	}
5970
5971	/* Sync icache before the mapping is stored to PTE */
5972	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5973	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5974		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5975
5976	pmap_store(l3, l3_val);
5977	dsb(ishst);
5978
5979#if VM_NRESERVLEVEL > 0
5980	/*
5981	 * If both the PTP and the reservation are fully populated, then
5982	 * attempt promotion.
5983	 */
5984	if ((mpte == NULL || mpte->ref_count == NL3PG) &&
5985	    (m->flags & PG_FICTITIOUS) == 0 &&
5986	    vm_reserv_level_iffullpop(m) == 0) {
5987		if (l2 == NULL)
5988			l2 = pmap_pde(pmap, va, &lvl);
5989
5990		/*
5991		 * If promotion succeeds, then the next call to this function
5992		 * should not be given the unmapped PTP as a hint.
5993		 */
5994		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
5995			mpte = NULL;
5996	}
5997#endif
5998
5999	return (mpte);
6000}
6001
6002/*
6003 * This code maps large physical mmap regions into the
6004 * processor address space.  Note that some shortcuts
6005 * are taken, but the code works.
6006 */
6007void
6008pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6009    vm_pindex_t pindex, vm_size_t size)
6010{
6011
6012	VM_OBJECT_ASSERT_WLOCKED(object);
6013	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6014	    ("pmap_object_init_pt: non-device object"));
6015}
6016
6017/*
6018 *	Clear the wired attribute from the mappings for the specified range of
6019 *	addresses in the given pmap.  Every valid mapping within that range
6020 *	must have the wired attribute set.  In contrast, invalid mappings
6021 *	cannot have the wired attribute set, so they are ignored.
6022 *
6023 *	The wired attribute of the page table entry is not a hardware feature,
6024 *	so there is no need to invalidate any TLB entries.
6025 */
6026void
6027pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6028{
6029	vm_offset_t va_next;
6030	pd_entry_t *l0, *l1, *l2;
6031	pt_entry_t *l3;
6032	bool partial_l3c;
6033
6034	PMAP_LOCK(pmap);
6035	for (; sva < eva; sva = va_next) {
6036		l0 = pmap_l0(pmap, sva);
6037		if (pmap_load(l0) == 0) {
6038			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6039			if (va_next < sva)
6040				va_next = eva;
6041			continue;
6042		}
6043
6044		l1 = pmap_l0_to_l1(l0, sva);
6045		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6046		if (va_next < sva)
6047			va_next = eva;
6048		if (pmap_load(l1) == 0)
6049			continue;
6050
6051		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6052			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6053			KASSERT(va_next <= eva,
6054			    ("partial update of non-transparent 1G page "
6055			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6056			    pmap_load(l1), sva, eva, va_next));
6057			MPASS(pmap != kernel_pmap);
6058			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6059			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6060			pmap_clear_bits(l1, ATTR_SW_WIRED);
6061			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6062			continue;
6063		}
6064
6065		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6066		if (va_next < sva)
6067			va_next = eva;
6068
6069		l2 = pmap_l1_to_l2(l1, sva);
6070		if (pmap_load(l2) == 0)
6071			continue;
6072
6073		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6074			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6075				panic("pmap_unwire: l2 %#jx is missing "
6076				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6077
6078			/*
6079			 * Are we unwiring the entire large page?  If not,
6080			 * demote the mapping and fall through.
6081			 */
6082			if (sva + L2_SIZE == va_next && eva >= va_next) {
6083				pmap_clear_bits(l2, ATTR_SW_WIRED);
6084				pmap->pm_stats.wired_count -= L2_SIZE /
6085				    PAGE_SIZE;
6086				continue;
6087			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6088				panic("pmap_unwire: demotion failed");
6089		}
6090		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6091		    ("pmap_unwire: Invalid l2 entry after demotion"));
6092
6093		if (va_next > eva)
6094			va_next = eva;
6095		for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6096		    sva != va_next; l3++, sva += L3_SIZE) {
6097			if (pmap_load(l3) == 0)
6098				continue;
6099			if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6100				/*
6101				 * Avoid demotion for whole-page unwiring.
6102				 */
6103				if ((sva & L3C_OFFSET) == 0) {
6104					/*
6105					 * Handle the possibility that
6106					 * "va_next" is zero because of
6107					 * address wraparound.
6108					 */
6109					partial_l3c = sva + L3C_OFFSET >
6110					    va_next - 1;
6111				}
6112				if (partial_l3c)
6113					(void)pmap_demote_l3c(pmap, l3, sva);
6114			}
6115			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6116				panic("pmap_unwire: l3 %#jx is missing "
6117				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6118
6119			/*
6120			 * ATTR_SW_WIRED must be cleared atomically.  Although
6121			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6122			 * the System MMU may write to the entry concurrently.
6123			 */
6124			pmap_clear_bits(l3, ATTR_SW_WIRED);
6125			pmap->pm_stats.wired_count--;
6126		}
6127	}
6128	PMAP_UNLOCK(pmap);
6129}
6130
6131/*
6132 * This function requires that the caller has already added one to ml3's
6133 * ref_count in anticipation of creating a 4KB page mapping.
6134 */
6135static bool
6136pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6137    vm_page_t ml3, struct rwlock **lockp)
6138{
6139	pt_entry_t *tl3p;
6140
6141	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6142	KASSERT((va & L3C_OFFSET) == 0,
6143	    ("pmap_copy_l3c: va is not aligned"));
6144	KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6145	    ("pmap_copy_l3c: l3e is not managed"));
6146
6147	/*
6148	 * Abort if a mapping already exists.
6149	 */
6150	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6151		if (pmap_load(tl3p) != 0) {
6152			if (ml3 != NULL)
6153				ml3->ref_count--;
6154			return (false);
6155		}
6156
6157	if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6158		if (ml3 != NULL)
6159			pmap_abort_ptp(pmap, va, ml3);
6160		return (false);
6161	}
6162	ml3->ref_count += L3C_ENTRIES - 1;
6163
6164	/*
6165	 * Clear the wired and accessed bits.  However, leave the dirty bit
6166	 * unchanged because read/write superpage mappings are required to be
6167	 * dirty.
6168	 */
6169	l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6170
6171	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6172		pmap_store(tl3p, l3e);
6173		l3e += L3_SIZE;
6174	}
6175	pmap_resident_count_inc(pmap, L3C_ENTRIES);
6176	counter_u64_add(pmap_l3c_mappings, 1);
6177	CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6178	    va, pmap);
6179	return (true);
6180}
6181
6182/*
6183 *	Copy the range specified by src_addr/len
6184 *	from the source map to the range dst_addr/len
6185 *	in the destination map.
6186 *
6187 *	This routine is only advisory and need not do anything.
6188 *
6189 *	Because the executable mappings created by this routine are copied,
6190 *	it should not have to flush the instruction cache.
6191 */
6192void
6193pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6194    vm_offset_t src_addr)
6195{
6196	struct rwlock *lock;
6197	pd_entry_t *l0, *l1, *l2, srcptepaddr;
6198	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6199	vm_offset_t addr, end_addr, va_next;
6200	vm_page_t dst_m, dstmpte, srcmpte;
6201
6202	PMAP_ASSERT_STAGE1(dst_pmap);
6203	PMAP_ASSERT_STAGE1(src_pmap);
6204
6205	if (dst_addr != src_addr)
6206		return;
6207	end_addr = src_addr + len;
6208	lock = NULL;
6209	if (dst_pmap < src_pmap) {
6210		PMAP_LOCK(dst_pmap);
6211		PMAP_LOCK(src_pmap);
6212	} else {
6213		PMAP_LOCK(src_pmap);
6214		PMAP_LOCK(dst_pmap);
6215	}
6216	for (addr = src_addr; addr < end_addr; addr = va_next) {
6217		l0 = pmap_l0(src_pmap, addr);
6218		if (pmap_load(l0) == 0) {
6219			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6220			if (va_next < addr)
6221				va_next = end_addr;
6222			continue;
6223		}
6224
6225		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6226		if (va_next < addr)
6227			va_next = end_addr;
6228		l1 = pmap_l0_to_l1(l0, addr);
6229		if (pmap_load(l1) == 0)
6230			continue;
6231		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6232			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6233			KASSERT(va_next <= end_addr,
6234			    ("partial update of non-transparent 1G page "
6235			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6236			    pmap_load(l1), addr, end_addr, va_next));
6237			srcptepaddr = pmap_load(l1);
6238			l1 = pmap_l1(dst_pmap, addr);
6239			if (l1 == NULL) {
6240				if (_pmap_alloc_l3(dst_pmap,
6241				    pmap_l0_pindex(addr), NULL) == NULL)
6242					break;
6243				l1 = pmap_l1(dst_pmap, addr);
6244			} else {
6245				l0 = pmap_l0(dst_pmap, addr);
6246				dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6247				dst_m->ref_count++;
6248			}
6249			KASSERT(pmap_load(l1) == 0,
6250			    ("1G mapping present in dst pmap "
6251			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6252			    pmap_load(l1), addr, end_addr, va_next));
6253			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6254			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6255			continue;
6256		}
6257
6258		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6259		if (va_next < addr)
6260			va_next = end_addr;
6261		l2 = pmap_l1_to_l2(l1, addr);
6262		srcptepaddr = pmap_load(l2);
6263		if (srcptepaddr == 0)
6264			continue;
6265		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6266			/*
6267			 * We can only virtual copy whole superpages.
6268			 */
6269			if ((addr & L2_OFFSET) != 0 ||
6270			    addr + L2_SIZE > end_addr)
6271				continue;
6272			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6273			if (l2 == NULL)
6274				break;
6275			if (pmap_load(l2) == 0 &&
6276			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6277			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6278			    PMAP_ENTER_NORECLAIM, &lock))) {
6279				/*
6280				 * We leave the dirty bit unchanged because
6281				 * managed read/write superpage mappings are
6282				 * required to be dirty.  However, managed
6283				 * superpage mappings are not required to
6284				 * have their accessed bit set, so we clear
6285				 * it because we don't know if this mapping
6286				 * will be used.
6287				 */
6288				srcptepaddr &= ~ATTR_SW_WIRED;
6289				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6290					srcptepaddr &= ~ATTR_AF;
6291				pmap_store(l2, srcptepaddr);
6292				pmap_resident_count_inc(dst_pmap, L2_SIZE /
6293				    PAGE_SIZE);
6294				atomic_add_long(&pmap_l2_mappings, 1);
6295			} else
6296				pmap_abort_ptp(dst_pmap, addr, dst_m);
6297			continue;
6298		}
6299		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6300		    ("pmap_copy: invalid L2 entry"));
6301		srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6302		KASSERT(srcmpte->ref_count > 0,
6303		    ("pmap_copy: source page table page is unused"));
6304		if (va_next > end_addr)
6305			va_next = end_addr;
6306		src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6307		src_pte = &src_pte[pmap_l3_index(addr)];
6308		dstmpte = NULL;
6309		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6310			ptetemp = pmap_load(src_pte);
6311
6312			/*
6313			 * We only virtual copy managed pages.
6314			 */
6315			if ((ptetemp & ATTR_SW_MANAGED) == 0)
6316				continue;
6317
6318			if (dstmpte != NULL) {
6319				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6320				    ("dstmpte pindex/addr mismatch"));
6321				dstmpte->ref_count++;
6322			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6323			    NULL)) == NULL)
6324				goto out;
6325			dst_pte = (pt_entry_t *)
6326			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6327			dst_pte = &dst_pte[pmap_l3_index(addr)];
6328			if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6329			    L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6330			    va_next - 1) {
6331				if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6332				    ptetemp, dstmpte, &lock))
6333					goto out;
6334				addr += L3C_SIZE - PAGE_SIZE;
6335				src_pte += L3C_ENTRIES - 1;
6336			} else if (pmap_load(dst_pte) == 0 &&
6337			    pmap_try_insert_pv_entry(dst_pmap, addr,
6338			    PTE_TO_VM_PAGE(ptetemp), &lock)) {
6339				/*
6340				 * Clear the wired, contiguous, modified, and
6341				 * accessed bits from the destination PTE.
6342				 * The contiguous bit is cleared because we
6343				 * are not copying the entire L3C superpage.
6344				 */
6345				mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6346				    ATTR_AF;
6347				nbits = 0;
6348				if ((ptetemp & ATTR_SW_DBM) != 0)
6349					nbits |= ATTR_S1_AP_RW_BIT;
6350				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6351				pmap_resident_count_inc(dst_pmap, 1);
6352			} else {
6353				pmap_abort_ptp(dst_pmap, addr, dstmpte);
6354				goto out;
6355			}
6356			/* Have we copied all of the valid mappings? */
6357			if (dstmpte->ref_count >= srcmpte->ref_count)
6358				break;
6359		}
6360	}
6361out:
6362	/*
6363	 * XXX This barrier may not be needed because the destination pmap is
6364	 * not active.
6365	 */
6366	dsb(ishst);
6367
6368	if (lock != NULL)
6369		rw_wunlock(lock);
6370	PMAP_UNLOCK(src_pmap);
6371	PMAP_UNLOCK(dst_pmap);
6372}
6373
6374int
6375pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6376{
6377	int error;
6378
6379	if (dst_pmap->pm_stage != src_pmap->pm_stage)
6380		return (EINVAL);
6381
6382	if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6383		return (0);
6384
6385	for (;;) {
6386		if (dst_pmap < src_pmap) {
6387			PMAP_LOCK(dst_pmap);
6388			PMAP_LOCK(src_pmap);
6389		} else {
6390			PMAP_LOCK(src_pmap);
6391			PMAP_LOCK(dst_pmap);
6392		}
6393		error = pmap_bti_copy(dst_pmap, src_pmap);
6394		/* Clean up partial copy on failure due to no memory. */
6395		if (error == ENOMEM)
6396			pmap_bti_deassign_all(dst_pmap);
6397		PMAP_UNLOCK(src_pmap);
6398		PMAP_UNLOCK(dst_pmap);
6399		if (error != ENOMEM)
6400			break;
6401		vm_wait(NULL);
6402	}
6403	return (error);
6404}
6405
6406/*
6407 *	pmap_zero_page zeros the specified hardware page by mapping
6408 *	the page into KVM and using bzero to clear its contents.
6409 */
6410void
6411pmap_zero_page(vm_page_t m)
6412{
6413	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6414
6415	pagezero((void *)va);
6416}
6417
6418/*
6419 *	pmap_zero_page_area zeros the specified hardware page by mapping
6420 *	the page into KVM and using bzero to clear its contents.
6421 *
6422 *	off and size may not cover an area beyond a single hardware page.
6423 */
6424void
6425pmap_zero_page_area(vm_page_t m, int off, int size)
6426{
6427	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6428
6429	if (off == 0 && size == PAGE_SIZE)
6430		pagezero((void *)va);
6431	else
6432		bzero((char *)va + off, size);
6433}
6434
6435/*
6436 *	pmap_copy_page copies the specified (machine independent)
6437 *	page by mapping the page into virtual memory and using
6438 *	bcopy to copy the page, one machine dependent page at a
6439 *	time.
6440 */
6441void
6442pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6443{
6444	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6445	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6446
6447	pagecopy((void *)src, (void *)dst);
6448}
6449
6450int unmapped_buf_allowed = 1;
6451
6452void
6453pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6454    vm_offset_t b_offset, int xfersize)
6455{
6456	void *a_cp, *b_cp;
6457	vm_page_t m_a, m_b;
6458	vm_paddr_t p_a, p_b;
6459	vm_offset_t a_pg_offset, b_pg_offset;
6460	int cnt;
6461
6462	while (xfersize > 0) {
6463		a_pg_offset = a_offset & PAGE_MASK;
6464		m_a = ma[a_offset >> PAGE_SHIFT];
6465		p_a = m_a->phys_addr;
6466		b_pg_offset = b_offset & PAGE_MASK;
6467		m_b = mb[b_offset >> PAGE_SHIFT];
6468		p_b = m_b->phys_addr;
6469		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6470		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6471		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6472			panic("!DMAP a %lx", p_a);
6473		} else {
6474			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6475		}
6476		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6477			panic("!DMAP b %lx", p_b);
6478		} else {
6479			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6480		}
6481		bcopy(a_cp, b_cp, cnt);
6482		a_offset += cnt;
6483		b_offset += cnt;
6484		xfersize -= cnt;
6485	}
6486}
6487
6488vm_offset_t
6489pmap_quick_enter_page(vm_page_t m)
6490{
6491
6492	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6493}
6494
6495void
6496pmap_quick_remove_page(vm_offset_t addr)
6497{
6498}
6499
6500/*
6501 * Returns true if the pmap's pv is one of the first
6502 * 16 pvs linked to from this page.  This count may
6503 * be changed upwards or downwards in the future; it
6504 * is only necessary that true be returned for a small
6505 * subset of pmaps for proper page aging.
6506 */
6507bool
6508pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6509{
6510	struct md_page *pvh;
6511	struct rwlock *lock;
6512	pv_entry_t pv;
6513	int loops = 0;
6514	bool rv;
6515
6516	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6517	    ("pmap_page_exists_quick: page %p is not managed", m));
6518	rv = false;
6519	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6520	rw_rlock(lock);
6521	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6522		if (PV_PMAP(pv) == pmap) {
6523			rv = true;
6524			break;
6525		}
6526		loops++;
6527		if (loops >= 16)
6528			break;
6529	}
6530	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6531		pvh = page_to_pvh(m);
6532		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6533			if (PV_PMAP(pv) == pmap) {
6534				rv = true;
6535				break;
6536			}
6537			loops++;
6538			if (loops >= 16)
6539				break;
6540		}
6541	}
6542	rw_runlock(lock);
6543	return (rv);
6544}
6545
6546/*
6547 *	pmap_page_wired_mappings:
6548 *
6549 *	Return the number of managed mappings to the given physical page
6550 *	that are wired.
6551 */
6552int
6553pmap_page_wired_mappings(vm_page_t m)
6554{
6555	struct rwlock *lock;
6556	struct md_page *pvh;
6557	pmap_t pmap;
6558	pt_entry_t *pte;
6559	pv_entry_t pv;
6560	int count, md_gen, pvh_gen;
6561
6562	if ((m->oflags & VPO_UNMANAGED) != 0)
6563		return (0);
6564	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6565	rw_rlock(lock);
6566restart:
6567	count = 0;
6568	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6569		pmap = PV_PMAP(pv);
6570		if (!PMAP_TRYLOCK(pmap)) {
6571			md_gen = m->md.pv_gen;
6572			rw_runlock(lock);
6573			PMAP_LOCK(pmap);
6574			rw_rlock(lock);
6575			if (md_gen != m->md.pv_gen) {
6576				PMAP_UNLOCK(pmap);
6577				goto restart;
6578			}
6579		}
6580		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6581		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6582			count++;
6583		PMAP_UNLOCK(pmap);
6584	}
6585	if ((m->flags & PG_FICTITIOUS) == 0) {
6586		pvh = page_to_pvh(m);
6587		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6588			pmap = PV_PMAP(pv);
6589			if (!PMAP_TRYLOCK(pmap)) {
6590				md_gen = m->md.pv_gen;
6591				pvh_gen = pvh->pv_gen;
6592				rw_runlock(lock);
6593				PMAP_LOCK(pmap);
6594				rw_rlock(lock);
6595				if (md_gen != m->md.pv_gen ||
6596				    pvh_gen != pvh->pv_gen) {
6597					PMAP_UNLOCK(pmap);
6598					goto restart;
6599				}
6600			}
6601			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6602			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6603				count++;
6604			PMAP_UNLOCK(pmap);
6605		}
6606	}
6607	rw_runlock(lock);
6608	return (count);
6609}
6610
6611/*
6612 * Returns true if the given page is mapped individually or as part of
6613 * a 2mpage.  Otherwise, returns false.
6614 */
6615bool
6616pmap_page_is_mapped(vm_page_t m)
6617{
6618	struct rwlock *lock;
6619	bool rv;
6620
6621	if ((m->oflags & VPO_UNMANAGED) != 0)
6622		return (false);
6623	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6624	rw_rlock(lock);
6625	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6626	    ((m->flags & PG_FICTITIOUS) == 0 &&
6627	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6628	rw_runlock(lock);
6629	return (rv);
6630}
6631
6632/*
6633 * Destroy all managed, non-wired mappings in the given user-space
6634 * pmap.  This pmap cannot be active on any processor besides the
6635 * caller.
6636 *
6637 * This function cannot be applied to the kernel pmap.  Moreover, it
6638 * is not intended for general use.  It is only to be used during
6639 * process termination.  Consequently, it can be implemented in ways
6640 * that make it faster than pmap_remove().  First, it can more quickly
6641 * destroy mappings by iterating over the pmap's collection of PV
6642 * entries, rather than searching the page table.  Second, it doesn't
6643 * have to test and clear the page table entries atomically, because
6644 * no processor is currently accessing the user address space.  In
6645 * particular, a page table entry's dirty bit won't change state once
6646 * this function starts.
6647 */
6648void
6649pmap_remove_pages(pmap_t pmap)
6650{
6651	pd_entry_t *pde;
6652	pt_entry_t *pte, tpte;
6653	struct spglist free;
6654	struct pv_chunklist free_chunks[PMAP_MEMDOM];
6655	vm_page_t m, ml3, mt;
6656	pv_entry_t pv;
6657	struct md_page *pvh;
6658	struct pv_chunk *pc, *npc;
6659	struct rwlock *lock;
6660	int64_t bit;
6661	uint64_t inuse, bitmask;
6662	int allfree, field, i, idx, lvl;
6663	int freed __pvused;
6664	vm_paddr_t pa;
6665
6666	lock = NULL;
6667
6668	for (i = 0; i < PMAP_MEMDOM; i++)
6669		TAILQ_INIT(&free_chunks[i]);
6670	SLIST_INIT(&free);
6671	PMAP_LOCK(pmap);
6672	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6673		allfree = 1;
6674		freed = 0;
6675		for (field = 0; field < _NPCM; field++) {
6676			inuse = ~pc->pc_map[field] & pc_freemask[field];
6677			while (inuse != 0) {
6678				bit = ffsl(inuse) - 1;
6679				bitmask = 1UL << bit;
6680				idx = field * 64 + bit;
6681				pv = &pc->pc_pventry[idx];
6682				inuse &= ~bitmask;
6683
6684				pde = pmap_pde(pmap, pv->pv_va, &lvl);
6685				KASSERT(pde != NULL,
6686				    ("Attempting to remove an unmapped page"));
6687
6688				switch(lvl) {
6689				case 1:
6690					pte = pmap_l1_to_l2(pde, pv->pv_va);
6691					tpte = pmap_load(pte);
6692					KASSERT((tpte & ATTR_DESCR_MASK) ==
6693					    L2_BLOCK,
6694					    ("Attempting to remove an invalid "
6695					    "block: %lx", tpte));
6696					break;
6697				case 2:
6698					pte = pmap_l2_to_l3(pde, pv->pv_va);
6699					tpte = pmap_load(pte);
6700					KASSERT((tpte & ATTR_DESCR_MASK) ==
6701					    L3_PAGE,
6702					    ("Attempting to remove an invalid "
6703					     "page: %lx", tpte));
6704					break;
6705				default:
6706					panic(
6707					    "Invalid page directory level: %d",
6708					    lvl);
6709				}
6710
6711				/*
6712				 * We cannot remove wired mappings at this time.
6713				 *
6714				 * For L3C superpages, all of the constituent PTEs
6715				 * should have the wired bit set, so we don't
6716				 * check for ATTR_CONTIGUOUS here.
6717				 */
6718				if (tpte & ATTR_SW_WIRED) {
6719					allfree = 0;
6720					continue;
6721				}
6722
6723				/* Mark free */
6724				pc->pc_map[field] |= bitmask;
6725
6726				/*
6727				 * Because this pmap is not active on other
6728				 * processors, the dirty bit cannot have
6729				 * changed state since we last loaded pte.
6730				 */
6731				pmap_clear(pte);
6732
6733				pa = PTE_TO_PHYS(tpte);
6734
6735				m = PHYS_TO_VM_PAGE(pa);
6736				KASSERT(m->phys_addr == pa,
6737				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6738				    m, (uintmax_t)m->phys_addr,
6739				    (uintmax_t)tpte));
6740
6741				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6742				    m < &vm_page_array[vm_page_array_size],
6743				    ("pmap_remove_pages: bad pte %#jx",
6744				    (uintmax_t)tpte));
6745
6746				/*
6747				 * Update the vm_page_t clean/reference bits.
6748				 *
6749				 * We don't check for ATTR_CONTIGUOUS here
6750				 * because writeable L3C superpages are expected
6751				 * to be dirty, i.e., every constituent PTE
6752				 * should be dirty.
6753				 */
6754				if (pmap_pte_dirty(pmap, tpte)) {
6755					switch (lvl) {
6756					case 1:
6757						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6758							vm_page_dirty(mt);
6759						break;
6760					case 2:
6761						vm_page_dirty(m);
6762						break;
6763					}
6764				}
6765
6766				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6767
6768				switch (lvl) {
6769				case 1:
6770					pmap_resident_count_dec(pmap,
6771					    L2_SIZE / PAGE_SIZE);
6772					pvh = page_to_pvh(m);
6773					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
6774					pvh->pv_gen++;
6775					if (TAILQ_EMPTY(&pvh->pv_list)) {
6776						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6777							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
6778							    TAILQ_EMPTY(&mt->md.pv_list))
6779								vm_page_aflag_clear(mt, PGA_WRITEABLE);
6780					}
6781					ml3 = pmap_remove_pt_page(pmap,
6782					    pv->pv_va);
6783					if (ml3 != NULL) {
6784						KASSERT(vm_page_any_valid(ml3),
6785						    ("pmap_remove_pages: l3 page not promoted"));
6786						pmap_resident_count_dec(pmap,1);
6787						KASSERT(ml3->ref_count == NL3PG,
6788						    ("pmap_remove_pages: l3 page ref count error"));
6789						ml3->ref_count = 0;
6790						pmap_add_delayed_free_list(ml3,
6791						    &free, false);
6792					}
6793					break;
6794				case 2:
6795					pmap_resident_count_dec(pmap, 1);
6796					TAILQ_REMOVE(&m->md.pv_list, pv,
6797					    pv_next);
6798					m->md.pv_gen++;
6799					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
6800					    TAILQ_EMPTY(&m->md.pv_list) &&
6801					    (m->flags & PG_FICTITIOUS) == 0) {
6802						pvh = page_to_pvh(m);
6803						if (TAILQ_EMPTY(&pvh->pv_list))
6804							vm_page_aflag_clear(m,
6805							    PGA_WRITEABLE);
6806					}
6807					break;
6808				}
6809				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
6810				    &free);
6811				freed++;
6812			}
6813		}
6814		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6815		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6816		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6817		if (allfree) {
6818			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6819			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
6820			    pc_list);
6821		}
6822	}
6823	if (lock != NULL)
6824		rw_wunlock(lock);
6825	pmap_invalidate_all(pmap);
6826	pmap_bti_deassign_all(pmap);
6827	free_pv_chunk_batch(free_chunks);
6828	PMAP_UNLOCK(pmap);
6829	vm_page_free_pages_toq(&free, true);
6830}
6831
6832/*
6833 * This is used to check if a page has been accessed or modified.
6834 */
6835static bool
6836pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
6837{
6838	struct rwlock *lock;
6839	pv_entry_t pv;
6840	struct md_page *pvh;
6841	pt_entry_t l3e, mask, *pte, value;
6842	pmap_t pmap;
6843	int md_gen, pvh_gen;
6844	bool rv;
6845
6846	rv = false;
6847	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6848	rw_rlock(lock);
6849restart:
6850	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6851		pmap = PV_PMAP(pv);
6852		PMAP_ASSERT_STAGE1(pmap);
6853		if (!PMAP_TRYLOCK(pmap)) {
6854			md_gen = m->md.pv_gen;
6855			rw_runlock(lock);
6856			PMAP_LOCK(pmap);
6857			rw_rlock(lock);
6858			if (md_gen != m->md.pv_gen) {
6859				PMAP_UNLOCK(pmap);
6860				goto restart;
6861			}
6862		}
6863		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6864		mask = 0;
6865		value = 0;
6866		if (modified) {
6867			mask |= ATTR_S1_AP_RW_BIT;
6868			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6869		}
6870		if (accessed) {
6871			mask |= ATTR_AF | ATTR_DESCR_MASK;
6872			value |= ATTR_AF | L3_PAGE;
6873		}
6874		l3e = pmap_load(pte);
6875		if ((l3e & ATTR_CONTIGUOUS) != 0)
6876			l3e = pmap_load_l3c(pte);
6877		PMAP_UNLOCK(pmap);
6878		rv = (l3e & mask) == value;
6879		if (rv)
6880			goto out;
6881	}
6882	if ((m->flags & PG_FICTITIOUS) == 0) {
6883		pvh = page_to_pvh(m);
6884		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6885			pmap = PV_PMAP(pv);
6886			PMAP_ASSERT_STAGE1(pmap);
6887			if (!PMAP_TRYLOCK(pmap)) {
6888				md_gen = m->md.pv_gen;
6889				pvh_gen = pvh->pv_gen;
6890				rw_runlock(lock);
6891				PMAP_LOCK(pmap);
6892				rw_rlock(lock);
6893				if (md_gen != m->md.pv_gen ||
6894				    pvh_gen != pvh->pv_gen) {
6895					PMAP_UNLOCK(pmap);
6896					goto restart;
6897				}
6898			}
6899			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6900			mask = 0;
6901			value = 0;
6902			if (modified) {
6903				mask |= ATTR_S1_AP_RW_BIT;
6904				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6905			}
6906			if (accessed) {
6907				mask |= ATTR_AF | ATTR_DESCR_MASK;
6908				value |= ATTR_AF | L2_BLOCK;
6909			}
6910			rv = (pmap_load(pte) & mask) == value;
6911			PMAP_UNLOCK(pmap);
6912			if (rv)
6913				goto out;
6914		}
6915	}
6916out:
6917	rw_runlock(lock);
6918	return (rv);
6919}
6920
6921/*
6922 *	pmap_is_modified:
6923 *
6924 *	Return whether or not the specified physical page was modified
6925 *	in any physical maps.
6926 */
6927bool
6928pmap_is_modified(vm_page_t m)
6929{
6930
6931	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6932	    ("pmap_is_modified: page %p is not managed", m));
6933
6934	/*
6935	 * If the page is not busied then this check is racy.
6936	 */
6937	if (!pmap_page_is_write_mapped(m))
6938		return (false);
6939	return (pmap_page_test_mappings(m, false, true));
6940}
6941
6942/*
6943 *	pmap_is_prefaultable:
6944 *
6945 *	Return whether or not the specified virtual address is eligible
6946 *	for prefault.
6947 */
6948bool
6949pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
6950{
6951	pd_entry_t *pde;
6952	pt_entry_t *pte;
6953	bool rv;
6954	int lvl;
6955
6956	/*
6957	 * Return true if and only if the L3 entry for the specified virtual
6958	 * address is allocated but invalid.
6959	 */
6960	rv = false;
6961	PMAP_LOCK(pmap);
6962	pde = pmap_pde(pmap, addr, &lvl);
6963	if (pde != NULL && lvl == 2) {
6964		pte = pmap_l2_to_l3(pde, addr);
6965		rv = pmap_load(pte) == 0;
6966	}
6967	PMAP_UNLOCK(pmap);
6968	return (rv);
6969}
6970
6971/*
6972 *	pmap_is_referenced:
6973 *
6974 *	Return whether or not the specified physical page was referenced
6975 *	in any physical maps.
6976 */
6977bool
6978pmap_is_referenced(vm_page_t m)
6979{
6980
6981	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6982	    ("pmap_is_referenced: page %p is not managed", m));
6983	return (pmap_page_test_mappings(m, true, false));
6984}
6985
6986/*
6987 * Clear the write and modified bits in each of the given page's mappings.
6988 */
6989void
6990pmap_remove_write(vm_page_t m)
6991{
6992	struct md_page *pvh;
6993	pmap_t pmap;
6994	struct rwlock *lock;
6995	pv_entry_t next_pv, pv;
6996	pt_entry_t oldpte, *pte, set, clear, mask, val;
6997	vm_offset_t va;
6998	int md_gen, pvh_gen;
6999
7000	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7001	    ("pmap_remove_write: page %p is not managed", m));
7002	vm_page_assert_busied(m);
7003
7004	if (!pmap_page_is_write_mapped(m))
7005		return;
7006	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7007	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7008	rw_wlock(lock);
7009retry:
7010	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7011		pmap = PV_PMAP(pv);
7012		PMAP_ASSERT_STAGE1(pmap);
7013		if (!PMAP_TRYLOCK(pmap)) {
7014			pvh_gen = pvh->pv_gen;
7015			rw_wunlock(lock);
7016			PMAP_LOCK(pmap);
7017			rw_wlock(lock);
7018			if (pvh_gen != pvh->pv_gen) {
7019				PMAP_UNLOCK(pmap);
7020				goto retry;
7021			}
7022		}
7023		va = pv->pv_va;
7024		pte = pmap_pte_exists(pmap, va, 2, __func__);
7025		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7026			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7027		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7028		    ("inconsistent pv lock %p %p for page %p",
7029		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7030		PMAP_UNLOCK(pmap);
7031	}
7032	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7033		pmap = PV_PMAP(pv);
7034		if (!PMAP_TRYLOCK(pmap)) {
7035			pvh_gen = pvh->pv_gen;
7036			md_gen = m->md.pv_gen;
7037			rw_wunlock(lock);
7038			PMAP_LOCK(pmap);
7039			rw_wlock(lock);
7040			if (pvh_gen != pvh->pv_gen ||
7041			    md_gen != m->md.pv_gen) {
7042				PMAP_UNLOCK(pmap);
7043				goto retry;
7044			}
7045		}
7046		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7047		oldpte = pmap_load(pte);
7048		if ((oldpte & ATTR_SW_DBM) != 0) {
7049			if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7050				(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7051
7052				/*
7053				 * The L3 entry's accessed bit may have
7054				 * changed.
7055				 */
7056				oldpte = pmap_load(pte);
7057			}
7058			if (pmap->pm_stage == PM_STAGE1) {
7059				set = ATTR_S1_AP_RW_BIT;
7060				clear = 0;
7061				mask = ATTR_S1_AP_RW_BIT;
7062				val = ATTR_S1_AP(ATTR_S1_AP_RW);
7063			} else {
7064				set = 0;
7065				clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7066				mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7067				val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7068			}
7069			clear |= ATTR_SW_DBM;
7070			while (!atomic_fcmpset_64(pte, &oldpte,
7071			    (oldpte | set) & ~clear))
7072				cpu_spinwait();
7073
7074			if ((oldpte & mask) == val)
7075				vm_page_dirty(m);
7076			pmap_invalidate_page(pmap, pv->pv_va, true);
7077		}
7078		PMAP_UNLOCK(pmap);
7079	}
7080	rw_wunlock(lock);
7081	vm_page_aflag_clear(m, PGA_WRITEABLE);
7082}
7083
7084/*
7085 *	pmap_ts_referenced:
7086 *
7087 *	Return a count of reference bits for a page, clearing those bits.
7088 *	It is not necessary for every reference bit to be cleared, but it
7089 *	is necessary that 0 only be returned when there are truly no
7090 *	reference bits set.
7091 *
7092 *	As an optimization, update the page's dirty field if a modified bit is
7093 *	found while counting reference bits.  This opportunistic update can be
7094 *	performed at low cost and can eliminate the need for some future calls
7095 *	to pmap_is_modified().  However, since this function stops after
7096 *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7097 *	dirty pages.  Those dirty pages will only be detected by a future call
7098 *	to pmap_is_modified().
7099 */
7100int
7101pmap_ts_referenced(vm_page_t m)
7102{
7103	struct md_page *pvh;
7104	pv_entry_t pv, pvf;
7105	pmap_t pmap;
7106	struct rwlock *lock;
7107	pt_entry_t *pte, tpte;
7108	vm_offset_t va;
7109	vm_paddr_t pa;
7110	int cleared, md_gen, not_cleared, pvh_gen;
7111	struct spglist free;
7112
7113	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7114	    ("pmap_ts_referenced: page %p is not managed", m));
7115	SLIST_INIT(&free);
7116	cleared = 0;
7117	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7118	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7119	rw_wlock(lock);
7120retry:
7121	not_cleared = 0;
7122	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7123		goto small_mappings;
7124	pv = pvf;
7125	do {
7126		if (pvf == NULL)
7127			pvf = pv;
7128		pmap = PV_PMAP(pv);
7129		if (!PMAP_TRYLOCK(pmap)) {
7130			pvh_gen = pvh->pv_gen;
7131			rw_wunlock(lock);
7132			PMAP_LOCK(pmap);
7133			rw_wlock(lock);
7134			if (pvh_gen != pvh->pv_gen) {
7135				PMAP_UNLOCK(pmap);
7136				goto retry;
7137			}
7138		}
7139		va = pv->pv_va;
7140		pte = pmap_pte_exists(pmap, va, 2, __func__);
7141		tpte = pmap_load(pte);
7142		if (pmap_pte_dirty(pmap, tpte)) {
7143			/*
7144			 * Although "tpte" is mapping a 2MB page, because
7145			 * this function is called at a 4KB page granularity,
7146			 * we only update the 4KB page under test.
7147			 */
7148			vm_page_dirty(m);
7149		}
7150		if ((tpte & ATTR_AF) != 0) {
7151			pa = VM_PAGE_TO_PHYS(m);
7152
7153			/*
7154			 * Since this reference bit is shared by 512 4KB pages,
7155			 * it should not be cleared every time it is tested.
7156			 * Apply a simple "hash" function on the physical page
7157			 * number, the virtual superpage number, and the pmap
7158			 * address to select one 4KB page out of the 512 on
7159			 * which testing the reference bit will result in
7160			 * clearing that reference bit.  This function is
7161			 * designed to avoid the selection of the same 4KB page
7162			 * for every 2MB page mapping.
7163			 *
7164			 * On demotion, a mapping that hasn't been referenced
7165			 * is simply destroyed.  To avoid the possibility of a
7166			 * subsequent page fault on a demoted wired mapping,
7167			 * always leave its reference bit set.  Moreover,
7168			 * since the superpage is wired, the current state of
7169			 * its reference bit won't affect page replacement.
7170			 */
7171			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7172			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7173			    (tpte & ATTR_SW_WIRED) == 0) {
7174				pmap_clear_bits(pte, ATTR_AF);
7175				pmap_invalidate_page(pmap, va, true);
7176				cleared++;
7177			} else
7178				not_cleared++;
7179		}
7180		PMAP_UNLOCK(pmap);
7181		/* Rotate the PV list if it has more than one entry. */
7182		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7183			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7184			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7185			pvh->pv_gen++;
7186		}
7187		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7188			goto out;
7189	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7190small_mappings:
7191	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7192		goto out;
7193	pv = pvf;
7194	do {
7195		if (pvf == NULL)
7196			pvf = pv;
7197		pmap = PV_PMAP(pv);
7198		if (!PMAP_TRYLOCK(pmap)) {
7199			pvh_gen = pvh->pv_gen;
7200			md_gen = m->md.pv_gen;
7201			rw_wunlock(lock);
7202			PMAP_LOCK(pmap);
7203			rw_wlock(lock);
7204			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7205				PMAP_UNLOCK(pmap);
7206				goto retry;
7207			}
7208		}
7209		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7210		tpte = pmap_load(pte);
7211		if (pmap_pte_dirty(pmap, tpte))
7212			vm_page_dirty(m);
7213		if ((tpte & ATTR_AF) != 0) {
7214			if ((tpte & ATTR_SW_WIRED) == 0) {
7215				/*
7216				 * Clear the accessed bit in this L3 entry
7217				 * regardless of the contiguous bit.
7218				 */
7219				pmap_clear_bits(pte, ATTR_AF);
7220				pmap_invalidate_page(pmap, pv->pv_va, true);
7221				cleared++;
7222			} else
7223				not_cleared++;
7224		} else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7225		    (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7226			/*
7227			 * An L3C superpage mapping is regarded as accessed
7228			 * until the accessed bit has been cleared in all
7229			 * of its constituent entries.
7230			 */
7231			not_cleared++;
7232		}
7233		PMAP_UNLOCK(pmap);
7234		/* Rotate the PV list if it has more than one entry. */
7235		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7236			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7237			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7238			m->md.pv_gen++;
7239		}
7240	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7241	    not_cleared < PMAP_TS_REFERENCED_MAX);
7242out:
7243	rw_wunlock(lock);
7244	vm_page_free_pages_toq(&free, true);
7245	return (cleared + not_cleared);
7246}
7247
7248/*
7249 *	Apply the given advice to the specified range of addresses within the
7250 *	given pmap.  Depending on the advice, clear the referenced and/or
7251 *	modified flags in each mapping and set the mapped page's dirty field.
7252 */
7253void
7254pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7255{
7256	struct rwlock *lock;
7257	vm_offset_t va, va_next, dva;
7258	vm_page_t m;
7259	pd_entry_t *l0, *l1, *l2, oldl2;
7260	pt_entry_t *l3, *dl3, oldl3;
7261
7262	PMAP_ASSERT_STAGE1(pmap);
7263
7264	if (advice != MADV_DONTNEED && advice != MADV_FREE)
7265		return;
7266
7267	PMAP_LOCK(pmap);
7268	for (; sva < eva; sva = va_next) {
7269		l0 = pmap_l0(pmap, sva);
7270		if (pmap_load(l0) == 0) {
7271			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7272			if (va_next < sva)
7273				va_next = eva;
7274			continue;
7275		}
7276
7277		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7278		if (va_next < sva)
7279			va_next = eva;
7280		l1 = pmap_l0_to_l1(l0, sva);
7281		if (pmap_load(l1) == 0)
7282			continue;
7283		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7284			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7285			continue;
7286		}
7287
7288		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7289		if (va_next < sva)
7290			va_next = eva;
7291		l2 = pmap_l1_to_l2(l1, sva);
7292		oldl2 = pmap_load(l2);
7293		if (oldl2 == 0)
7294			continue;
7295		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7296			if ((oldl2 & ATTR_SW_MANAGED) == 0)
7297				continue;
7298			lock = NULL;
7299			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7300				if (lock != NULL)
7301					rw_wunlock(lock);
7302
7303				/*
7304				 * The 2MB page mapping was destroyed.
7305				 */
7306				continue;
7307			}
7308
7309			/*
7310			 * Unless the page mappings are wired, remove the
7311			 * mapping to a single page so that a subsequent
7312			 * access may repromote.  Choosing the last page
7313			 * within the address range [sva, min(va_next, eva))
7314			 * generally results in more repromotions.  Since the
7315			 * underlying page table page is fully populated, this
7316			 * removal never frees a page table page.
7317			 */
7318			if ((oldl2 & ATTR_SW_WIRED) == 0) {
7319				va = eva;
7320				if (va > va_next)
7321					va = va_next;
7322				va -= PAGE_SIZE;
7323				KASSERT(va >= sva,
7324				    ("pmap_advise: no address gap"));
7325				l3 = pmap_l2_to_l3(l2, va);
7326				KASSERT(pmap_load(l3) != 0,
7327				    ("pmap_advise: invalid PTE"));
7328				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7329				    NULL, &lock);
7330			}
7331			if (lock != NULL)
7332				rw_wunlock(lock);
7333		}
7334		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7335		    ("pmap_advise: invalid L2 entry after demotion"));
7336		if (va_next > eva)
7337			va_next = eva;
7338		va = va_next;
7339		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7340		    sva += L3_SIZE) {
7341			oldl3 = pmap_load(l3);
7342			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7343			    (ATTR_SW_MANAGED | L3_PAGE))
7344				goto maybe_invlrng;
7345			else if (pmap_pte_dirty(pmap, oldl3)) {
7346				if (advice == MADV_DONTNEED) {
7347					/*
7348					 * Future calls to pmap_is_modified()
7349					 * can be avoided by making the page
7350					 * dirty now.
7351					 */
7352					m = PTE_TO_VM_PAGE(oldl3);
7353					vm_page_dirty(m);
7354				}
7355				if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7356					/*
7357					 * Unconditionally demote the L3C
7358					 * superpage because we do not allow
7359					 * writeable, clean superpages.
7360					 */
7361					(void)pmap_demote_l3c(pmap, l3, sva);
7362
7363					/*
7364                                         * Destroy the final mapping before the
7365                                         * next L3C boundary or va_next,
7366					 * whichever comes first, so that a
7367					 * subsequent access may act as a
7368					 * repromotion trigger.
7369					 */
7370                                        if ((oldl3 & ATTR_SW_WIRED) == 0) {
7371						dva = MIN((sva & ~L3C_OFFSET) +
7372						    L3C_SIZE - PAGE_SIZE,
7373						    va_next - PAGE_SIZE);
7374						dl3 = pmap_l2_to_l3(l2, dva);
7375						KASSERT(pmap_load(dl3) != 0,
7376						    ("pmap_advise: invalid PTE"));
7377						lock = NULL;
7378						pmap_remove_l3(pmap, dl3, dva,
7379						    pmap_load(l2), NULL, &lock);
7380						if (lock != NULL)
7381							rw_wunlock(lock);
7382					}
7383
7384					/*
7385					 * The L3 entry's accessed bit may have
7386					 * changed.
7387					 */
7388					oldl3 = pmap_load(l3);
7389				}
7390
7391				/*
7392				 * Check that we did not just destroy this entry so
7393				 * we avoid corrupting the page able.
7394				 */
7395				if (oldl3 != 0) {
7396					while (!atomic_fcmpset_long(l3, &oldl3,
7397					    (oldl3 & ~ATTR_AF) |
7398					    ATTR_S1_AP(ATTR_S1_AP_RO)))
7399						cpu_spinwait();
7400				}
7401			} else if ((oldl3 & ATTR_AF) != 0) {
7402				/*
7403				 * Clear the accessed bit in this L3 entry
7404				 * regardless of the contiguous bit.
7405				 */
7406				pmap_clear_bits(l3, ATTR_AF);
7407			} else
7408				goto maybe_invlrng;
7409			if (va == va_next)
7410				va = sva;
7411			continue;
7412maybe_invlrng:
7413			if (va != va_next) {
7414				pmap_s1_invalidate_range(pmap, va, sva, true);
7415				va = va_next;
7416			}
7417		}
7418		if (va != va_next)
7419			pmap_s1_invalidate_range(pmap, va, sva, true);
7420	}
7421	PMAP_UNLOCK(pmap);
7422}
7423
7424/*
7425 *	Clear the modify bits on the specified physical page.
7426 */
7427void
7428pmap_clear_modify(vm_page_t m)
7429{
7430	struct md_page *pvh;
7431	struct rwlock *lock;
7432	pmap_t pmap;
7433	pv_entry_t next_pv, pv;
7434	pd_entry_t *l2, oldl2;
7435	pt_entry_t *l3, oldl3;
7436	vm_offset_t va;
7437	int md_gen, pvh_gen;
7438
7439	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7440	    ("pmap_clear_modify: page %p is not managed", m));
7441	vm_page_assert_busied(m);
7442
7443	if (!pmap_page_is_write_mapped(m))
7444		return;
7445	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7446	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7447	rw_wlock(lock);
7448restart:
7449	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7450		pmap = PV_PMAP(pv);
7451		PMAP_ASSERT_STAGE1(pmap);
7452		if (!PMAP_TRYLOCK(pmap)) {
7453			pvh_gen = pvh->pv_gen;
7454			rw_wunlock(lock);
7455			PMAP_LOCK(pmap);
7456			rw_wlock(lock);
7457			if (pvh_gen != pvh->pv_gen) {
7458				PMAP_UNLOCK(pmap);
7459				goto restart;
7460			}
7461		}
7462		va = pv->pv_va;
7463		l2 = pmap_l2(pmap, va);
7464		oldl2 = pmap_load(l2);
7465		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7466		if ((oldl2 & ATTR_SW_DBM) != 0 &&
7467		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7468		    (oldl2 & ATTR_SW_WIRED) == 0) {
7469			/*
7470			 * Write protect the mapping to a single page so that
7471			 * a subsequent write access may repromote.
7472			 */
7473			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7474			l3 = pmap_l2_to_l3(l2, va);
7475			oldl3 = pmap_load(l3);
7476			while (!atomic_fcmpset_long(l3, &oldl3,
7477			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7478				cpu_spinwait();
7479			vm_page_dirty(m);
7480			pmap_s1_invalidate_page(pmap, va, true);
7481		}
7482		PMAP_UNLOCK(pmap);
7483	}
7484	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7485		pmap = PV_PMAP(pv);
7486		PMAP_ASSERT_STAGE1(pmap);
7487		if (!PMAP_TRYLOCK(pmap)) {
7488			md_gen = m->md.pv_gen;
7489			pvh_gen = pvh->pv_gen;
7490			rw_wunlock(lock);
7491			PMAP_LOCK(pmap);
7492			rw_wlock(lock);
7493			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7494				PMAP_UNLOCK(pmap);
7495				goto restart;
7496			}
7497		}
7498		l2 = pmap_l2(pmap, pv->pv_va);
7499		l3 = pmap_l2_to_l3(l2, pv->pv_va);
7500		oldl3 = pmap_load(l3);
7501		KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7502		    (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7503		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7504		    ("writeable L3C superpage not dirty"));
7505		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7506			if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7507				(void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7508			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7509			pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7510		}
7511		PMAP_UNLOCK(pmap);
7512	}
7513	rw_wunlock(lock);
7514}
7515
7516void *
7517pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7518{
7519	struct pmap_preinit_mapping *ppim;
7520	vm_offset_t va, offset;
7521	pd_entry_t old_l2e, *pde;
7522	pt_entry_t *l2;
7523	int i, lvl, l2_blocks, free_l2_count, start_idx;
7524
7525	if (!vm_initialized) {
7526		/*
7527		 * No L3 ptables so map entire L2 blocks where start VA is:
7528		 * 	preinit_map_va + start_idx * L2_SIZE
7529		 * There may be duplicate mappings (multiple VA -> same PA) but
7530		 * ARM64 dcache is always PIPT so that's acceptable.
7531		 */
7532		 if (size == 0)
7533			 return (NULL);
7534
7535		 /* Calculate how many L2 blocks are needed for the mapping */
7536		l2_blocks = (roundup2(pa + size, L2_SIZE) -
7537		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7538
7539		offset = pa & L2_OFFSET;
7540
7541		if (preinit_map_va == 0)
7542			return (NULL);
7543
7544		/* Map 2MiB L2 blocks from reserved VA space */
7545
7546		free_l2_count = 0;
7547		start_idx = -1;
7548		/* Find enough free contiguous VA space */
7549		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7550			ppim = pmap_preinit_mapping + i;
7551			if (free_l2_count > 0 && ppim->pa != 0) {
7552				/* Not enough space here */
7553				free_l2_count = 0;
7554				start_idx = -1;
7555				continue;
7556			}
7557
7558			if (ppim->pa == 0) {
7559				/* Free L2 block */
7560				if (start_idx == -1)
7561					start_idx = i;
7562				free_l2_count++;
7563				if (free_l2_count == l2_blocks)
7564					break;
7565			}
7566		}
7567		if (free_l2_count != l2_blocks)
7568			panic("%s: too many preinit mappings", __func__);
7569
7570		va = preinit_map_va + (start_idx * L2_SIZE);
7571		for (i = start_idx; i < start_idx + l2_blocks; i++) {
7572			/* Mark entries as allocated */
7573			ppim = pmap_preinit_mapping + i;
7574			ppim->pa = pa;
7575			ppim->va = va + offset;
7576			ppim->size = size;
7577		}
7578
7579		/* Map L2 blocks */
7580		pa = rounddown2(pa, L2_SIZE);
7581		old_l2e = 0;
7582		for (i = 0; i < l2_blocks; i++) {
7583			pde = pmap_pde(kernel_pmap, va, &lvl);
7584			KASSERT(pde != NULL,
7585			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7586			    va));
7587			KASSERT(lvl == 1,
7588			    ("pmap_mapbios: Invalid level %d", lvl));
7589
7590			/* Insert L2_BLOCK */
7591			l2 = pmap_l1_to_l2(pde, va);
7592			old_l2e |= pmap_load_store(l2,
7593			    PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
7594			    ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
7595			    L2_BLOCK);
7596
7597			va += L2_SIZE;
7598			pa += L2_SIZE;
7599		}
7600		if ((old_l2e & ATTR_DESCR_VALID) != 0)
7601			pmap_s1_invalidate_all(kernel_pmap);
7602		else {
7603			/*
7604			 * Because the old entries were invalid and the new
7605			 * mappings are not executable, an isb is not required.
7606			 */
7607			dsb(ishst);
7608		}
7609
7610		va = preinit_map_va + (start_idx * L2_SIZE);
7611
7612	} else {
7613		/* kva_alloc may be used to map the pages */
7614		offset = pa & PAGE_MASK;
7615		size = round_page(offset + size);
7616
7617		va = kva_alloc(size);
7618		if (va == 0)
7619			panic("%s: Couldn't allocate KVA", __func__);
7620
7621		pde = pmap_pde(kernel_pmap, va, &lvl);
7622		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7623
7624		/* L3 table is linked */
7625		va = trunc_page(va);
7626		pa = trunc_page(pa);
7627		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7628	}
7629
7630	return ((void *)(va + offset));
7631}
7632
7633void
7634pmap_unmapbios(void *p, vm_size_t size)
7635{
7636	struct pmap_preinit_mapping *ppim;
7637	vm_offset_t offset, va, va_trunc;
7638	pd_entry_t *pde;
7639	pt_entry_t *l2;
7640	int i, lvl, l2_blocks, block;
7641	bool preinit_map;
7642
7643	va = (vm_offset_t)p;
7644	l2_blocks =
7645	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7646	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7647
7648	/* Remove preinit mapping */
7649	preinit_map = false;
7650	block = 0;
7651	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7652		ppim = pmap_preinit_mapping + i;
7653		if (ppim->va == va) {
7654			KASSERT(ppim->size == size,
7655			    ("pmap_unmapbios: size mismatch"));
7656			ppim->va = 0;
7657			ppim->pa = 0;
7658			ppim->size = 0;
7659			preinit_map = true;
7660			offset = block * L2_SIZE;
7661			va_trunc = rounddown2(va, L2_SIZE) + offset;
7662
7663			/* Remove L2_BLOCK */
7664			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7665			KASSERT(pde != NULL,
7666			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7667			    va_trunc));
7668			l2 = pmap_l1_to_l2(pde, va_trunc);
7669			pmap_clear(l2);
7670
7671			if (block == (l2_blocks - 1))
7672				break;
7673			block++;
7674		}
7675	}
7676	if (preinit_map) {
7677		pmap_s1_invalidate_all(kernel_pmap);
7678		return;
7679	}
7680
7681	/* Unmap the pages reserved with kva_alloc. */
7682	if (vm_initialized) {
7683		offset = va & PAGE_MASK;
7684		size = round_page(offset + size);
7685		va = trunc_page(va);
7686
7687		/* Unmap and invalidate the pages */
7688		pmap_kremove_device(va, size);
7689
7690		kva_free(va, size);
7691	}
7692}
7693
7694/*
7695 * Sets the memory attribute for the specified page.
7696 */
7697void
7698pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7699{
7700
7701	m->md.pv_memattr = ma;
7702
7703	/*
7704	 * If "m" is a normal page, update its direct mapping.  This update
7705	 * can be relied upon to perform any cache operations that are
7706	 * required for data coherence.
7707	 */
7708	if ((m->flags & PG_FICTITIOUS) == 0 &&
7709	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7710	    m->md.pv_memattr) != 0)
7711		panic("memory attribute change on the direct map failed");
7712}
7713
7714/*
7715 * Changes the specified virtual address range's memory type to that given by
7716 * the parameter "mode".  The specified virtual address range must be
7717 * completely contained within either the direct map or the kernel map.  If
7718 * the virtual address range is contained within the kernel map, then the
7719 * memory type for each of the corresponding ranges of the direct map is also
7720 * changed.  (The corresponding ranges of the direct map are those ranges that
7721 * map the same physical pages as the specified virtual address range.)  These
7722 * changes to the direct map are necessary because Intel describes the
7723 * behavior of their processors as "undefined" if two or more mappings to the
7724 * same physical page have different memory types.
7725 *
7726 * Returns zero if the change completed successfully, and either EINVAL or
7727 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
7728 * of the virtual address range was not mapped, and ENOMEM is returned if
7729 * there was insufficient memory available to complete the change.  In the
7730 * latter case, the memory type may have been changed on some part of the
7731 * virtual address range or the direct map.
7732 */
7733int
7734pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7735{
7736	int error;
7737
7738	PMAP_LOCK(kernel_pmap);
7739	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7740	PMAP_UNLOCK(kernel_pmap);
7741	return (error);
7742}
7743
7744/*
7745 * Changes the specified virtual address range's protections to those
7746 * specified by "prot".  Like pmap_change_attr(), protections for aliases
7747 * in the direct map are updated as well.  Protections on aliasing mappings may
7748 * be a subset of the requested protections; for example, mappings in the direct
7749 * map are never executable.
7750 */
7751int
7752pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7753{
7754	int error;
7755
7756	/* Only supported within the kernel map. */
7757	if (va < VM_MIN_KERNEL_ADDRESS)
7758		return (EINVAL);
7759
7760	PMAP_LOCK(kernel_pmap);
7761	error = pmap_change_props_locked(va, size, prot, -1, false);
7762	PMAP_UNLOCK(kernel_pmap);
7763	return (error);
7764}
7765
7766static int
7767pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
7768    int mode, bool skip_unmapped)
7769{
7770	vm_offset_t base, offset, tmpva;
7771	vm_size_t pte_size;
7772	vm_paddr_t pa;
7773	pt_entry_t pte, *ptep, *newpte;
7774	pt_entry_t bits, mask;
7775	int lvl, rv;
7776
7777	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7778	base = trunc_page(va);
7779	offset = va & PAGE_MASK;
7780	size = round_page(offset + size);
7781
7782	if (!VIRT_IN_DMAP(base) &&
7783	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
7784		return (EINVAL);
7785
7786	bits = 0;
7787	mask = 0;
7788	if (mode != -1) {
7789		bits = ATTR_S1_IDX(mode);
7790		mask = ATTR_S1_IDX_MASK;
7791		if (mode == VM_MEMATTR_DEVICE) {
7792			mask |= ATTR_S1_XN;
7793			bits |= ATTR_S1_XN;
7794		}
7795	}
7796	if (prot != VM_PROT_NONE) {
7797		/* Don't mark the DMAP as executable. It never is on arm64. */
7798		if (VIRT_IN_DMAP(base)) {
7799			prot &= ~VM_PROT_EXECUTE;
7800			/*
7801			 * XXX Mark the DMAP as writable for now. We rely
7802			 * on this in ddb & dtrace to insert breakpoint
7803			 * instructions.
7804			 */
7805			prot |= VM_PROT_WRITE;
7806		}
7807
7808		if ((prot & VM_PROT_WRITE) == 0) {
7809			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
7810		}
7811		if ((prot & VM_PROT_EXECUTE) == 0) {
7812			bits |= ATTR_S1_PXN;
7813		}
7814		bits |= ATTR_S1_UXN;
7815		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
7816	}
7817
7818	for (tmpva = base; tmpva < base + size; ) {
7819		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
7820		if (ptep == NULL && !skip_unmapped) {
7821			return (EINVAL);
7822		} else if ((ptep == NULL && skip_unmapped) ||
7823		    (pmap_load(ptep) & mask) == bits) {
7824			/*
7825			 * We already have the correct attribute or there
7826			 * is no memory mapped at this address and we are
7827			 * skipping unmapped memory.
7828			 */
7829			switch (lvl) {
7830			default:
7831				panic("Invalid DMAP table level: %d\n", lvl);
7832			case 1:
7833				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
7834				break;
7835			case 2:
7836				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
7837				break;
7838			case 3:
7839				tmpva += PAGE_SIZE;
7840				break;
7841			}
7842		} else {
7843			/* We can't demote/promote this entry */
7844			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
7845
7846			/*
7847			 * Split the entry to an level 3 table, then
7848			 * set the new attribute.
7849			 */
7850			switch (lvl) {
7851			default:
7852				panic("Invalid DMAP table level: %d\n", lvl);
7853			case 1:
7854				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7855				if ((tmpva & L1_OFFSET) == 0 &&
7856				    (base + size - tmpva) >= L1_SIZE) {
7857					pte_size = L1_SIZE;
7858					break;
7859				}
7860				newpte = pmap_demote_l1(kernel_pmap, ptep,
7861				    tmpva & ~L1_OFFSET);
7862				if (newpte == NULL)
7863					return (EINVAL);
7864				ptep = pmap_l1_to_l2(ptep, tmpva);
7865				/* FALLTHROUGH */
7866			case 2:
7867				if ((tmpva & L2_OFFSET) == 0 &&
7868				    (base + size - tmpva) >= L2_SIZE) {
7869					pte_size = L2_SIZE;
7870					break;
7871				}
7872				newpte = pmap_demote_l2(kernel_pmap, ptep,
7873				    tmpva);
7874				if (newpte == NULL)
7875					return (EINVAL);
7876				ptep = pmap_l2_to_l3(ptep, tmpva);
7877				/* FALLTHROUGH */
7878			case 3:
7879				if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7880					if ((tmpva & L3C_OFFSET) == 0 &&
7881					    (base + size - tmpva) >= L3C_SIZE) {
7882						pte_size = L3C_SIZE;
7883						break;
7884					}
7885					if (!pmap_demote_l3c(kernel_pmap, ptep,
7886					    tmpva))
7887						return (EINVAL);
7888				}
7889				pte_size = PAGE_SIZE;
7890				break;
7891			}
7892
7893			/* Update the entry */
7894			pte = pmap_load(ptep);
7895			pte &= ~mask;
7896			pte |= bits;
7897
7898			pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
7899			    pte_size);
7900
7901			pa = PTE_TO_PHYS(pte);
7902			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
7903				/*
7904				 * Keep the DMAP memory in sync.
7905				 */
7906				rv = pmap_change_props_locked(
7907				    PHYS_TO_DMAP(pa), pte_size,
7908				    prot, mode, true);
7909				if (rv != 0)
7910					return (rv);
7911			}
7912
7913			/*
7914			 * If moving to a non-cacheable entry flush
7915			 * the cache.
7916			 */
7917			if (mode == VM_MEMATTR_UNCACHEABLE)
7918				cpu_dcache_wbinv_range((void *)tmpva, pte_size);
7919			tmpva += pte_size;
7920		}
7921	}
7922
7923	return (0);
7924}
7925
7926/*
7927 * Create an L2 table to map all addresses within an L1 mapping.
7928 */
7929static pt_entry_t *
7930pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
7931{
7932	pt_entry_t *l2, newl2, oldl1;
7933	vm_offset_t tmpl1;
7934	vm_paddr_t l2phys, phys;
7935	vm_page_t ml2;
7936	int i;
7937
7938	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7939	oldl1 = pmap_load(l1);
7940	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7941	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
7942	    ("pmap_demote_l1: Demoting a non-block entry"));
7943	KASSERT((va & L1_OFFSET) == 0,
7944	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
7945	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
7946	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
7947	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
7948	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
7949
7950	tmpl1 = 0;
7951	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
7952		tmpl1 = kva_alloc(PAGE_SIZE);
7953		if (tmpl1 == 0)
7954			return (NULL);
7955	}
7956
7957	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
7958	    NULL) {
7959		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
7960		    " in pmap %p", va, pmap);
7961		l2 = NULL;
7962		goto fail;
7963	}
7964
7965	l2phys = VM_PAGE_TO_PHYS(ml2);
7966	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
7967
7968	/* Address the range points at */
7969	phys = PTE_TO_PHYS(oldl1);
7970	/* The attributed from the old l1 table to be copied */
7971	newl2 = oldl1 & ATTR_MASK;
7972
7973	/* Create the new entries */
7974	for (i = 0; i < Ln_ENTRIES; i++) {
7975		l2[i] = newl2 | phys;
7976		phys += L2_SIZE;
7977	}
7978	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
7979	    ("Invalid l2 page (%lx != %lx)", l2[0],
7980	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
7981
7982	if (tmpl1 != 0) {
7983		pmap_kenter(tmpl1, PAGE_SIZE,
7984		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
7985		    VM_MEMATTR_WRITE_BACK);
7986		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
7987	}
7988
7989	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
7990
7991fail:
7992	if (tmpl1 != 0) {
7993		pmap_kremove(tmpl1);
7994		kva_free(tmpl1, PAGE_SIZE);
7995	}
7996
7997	return (l2);
7998}
7999
8000static void
8001pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8002{
8003	pt_entry_t *l3;
8004
8005	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8006		*l3 = newl3;
8007		newl3 += L3_SIZE;
8008	}
8009}
8010
8011static void
8012pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8013{
8014#ifdef INVARIANTS
8015#ifdef DIAGNOSTIC
8016	pt_entry_t *xl3p, *yl3p;
8017
8018	for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8019	    xl3p++, newl3e += PAGE_SIZE) {
8020		if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8021			printf("pmap_demote_l2: xl3e %zd and newl3e map "
8022			    "different pages: found %#lx, expected %#lx\n",
8023			    xl3p - firstl3p, pmap_load(xl3p), newl3e);
8024			printf("page table dump\n");
8025			for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8026			    yl3p++) {
8027				printf("%zd %#lx\n", yl3p - firstl3p,
8028				    pmap_load(yl3p));
8029			}
8030			panic("firstpte");
8031		}
8032	}
8033#else
8034	KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8035	    ("pmap_demote_l2: firstl3 and newl3e map different physical"
8036	    " addresses"));
8037#endif
8038#endif
8039}
8040
8041static void
8042pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8043    struct rwlock **lockp)
8044{
8045	struct spglist free;
8046
8047	SLIST_INIT(&free);
8048	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8049	    lockp);
8050	vm_page_free_pages_toq(&free, true);
8051}
8052
8053/*
8054 * Create an L3 table to map all addresses within an L2 mapping.
8055 */
8056static pt_entry_t *
8057pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8058    struct rwlock **lockp)
8059{
8060	pt_entry_t *l3, newl3, oldl2;
8061	vm_offset_t tmpl2;
8062	vm_paddr_t l3phys;
8063	vm_page_t ml3;
8064
8065	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8066	PMAP_ASSERT_STAGE1(pmap);
8067	KASSERT(ADDR_IS_CANONICAL(va),
8068	    ("%s: Address not in canonical form: %lx", __func__, va));
8069
8070	l3 = NULL;
8071	oldl2 = pmap_load(l2);
8072	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8073	    ("pmap_demote_l2: Demoting a non-block entry"));
8074	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8075	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8076	va &= ~L2_OFFSET;
8077
8078	tmpl2 = 0;
8079	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8080		tmpl2 = kva_alloc(PAGE_SIZE);
8081		if (tmpl2 == 0)
8082			return (NULL);
8083	}
8084
8085	/*
8086	 * Invalidate the 2MB page mapping and return "failure" if the
8087	 * mapping was never accessed.
8088	 */
8089	if ((oldl2 & ATTR_AF) == 0) {
8090		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8091		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8092		pmap_demote_l2_abort(pmap, va, l2, lockp);
8093		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8094		    va, pmap);
8095		goto fail;
8096	}
8097
8098	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8099		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8100		    ("pmap_demote_l2: page table page for a wired mapping"
8101		    " is missing"));
8102
8103		/*
8104		 * If the page table page is missing and the mapping
8105		 * is for a kernel address, the mapping must belong to
8106		 * either the direct map or the early kernel memory.
8107		 * Page table pages are preallocated for every other
8108		 * part of the kernel address space, so the direct map
8109		 * region and early kernel memory are the only parts of the
8110		 * kernel address space that must be handled here.
8111		 */
8112		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8113		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8114		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
8115
8116		/*
8117		 * If the 2MB page mapping belongs to the direct map
8118		 * region of the kernel's address space, then the page
8119		 * allocation request specifies the highest possible
8120		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
8121		 * priority is normal.
8122		 */
8123		ml3 = vm_page_alloc_noobj(
8124		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8125		    VM_ALLOC_WIRED);
8126
8127		/*
8128		 * If the allocation of the new page table page fails,
8129		 * invalidate the 2MB page mapping and return "failure".
8130		 */
8131		if (ml3 == NULL) {
8132			pmap_demote_l2_abort(pmap, va, l2, lockp);
8133			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8134			    " in pmap %p", va, pmap);
8135			goto fail;
8136		}
8137		ml3->pindex = pmap_l2_pindex(va);
8138
8139		if (!ADDR_IS_KERNEL(va)) {
8140			ml3->ref_count = NL3PG;
8141			pmap_resident_count_inc(pmap, 1);
8142		}
8143	}
8144	l3phys = VM_PAGE_TO_PHYS(ml3);
8145	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8146	newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8147	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8148	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8149	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8150
8151	/*
8152	 * If the PTP is not leftover from an earlier promotion or it does not
8153	 * have ATTR_AF set in every L3E, then fill it.  The new L3Es will all
8154	 * have ATTR_AF set.
8155	 *
8156	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8157	 * performs a dsb().  That dsb() ensures that the stores for filling
8158	 * "l3" are visible before "l3" is added to the page table.
8159	 */
8160	if (!vm_page_all_valid(ml3))
8161		pmap_fill_l3(l3, newl3);
8162
8163	pmap_demote_l2_check(l3, newl3);
8164
8165	/*
8166	 * If the mapping has changed attributes, update the L3Es.
8167	 */
8168	if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8169		pmap_fill_l3(l3, newl3);
8170
8171	/*
8172	 * Map the temporary page so we don't lose access to the l2 table.
8173	 */
8174	if (tmpl2 != 0) {
8175		pmap_kenter(tmpl2, PAGE_SIZE,
8176		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8177		    VM_MEMATTR_WRITE_BACK);
8178		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8179	}
8180
8181	/*
8182	 * The spare PV entries must be reserved prior to demoting the
8183	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
8184	 * of the L2 and the PV lists will be inconsistent, which can result
8185	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8186	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8187	 * PV entry for the 2MB page mapping that is being demoted.
8188	 */
8189	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8190		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8191
8192	/*
8193	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8194	 * the 2MB page mapping.
8195	 */
8196	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8197
8198	/*
8199	 * Demote the PV entry.
8200	 */
8201	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8202		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8203
8204	atomic_add_long(&pmap_l2_demotions, 1);
8205	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8206	    " in pmap %p %lx", va, pmap, l3[0]);
8207
8208fail:
8209	if (tmpl2 != 0) {
8210		pmap_kremove(tmpl2);
8211		kva_free(tmpl2, PAGE_SIZE);
8212	}
8213
8214	return (l3);
8215
8216}
8217
8218static pt_entry_t *
8219pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8220{
8221	struct rwlock *lock;
8222	pt_entry_t *l3;
8223
8224	lock = NULL;
8225	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8226	if (lock != NULL)
8227		rw_wunlock(lock);
8228	return (l3);
8229}
8230
8231/*
8232 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8233 */
8234static bool
8235pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8236{
8237	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8238	vm_offset_t tmpl3;
8239	register_t intr;
8240
8241	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8242	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8243	    sizeof(pt_entry_t)) - 1));
8244	l3c_end = l3c_start + L3C_ENTRIES;
8245	tmpl3 = 0;
8246	if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8247	    (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8248		tmpl3 = kva_alloc(PAGE_SIZE);
8249		if (tmpl3 == 0)
8250			return (false);
8251		pmap_kenter(tmpl3, PAGE_SIZE,
8252		    DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8253		    VM_MEMATTR_WRITE_BACK);
8254		l3c_start = (pt_entry_t *)(tmpl3 +
8255		    ((vm_offset_t)l3c_start & PAGE_MASK));
8256		l3c_end = (pt_entry_t *)(tmpl3 +
8257		    ((vm_offset_t)l3c_end & PAGE_MASK));
8258	}
8259	mask = 0;
8260	nbits = ATTR_DESCR_VALID;
8261	intr = intr_disable();
8262
8263	/*
8264	 * Break the mappings.
8265	 */
8266	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8267		/*
8268		 * Clear the mapping's contiguous and valid bits, but leave
8269		 * the rest of the entry unchanged, so that a lockless,
8270		 * concurrent pmap_kextract() can still lookup the physical
8271		 * address.
8272		 */
8273		l3e = pmap_load(tl3p);
8274		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8275		    ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8276		KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8277		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8278		    ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8279		while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8280		    ATTR_DESCR_VALID)))
8281			cpu_spinwait();
8282
8283		/*
8284		 * Hardware accessed and dirty bit maintenance might only
8285		 * update a single L3 entry, so we must combine the accessed
8286		 * and dirty bits from this entire set of contiguous L3
8287		 * entries.
8288		 */
8289		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8290		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8291			mask = ATTR_S1_AP_RW_BIT;
8292		nbits |= l3e & ATTR_AF;
8293	}
8294	if ((nbits & ATTR_AF) != 0) {
8295		pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8296		    ~L3C_OFFSET, true);
8297	}
8298
8299	/*
8300	 * Remake the mappings, updating the accessed and dirty bits.
8301	 */
8302	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8303		l3e = pmap_load(tl3p);
8304		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8305			cpu_spinwait();
8306	}
8307	dsb(ishst);
8308
8309	intr_restore(intr);
8310	if (tmpl3 != 0) {
8311		pmap_kremove(tmpl3);
8312		kva_free(tmpl3, PAGE_SIZE);
8313	}
8314	counter_u64_add(pmap_l3c_demotions, 1);
8315	CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8316	    va, pmap);
8317	return (true);
8318}
8319
8320/*
8321 * Accumulate the accessed and dirty bits within a L3C superpage and
8322 * return the specified PTE with them applied correctly.
8323 */
8324static pt_entry_t
8325pmap_load_l3c(pt_entry_t *l3p)
8326{
8327	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8328
8329	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8330	    sizeof(pt_entry_t)) - 1));
8331	l3c_end = l3c_start + L3C_ENTRIES;
8332	mask = 0;
8333	nbits = 0;
8334	/* Iterate over each mapping in the superpage. */
8335	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8336		l3e = pmap_load(tl3p);
8337		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8338		    ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8339		/* Update mask if the current page has its dirty bit set. */
8340		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8341		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8342			mask = ATTR_S1_AP_RW_BIT;
8343		/* Update nbits if the accessed bit is set. */
8344		nbits |= l3e & ATTR_AF;
8345	}
8346	return ((pmap_load(l3p) & ~mask) | nbits);
8347}
8348
8349/*
8350 * Perform the pmap work for mincore(2).  If the page is not both referenced and
8351 * modified by this pmap, returns its physical address so that the caller can
8352 * find other mappings.
8353 */
8354int
8355pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8356{
8357	pt_entry_t *pte, tpte;
8358	vm_paddr_t mask, pa;
8359	int lvl, val;
8360	bool managed;
8361
8362	PMAP_ASSERT_STAGE1(pmap);
8363	PMAP_LOCK(pmap);
8364	pte = pmap_pte(pmap, addr, &lvl);
8365	if (pte != NULL) {
8366		tpte = pmap_load(pte);
8367
8368		switch (lvl) {
8369		case 3:
8370			mask = L3_OFFSET;
8371			break;
8372		case 2:
8373			mask = L2_OFFSET;
8374			break;
8375		case 1:
8376			mask = L1_OFFSET;
8377			break;
8378		default:
8379			panic("pmap_mincore: invalid level %d", lvl);
8380		}
8381
8382		managed = (tpte & ATTR_SW_MANAGED) != 0;
8383		val = MINCORE_INCORE;
8384		if (lvl != 3)
8385			val |= MINCORE_PSIND(3 - lvl);
8386		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8387		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8388			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8389		if ((tpte & ATTR_AF) == ATTR_AF)
8390			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8391
8392		pa = PTE_TO_PHYS(tpte) | (addr & mask);
8393	} else {
8394		managed = false;
8395		val = 0;
8396	}
8397
8398	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8399	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8400		*pap = pa;
8401	}
8402	PMAP_UNLOCK(pmap);
8403	return (val);
8404}
8405
8406/*
8407 * Garbage collect every ASID that is neither active on a processor nor
8408 * reserved.
8409 */
8410static void
8411pmap_reset_asid_set(pmap_t pmap)
8412{
8413	pmap_t curpmap;
8414	int asid, cpuid, epoch;
8415	struct asid_set *set;
8416	enum pmap_stage stage;
8417
8418	set = pmap->pm_asid_set;
8419	stage = pmap->pm_stage;
8420
8421	set = pmap->pm_asid_set;
8422	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8423	mtx_assert(&set->asid_set_mutex, MA_OWNED);
8424
8425	/*
8426	 * Ensure that the store to asid_epoch is globally visible before the
8427	 * loads from pc_curpmap are performed.
8428	 */
8429	epoch = set->asid_epoch + 1;
8430	if (epoch == INT_MAX)
8431		epoch = 0;
8432	set->asid_epoch = epoch;
8433	dsb(ishst);
8434	if (stage == PM_STAGE1) {
8435		__asm __volatile("tlbi vmalle1is");
8436	} else {
8437		KASSERT(pmap_clean_stage2_tlbi != NULL,
8438		    ("%s: Unset stage 2 tlb invalidation callback\n",
8439		    __func__));
8440		pmap_clean_stage2_tlbi();
8441	}
8442	dsb(ish);
8443	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8444	    set->asid_set_size - 1);
8445	CPU_FOREACH(cpuid) {
8446		if (cpuid == curcpu)
8447			continue;
8448		if (stage == PM_STAGE1) {
8449			curpmap = pcpu_find(cpuid)->pc_curpmap;
8450			PMAP_ASSERT_STAGE1(pmap);
8451		} else {
8452			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8453			if (curpmap == NULL)
8454				continue;
8455			PMAP_ASSERT_STAGE2(pmap);
8456		}
8457		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8458		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8459		if (asid == -1)
8460			continue;
8461		bit_set(set->asid_set, asid);
8462		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8463	}
8464}
8465
8466/*
8467 * Allocate a new ASID for the specified pmap.
8468 */
8469static void
8470pmap_alloc_asid(pmap_t pmap)
8471{
8472	struct asid_set *set;
8473	int new_asid;
8474
8475	set = pmap->pm_asid_set;
8476	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8477
8478	mtx_lock_spin(&set->asid_set_mutex);
8479
8480	/*
8481	 * While this processor was waiting to acquire the asid set mutex,
8482	 * pmap_reset_asid_set() running on another processor might have
8483	 * updated this pmap's cookie to the current epoch.  In which case, we
8484	 * don't need to allocate a new ASID.
8485	 */
8486	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8487		goto out;
8488
8489	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8490	    &new_asid);
8491	if (new_asid == -1) {
8492		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8493		    set->asid_next, &new_asid);
8494		if (new_asid == -1) {
8495			pmap_reset_asid_set(pmap);
8496			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8497			    set->asid_set_size, &new_asid);
8498			KASSERT(new_asid != -1, ("ASID allocation failure"));
8499		}
8500	}
8501	bit_set(set->asid_set, new_asid);
8502	set->asid_next = new_asid + 1;
8503	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8504out:
8505	mtx_unlock_spin(&set->asid_set_mutex);
8506}
8507
8508static uint64_t __read_mostly ttbr_flags;
8509
8510/*
8511 * Compute the value that should be stored in ttbr0 to activate the specified
8512 * pmap.  This value may change from time to time.
8513 */
8514uint64_t
8515pmap_to_ttbr0(pmap_t pmap)
8516{
8517	uint64_t ttbr;
8518
8519	ttbr = pmap->pm_ttbr;
8520	ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8521	ttbr |= ttbr_flags;
8522
8523	return (ttbr);
8524}
8525
8526static void
8527pmap_set_cnp(void *arg)
8528{
8529	uint64_t ttbr0, ttbr1;
8530	u_int cpuid;
8531
8532	cpuid = *(u_int *)arg;
8533	if (cpuid == curcpu) {
8534		/*
8535		 * Set the flags while all CPUs are handling the
8536		 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8537		 * to pmap_to_ttbr0 after this will have the CnP flag set.
8538		 * The dsb after invalidating the TLB will act as a barrier
8539		 * to ensure all CPUs can observe this change.
8540		 */
8541		ttbr_flags |= TTBR_CnP;
8542	}
8543
8544	ttbr0 = READ_SPECIALREG(ttbr0_el1);
8545	ttbr0 |= TTBR_CnP;
8546
8547	ttbr1 = READ_SPECIALREG(ttbr1_el1);
8548	ttbr1 |= TTBR_CnP;
8549
8550	/* Update ttbr{0,1}_el1 with the CnP flag */
8551	WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8552	WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8553	isb();
8554	__asm __volatile("tlbi vmalle1is");
8555	dsb(ish);
8556	isb();
8557}
8558
8559/*
8560 * Defer enabling some features until we have read the ID registers to know
8561 * if they are supported on all CPUs.
8562 */
8563static void
8564pmap_init_mp(void *dummy __unused)
8565{
8566	uint64_t reg;
8567
8568	if (get_kernel_reg(ID_AA64PFR1_EL1, &reg)) {
8569		if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8570			if (bootverbose)
8571				printf("Enabling BTI\n");
8572			pmap_bti_support = true;
8573
8574			pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8575			    sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8576			    UMA_ALIGN_PTR, 0);
8577		}
8578	}
8579}
8580SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8581
8582/*
8583 * Defer enabling CnP until we have read the ID registers to know if it's
8584 * supported on all CPUs.
8585 */
8586static void
8587pmap_init_cnp(void *dummy __unused)
8588{
8589	uint64_t reg;
8590	u_int cpuid;
8591
8592	if (!get_kernel_reg(ID_AA64MMFR2_EL1, &reg))
8593		return;
8594
8595	if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8596		if (bootverbose)
8597			printf("Enabling CnP\n");
8598		cpuid = curcpu;
8599		smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8600	}
8601
8602}
8603SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8604
8605static bool
8606pmap_activate_int(pmap_t pmap)
8607{
8608	struct asid_set *set;
8609	int epoch;
8610
8611	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8612	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8613
8614	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8615	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8616		/*
8617		 * Handle the possibility that the old thread was preempted
8618		 * after an "ic" or "tlbi" instruction but before it performed
8619		 * a "dsb" instruction.  If the old thread migrates to a new
8620		 * processor, its completion of a "dsb" instruction on that
8621		 * new processor does not guarantee that the "ic" or "tlbi"
8622		 * instructions performed on the old processor have completed.
8623		 */
8624		dsb(ish);
8625		return (false);
8626	}
8627
8628	set = pmap->pm_asid_set;
8629	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8630
8631	/*
8632	 * Ensure that the store to curpmap is globally visible before the
8633	 * load from asid_epoch is performed.
8634	 */
8635	if (pmap->pm_stage == PM_STAGE1)
8636		PCPU_SET(curpmap, pmap);
8637	else
8638		PCPU_SET(curvmpmap, pmap);
8639	dsb(ish);
8640	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
8641	if (epoch >= 0 && epoch != set->asid_epoch)
8642		pmap_alloc_asid(pmap);
8643
8644	if (pmap->pm_stage == PM_STAGE1) {
8645		set_ttbr0(pmap_to_ttbr0(pmap));
8646		if (PCPU_GET(bcast_tlbi_workaround) != 0)
8647			invalidate_local_icache();
8648	}
8649	return (true);
8650}
8651
8652void
8653pmap_activate_vm(pmap_t pmap)
8654{
8655
8656	PMAP_ASSERT_STAGE2(pmap);
8657
8658	(void)pmap_activate_int(pmap);
8659}
8660
8661void
8662pmap_activate(struct thread *td)
8663{
8664	pmap_t	pmap;
8665
8666	pmap = vmspace_pmap(td->td_proc->p_vmspace);
8667	PMAP_ASSERT_STAGE1(pmap);
8668	critical_enter();
8669	(void)pmap_activate_int(pmap);
8670	critical_exit();
8671}
8672
8673/*
8674 * Activate the thread we are switching to.
8675 * To simplify the assembly in cpu_throw return the new threads pcb.
8676 */
8677struct pcb *
8678pmap_switch(struct thread *new)
8679{
8680	pcpu_bp_harden bp_harden;
8681	struct pcb *pcb;
8682
8683	/* Store the new curthread */
8684	PCPU_SET(curthread, new);
8685
8686	/* And the new pcb */
8687	pcb = new->td_pcb;
8688	PCPU_SET(curpcb, pcb);
8689
8690	/*
8691	 * TODO: We may need to flush the cache here if switching
8692	 * to a user process.
8693	 */
8694
8695	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
8696		/*
8697		 * Stop userspace from training the branch predictor against
8698		 * other processes. This will call into a CPU specific
8699		 * function that clears the branch predictor state.
8700		 */
8701		bp_harden = PCPU_GET(bp_harden);
8702		if (bp_harden != NULL)
8703			bp_harden();
8704	}
8705
8706	return (pcb);
8707}
8708
8709void
8710pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
8711{
8712
8713	PMAP_ASSERT_STAGE1(pmap);
8714	KASSERT(ADDR_IS_CANONICAL(va),
8715	    ("%s: Address not in canonical form: %lx", __func__, va));
8716
8717	if (ADDR_IS_KERNEL(va)) {
8718		cpu_icache_sync_range((void *)va, sz);
8719	} else {
8720		u_int len, offset;
8721		vm_paddr_t pa;
8722
8723		/* Find the length of data in this page to flush */
8724		offset = va & PAGE_MASK;
8725		len = imin(PAGE_SIZE - offset, sz);
8726
8727		while (sz != 0) {
8728			/* Extract the physical address & find it in the DMAP */
8729			pa = pmap_extract(pmap, va);
8730			if (pa != 0)
8731				cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
8732				    len);
8733
8734			/* Move to the next page */
8735			sz -= len;
8736			va += len;
8737			/* Set the length for the next iteration */
8738			len = imin(PAGE_SIZE, sz);
8739		}
8740	}
8741}
8742
8743static int
8744pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8745{
8746	pd_entry_t *pdep;
8747	pt_entry_t *ptep, pte;
8748	int rv, lvl, dfsc;
8749
8750	PMAP_ASSERT_STAGE2(pmap);
8751	rv = KERN_FAILURE;
8752
8753	/* Data and insn aborts use same encoding for FSC field. */
8754	dfsc = esr & ISS_DATA_DFSC_MASK;
8755	switch (dfsc) {
8756	case ISS_DATA_DFSC_TF_L0:
8757	case ISS_DATA_DFSC_TF_L1:
8758	case ISS_DATA_DFSC_TF_L2:
8759	case ISS_DATA_DFSC_TF_L3:
8760		PMAP_LOCK(pmap);
8761		pdep = pmap_pde(pmap, far, &lvl);
8762		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
8763			PMAP_UNLOCK(pmap);
8764			break;
8765		}
8766
8767		switch (lvl) {
8768		case 0:
8769			ptep = pmap_l0_to_l1(pdep, far);
8770			break;
8771		case 1:
8772			ptep = pmap_l1_to_l2(pdep, far);
8773			break;
8774		case 2:
8775			ptep = pmap_l2_to_l3(pdep, far);
8776			break;
8777		default:
8778			panic("%s: Invalid pde level %d", __func__,lvl);
8779		}
8780		goto fault_exec;
8781
8782	case ISS_DATA_DFSC_AFF_L1:
8783	case ISS_DATA_DFSC_AFF_L2:
8784	case ISS_DATA_DFSC_AFF_L3:
8785		PMAP_LOCK(pmap);
8786		ptep = pmap_pte(pmap, far, &lvl);
8787fault_exec:
8788		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
8789			if (icache_vmid) {
8790				pmap_invalidate_vpipt_icache();
8791			} else {
8792				/*
8793				 * If accessing an executable page invalidate
8794				 * the I-cache so it will be valid when we
8795				 * continue execution in the guest. The D-cache
8796				 * is assumed to already be clean to the Point
8797				 * of Coherency.
8798				 */
8799				if ((pte & ATTR_S2_XN_MASK) !=
8800				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
8801					invalidate_icache();
8802				}
8803			}
8804			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
8805			rv = KERN_SUCCESS;
8806		}
8807		PMAP_UNLOCK(pmap);
8808		break;
8809	}
8810
8811	return (rv);
8812}
8813
8814int
8815pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8816{
8817	pt_entry_t pte, *ptep;
8818	register_t intr;
8819	uint64_t ec, par;
8820	int lvl, rv;
8821
8822	rv = KERN_FAILURE;
8823
8824	ec = ESR_ELx_EXCEPTION(esr);
8825	switch (ec) {
8826	case EXCP_INSN_ABORT_L:
8827	case EXCP_INSN_ABORT:
8828	case EXCP_DATA_ABORT_L:
8829	case EXCP_DATA_ABORT:
8830		break;
8831	default:
8832		return (rv);
8833	}
8834
8835	if (pmap->pm_stage == PM_STAGE2)
8836		return (pmap_stage2_fault(pmap, esr, far));
8837
8838	/* Data and insn aborts use same encoding for FSC field. */
8839	switch (esr & ISS_DATA_DFSC_MASK) {
8840	case ISS_DATA_DFSC_AFF_L1:
8841	case ISS_DATA_DFSC_AFF_L2:
8842	case ISS_DATA_DFSC_AFF_L3:
8843		PMAP_LOCK(pmap);
8844		ptep = pmap_pte(pmap, far, &lvl);
8845		if (ptep != NULL) {
8846			pmap_set_bits(ptep, ATTR_AF);
8847			rv = KERN_SUCCESS;
8848			/*
8849			 * XXXMJ as an optimization we could mark the entry
8850			 * dirty if this is a write fault.
8851			 */
8852		}
8853		PMAP_UNLOCK(pmap);
8854		break;
8855	case ISS_DATA_DFSC_PF_L1:
8856	case ISS_DATA_DFSC_PF_L2:
8857	case ISS_DATA_DFSC_PF_L3:
8858		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
8859		    (esr & ISS_DATA_WnR) == 0)
8860			return (rv);
8861		PMAP_LOCK(pmap);
8862		ptep = pmap_pte(pmap, far, &lvl);
8863		if (ptep != NULL &&
8864		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
8865			if ((pte & ATTR_S1_AP_RW_BIT) ==
8866			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
8867				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
8868				pmap_s1_invalidate_page(pmap, far, true);
8869			}
8870			rv = KERN_SUCCESS;
8871		}
8872		PMAP_UNLOCK(pmap);
8873		break;
8874	case ISS_DATA_DFSC_TF_L0:
8875	case ISS_DATA_DFSC_TF_L1:
8876	case ISS_DATA_DFSC_TF_L2:
8877	case ISS_DATA_DFSC_TF_L3:
8878		/*
8879		 * Retry the translation.  A break-before-make sequence can
8880		 * produce a transient fault.
8881		 */
8882		if (pmap == kernel_pmap) {
8883			/*
8884			 * The translation fault may have occurred within a
8885			 * critical section.  Therefore, we must check the
8886			 * address without acquiring the kernel pmap's lock.
8887			 */
8888			if (pmap_klookup(far, NULL))
8889				rv = KERN_SUCCESS;
8890		} else {
8891			PMAP_LOCK(pmap);
8892			/* Ask the MMU to check the address. */
8893			intr = intr_disable();
8894			par = arm64_address_translate_s1e0r(far);
8895			intr_restore(intr);
8896			PMAP_UNLOCK(pmap);
8897
8898			/*
8899			 * If the translation was successful, then we can
8900			 * return success to the trap handler.
8901			 */
8902			if (PAR_SUCCESS(par))
8903				rv = KERN_SUCCESS;
8904		}
8905		break;
8906	}
8907
8908	return (rv);
8909}
8910
8911/*
8912 *	Increase the starting virtual address of the given mapping if a
8913 *	different alignment might result in more superpage mappings.
8914 */
8915void
8916pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
8917    vm_offset_t *addr, vm_size_t size)
8918{
8919	vm_offset_t superpage_offset;
8920
8921	if (size < L2_SIZE)
8922		return;
8923	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
8924		offset += ptoa(object->pg_color);
8925	superpage_offset = offset & L2_OFFSET;
8926	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
8927	    (*addr & L2_OFFSET) == superpage_offset)
8928		return;
8929	if ((*addr & L2_OFFSET) < superpage_offset)
8930		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
8931	else
8932		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
8933}
8934
8935/**
8936 * Get the kernel virtual address of a set of physical pages. If there are
8937 * physical addresses not covered by the DMAP perform a transient mapping
8938 * that will be removed when calling pmap_unmap_io_transient.
8939 *
8940 * \param page        The pages the caller wishes to obtain the virtual
8941 *                    address on the kernel memory map.
8942 * \param vaddr       On return contains the kernel virtual memory address
8943 *                    of the pages passed in the page parameter.
8944 * \param count       Number of pages passed in.
8945 * \param can_fault   true if the thread using the mapped pages can take
8946 *                    page faults, false otherwise.
8947 *
8948 * \returns true if the caller must call pmap_unmap_io_transient when
8949 *          finished or false otherwise.
8950 *
8951 */
8952bool
8953pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8954    bool can_fault)
8955{
8956	vm_paddr_t paddr;
8957	bool needs_mapping;
8958	int error __diagused, i;
8959
8960	/*
8961	 * Allocate any KVA space that we need, this is done in a separate
8962	 * loop to prevent calling vmem_alloc while pinned.
8963	 */
8964	needs_mapping = false;
8965	for (i = 0; i < count; i++) {
8966		paddr = VM_PAGE_TO_PHYS(page[i]);
8967		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
8968			error = vmem_alloc(kernel_arena, PAGE_SIZE,
8969			    M_BESTFIT | M_WAITOK, &vaddr[i]);
8970			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
8971			needs_mapping = true;
8972		} else {
8973			vaddr[i] = PHYS_TO_DMAP(paddr);
8974		}
8975	}
8976
8977	/* Exit early if everything is covered by the DMAP */
8978	if (!needs_mapping)
8979		return (false);
8980
8981	if (!can_fault)
8982		sched_pin();
8983	for (i = 0; i < count; i++) {
8984		paddr = VM_PAGE_TO_PHYS(page[i]);
8985		if (!PHYS_IN_DMAP(paddr)) {
8986			panic(
8987			   "pmap_map_io_transient: TODO: Map out of DMAP data");
8988		}
8989	}
8990
8991	return (needs_mapping);
8992}
8993
8994void
8995pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8996    bool can_fault)
8997{
8998	vm_paddr_t paddr;
8999	int i;
9000
9001	if (!can_fault)
9002		sched_unpin();
9003	for (i = 0; i < count; i++) {
9004		paddr = VM_PAGE_TO_PHYS(page[i]);
9005		if (!PHYS_IN_DMAP(paddr)) {
9006			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9007		}
9008	}
9009}
9010
9011bool
9012pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9013{
9014
9015	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9016}
9017
9018static void *
9019bti_dup_range(void *ctx __unused, void *data)
9020{
9021	struct rs_el *node, *new_node;
9022
9023	new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9024	if (new_node == NULL)
9025		return (NULL);
9026	node = data;
9027	memcpy(new_node, node, sizeof(*node));
9028	return (new_node);
9029}
9030
9031static void
9032bti_free_range(void *ctx __unused, void *node)
9033{
9034
9035	uma_zfree(pmap_bti_ranges_zone, node);
9036}
9037
9038static int
9039pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9040{
9041	struct rs_el *rs;
9042	int error;
9043
9044	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9045	PMAP_ASSERT_STAGE1(pmap);
9046	MPASS(pmap->pm_bti != NULL);
9047	rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9048	if (rs == NULL)
9049		return (ENOMEM);
9050	error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9051	if (error != 0)
9052		uma_zfree(pmap_bti_ranges_zone, rs);
9053	return (error);
9054}
9055
9056static void
9057pmap_bti_deassign_all(pmap_t pmap)
9058{
9059
9060	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9061	if (pmap->pm_bti != NULL)
9062		rangeset_remove_all(pmap->pm_bti);
9063}
9064
9065static bool
9066pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9067{
9068	struct rs_el *prev_rs, *rs;
9069	vm_offset_t va;
9070
9071	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9072	KASSERT(ADDR_IS_CANONICAL(sva),
9073	    ("%s: Start address not in canonical form: %lx", __func__, sva));
9074	KASSERT(ADDR_IS_CANONICAL(eva),
9075	    ("%s: End address not in canonical form: %lx", __func__, eva));
9076
9077	if (pmap->pm_bti == NULL || ADDR_IS_KERNEL(sva))
9078		return (true);
9079	MPASS(!ADDR_IS_KERNEL(eva));
9080	for (va = sva; va < eva; prev_rs = rs) {
9081		rs = rangeset_lookup(pmap->pm_bti, va);
9082		if (va == sva)
9083			prev_rs = rs;
9084		else if ((rs == NULL) ^ (prev_rs == NULL))
9085			return (false);
9086		if (rs == NULL) {
9087			va += PAGE_SIZE;
9088			continue;
9089		}
9090		va = rs->re_end;
9091	}
9092	return (true);
9093}
9094
9095static pt_entry_t
9096pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9097{
9098	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9099	MPASS(ADDR_IS_CANONICAL(va));
9100
9101	if (pmap->pm_stage != PM_STAGE1)
9102		return (0);
9103	if (pmap == kernel_pmap)
9104		return (ATTR_KERN_GP);
9105	if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL)
9106		return (ATTR_S1_GP);
9107	return (0);
9108}
9109
9110static void
9111pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9112{
9113
9114	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9115	if (pmap->pm_bti != NULL)
9116		rangeset_remove(pmap->pm_bti, sva, eva);
9117}
9118
9119static int
9120pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9121{
9122
9123	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9124	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9125	MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9126	MPASS(src_pmap->pm_bti != NULL);
9127	MPASS(dst_pmap->pm_bti != NULL);
9128	if (src_pmap->pm_bti->rs_data_ctx == NULL)
9129		return (0);
9130	return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9131}
9132
9133static void
9134pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9135{
9136	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9137	PMAP_ASSERT_STAGE1(pmap);
9138
9139	pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9140	    true);
9141}
9142
9143int
9144pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9145{
9146	int error;
9147
9148	if (pmap->pm_bti == NULL)
9149		return (0);
9150	if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9151		return (EINVAL);
9152	if (pmap->pm_stage != PM_STAGE1)
9153		return (EINVAL);
9154	if (eva <= sva || ADDR_IS_KERNEL(eva))
9155		return (EFAULT);
9156
9157	sva = trunc_page(sva);
9158	eva = round_page(eva);
9159	for (;;) {
9160		PMAP_LOCK(pmap);
9161		error = pmap_bti_assign(pmap, sva, eva);
9162		if (error == 0)
9163			pmap_bti_update_range(pmap, sva, eva, true);
9164		PMAP_UNLOCK(pmap);
9165		if (error != ENOMEM)
9166			break;
9167		vm_wait(NULL);
9168	}
9169	return (error);
9170}
9171
9172#if defined(KASAN) || defined(KMSAN)
9173static pd_entry_t	*pmap_san_early_l2;
9174
9175#define	SAN_BOOTSTRAP_L2_SIZE	(1 * L2_SIZE)
9176#define	SAN_BOOTSTRAP_SIZE	(2 * PAGE_SIZE)
9177static vm_offset_t __nosanitizeaddress
9178pmap_san_enter_bootstrap_alloc_l2(void)
9179{
9180	static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9181	static size_t offset = 0;
9182	vm_offset_t addr;
9183
9184	if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9185		panic("%s: out of memory for the bootstrap shadow map L2 entries",
9186		    __func__);
9187	}
9188
9189	addr = (uintptr_t)&bootstrap_data[offset];
9190	offset += L2_SIZE;
9191	return (addr);
9192}
9193
9194/*
9195 * SAN L1 + L2 pages, maybe L3 entries later?
9196 */
9197static vm_offset_t __nosanitizeaddress
9198pmap_san_enter_bootstrap_alloc_pages(int npages)
9199{
9200	static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9201	static size_t offset = 0;
9202	vm_offset_t addr;
9203
9204	if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9205		panic("%s: out of memory for the bootstrap shadow map",
9206		    __func__);
9207	}
9208
9209	addr = (uintptr_t)&bootstrap_data[offset];
9210	offset += (npages * PAGE_SIZE);
9211	return (addr);
9212}
9213
9214static void __nosanitizeaddress
9215pmap_san_enter_bootstrap(void)
9216{
9217	vm_offset_t freemempos;
9218
9219	/* L1, L2 */
9220	freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9221	bs_state.freemempos = freemempos;
9222	bs_state.va = KASAN_MIN_ADDRESS;
9223	pmap_bootstrap_l1_table(&bs_state);
9224	pmap_san_early_l2 = bs_state.l2;
9225}
9226
9227static vm_page_t
9228pmap_san_enter_alloc_l3(void)
9229{
9230	vm_page_t m;
9231
9232	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9233	    VM_ALLOC_ZERO);
9234	if (m == NULL)
9235		panic("%s: no memory to grow shadow map", __func__);
9236	return (m);
9237}
9238
9239static vm_page_t
9240pmap_san_enter_alloc_l2(void)
9241{
9242	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9243	    Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9244}
9245
9246void __nosanitizeaddress __nosanitizememory
9247pmap_san_enter(vm_offset_t va)
9248{
9249	pd_entry_t *l1, *l2;
9250	pt_entry_t *l3;
9251	vm_page_t m;
9252
9253	if (virtual_avail == 0) {
9254		vm_offset_t block;
9255		int slot;
9256		bool first;
9257
9258		/* Temporary shadow map prior to pmap_bootstrap(). */
9259		first = pmap_san_early_l2 == NULL;
9260		if (first)
9261			pmap_san_enter_bootstrap();
9262
9263		l2 = pmap_san_early_l2;
9264		slot = pmap_l2_index(va);
9265
9266		if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9267			MPASS(first);
9268			block = pmap_san_enter_bootstrap_alloc_l2();
9269			pmap_store(&l2[slot],
9270			    PHYS_TO_PTE(pmap_early_vtophys(block)) |
9271			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9272			dmb(ishst);
9273		}
9274
9275		return;
9276	}
9277
9278	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9279	l1 = pmap_l1(kernel_pmap, va);
9280	MPASS(l1 != NULL);
9281	if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9282		m = pmap_san_enter_alloc_l3();
9283		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9284	}
9285	l2 = pmap_l1_to_l2(l1, va);
9286	if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9287		m = pmap_san_enter_alloc_l2();
9288		if (m != NULL) {
9289			pmap_store(l2, VM_PAGE_TO_PTE(m) |
9290			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9291		} else {
9292			m = pmap_san_enter_alloc_l3();
9293			pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9294		}
9295		dmb(ishst);
9296	}
9297	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9298		return;
9299	l3 = pmap_l2_to_l3(l2, va);
9300	if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9301		return;
9302	m = pmap_san_enter_alloc_l3();
9303	pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9304	dmb(ishst);
9305}
9306#endif /* KASAN || KMSAN */
9307
9308/*
9309 * Track a range of the kernel's virtual address space that is contiguous
9310 * in various mapping attributes.
9311 */
9312struct pmap_kernel_map_range {
9313	vm_offset_t sva;
9314	pt_entry_t attrs;
9315	int l3pages;
9316	int l3contig;
9317	int l2blocks;
9318	int l1blocks;
9319};
9320
9321static void
9322sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9323    vm_offset_t eva)
9324{
9325	const char *mode;
9326	int index;
9327
9328	if (eva <= range->sva)
9329		return;
9330
9331	index = range->attrs & ATTR_S1_IDX_MASK;
9332	switch (index) {
9333	case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9334		mode = "DEV-NP";
9335		break;
9336	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9337		mode = "DEV";
9338		break;
9339	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9340		mode = "UC";
9341		break;
9342	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9343		mode = "WB";
9344		break;
9345	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9346		mode = "WT";
9347		break;
9348	default:
9349		printf(
9350		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9351		    __func__, index, range->sva, eva);
9352		mode = "??";
9353		break;
9354	}
9355
9356	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d\n",
9357	    range->sva, eva,
9358	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9359	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9360	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9361	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9362	    (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9363	    mode, range->l1blocks, range->l2blocks, range->l3contig,
9364	    range->l3pages);
9365
9366	/* Reset to sentinel value. */
9367	range->sva = 0xfffffffffffffffful;
9368}
9369
9370/*
9371 * Determine whether the attributes specified by a page table entry match those
9372 * being tracked by the current range.
9373 */
9374static bool
9375sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9376{
9377
9378	return (range->attrs == attrs);
9379}
9380
9381static void
9382sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9383    pt_entry_t attrs)
9384{
9385
9386	memset(range, 0, sizeof(*range));
9387	range->sva = va;
9388	range->attrs = attrs;
9389}
9390
9391/* Get the block/page attributes that correspond to the table attributes */
9392static pt_entry_t
9393sysctl_kmaps_table_attrs(pd_entry_t table)
9394{
9395	pt_entry_t attrs;
9396
9397	attrs = 0;
9398	if ((table & TATTR_UXN_TABLE) != 0)
9399		attrs |= ATTR_S1_UXN;
9400	if ((table & TATTR_PXN_TABLE) != 0)
9401		attrs |= ATTR_S1_PXN;
9402	if ((table & TATTR_AP_TABLE_RO) != 0)
9403		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9404
9405	return (attrs);
9406}
9407
9408/* Read the block/page attributes we care about */
9409static pt_entry_t
9410sysctl_kmaps_block_attrs(pt_entry_t block)
9411{
9412	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9413	    ATTR_S1_GP));
9414}
9415
9416/*
9417 * Given a leaf PTE, derive the mapping's attributes.  If they do not match
9418 * those of the current run, dump the address range and its attributes, and
9419 * begin a new run.
9420 */
9421static void
9422sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9423    vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9424    pt_entry_t l3e)
9425{
9426	pt_entry_t attrs;
9427
9428	attrs = sysctl_kmaps_table_attrs(l0e);
9429
9430	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9431		attrs |= sysctl_kmaps_block_attrs(l1e);
9432		goto done;
9433	}
9434	attrs |= sysctl_kmaps_table_attrs(l1e);
9435
9436	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9437		attrs |= sysctl_kmaps_block_attrs(l2e);
9438		goto done;
9439	}
9440	attrs |= sysctl_kmaps_table_attrs(l2e);
9441	attrs |= sysctl_kmaps_block_attrs(l3e);
9442
9443done:
9444	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9445		sysctl_kmaps_dump(sb, range, va);
9446		sysctl_kmaps_reinit(range, va, attrs);
9447	}
9448}
9449
9450static int
9451sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9452{
9453	struct pmap_kernel_map_range range;
9454	struct sbuf sbuf, *sb;
9455	pd_entry_t l0e, *l1, l1e, *l2, l2e;
9456	pt_entry_t *l3, l3e;
9457	vm_offset_t sva;
9458	vm_paddr_t pa;
9459	int error, i, j, k, l;
9460
9461	error = sysctl_wire_old_buffer(req, 0);
9462	if (error != 0)
9463		return (error);
9464	sb = &sbuf;
9465	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9466
9467	/* Sentinel value. */
9468	range.sva = 0xfffffffffffffffful;
9469
9470	/*
9471	 * Iterate over the kernel page tables without holding the kernel pmap
9472	 * lock.  Kernel page table pages are never freed, so at worst we will
9473	 * observe inconsistencies in the output.
9474	 */
9475	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9476	    i++) {
9477		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9478			sbuf_printf(sb, "\nDirect map:\n");
9479		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9480			sbuf_printf(sb, "\nKernel map:\n");
9481#ifdef KASAN
9482		else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9483			sbuf_printf(sb, "\nKASAN shadow map:\n");
9484#endif
9485#ifdef KMSAN
9486		else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9487			sbuf_printf(sb, "\nKMSAN shadow map:\n");
9488		else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9489			sbuf_printf(sb, "\nKMSAN origin map:\n");
9490#endif
9491
9492		l0e = kernel_pmap->pm_l0[i];
9493		if ((l0e & ATTR_DESCR_VALID) == 0) {
9494			sysctl_kmaps_dump(sb, &range, sva);
9495			sva += L0_SIZE;
9496			continue;
9497		}
9498		pa = PTE_TO_PHYS(l0e);
9499		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9500
9501		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9502			l1e = l1[j];
9503			if ((l1e & ATTR_DESCR_VALID) == 0) {
9504				sysctl_kmaps_dump(sb, &range, sva);
9505				sva += L1_SIZE;
9506				continue;
9507			}
9508			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9509				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9510				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9511				    0, 0);
9512				range.l1blocks++;
9513				sva += L1_SIZE;
9514				continue;
9515			}
9516			pa = PTE_TO_PHYS(l1e);
9517			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9518
9519			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9520				l2e = l2[k];
9521				if ((l2e & ATTR_DESCR_VALID) == 0) {
9522					sysctl_kmaps_dump(sb, &range, sva);
9523					sva += L2_SIZE;
9524					continue;
9525				}
9526				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9527					sysctl_kmaps_check(sb, &range, sva,
9528					    l0e, l1e, l2e, 0);
9529					range.l2blocks++;
9530					sva += L2_SIZE;
9531					continue;
9532				}
9533				pa = PTE_TO_PHYS(l2e);
9534				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9535
9536				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9537				    l++, sva += L3_SIZE) {
9538					l3e = l3[l];
9539					if ((l3e & ATTR_DESCR_VALID) == 0) {
9540						sysctl_kmaps_dump(sb, &range,
9541						    sva);
9542						continue;
9543					}
9544					sysctl_kmaps_check(sb, &range, sva,
9545					    l0e, l1e, l2e, l3e);
9546					if ((l3e & ATTR_CONTIGUOUS) != 0)
9547						range.l3contig +=
9548						    l % L3C_ENTRIES == 0 ?
9549						    1 : 0;
9550					else
9551						range.l3pages++;
9552				}
9553			}
9554		}
9555	}
9556
9557	error = sbuf_finish(sb);
9558	sbuf_delete(sb);
9559	return (error);
9560}
9561SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9562    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9563    NULL, 0, sysctl_kmaps, "A",
9564    "Dump kernel address layout");
9565