1/*	$NetBSD: pmap.c,v 1.151 2024/02/16 21:32:17 andvar Exp $	*/
2
3/*
4 * Copyright (c) 2017 Ryo Shimizu
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
20 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.151 2024/02/16 21:32:17 andvar Exp $");
31
32#include "opt_arm_debug.h"
33#include "opt_cpuoptions.h"
34#include "opt_ddb.h"
35#include "opt_efi.h"
36#include "opt_modular.h"
37#include "opt_multiprocessor.h"
38#include "opt_pmap.h"
39#include "opt_uvmhist.h"
40
41#include <sys/param.h>
42#include <sys/types.h>
43
44#include <sys/asan.h>
45#include <sys/atomic.h>
46#include <sys/cpu.h>
47#include <sys/kmem.h>
48#include <sys/vmem.h>
49
50#include <uvm/uvm.h>
51#include <uvm/pmap/pmap_pvt.h>
52
53#include <arm/cpufunc.h>
54
55#include <aarch64/pmap.h>
56#include <aarch64/pte.h>
57#include <aarch64/armreg.h>
58#include <aarch64/locore.h>
59#include <aarch64/machdep.h>
60#ifdef DDB
61#include <aarch64/db_machdep.h>
62#include <ddb/db_access.h>
63#endif
64
65#include <arm/cpufunc.h>
66
67//#define PMAP_PV_DEBUG
68
69#ifdef VERBOSE_INIT_ARM
70#define VPRINTF(...)	printf(__VA_ARGS__)
71#else
72#define VPRINTF(...)	__nothing
73#endif
74
75#ifdef UVMHIST
76
77#ifndef UVMHIST_PMAPHIST_SIZE
78#define UVMHIST_PMAPHIST_SIZE	(1024 * 4)
79#endif
80
81struct kern_history_ent pmaphistbuf[UVMHIST_PMAPHIST_SIZE];
82UVMHIST_DEFINE(pmaphist) = UVMHIST_INITIALIZER(pmaphist, pmaphistbuf);
83
84static void
85pmap_hist_init(void)
86{
87	static bool inited = false;
88	if (inited == false) {
89		UVMHIST_LINK_STATIC(pmaphist);
90		inited = true;
91	}
92}
93#define PMAP_HIST_INIT()	pmap_hist_init()
94
95#else /* UVMHIST */
96
97#define PMAP_HIST_INIT()	((void)0)
98
99#endif /* UVMHIST */
100
101
102#ifdef PMAPCOUNTERS
103#define PMAP_COUNT(name)		(pmap_evcnt_##name.ev_count++ + 0)
104#define PMAP_COUNTER(name, desc)					\
105	struct evcnt pmap_evcnt_##name =				\
106	    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap", desc);	\
107	EVCNT_ATTACH_STATIC(pmap_evcnt_##name)
108
109PMAP_COUNTER(pdp_alloc_boot, "page table page allocate (uvm_pageboot_alloc)");
110PMAP_COUNTER(pdp_alloc, "page table page allocate (uvm_pagealloc)");
111PMAP_COUNTER(pdp_free, "page table page free (uvm_pagefree)");
112
113PMAP_COUNTER(pv_enter, "pv_entry fill");
114PMAP_COUNTER(pv_remove_dyn, "pv_entry free and unlink dynamic");
115PMAP_COUNTER(pv_remove_emb, "pv_entry clear embedded");
116PMAP_COUNTER(pv_remove_nopv, "no pv_entry found when removing pv");
117
118PMAP_COUNTER(activate, "pmap_activate call");
119PMAP_COUNTER(deactivate, "pmap_deactivate call");
120PMAP_COUNTER(create, "pmap_create call");
121PMAP_COUNTER(destroy, "pmap_destroy call");
122
123PMAP_COUNTER(page_protect, "pmap_page_protect call");
124PMAP_COUNTER(protect, "pmap_protect call");
125PMAP_COUNTER(protect_remove_fallback, "pmap_protect with no-read");
126PMAP_COUNTER(protect_none, "pmap_protect non-exists pages");
127PMAP_COUNTER(protect_managed, "pmap_protect managed pages");
128PMAP_COUNTER(protect_unmanaged, "pmap_protect unmanaged pages");
129PMAP_COUNTER(protect_pvmanaged, "pmap_protect pv-tracked unmanaged pages");
130
131PMAP_COUNTER(clear_modify, "pmap_clear_modify call");
132PMAP_COUNTER(clear_modify_pages, "pmap_clear_modify pages");
133PMAP_COUNTER(clear_reference, "pmap_clear_reference call");
134PMAP_COUNTER(clear_reference_pages, "pmap_clear_reference pages");
135
136PMAP_COUNTER(fixup_referenced, "page reference emulations");
137PMAP_COUNTER(fixup_modified, "page modification emulations");
138
139PMAP_COUNTER(kern_mappings_bad, "kernel pages mapped (bad color)");
140PMAP_COUNTER(kern_mappings_bad_wired, "kernel pages mapped (wired bad color)");
141PMAP_COUNTER(user_mappings_bad, "user pages mapped (bad color, not wired)");
142PMAP_COUNTER(user_mappings_bad_wired, "user pages mapped (bad color, wired)");
143PMAP_COUNTER(kern_mappings, "kernel pages mapped");
144PMAP_COUNTER(user_mappings, "user pages mapped");
145PMAP_COUNTER(user_mappings_changed, "user mapping changed");
146PMAP_COUNTER(kern_mappings_changed, "kernel mapping changed");
147PMAP_COUNTER(uncached_mappings, "uncached pages mapped");
148PMAP_COUNTER(unmanaged_mappings, "unmanaged pages mapped");
149PMAP_COUNTER(pvmanaged_mappings, "pv-tracked unmanaged pages mapped");
150PMAP_COUNTER(managed_mappings, "managed pages mapped");
151PMAP_COUNTER(mappings, "pages mapped (including remapped)");
152PMAP_COUNTER(remappings, "pages remapped");
153
154PMAP_COUNTER(pv_entry_cannotalloc, "pv_entry allocation failure");
155
156PMAP_COUNTER(unwire, "pmap_unwire call");
157PMAP_COUNTER(unwire_failure, "pmap_unwire failure");
158
159#else /* PMAPCOUNTERS */
160#define PMAP_COUNT(name)		__nothing
161#endif /* PMAPCOUNTERS */
162
163/*
164 * invalidate TLB entry for ASID and VA.
165 */
166#define AARCH64_TLBI_BY_ASID_VA(asid, va)			\
167	do {							\
168		if ((asid) == 0)				\
169			aarch64_tlbi_by_va((va));		\
170		else						\
171			aarch64_tlbi_by_asid_va((asid), (va));	\
172	} while (0/*CONSTCOND*/)
173
174/*
175 * require access permission in pte to invalidate instruction cache.
176 * change the pte to be accessible temporarily before cpu_icache_sync_range().
177 * this macro modifies PTE (*ptep). need to update PTE after this.
178 */
179#define PTE_ICACHE_SYNC_PAGE(pte, ptep, asid, va)			\
180	do {								\
181		atomic_swap_64((ptep), (pte) | LX_BLKPAG_AF);		\
182		AARCH64_TLBI_BY_ASID_VA((asid), (va));			\
183		cpu_icache_sync_range((va), PAGE_SIZE);			\
184	} while (0/*CONSTCOND*/)
185
186#define VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mdpg_pp)
187
188#define L3INDEXMASK	(L3_SIZE * Ln_ENTRIES - 1)
189#define PDPSWEEP_TRIGGER	512
190
191static pt_entry_t *_pmap_pte_lookup_l3(struct pmap *, vaddr_t);
192static pt_entry_t *_pmap_pte_lookup_bs(struct pmap *, vaddr_t, vsize_t *);
193static pt_entry_t _pmap_pte_adjust_prot(pt_entry_t, vm_prot_t, vm_prot_t, bool);
194static pt_entry_t _pmap_pte_adjust_cacheflags(pt_entry_t, u_int);
195static void _pmap_remove(struct pmap *, vaddr_t, vaddr_t, bool,
196    struct pv_entry **);
197static int _pmap_enter(struct pmap *, vaddr_t, paddr_t, vm_prot_t, u_int, bool);
198static int _pmap_get_pdp(struct pmap *, vaddr_t, bool, int, paddr_t *,
199    struct vm_page **);
200
201static struct pmap kernel_pmap __cacheline_aligned;
202struct pmap * const kernel_pmap_ptr = &kernel_pmap;
203
204#if defined(EFI_RUNTIME)
205static struct pmap efirt_pmap __cacheline_aligned;
206
207pmap_t
208pmap_efirt(void)
209{
210	return &efirt_pmap;
211}
212#endif
213
214static vaddr_t pmap_maxkvaddr;
215
216vaddr_t virtual_avail, virtual_end;
217vaddr_t virtual_devmap_addr;
218
219static struct pool_cache _pmap_cache;
220static struct pool_cache _pmap_pv_pool;
221
222/* Set to LX_BLKPAG_GP if supported. */
223uint64_t pmap_attr_gp = 0;
224
225static inline void
226pmap_pv_lock(struct pmap_page *pp)
227{
228
229	mutex_enter(&pp->pp_pvlock);
230}
231
232static inline void
233pmap_pv_unlock(struct pmap_page *pp)
234{
235
236	mutex_exit(&pp->pp_pvlock);
237}
238
239
240static inline void
241pm_lock(struct pmap *pm)
242{
243	mutex_enter(&pm->pm_lock);
244}
245
246static inline void
247pm_unlock(struct pmap *pm)
248{
249	mutex_exit(&pm->pm_lock);
250}
251
252static bool
253pm_reverse_lock(struct pmap *pm, struct pmap_page *pp)
254{
255
256	KASSERT(mutex_owned(&pp->pp_pvlock));
257
258	if (__predict_true(mutex_tryenter(&pm->pm_lock)))
259		return true;
260
261	if (pm != pmap_kernel())
262		pmap_reference(pm);
263	mutex_exit(&pp->pp_pvlock);
264	mutex_enter(&pm->pm_lock);
265	/* nothing, just wait for lock */
266	mutex_exit(&pm->pm_lock);
267	if (pm != pmap_kernel())
268		pmap_destroy(pm);
269	mutex_enter(&pp->pp_pvlock);
270	return false;
271}
272
273static inline struct pmap_page *
274phys_to_pp(paddr_t pa)
275{
276	struct vm_page *pg;
277
278	pg = PHYS_TO_VM_PAGE(pa);
279	if (pg != NULL)
280		return VM_PAGE_TO_PP(pg);
281
282#ifdef __HAVE_PMAP_PV_TRACK
283	return pmap_pv_tracked(pa);
284#else
285	return NULL;
286#endif /* __HAVE_PMAP_PV_TRACK */
287}
288
289#define IN_RANGE(va, sta, end)	(((sta) <= (va)) && ((va) < (end)))
290
291#define IN_DIRECTMAP_ADDR(va)	\
292	IN_RANGE((va), AARCH64_DIRECTMAP_START, AARCH64_DIRECTMAP_END)
293
294#define	PMAP_EFIVA_P(va) \
295     IN_RANGE((va), EFI_RUNTIME_VA, EFI_RUNTIME_VA + EFI_RUNTIME_SIZE)
296
297#ifdef MODULAR
298#define IN_MODULE_VA(va)	IN_RANGE((va), module_start, module_end)
299#else
300#define IN_MODULE_VA(va)	false
301#endif
302
303#ifdef DIAGNOSTIC
304
305#define KERNEL_ADDR_P(va)						\
306    (IN_RANGE((va), VM_MIN_KERNEL_ADDRESS,  VM_MAX_KERNEL_ADDRESS) ||	\
307     PMAP_EFIVA_P(va))
308
309#define KASSERT_PM_ADDR(pm, va)						\
310    do {								\
311	int space = aarch64_addressspace(va);				\
312	if ((pm) == pmap_kernel()) {					\
313		KASSERTMSG(space == AARCH64_ADDRSPACE_UPPER,		\
314		    "%s: kernel pm %p: va=%016lx"			\
315		    " is out of upper address space",			\
316		    __func__, (pm), (va));				\
317		KASSERTMSG(KERNEL_ADDR_P(va),				\
318		    "%s: kernel pm %p: va=%016lx"			\
319		    " is not kernel address",				\
320		    __func__, (pm), (va));				\
321	} else {							\
322		KASSERTMSG(space == AARCH64_ADDRSPACE_LOWER,		\
323		    "%s: user pm %p: va=%016lx"				\
324		    " is out of lower address space",			\
325		    __func__, (pm), (va));				\
326		KASSERTMSG(IN_RANGE((va),				\
327		    VM_MIN_ADDRESS, VM_MAX_ADDRESS),			\
328		    "%s: user pm %p: va=%016lx"				\
329		    " is not user address",				\
330		    __func__, (pm), (va));				\
331	}								\
332    } while (0 /* CONSTCOND */)
333#else /* DIAGNOSTIC */
334#define KASSERT_PM_ADDR(pm,va)
335#endif /* DIAGNOSTIC */
336
337
338vsize_t
339pmap_kenter_range(vaddr_t va, paddr_t pa, vsize_t size,
340    vm_prot_t prot, u_int flags)
341{
342	pt_entry_t attr;
343	vsize_t resid = round_page(size);
344
345	attr = _pmap_pte_adjust_prot(0, prot, VM_PROT_ALL, false);
346	attr = _pmap_pte_adjust_cacheflags(attr, flags);
347	pmapboot_enter_range(va, pa, resid, attr, printf);
348
349	return resid;
350}
351
352
353void
354pmap_bootstrap(vaddr_t vstart, vaddr_t vend)
355{
356	struct pmap *kpm;
357	pd_entry_t *l0;
358	paddr_t l0pa;
359
360	PMAP_HIST_INIT();	/* init once */
361
362	UVMHIST_FUNC(__func__);
363	UVMHIST_CALLARGS(pmaphist, "vstart=%#jx vend=%#jx", (uintptr_t)vstart,
364	    (uintptr_t)vend, 0, 0);
365
366	uvmexp.ncolors = aarch64_cache_vindexsize / PAGE_SIZE;
367
368	/* devmap already uses last of va? */
369	if (virtual_devmap_addr != 0 && virtual_devmap_addr < vend)
370		vend = virtual_devmap_addr;
371
372	virtual_avail = vstart;
373	virtual_end = vend;
374	pmap_maxkvaddr = vstart;
375
376	l0pa = reg_ttbr1_el1_read();
377	l0 = (void *)AARCH64_PA_TO_KVA(l0pa);
378
379	pmap_tlb_info_init(&pmap_tlb0_info);
380
381	memset(&kernel_pmap, 0, sizeof(kernel_pmap));
382
383	kpm = pmap_kernel();
384	struct pmap_asid_info * const pai = PMAP_PAI(kpm, cpu_tlb_info(ci));
385
386	pai->pai_asid = KERNEL_PID;
387	kpm->pm_refcnt = 1;
388	kpm->pm_idlepdp = 0;
389	kpm->pm_l0table = l0;
390	kpm->pm_l0table_pa = l0pa;
391	kpm->pm_onproc = kcpuset_running;
392	kpm->pm_active = kcpuset_running;
393	kpm->pm_activated = true;
394	LIST_INIT(&kpm->pm_vmlist);
395	LIST_INIT(&kpm->pm_pvlist);	/* not used for kernel pmap */
396	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
397
398	CTASSERT(sizeof(kpm->pm_stats.wired_count) == sizeof(long));
399	CTASSERT(sizeof(kpm->pm_stats.resident_count) == sizeof(long));
400
401#if defined(EFI_RUNTIME)
402	memset(&efirt_pmap, 0, sizeof(efirt_pmap));
403	struct pmap * const efipm = &efirt_pmap;
404	struct pmap_asid_info * const efipai = PMAP_PAI(efipm, cpu_tlb_info(ci));
405
406	efipai->pai_asid = KERNEL_PID;
407	efipm->pm_refcnt = 1;
408
409	vaddr_t efi_l0va = uvm_pageboot_alloc(Ln_TABLE_SIZE);
410	KASSERT((efi_l0va & PAGE_MASK) == 0);
411
412	efipm->pm_l0table = (pd_entry_t *)efi_l0va;
413	memset(efipm->pm_l0table, 0, Ln_TABLE_SIZE);
414
415	efipm->pm_l0table_pa = AARCH64_KVA_TO_PA(efi_l0va);
416
417	efipm->pm_activated = false;
418	LIST_INIT(&efipm->pm_vmlist);
419	LIST_INIT(&efipm->pm_pvlist);	/* not used for efi pmap */
420	mutex_init(&efipm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
421#endif
422}
423
424#ifdef MULTIPROCESSOR
425void
426pmap_md_tlb_info_attach(struct pmap_tlb_info *ti, struct cpu_info *ci)
427{
428	/* nothing */
429}
430#endif /* MULTIPROCESSOR */
431
432static inline void
433_pmap_adj_wired_count(struct pmap *pm, int adj)
434{
435
436	if (pm == pmap_kernel()) {
437		atomic_add_long(&pm->pm_stats.wired_count, adj);
438	} else {
439		KASSERT(mutex_owned(&pm->pm_lock));
440		pm->pm_stats.wired_count += adj;
441	}
442}
443
444static inline void
445_pmap_adj_resident_count(struct pmap *pm, int adj)
446{
447
448	if (pm == pmap_kernel()) {
449		atomic_add_long(&pm->pm_stats.resident_count, adj);
450	} else {
451		KASSERT(mutex_owned(&pm->pm_lock));
452		pm->pm_stats.resident_count += adj;
453	}
454}
455
456inline static int
457_pmap_color(vaddr_t addr)	/* or paddr_t */
458{
459	return (addr >> PGSHIFT) & (uvmexp.ncolors - 1);
460}
461
462static int
463_pmap_pmap_ctor(void *arg, void *v, int flags)
464{
465	memset(v, 0, sizeof(struct pmap));
466	return 0;
467}
468
469static int
470_pmap_pv_ctor(void *arg, void *v, int flags)
471{
472	memset(v, 0, sizeof(struct pv_entry));
473	return 0;
474}
475
476pd_entry_t *
477pmap_l0table(struct pmap *pm)
478{
479
480	return pm->pm_l0table;
481}
482
483void
484pmap_init(void)
485{
486
487	pool_cache_bootstrap(&_pmap_cache, sizeof(struct pmap),
488	    coherency_unit, 0, 0, "pmappl", NULL, IPL_NONE, _pmap_pmap_ctor,
489	    NULL, NULL);
490
491	pool_cache_bootstrap(&_pmap_pv_pool, sizeof(struct pv_entry),
492	    32, 0, PR_LARGECACHE, "pvpl", NULL, IPL_NONE, _pmap_pv_ctor,
493	    NULL, NULL);
494
495	pmap_tlb_info_evcnt_attach(&pmap_tlb0_info);
496}
497
498void
499pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
500{
501	*vstartp = virtual_avail;
502	*vendp = virtual_end;
503}
504
505vaddr_t
506pmap_steal_memory(vsize_t size, vaddr_t *vstartp, vaddr_t *vendp)
507{
508	int npage;
509	paddr_t pa;
510	vaddr_t va;
511	psize_t bank_npage;
512	uvm_physseg_t bank;
513
514	UVMHIST_FUNC(__func__);
515	UVMHIST_CALLARGS(pmaphist, "size=%llu, *vstartp=%llx, *vendp=%llx",
516	    size, *vstartp, *vendp, 0);
517
518	size = round_page(size);
519	npage = atop(size);
520
521	for (bank = uvm_physseg_get_first(); uvm_physseg_valid_p(bank);
522	    bank = uvm_physseg_get_next(bank)) {
523
524		bank_npage = uvm_physseg_get_avail_end(bank) -
525		    uvm_physseg_get_avail_start(bank);
526		if (npage <= bank_npage)
527			break;
528	}
529
530	if (!uvm_physseg_valid_p(bank)) {
531		panic("%s: no memory", __func__);
532	}
533
534	/* Steal pages */
535	pa = ptoa(uvm_physseg_get_avail_start(bank));
536	va = AARCH64_PA_TO_KVA(pa);
537	uvm_physseg_unplug(atop(pa), npage);
538
539	for (; npage > 0; npage--, pa += PAGE_SIZE)
540		pmap_zero_page(pa);
541
542	return va;
543}
544
545void
546pmap_reference(struct pmap *pm)
547{
548	atomic_inc_uint(&pm->pm_refcnt);
549}
550
551static paddr_t
552pmap_alloc_pdp(struct pmap *pm, struct vm_page **pgp, int flags, bool waitok)
553{
554	paddr_t pa;
555	struct vm_page *pg;
556
557	UVMHIST_FUNC(__func__);
558	UVMHIST_CALLARGS(pmaphist, "pm=%p, flags=%08x, waitok=%d",
559	    pm, flags, waitok, 0);
560
561	if (uvm.page_init_done) {
562		int aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
563		    UVM_PGA_ZERO;
564 retry:
565		pg = uvm_pagealloc(NULL, 0, NULL, aflags);
566		if (pg == NULL) {
567			if (waitok) {
568				uvm_wait("pmap_alloc_pdp");
569				goto retry;
570			}
571			return POOL_PADDR_INVALID;
572		}
573
574		LIST_INSERT_HEAD(&pm->pm_vmlist, pg, pageq.list);
575		pg->flags &= ~PG_BUSY;	/* never busy */
576		pg->wire_count = 1;	/* max = 1 + Ln_ENTRIES = 513 */
577		pa = VM_PAGE_TO_PHYS(pg);
578		PMAP_COUNT(pdp_alloc);
579		PMAP_PAGE_INIT(VM_PAGE_TO_PP(pg));
580	} else {
581		/* uvm_pageboot_alloc() returns a direct mapping address */
582		pg = NULL;
583		pa = AARCH64_KVA_TO_PA(
584		    uvm_pageboot_alloc(Ln_TABLE_SIZE));
585		PMAP_COUNT(pdp_alloc_boot);
586	}
587	if (pgp != NULL)
588		*pgp = pg;
589
590	UVMHIST_LOG(pmaphist, "pa=%llx, pg=%llx",
591	    pa, pg, 0, 0);
592
593	return pa;
594}
595
596static void
597pmap_free_pdp(struct pmap *pm, struct vm_page *pg)
598{
599
600	KASSERT(pm != pmap_kernel());
601	KASSERT(VM_PAGE_TO_PP(pg)->pp_pv.pv_pmap == NULL);
602	KASSERT(VM_PAGE_TO_PP(pg)->pp_pv.pv_next == NULL);
603
604	LIST_REMOVE(pg, pageq.list);
605	pg->wire_count = 0;
606	uvm_pagefree(pg);
607	PMAP_COUNT(pdp_free);
608}
609
610/* free empty page table pages */
611static void
612_pmap_sweep_pdp(struct pmap *pm)
613{
614	struct vm_page *pg, *tmp;
615	pd_entry_t *ptep_in_parent, opte __diagused;
616	paddr_t pa, pdppa;
617	uint16_t wirecount __diagused;
618
619	KASSERT(mutex_owned(&pm->pm_lock) || pm->pm_refcnt == 0);
620
621	LIST_FOREACH_SAFE(pg, &pm->pm_vmlist, pageq.list, tmp) {
622		if (pg->wire_count != 1)
623			continue;
624
625		pa = VM_PAGE_TO_PHYS(pg);
626		if (pa == pm->pm_l0table_pa)
627			continue;
628
629		ptep_in_parent = VM_PAGE_TO_MD(pg)->mdpg_ptep_parent;
630		if (ptep_in_parent == NULL) {
631			/* no parent */
632			pmap_free_pdp(pm, pg);
633			continue;
634		}
635
636		/* unlink from parent */
637		opte = atomic_swap_64(ptep_in_parent, 0);
638		KASSERT(lxpde_valid(opte));
639		wirecount = --pg->wire_count; /* 1 -> 0 */
640		KASSERT(wirecount == 0);
641		pmap_free_pdp(pm, pg);
642
643		/* L3->L2->L1. no need for L0 */
644		pdppa = AARCH64_KVA_TO_PA(trunc_page((vaddr_t)ptep_in_parent));
645		if (pdppa == pm->pm_l0table_pa)
646			continue;
647
648		pg = PHYS_TO_VM_PAGE(pdppa);
649		KASSERT(pg != NULL);
650		KASSERTMSG(pg->wire_count >= 1,
651		    "wire_count=%d", pg->wire_count);
652		/* decrement wire_count of parent */
653		wirecount = --pg->wire_count;
654		KASSERTMSG(pg->wire_count <= (Ln_ENTRIES + 1),
655		    "pm=%p, pg=%p, wire_count=%d",
656		    pm, pg, pg->wire_count);
657	}
658	pm->pm_idlepdp = 0;
659}
660
661static void
662_pmap_free_pdp_all(struct pmap *pm, bool free_l0)
663{
664	struct vm_page *pg, *pgtmp, *pg_reserve;
665
666	pg_reserve = free_l0 ? NULL : PHYS_TO_VM_PAGE(pm->pm_l0table_pa);
667	LIST_FOREACH_SAFE(pg, &pm->pm_vmlist, pageq.list, pgtmp) {
668		if (pg == pg_reserve)
669			continue;
670		pmap_free_pdp(pm, pg);
671	}
672}
673
674vaddr_t
675pmap_growkernel(vaddr_t maxkvaddr)
676{
677	struct pmap *pm = pmap_kernel();
678	struct vm_page *pg;
679	int error;
680	vaddr_t va;
681	paddr_t pa;
682
683	UVMHIST_FUNC(__func__);
684	UVMHIST_CALLARGS(pmaphist, "maxkvaddr=%llx, pmap_maxkvaddr=%llx",
685	    maxkvaddr, pmap_maxkvaddr, 0, 0);
686
687	mutex_enter(&pm->pm_lock);
688	for (va = pmap_maxkvaddr & L2_FRAME; va <= maxkvaddr; va += L2_SIZE) {
689		error = _pmap_get_pdp(pm, va, false, 0, &pa, &pg);
690		if (error != 0) {
691			panic("%s: cannot allocate L3 table error=%d",
692			    __func__, error);
693		}
694	}
695	kasan_shadow_map((void *)pmap_maxkvaddr,
696	    (size_t)(va - pmap_maxkvaddr));
697	pmap_maxkvaddr = va;
698	mutex_exit(&pm->pm_lock);
699
700	return va;
701}
702
703bool
704pmap_extract(struct pmap *pm, vaddr_t va, paddr_t *pap)
705{
706
707	return pmap_extract_coherency(pm, va, pap, NULL);
708}
709
710bool
711pmap_extract_coherency(struct pmap *pm, vaddr_t va, paddr_t *pap,
712    bool *coherencyp)
713{
714	pt_entry_t *ptep, pte;
715	paddr_t pa;
716	vsize_t blocksize = 0;
717	int space;
718	bool coherency, valid;
719	extern char __kernel_text[];
720	extern char _end[];
721
722	coherency = false;
723
724	space = aarch64_addressspace(va);
725	if (pm == pmap_kernel()) {
726		if (space != AARCH64_ADDRSPACE_UPPER)
727			return false;
728
729		if (IN_RANGE(va, (vaddr_t)__kernel_text, (vaddr_t)_end)) {
730			/* kernel text/data/bss are definitely linear mapped */
731			pa = KERN_VTOPHYS(va);
732			goto mapped;
733		} else if (IN_DIRECTMAP_ADDR(va)) {
734			/*
735			 * also direct mapping is linear mapped, but areas that
736			 * have no physical memory haven't been mapped.
737			 * fast lookup by using the S1E1R/PAR_EL1 registers.
738			 */
739			register_t s = daif_disable(DAIF_I | DAIF_F);
740			reg_s1e1r_write(va);
741			isb();
742			uint64_t par = reg_par_el1_read();
743			reg_daif_write(s);
744
745			if (par & PAR_F)
746				return false;
747			pa = (__SHIFTOUT(par, PAR_PA) << PAR_PA_SHIFT) +
748			    (va & __BITS(PAR_PA_SHIFT - 1, 0));
749			goto mapped;
750		}
751	} else {
752		if (space != AARCH64_ADDRSPACE_LOWER)
753			return false;
754	}
755
756	/*
757	 * other areas, it isn't able to examined using the PAR_EL1 register,
758	 * because the page may be in an access fault state due to
759	 * reference bit emulation.
760	 */
761	if (pm != pmap_kernel())
762		mutex_enter(&pm->pm_lock);
763	ptep = _pmap_pte_lookup_bs(pm, va, &blocksize);
764	valid = (ptep != NULL && lxpde_valid(pte = *ptep));
765	if (pm != pmap_kernel())
766		mutex_exit(&pm->pm_lock);
767
768	if (!valid) {
769		return false;
770	}
771
772	pa = lxpde_pa(pte) + (va & (blocksize - 1));
773
774	switch (pte & LX_BLKPAG_ATTR_MASK) {
775	case LX_BLKPAG_ATTR_NORMAL_NC:
776	case LX_BLKPAG_ATTR_DEVICE_MEM:
777	case LX_BLKPAG_ATTR_DEVICE_MEM_NP:
778		coherency = true;
779		break;
780	}
781
782 mapped:
783	if (pap != NULL)
784		*pap = pa;
785	if (coherencyp != NULL)
786		*coherencyp = coherency;
787	return true;
788}
789
790paddr_t
791vtophys(vaddr_t va)
792{
793	struct pmap *pm;
794	paddr_t pa;
795
796	/* even if TBI is disabled, AARCH64_ADDRTOP_TAG means KVA */
797	if ((uint64_t)va & AARCH64_ADDRTOP_TAG)
798		pm = pmap_kernel();
799	else
800		pm = curlwp->l_proc->p_vmspace->vm_map.pmap;
801
802	if (pmap_extract(pm, va, &pa) == false)
803		return VTOPHYS_FAILED;
804	return pa;
805}
806
807/*
808 * return pointer of the pte. regardess of whether the entry is valid or not.
809 */
810static pt_entry_t *
811_pmap_pte_lookup_bs(struct pmap *pm, vaddr_t va, vsize_t *bs)
812{
813	pt_entry_t *ptep;
814	pd_entry_t *l0, *l1, *l2, *l3;
815	pd_entry_t pde;
816	vsize_t blocksize;
817	unsigned int idx;
818
819	KASSERT(pm == pmap_kernel() || mutex_owned(&pm->pm_lock));
820
821	/*
822	 * traverse L0 -> L1 -> L2 -> L3
823	 */
824	blocksize = L0_SIZE;
825	l0 = pm->pm_l0table;
826	idx = l0pde_index(va);
827	ptep = &l0[idx];
828	pde = *ptep;
829	if (!l0pde_valid(pde))
830		goto done;
831
832	blocksize = L1_SIZE;
833	l1 = (pd_entry_t *)AARCH64_PA_TO_KVA(l0pde_pa(pde));
834	idx = l1pde_index(va);
835	ptep = &l1[idx];
836	pde = *ptep;
837	if (!l1pde_valid(pde) || l1pde_is_block(pde))
838		goto done;
839
840	blocksize = L2_SIZE;
841	l2 = (pd_entry_t *)AARCH64_PA_TO_KVA(l1pde_pa(pde));
842	idx = l2pde_index(va);
843	ptep = &l2[idx];
844	pde = *ptep;
845	if (!l2pde_valid(pde) || l2pde_is_block(pde))
846		goto done;
847
848	blocksize = L3_SIZE;
849	l3 = (pd_entry_t *)AARCH64_PA_TO_KVA(l2pde_pa(pde));
850	idx = l3pte_index(va);
851	ptep = &l3[idx];
852
853 done:
854	if (bs != NULL)
855		*bs = blocksize;
856	return ptep;
857}
858
859static pt_entry_t *
860_pmap_pte_lookup_l3(struct pmap *pm, vaddr_t va)
861{
862	pt_entry_t *ptep;
863	vsize_t blocksize = 0;
864
865	ptep = _pmap_pte_lookup_bs(pm, va, &blocksize);
866	if ((ptep != NULL) && (blocksize == L3_SIZE))
867		return ptep;
868
869	return NULL;
870}
871
872void
873pmap_icache_sync_range(pmap_t pm, vaddr_t sva, vaddr_t eva)
874{
875	pt_entry_t *ptep = NULL, pte;
876	vaddr_t va;
877	vsize_t blocksize = 0;
878
879	KASSERT_PM_ADDR(pm, sva);
880
881	pm_lock(pm);
882
883	for (va = sva; va < eva; va = (va + blocksize) & ~(blocksize - 1)) {
884		/* va is belong to the same L3 table as before? */
885		if ((blocksize == L3_SIZE) && ((va & L3INDEXMASK) != 0)) {
886			ptep++;
887		} else {
888			ptep = _pmap_pte_lookup_bs(pm, va, &blocksize);
889			if (ptep == NULL)
890				break;
891		}
892
893		pte = *ptep;
894		if (!lxpde_valid(pte))
895			continue;
896
897		vaddr_t eob = (va + blocksize) & ~(blocksize - 1);
898		vsize_t len = ulmin(eva, eob) - va;
899
900		if (l3pte_readable(pte)) {
901			cpu_icache_sync_range(va, len);
902		} else {
903			/*
904			 * change to accessible temporarily
905			 * to do cpu_icache_sync_range()
906			 */
907			struct pmap_asid_info * const pai = PMAP_PAI(pm,
908			    cpu_tlb_info(ci));
909
910			atomic_swap_64(ptep, pte | LX_BLKPAG_AF);
911			AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
912			cpu_icache_sync_range(va, len);
913			atomic_swap_64(ptep, pte);
914			AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
915		}
916	}
917
918	pm_unlock(pm);
919}
920
921/*
922 * Routine:	pmap_procwr
923 *
924 * Function:
925 *	Synchronize caches corresponding to [addr, addr+len) in p.
926 *
927 */
928void
929pmap_procwr(struct proc *p, vaddr_t sva, int len)
930{
931
932	if (__predict_true(p == curproc))
933		cpu_icache_sync_range(sva, len);
934	else {
935		struct pmap *pm = p->p_vmspace->vm_map.pmap;
936		paddr_t pa;
937		vaddr_t va, eva;
938		int tlen;
939
940		for (va = sva; len > 0; va = eva, len -= tlen) {
941			eva = uimin(va + len, trunc_page(va + PAGE_SIZE));
942			tlen = eva - va;
943			if (!pmap_extract(pm, va, &pa))
944				continue;
945			va = AARCH64_PA_TO_KVA(pa);
946			cpu_icache_sync_range(va, tlen);
947		}
948	}
949}
950
951static pt_entry_t
952_pmap_pte_adjust_prot(pt_entry_t pte, vm_prot_t prot, vm_prot_t refmod,
953    bool user)
954{
955	vm_prot_t masked;
956	pt_entry_t xn;
957
958	masked = prot & refmod;
959	pte &= ~(LX_BLKPAG_OS_RWMASK | LX_BLKPAG_AF | LX_BLKPAG_DBM | LX_BLKPAG_AP);
960
961	/*
962	 * keep actual prot in the pte as OS_{READ|WRITE} for ref/mod emulation,
963	 * and set the DBM bit for HAFDBS if it has write permission.
964	 */
965	pte |= LX_BLKPAG_OS_READ;	/* a valid pte can always be readable */
966	if (prot & VM_PROT_WRITE)
967		pte |= LX_BLKPAG_OS_WRITE | LX_BLKPAG_DBM;
968
969	switch (masked & (VM_PROT_READ | VM_PROT_WRITE)) {
970	case 0:
971	default:
972		/*
973		 * it cannot be accessed because there is no AF bit,
974		 * but the AF bit will be added by fixup() or HAFDBS.
975		 */
976		pte |= LX_BLKPAG_AP_RO;
977		break;
978	case VM_PROT_READ:
979		/*
980		 * as it is RO, it cannot be written as is,
981		 * but it may be changed to RW by fixup() or HAFDBS.
982		 */
983		pte |= LX_BLKPAG_AF;
984		pte |= LX_BLKPAG_AP_RO;
985		break;
986	case VM_PROT_WRITE:
987	case VM_PROT_READ | VM_PROT_WRITE:
988		/* fully readable and writable */
989		pte |= LX_BLKPAG_AF;
990		pte |= LX_BLKPAG_AP_RW;
991		break;
992	}
993
994	/* executable for kernel or user? first set never exec both */
995	pte |= (LX_BLKPAG_UXN | LX_BLKPAG_PXN);
996	/* and either to executable */
997	xn = user ? LX_BLKPAG_UXN : LX_BLKPAG_PXN;
998	if (prot & VM_PROT_EXECUTE)
999		pte &= ~xn;
1000
1001	return pte;
1002}
1003
1004static pt_entry_t
1005_pmap_pte_adjust_cacheflags(pt_entry_t pte, u_int flags)
1006{
1007
1008	pte &= ~LX_BLKPAG_ATTR_MASK;
1009
1010	switch (flags & (PMAP_CACHE_MASK | PMAP_DEV_MASK)) {
1011	case PMAP_DEV_NP ... PMAP_DEV_NP | PMAP_CACHE_MASK:
1012		pte |= LX_BLKPAG_ATTR_DEVICE_MEM_NP;	/* Device-nGnRnE */
1013		break;
1014	case PMAP_DEV ... PMAP_DEV | PMAP_CACHE_MASK:
1015		pte |= LX_BLKPAG_ATTR_DEVICE_MEM;	/* Device-nGnRE */
1016		break;
1017	case PMAP_NOCACHE:
1018	case PMAP_NOCACHE_OVR:
1019	case PMAP_WRITE_COMBINE:
1020		pte |= LX_BLKPAG_ATTR_NORMAL_NC;	/* only no-cache */
1021		break;
1022	case PMAP_WRITE_BACK:
1023	case 0:
1024	default:
1025		pte |= LX_BLKPAG_ATTR_NORMAL_WB;
1026		break;
1027	}
1028
1029	return pte;
1030}
1031
1032#ifdef ARMV81_HAFDBS
1033static inline void
1034_pmap_reflect_refmod_in_pp(pt_entry_t pte, struct pmap_page *pp)
1035{
1036	if (!lxpde_valid(pte))
1037		return;
1038
1039	/*
1040	 * In order to retain referenced/modified information,
1041	 * it should be reflected from pte in the pmap_page.
1042	 */
1043	if (pte & LX_BLKPAG_AF)
1044		pp->pp_pv.pv_va |= VM_PROT_READ;
1045	if ((pte & LX_BLKPAG_AP) == LX_BLKPAG_AP_RW)
1046		pp->pp_pv.pv_va |= VM_PROT_WRITE;
1047}
1048#endif
1049
1050static struct pv_entry *
1051_pmap_remove_pv(struct pmap_page *pp, struct pmap *pm, vaddr_t va,
1052    pt_entry_t pte)
1053{
1054	struct pv_entry *pv, *ppv;
1055
1056	UVMHIST_FUNC(__func__);
1057	UVMHIST_CALLARGS(pmaphist, "pp=%p, pm=%p, va=%llx, pte=%llx",
1058	    pp, pm, va, pte);
1059
1060	KASSERT(mutex_owned(&pm->pm_lock));	/* for pv_proc */
1061	KASSERT(mutex_owned(&pp->pp_pvlock));
1062
1063#ifdef ARMV81_HAFDBS
1064	if (aarch64_hafdbs_enabled != ID_AA64MMFR1_EL1_HAFDBS_NONE)
1065		_pmap_reflect_refmod_in_pp(pte, pp);
1066#endif
1067
1068	for (ppv = NULL, pv = &pp->pp_pv; pv != NULL; pv = pv->pv_next) {
1069		if (pv->pv_pmap == pm && trunc_page(pv->pv_va) == va) {
1070			break;
1071		}
1072		ppv = pv;
1073	}
1074
1075	if (pm != pmap_kernel() && pv != NULL)
1076		LIST_REMOVE(pv, pv_proc);
1077
1078	if (ppv == NULL) {
1079		/* embedded in pmap_page */
1080		pv->pv_pmap = NULL;
1081		pv = NULL;
1082		PMAP_COUNT(pv_remove_emb);
1083	} else if (pv != NULL) {
1084		/* dynamically allocated */
1085		ppv->pv_next = pv->pv_next;
1086		PMAP_COUNT(pv_remove_dyn);
1087	} else {
1088		PMAP_COUNT(pv_remove_nopv);
1089	}
1090
1091	return pv;
1092}
1093
1094#if defined(PMAP_PV_DEBUG) || defined(DDB)
1095
1096static char *
1097str_vmflags(uint32_t flags)
1098{
1099	static int idx = 0;
1100	static char buf[4][32];	/* XXX */
1101	char *p;
1102
1103	p = buf[idx];
1104	idx = (idx + 1) & 3;
1105
1106	p[0] = (flags & VM_PROT_READ) ? 'R' : '-';
1107	p[1] = (flags & VM_PROT_WRITE) ? 'W' : '-';
1108	p[2] = (flags & VM_PROT_EXECUTE) ? 'X' : '-';
1109	if (flags & PMAP_WIRED)
1110		memcpy(&p[3], ",WIRED\0", 7);
1111	else
1112		p[3] = '\0';
1113
1114	return p;
1115}
1116
1117void
1118pmap_db_mdpg_print(struct vm_page *pg, void (*pr)(const char *, ...) __printflike(1, 2))
1119{
1120	struct pmap_page *pp = VM_PAGE_TO_PP(pg);
1121	struct pv_entry *pv;
1122	int i, flags;
1123
1124	i = 0;
1125	flags = pp->pp_pv.pv_va & (PAGE_SIZE - 1);
1126
1127	pr("pp=%p\n", pp);
1128	pr(" pp flags=%08x %s\n", flags, str_vmflags(flags));
1129
1130	for (pv = &pp->pp_pv; pv != NULL; pv = pv->pv_next) {
1131		if (pv->pv_pmap == NULL) {
1132			KASSERT(pv == &pp->pp_pv);
1133			continue;
1134		}
1135		struct pmap * const pm = pv->pv_pmap;
1136		struct pmap_asid_info * const pai = PMAP_PAI(pm,
1137		    cpu_tlb_info(ci));
1138
1139		pr("  pv[%d] pv=%p\n", i, pv);
1140		pr("    pv[%d].pv_pmap = %p (asid=%d)\n", i, pm, pai->pai_asid);
1141		pr("    pv[%d].pv_va   = %016lx (color=%d)\n", i,
1142		    trunc_page(pv->pv_va), _pmap_color(pv->pv_va));
1143		pr("    pv[%d].pv_ptep = %p\n", i, pv->pv_ptep);
1144		i++;
1145	}
1146}
1147#endif /* PMAP_PV_DEBUG & DDB */
1148
1149static int
1150_pmap_enter_pv(struct pmap_page *pp, struct pmap *pm, struct pv_entry **pvp,
1151    vaddr_t va, pt_entry_t *ptep, paddr_t pa, u_int flags)
1152{
1153	struct pv_entry *pv;
1154
1155	UVMHIST_FUNC(__func__);
1156	UVMHIST_CALLARGS(pmaphist, "pp=%p, pm=%p, va=%llx, pa=%llx", pp, pm, va,
1157	    pa);
1158	UVMHIST_LOG(pmaphist, "ptep=%p, flags=%08x", ptep, flags, 0, 0);
1159
1160	KASSERT(mutex_owned(&pp->pp_pvlock));
1161	KASSERT(trunc_page(va) == va);
1162
1163	/*
1164	 * mapping cannot be already registered at this VA.
1165	 */
1166	if (pp->pp_pv.pv_pmap == NULL) {
1167		/*
1168		 * claim pv_entry embedded in pmap_page.
1169		 * take care not to wipe out acc/mod flags.
1170		 */
1171		pv = &pp->pp_pv;
1172		pv->pv_va = (pv->pv_va & (PAGE_SIZE - 1)) | va;
1173	} else {
1174		/*
1175		 * create and link new pv.
1176		 * pv is already allocated at beginning of _pmap_enter().
1177		 */
1178		pv = *pvp;
1179		if (pv == NULL)
1180			return ENOMEM;
1181		*pvp = NULL;
1182		pv->pv_next = pp->pp_pv.pv_next;
1183		pp->pp_pv.pv_next = pv;
1184		pv->pv_va = va;
1185	}
1186	pv->pv_pmap = pm;
1187	pv->pv_ptep = ptep;
1188	PMAP_COUNT(pv_enter);
1189
1190	if (pm != pmap_kernel())
1191		LIST_INSERT_HEAD(&pm->pm_pvlist, pv, pv_proc);
1192
1193#ifdef PMAP_PV_DEBUG
1194	printf("pv %p alias added va=%016lx -> pa=%016lx\n", pv, va, pa);
1195	pmap_db_mdpg_print(PHYS_TO_VM_PAGE(pa), printf);
1196#endif
1197
1198	return 0;
1199}
1200
1201void
1202pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1203{
1204
1205	_pmap_enter(pmap_kernel(), va, pa, prot, flags | PMAP_WIRED, true);
1206}
1207
1208void
1209pmap_kremove(vaddr_t va, vsize_t size)
1210{
1211	struct pmap *kpm = pmap_kernel();
1212
1213	UVMHIST_FUNC(__func__);
1214	UVMHIST_CALLARGS(pmaphist, "va=%llx, size=%llx", va, size, 0, 0);
1215
1216	KDASSERT((va & PGOFSET) == 0);
1217	KDASSERT((size & PGOFSET) == 0);
1218
1219	KDASSERT(!IN_DIRECTMAP_ADDR(va));
1220	KDASSERT(IN_RANGE(va, VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS));
1221
1222	_pmap_remove(kpm, va, va + size, true, NULL);
1223}
1224
1225static void
1226_pmap_protect_pv(struct pmap_page *pp, struct pv_entry *pv, vm_prot_t prot)
1227{
1228	pt_entry_t *ptep, pte;
1229	vm_prot_t pteprot;
1230	uint32_t mdattr;
1231	const bool user = (pv->pv_pmap != pmap_kernel());
1232
1233	UVMHIST_FUNC(__func__);
1234	UVMHIST_CALLARGS(pmaphist, "pp=%p, pv=%p, prot=%08x", pp, pv, prot, 0);
1235
1236	KASSERT(mutex_owned(&pv->pv_pmap->pm_lock));
1237
1238	ptep = pv->pv_ptep;
1239	pte = *ptep;
1240
1241	/* get prot mask from pte */
1242	pteprot = VM_PROT_READ;	/* a valid pte can always be readable */
1243	if ((pte & (LX_BLKPAG_OS_WRITE | LX_BLKPAG_DBM)) != 0)
1244		pteprot |= VM_PROT_WRITE;
1245	if (l3pte_executable(pte, user))
1246		pteprot |= VM_PROT_EXECUTE;
1247
1248#ifdef ARMV81_HAFDBS
1249	if (aarch64_hafdbs_enabled != ID_AA64MMFR1_EL1_HAFDBS_NONE)
1250		_pmap_reflect_refmod_in_pp(pte, pp);
1251#endif
1252	/* get prot mask from referenced/modified */
1253	mdattr = pp->pp_pv.pv_va & (VM_PROT_READ | VM_PROT_WRITE);
1254
1255	/* new prot = prot & pteprot & mdattr */
1256	pte = _pmap_pte_adjust_prot(pte, prot & pteprot, mdattr, user);
1257	atomic_swap_64(ptep, pte);
1258
1259	struct pmap * const pm = pv->pv_pmap;
1260	struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
1261
1262	AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, trunc_page(pv->pv_va));
1263}
1264
1265void
1266pmap_protect(struct pmap *pm, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
1267{
1268	pt_entry_t *ptep = NULL, pte;
1269	vaddr_t va;
1270	vsize_t blocksize = 0;
1271	const bool user = (pm != pmap_kernel());
1272
1273	KASSERT((prot & VM_PROT_READ) || !(prot & VM_PROT_WRITE));
1274
1275	UVMHIST_FUNC(__func__);
1276	UVMHIST_CALLARGS(pmaphist, "pm=%p, sva=%016lx, eva=%016lx, prot=%08x",
1277	    pm, sva, eva, prot);
1278
1279	KASSERT_PM_ADDR(pm, sva);
1280	KASSERT(!IN_DIRECTMAP_ADDR(sva));
1281
1282	/* PROT_EXEC requires implicit PROT_READ */
1283	if (prot & VM_PROT_EXECUTE)
1284		prot |= VM_PROT_READ;
1285
1286	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1287		PMAP_COUNT(protect_remove_fallback);
1288		pmap_remove(pm, sva, eva);
1289		return;
1290	}
1291	PMAP_COUNT(protect);
1292
1293	KDASSERT((sva & PAGE_MASK) == 0);
1294	KDASSERT((eva & PAGE_MASK) == 0);
1295
1296	pm_lock(pm);
1297
1298	for (va = sva; va < eva; va = (va + blocksize) & ~(blocksize - 1)) {
1299#ifdef UVMHIST
1300		pt_entry_t opte;
1301#endif
1302		struct pmap_page *pp;
1303		uint32_t mdattr;
1304		bool executable;
1305
1306		/* va is belong to the same L3 table as before? */
1307		if ((blocksize == L3_SIZE) && ((va & L3INDEXMASK) != 0))
1308			ptep++;
1309		else
1310			ptep = _pmap_pte_lookup_bs(pm, va, &blocksize);
1311
1312		pte = *ptep;
1313		if (!lxpde_valid(pte)) {
1314			PMAP_COUNT(protect_none);
1315			continue;
1316		}
1317
1318		if ((pte & LX_BLKPAG_OS_WIRED) == 0) {
1319			const paddr_t pa = lxpde_pa(pte);
1320			struct vm_page *const pg = PHYS_TO_VM_PAGE(pa);
1321
1322			if (pg != NULL) {
1323				pp = VM_PAGE_TO_PP(pg);
1324				PMAP_COUNT(protect_managed);
1325			} else {
1326#ifdef __HAVE_PMAP_PV_TRACK
1327				pp = pmap_pv_tracked(pa);
1328#ifdef PMAPCOUNTERS
1329				if (pp != NULL)
1330					PMAP_COUNT(protect_pvmanaged);
1331				else
1332					PMAP_COUNT(protect_unmanaged);
1333#endif
1334#else
1335				pp = NULL;
1336				PMAP_COUNT(protect_unmanaged);
1337#endif /* __HAVE_PMAP_PV_TRACK */
1338			}
1339		} else {	/* kenter */
1340			pp = NULL;
1341			PMAP_COUNT(protect_unmanaged);
1342		}
1343
1344		if (pp != NULL) {
1345#ifdef ARMV81_HAFDBS
1346			if (aarch64_hafdbs_enabled != ID_AA64MMFR1_EL1_HAFDBS_NONE)
1347				_pmap_reflect_refmod_in_pp(pte, pp);
1348#endif
1349			/* get prot mask from referenced/modified */
1350			mdattr = pp->pp_pv.pv_va &
1351			    (VM_PROT_READ | VM_PROT_WRITE);
1352		} else {
1353			/* unmanaged page */
1354			mdattr = VM_PROT_ALL;
1355		}
1356
1357#ifdef UVMHIST
1358		opte = pte;
1359#endif
1360		executable = l3pte_executable(pte, user);
1361		pte = _pmap_pte_adjust_prot(pte, prot, mdattr, user);
1362
1363		struct pmap_asid_info * const pai = PMAP_PAI(pm,
1364		    cpu_tlb_info(ci));
1365		if (!executable && (prot & VM_PROT_EXECUTE)) {
1366			/* non-exec -> exec */
1367			UVMHIST_LOG(pmaphist, "icache_sync: "
1368			    "pm=%p, va=%016lx, pte: %016lx -> %016lx",
1369			    pm, va, opte, pte);
1370
1371			if (!l3pte_readable(pte)) {
1372				PTE_ICACHE_SYNC_PAGE(pte, ptep, pai->pai_asid,
1373				    va);
1374				atomic_swap_64(ptep, pte);
1375				AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
1376			} else {
1377				atomic_swap_64(ptep, pte);
1378				AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
1379				cpu_icache_sync_range(va, PAGE_SIZE);
1380			}
1381		} else {
1382			atomic_swap_64(ptep, pte);
1383			AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
1384		}
1385	}
1386
1387	pm_unlock(pm);
1388}
1389
1390#if defined(EFI_RUNTIME)
1391void
1392pmap_activate_efirt(void)
1393{
1394	struct cpu_info *ci = curcpu();
1395	struct pmap *pm = &efirt_pmap;
1396	struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
1397
1398	UVMHIST_FUNC(__func__);
1399	UVMHIST_CALLARGS(pmaphist, " (pm=%#jx)", (uintptr_t)pm, 0, 0, 0);
1400
1401	KASSERT(kpreempt_disabled());
1402
1403	ci->ci_pmap_asid_cur = pai->pai_asid;
1404	UVMHIST_LOG(pmaphist, "setting asid to %#jx", pai->pai_asid,
1405	    0, 0, 0);
1406	tlb_set_asid(pai->pai_asid, pm);
1407
1408	/* Re-enable translation table walks using TTBR0 */
1409	uint64_t tcr = reg_tcr_el1_read();
1410	reg_tcr_el1_write(tcr & ~TCR_EPD0);
1411	isb();
1412	pm->pm_activated = true;
1413
1414	PMAP_COUNT(activate);
1415}
1416#endif
1417
1418void
1419pmap_activate(struct lwp *l)
1420{
1421	struct pmap *pm = l->l_proc->p_vmspace->vm_map.pmap;
1422	uint64_t tcr;
1423
1424	UVMHIST_FUNC(__func__);
1425	UVMHIST_CALLARGS(pmaphist, "lwp=%p (pid=%d, kernel=%u)", l,
1426	    l->l_proc->p_pid, pm == pmap_kernel() ? 1 : 0, 0);
1427
1428	KASSERT(kpreempt_disabled());
1429	KASSERT((reg_tcr_el1_read() & TCR_EPD0) != 0);
1430
1431	if (pm == pmap_kernel())
1432		return;
1433	if (l != curlwp)
1434		return;
1435
1436	KASSERT(pm->pm_l0table != NULL);
1437
1438	/* this calls tlb_set_asid which calls cpu_set_ttbr0 */
1439	pmap_tlb_asid_acquire(pm, l);
1440
1441	UVMHIST_LOG(pmaphist, "lwp=%p, asid=%d", l,
1442	    PMAP_PAI(pm, cpu_tlb_info(ci))->pai_asid, 0, 0);
1443
1444	/* Re-enable translation table walks using TTBR0 */
1445	tcr = reg_tcr_el1_read();
1446	reg_tcr_el1_write(tcr & ~TCR_EPD0);
1447	isb();
1448
1449	pm->pm_activated = true;
1450
1451	PMAP_COUNT(activate);
1452}
1453
1454#if defined(EFI_RUNTIME)
1455void
1456pmap_deactivate_efirt(void)
1457{
1458	struct cpu_info * const ci = curcpu();
1459	struct pmap * const pm = &efirt_pmap;
1460
1461	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pmaphist);
1462
1463	KASSERT(kpreempt_disabled());
1464
1465	/* Disable translation table walks using TTBR0 */
1466	uint64_t tcr = reg_tcr_el1_read();
1467	reg_tcr_el1_write(tcr | TCR_EPD0);
1468	isb();
1469
1470	UVMHIST_LOG(pmaphist, "setting asid to %#jx", KERNEL_PID,
1471	    0, 0, 0);
1472
1473	ci->ci_pmap_asid_cur = KERNEL_PID;
1474        tlb_set_asid(KERNEL_PID, pmap_kernel());
1475
1476	pm->pm_activated = false;
1477
1478	PMAP_COUNT(deactivate);
1479}
1480#endif
1481
1482void
1483pmap_deactivate(struct lwp *l)
1484{
1485	struct pmap *pm = l->l_proc->p_vmspace->vm_map.pmap;
1486	uint64_t tcr;
1487
1488	UVMHIST_FUNC(__func__);
1489	UVMHIST_CALLARGS(pmaphist, "lwp=%p (pid=%d, (kernel=%u))", l,
1490	    l->l_proc->p_pid, pm == pmap_kernel() ? 1 : 0, 0);
1491
1492	KASSERT(kpreempt_disabled());
1493
1494	/* Disable translation table walks using TTBR0 */
1495	tcr = reg_tcr_el1_read();
1496	reg_tcr_el1_write(tcr | TCR_EPD0);
1497	isb();
1498
1499	UVMHIST_LOG(pmaphist, "lwp=%p, asid=%d", l,
1500	    PMAP_PAI(pm, cpu_tlb_info(ci))->pai_asid, 0, 0);
1501
1502	pmap_tlb_asid_deactivate(pm);
1503
1504	pm->pm_activated = false;
1505
1506	PMAP_COUNT(deactivate);
1507}
1508
1509struct pmap *
1510pmap_create(void)
1511{
1512	struct pmap *pm;
1513
1514	UVMHIST_FUNC(__func__);
1515	UVMHIST_CALLED(pmaphist);
1516
1517	pm = pool_cache_get(&_pmap_cache, PR_WAITOK);
1518	memset(pm, 0, sizeof(*pm));
1519	pm->pm_refcnt = 1;
1520	pm->pm_idlepdp = 0;
1521	LIST_INIT(&pm->pm_vmlist);
1522	LIST_INIT(&pm->pm_pvlist);
1523	mutex_init(&pm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1524
1525	kcpuset_create(&pm->pm_active, true);
1526	kcpuset_create(&pm->pm_onproc, true);
1527
1528	pm->pm_l0table_pa = pmap_alloc_pdp(pm, NULL, 0, true);
1529	KASSERT(pm->pm_l0table_pa != POOL_PADDR_INVALID);
1530	pm->pm_l0table = (pd_entry_t *)AARCH64_PA_TO_KVA(pm->pm_l0table_pa);
1531	KASSERT(((vaddr_t)pm->pm_l0table & (PAGE_SIZE - 1)) == 0);
1532
1533	UVMHIST_LOG(pmaphist, "pm=%p, pm_l0table=%016lx, pm_l0table_pa=%016lx",
1534	    pm, pm->pm_l0table, pm->pm_l0table_pa, 0);
1535
1536	PMAP_COUNT(create);
1537	return pm;
1538}
1539
1540void
1541pmap_destroy(struct pmap *pm)
1542{
1543	unsigned int refcnt;
1544
1545	UVMHIST_FUNC(__func__);
1546	UVMHIST_CALLARGS(pmaphist, "pm=%p, pm_l0table=%016lx, refcnt=%jd",
1547	    pm, pm->pm_l0table, pm->pm_refcnt, 0);
1548
1549	if (pm == NULL)
1550		return;
1551
1552	if (pm == pmap_kernel())
1553		panic("cannot destroy kernel pmap");
1554
1555	membar_release();
1556	refcnt = atomic_dec_uint_nv(&pm->pm_refcnt);
1557	if (refcnt > 0)
1558		return;
1559	membar_acquire();
1560
1561	KASSERT(LIST_EMPTY(&pm->pm_pvlist));
1562	pmap_tlb_asid_release_all(pm);
1563
1564	_pmap_free_pdp_all(pm, true);
1565	mutex_destroy(&pm->pm_lock);
1566
1567	kcpuset_destroy(pm->pm_active);
1568	kcpuset_destroy(pm->pm_onproc);
1569
1570	pool_cache_put(&_pmap_cache, pm);
1571
1572	PMAP_COUNT(destroy);
1573}
1574
1575static inline void
1576_pmap_pdp_setparent(struct pmap *pm, struct vm_page *pg, pt_entry_t *ptep)
1577{
1578
1579	if ((pm != pmap_kernel()) && (pg != NULL)) {
1580		KASSERT(mutex_owned(&pm->pm_lock));
1581		VM_PAGE_TO_MD(pg)->mdpg_ptep_parent = ptep;
1582	}
1583}
1584
1585/*
1586 * increment reference counter of the page descriptor page.
1587 * the reference counter should be equal to
1588 *  1 + num of valid entries the page has.
1589 */
1590static inline void
1591_pmap_pdp_addref(struct pmap *pm, paddr_t pdppa, struct vm_page *pdppg_hint)
1592{
1593	struct vm_page *pg;
1594
1595	/* kernel L0-L3 pages will never be freed */
1596	if (pm == pmap_kernel())
1597		return;
1598
1599#if defined(EFI_RUNTIME)
1600	/* EFI runtme L0-L3 pages will never be freed */
1601	if (pm == pmap_efirt())
1602		return;
1603#endif
1604
1605	KASSERT(mutex_owned(&pm->pm_lock));
1606
1607	/* no need for L0 page */
1608	if (pm->pm_l0table_pa == pdppa)
1609		return;
1610
1611	pg = pdppg_hint;
1612	if (pg == NULL)
1613		pg = PHYS_TO_VM_PAGE(pdppa);
1614	KASSERT(pg != NULL);
1615
1616	pg->wire_count++;
1617
1618	KASSERTMSG(pg->wire_count <= (Ln_ENTRIES + 1),
1619	    "pg=%p, wire_count=%d", pg, pg->wire_count);
1620}
1621
1622/*
1623 * decrement reference counter of the page descriptor page.
1624 * if reference counter is 1(=empty), pages will be freed, and return true.
1625 * otherwise return false.
1626 * kernel page, or L0 page descriptor page will be never freed.
1627 */
1628static bool
1629_pmap_pdp_delref(struct pmap *pm, paddr_t pdppa, bool do_free_pdp)
1630{
1631	struct vm_page *pg;
1632	bool removed;
1633	uint16_t wirecount;
1634
1635	/* kernel L0-L3 pages will never be freed */
1636	if (pm == pmap_kernel())
1637		return false;
1638
1639#if defined(EFI_RUNTIME)
1640	/* EFI runtme L0-L3 pages will never be freed */
1641	if (pm == pmap_efirt())
1642		return false;
1643#endif
1644
1645	KASSERT(mutex_owned(&pm->pm_lock));
1646
1647	/* no need for L0 page */
1648	if (pm->pm_l0table_pa == pdppa)
1649		return false;
1650
1651	pg = PHYS_TO_VM_PAGE(pdppa);
1652	KASSERT(pg != NULL);
1653
1654	wirecount = --pg->wire_count;
1655
1656	if (!do_free_pdp) {
1657		/*
1658		 * pm_idlepdp is counted by only pmap_page_protect() with
1659		 * VM_PROT_NONE. it is not correct because without considering
1660		 * pmap_enter(), but useful hint to just sweep.
1661		 */
1662		if (wirecount == 1)
1663			pm->pm_idlepdp++;
1664		return false;
1665	}
1666
1667	/* if no reference, free pdp */
1668	removed = false;
1669	while (wirecount == 1) {
1670		pd_entry_t *ptep_in_parent, opte __diagused;
1671		ptep_in_parent = VM_PAGE_TO_MD(pg)->mdpg_ptep_parent;
1672		if (ptep_in_parent == NULL) {
1673			/* no parent */
1674			pmap_free_pdp(pm, pg);
1675			removed = true;
1676			break;
1677		}
1678
1679		/* unlink from parent */
1680		opte = atomic_swap_64(ptep_in_parent, 0);
1681		KASSERT(lxpde_valid(opte));
1682		wirecount = atomic_add_32_nv(&pg->wire_count, -1); /* 1 -> 0 */
1683		KASSERT(wirecount == 0);
1684		pmap_free_pdp(pm, pg);
1685		removed = true;
1686
1687		/* L3->L2->L1. no need for L0 */
1688		pdppa = AARCH64_KVA_TO_PA(trunc_page((vaddr_t)ptep_in_parent));
1689		if (pdppa == pm->pm_l0table_pa)
1690			break;
1691
1692		pg = PHYS_TO_VM_PAGE(pdppa);
1693		KASSERT(pg != NULL);
1694		KASSERTMSG(pg->wire_count >= 1,
1695		    "wire_count=%d", pg->wire_count);
1696		/* decrement wire_count of parent */
1697		wirecount = atomic_add_32_nv(&pg->wire_count, -1);
1698		KASSERTMSG(pg->wire_count <= (Ln_ENTRIES + 1),
1699		    "pm=%p, pg=%p, wire_count=%d",
1700		    pm, pg, pg->wire_count);
1701	}
1702
1703	return removed;
1704}
1705
1706/*
1707 * traverse L0 -> L1 -> L2 -> L3 table with growing pdp if needed.
1708 */
1709static int
1710_pmap_get_pdp(struct pmap *pm, vaddr_t va, bool kenter, int flags,
1711    paddr_t *pap, struct vm_page **pgp)
1712{
1713	pd_entry_t *l0, *l1, *l2;
1714	struct vm_page *pdppg, *pdppg0;
1715	paddr_t pdppa, pdppa0;
1716	unsigned int idx;
1717	pd_entry_t pde;
1718
1719	KASSERT(kenter || mutex_owned(&pm->pm_lock));
1720
1721	l0 = pm->pm_l0table;
1722
1723	idx = l0pde_index(va);
1724	pde = l0[idx];
1725	if (!l0pde_valid(pde)) {
1726		KASSERTMSG(!kenter || IN_MODULE_VA(va) || PMAP_EFIVA_P(va),
1727		    "%s va %" PRIxVADDR, kenter ? "kernel" : "user", va);
1728		/* no need to increment L0 occupancy. L0 page never freed */
1729		pdppa = pmap_alloc_pdp(pm, &pdppg, flags, false);  /* L1 pdp */
1730		if (pdppa == POOL_PADDR_INVALID) {
1731			return ENOMEM;
1732		}
1733		atomic_swap_64(&l0[idx], pdppa | L0_TABLE);
1734		_pmap_pdp_setparent(pm, pdppg, &l0[idx]);
1735	} else {
1736		pdppa = l0pde_pa(pde);
1737		pdppg = NULL;
1738	}
1739	l1 = (void *)AARCH64_PA_TO_KVA(pdppa);
1740
1741	idx = l1pde_index(va);
1742	pde = l1[idx];
1743	if (!l1pde_valid(pde)) {
1744		KASSERTMSG(!kenter || IN_MODULE_VA(va) || PMAP_EFIVA_P(va),
1745		    "%s va %" PRIxVADDR, kenter ? "kernel" : "user", va);
1746		pdppa0 = pdppa;
1747		pdppg0 = pdppg;
1748		pdppa = pmap_alloc_pdp(pm, &pdppg, flags, false);  /* L2 pdp */
1749		if (pdppa == POOL_PADDR_INVALID) {
1750			return ENOMEM;
1751		}
1752		atomic_swap_64(&l1[idx], pdppa | L1_TABLE);
1753		_pmap_pdp_addref(pm, pdppa0, pdppg0);	/* L1 occupancy++ */
1754		_pmap_pdp_setparent(pm, pdppg, &l1[idx]);
1755	} else {
1756		pdppa = l1pde_pa(pde);
1757		pdppg = NULL;
1758	}
1759	l2 = (void *)AARCH64_PA_TO_KVA(pdppa);
1760
1761	idx = l2pde_index(va);
1762	pde = l2[idx];
1763	if (!l2pde_valid(pde)) {
1764		KASSERTMSG(!kenter || IN_MODULE_VA(va) || PMAP_EFIVA_P(va),
1765		    "%s va %" PRIxVADDR, kenter ? "kernel" : "user", va);
1766		pdppa0 = pdppa;
1767		pdppg0 = pdppg;
1768		pdppa = pmap_alloc_pdp(pm, &pdppg, flags, false);  /* L3 pdp */
1769		if (pdppa == POOL_PADDR_INVALID) {
1770			return ENOMEM;
1771		}
1772		atomic_swap_64(&l2[idx], pdppa | L2_TABLE);
1773		_pmap_pdp_addref(pm, pdppa0, pdppg0);	/* L2 occupancy++ */
1774		_pmap_pdp_setparent(pm, pdppg, &l2[idx]);
1775	} else {
1776		pdppa = l2pde_pa(pde);
1777		pdppg = NULL;
1778	}
1779	*pap = pdppa;
1780	*pgp = pdppg;
1781	return 0;
1782}
1783
1784static int
1785_pmap_enter(struct pmap *pm, vaddr_t va, paddr_t pa, vm_prot_t prot,
1786    u_int flags, bool kenter)
1787{
1788	struct vm_page *pdppg;
1789	struct pmap_page *pp, *opp, *pps[2];
1790	struct pv_entry *spv, *opv = NULL;
1791	pt_entry_t attr, pte, opte, *ptep;
1792	pd_entry_t *l3;
1793	paddr_t pdppa;
1794	uint32_t mdattr;
1795	unsigned int idx;
1796	int error = 0;
1797#if defined(EFI_RUNTIME)
1798	const bool efirt_p = pm == pmap_efirt();
1799#else
1800	const bool efirt_p = false;
1801#endif
1802	const bool kernel_p = pm == pmap_kernel();
1803	const bool user = !kernel_p && !efirt_p;
1804	bool need_sync_icache, need_enter_pv;
1805
1806	UVMHIST_FUNC(__func__);
1807	UVMHIST_CALLARGS(pmaphist, "pm=%p, kentermode=%d", pm, kenter, 0, 0);
1808	UVMHIST_LOG(pmaphist, "va=%016lx, pa=%016lx, prot=%08x, flags=%08x",
1809	    va, pa, prot, flags);
1810
1811	KASSERT_PM_ADDR(pm, va);
1812	KASSERT(!IN_DIRECTMAP_ADDR(va));
1813	KASSERT((prot & VM_PROT_ALL) != VM_PROT_NONE);
1814	KASSERT(pa < AARCH64_MAX_PA);
1815
1816#ifdef PMAPCOUNTERS
1817	PMAP_COUNT(mappings);
1818	if (_pmap_color(va) == _pmap_color(pa)) {
1819		if (user) {
1820			PMAP_COUNT(user_mappings);
1821		} else {
1822			PMAP_COUNT(kern_mappings);
1823		}
1824	} else if (flags & PMAP_WIRED) {
1825		if (user) {
1826			PMAP_COUNT(user_mappings_bad_wired);
1827		} else {
1828			PMAP_COUNT(kern_mappings_bad_wired);
1829		}
1830	} else {
1831		if (user) {
1832			PMAP_COUNT(user_mappings_bad);
1833		} else {
1834			PMAP_COUNT(kern_mappings_bad);
1835		}
1836	}
1837#endif
1838
1839	if (kenter) {
1840		pp = NULL;
1841		spv = NULL;
1842		need_enter_pv = false;
1843	} else {
1844		struct vm_page *pg = PHYS_TO_VM_PAGE(pa);
1845		if (pg != NULL) {
1846			pp = VM_PAGE_TO_PP(pg);
1847			PMAP_COUNT(managed_mappings);
1848		} else {
1849#ifdef __HAVE_PMAP_PV_TRACK
1850			pp = pmap_pv_tracked(pa);
1851#ifdef PMAPCOUNTERS
1852			if (pp != NULL)
1853				PMAP_COUNT(pvmanaged_mappings);
1854			else
1855				PMAP_COUNT(unmanaged_mappings);
1856#endif
1857#else
1858			pp = NULL;
1859			PMAP_COUNT(unmanaged_mappings);
1860#endif /* __HAVE_PMAP_PV_TRACK */
1861		}
1862
1863		if (pp != NULL) {
1864			/*
1865			 * allocate pv in advance of pm_lock().
1866			 */
1867			spv = pool_cache_get(&_pmap_pv_pool, PR_NOWAIT);
1868			need_enter_pv = true;
1869		} else {
1870			spv = NULL;
1871			need_enter_pv = false;
1872		}
1873
1874		pm_lock(pm);
1875		if (pm->pm_idlepdp >= PDPSWEEP_TRIGGER) {
1876			_pmap_sweep_pdp(pm);
1877		}
1878	}
1879
1880	/*
1881	 * traverse L0 -> L1 -> L2 -> L3 table with growing pdp if needed.
1882	 */
1883	error = _pmap_get_pdp(pm, va, kenter, flags, &pdppa, &pdppg);
1884	if (error != 0) {
1885		if (flags & PMAP_CANFAIL) {
1886			goto fail0;
1887		}
1888		panic("%s: cannot allocate L3 table error=%d", __func__,
1889		    error);
1890	}
1891
1892	l3 = (void *)AARCH64_PA_TO_KVA(pdppa);
1893
1894	idx = l3pte_index(va);
1895	ptep = &l3[idx];	/* as PTE */
1896	opte = *ptep;
1897	need_sync_icache = (prot & VM_PROT_EXECUTE) && !efirt_p;
1898
1899	/* for lock ordering for old page and new page */
1900	pps[0] = pp;
1901	pps[1] = NULL;
1902
1903	/* remap? */
1904	if (l3pte_valid(opte)) {
1905		bool need_remove_pv;
1906
1907		KASSERT(!kenter);	/* pmap_kenter_pa() cannot override */
1908		if (opte & LX_BLKPAG_OS_WIRED) {
1909			_pmap_adj_wired_count(pm, -1);
1910		}
1911		_pmap_adj_resident_count(pm, -1);
1912#ifdef PMAPCOUNTERS
1913		PMAP_COUNT(remappings);
1914		if (user) {
1915			PMAP_COUNT(user_mappings_changed);
1916		} else {
1917			PMAP_COUNT(kern_mappings_changed);
1918		}
1919#endif
1920		UVMHIST_LOG(pmaphist,
1921		    "va=%016lx has already mapped."
1922		    " old-pa=%016lx new-pa=%016lx, old-pte=%016llx",
1923		    va, l3pte_pa(opte), pa, opte);
1924
1925		if (pa == l3pte_pa(opte)) {
1926			/* old and new pte have same pa, no need to update pv */
1927			need_remove_pv = (pp == NULL);
1928			need_enter_pv = false;
1929			if (need_sync_icache && l3pte_executable(opte, user))
1930				need_sync_icache = false;
1931		} else {
1932			need_remove_pv = true;
1933		}
1934
1935		if (need_remove_pv &&
1936		    ((opp = phys_to_pp(l3pte_pa(opte))) != NULL)) {
1937			/*
1938			 * need to lock both pp and opp(old pp)
1939			 * against deadlock, and 'pp' maybe NULL.
1940			 */
1941			if (pp < opp) {
1942				pps[0] = pp;
1943				pps[1] = opp;
1944			} else {
1945				pps[0] = opp;
1946				pps[1] = pp;
1947			}
1948			if (pps[0] != NULL)
1949				pmap_pv_lock(pps[0]);
1950			if (pps[1] != NULL)
1951				pmap_pv_lock(pps[1]);
1952			opv = _pmap_remove_pv(opp, pm, va, opte);
1953		} else {
1954			if (pp != NULL)
1955				pmap_pv_lock(pp);
1956		}
1957		opte = atomic_swap_64(ptep, 0);
1958	} else {
1959		if (pp != NULL)
1960			pmap_pv_lock(pp);
1961	}
1962
1963	if (!l3pte_valid(opte))
1964		_pmap_pdp_addref(pm, pdppa, pdppg);	/* L3 occupancy++ */
1965
1966	/*
1967	 * read permission is treated as an access permission internally.
1968	 * require to add PROT_READ even if only PROT_WRITE or PROT_EXEC
1969	 */
1970	if (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE))
1971		prot |= VM_PROT_READ;
1972	if (flags & (VM_PROT_WRITE | VM_PROT_EXECUTE))
1973		flags |= VM_PROT_READ;
1974
1975	mdattr = VM_PROT_READ | VM_PROT_WRITE;
1976	if (need_enter_pv) {
1977		KASSERT(!kenter);
1978		error = _pmap_enter_pv(pp, pm, &spv, va, ptep, pa, flags);
1979		if (error != 0) {
1980			/*
1981			 * If pmap_enter() fails,
1982			 * it must not leave behind an existing pmap entry.
1983			 */
1984			if (lxpde_valid(opte)) {
1985				KASSERT((vaddr_t)l3 == trunc_page((vaddr_t)ptep));
1986				_pmap_pdp_delref(pm, AARCH64_KVA_TO_PA((vaddr_t)l3),
1987				    true);
1988				struct pmap_asid_info * const pai = PMAP_PAI(pm,
1989				    cpu_tlb_info(ci));
1990
1991				AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
1992			}
1993			PMAP_COUNT(pv_entry_cannotalloc);
1994			if (flags & PMAP_CANFAIL)
1995				goto fail1;
1996			panic("pmap_enter: failed to allocate pv_entry");
1997		}
1998	}
1999
2000	if (pp != NULL) {
2001		/* update referenced/modified flags */
2002		KASSERT(!kenter);
2003		pp->pp_pv.pv_va |= (flags & (VM_PROT_READ | VM_PROT_WRITE));
2004		mdattr &= (uint32_t)pp->pp_pv.pv_va;
2005	}
2006
2007#ifdef PMAPCOUNTERS
2008	switch (flags & PMAP_CACHE_MASK) {
2009	case PMAP_NOCACHE:
2010	case PMAP_NOCACHE_OVR:
2011		PMAP_COUNT(uncached_mappings);
2012		break;
2013	}
2014#endif
2015
2016	attr = L3_PAGE | (kenter ? 0 : LX_BLKPAG_NG);
2017	attr = _pmap_pte_adjust_prot(attr, prot, mdattr, user);
2018	attr = _pmap_pte_adjust_cacheflags(attr, flags);
2019	if (VM_MAXUSER_ADDRESS > va && !efirt_p)
2020		attr |= LX_BLKPAG_APUSER;
2021	if (flags & PMAP_WIRED)
2022		attr |= LX_BLKPAG_OS_WIRED;
2023#ifdef MULTIPROCESSOR
2024	attr |= LX_BLKPAG_SH_IS;
2025#endif
2026
2027	pte = pa | attr;
2028
2029	struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
2030	const tlb_asid_t asid = pai->pai_asid;
2031
2032	if (need_sync_icache) {
2033		/* non-exec -> exec */
2034		UVMHIST_LOG(pmaphist,
2035		    "icache_sync: pm=%p, va=%016lx, pte: %016lx -> %016lx",
2036		    pm, va, opte, pte);
2037
2038		if (!l3pte_readable(pte)) {
2039			PTE_ICACHE_SYNC_PAGE(pte, ptep, asid, va);
2040			atomic_swap_64(ptep, pte);
2041			AARCH64_TLBI_BY_ASID_VA(asid, va);
2042		} else {
2043			atomic_swap_64(ptep, pte);
2044			AARCH64_TLBI_BY_ASID_VA(asid, va);
2045			cpu_icache_sync_range(va, PAGE_SIZE);
2046		}
2047	} else {
2048		atomic_swap_64(ptep, pte);
2049		AARCH64_TLBI_BY_ASID_VA(asid, va);
2050	}
2051
2052	if (pte & LX_BLKPAG_OS_WIRED) {
2053		_pmap_adj_wired_count(pm, 1);
2054	}
2055	_pmap_adj_resident_count(pm, 1);
2056
2057 fail1:
2058	if (pps[1] != NULL)
2059		pmap_pv_unlock(pps[1]);
2060	if (pps[0] != NULL)
2061		pmap_pv_unlock(pps[0]);
2062 fail0:
2063	if (!kenter) {
2064		pm_unlock(pm);
2065
2066		/* spare pv was not used. discard */
2067		if (spv != NULL)
2068			pool_cache_put(&_pmap_pv_pool, spv);
2069
2070		if (opv != NULL)
2071			pool_cache_put(&_pmap_pv_pool, opv);
2072	}
2073
2074	return error;
2075}
2076
2077int
2078pmap_enter(struct pmap *pm, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2079{
2080	return _pmap_enter(pm, va, pa, prot, flags, false);
2081}
2082
2083
2084bool
2085pmap_remove_all(struct pmap *pm)
2086{
2087	struct pmap_page *pp;
2088	struct pv_entry *pv, *pvtmp, *opv, *pvtofree = NULL;
2089	pt_entry_t pte, *ptep;
2090	paddr_t pa;
2091
2092	UVMHIST_FUNC(__func__);
2093	UVMHIST_CALLARGS(pmaphist, "pm=%p", pm, 0, 0, 0);
2094
2095	KASSERT(pm != pmap_kernel());
2096
2097	UVMHIST_LOG(pmaphist, "pm=%p, asid=%d", pm,
2098	    PMAP_PAI(pm, cpu_tlb_info(ci))->pai_asid, 0, 0);
2099
2100	pm_lock(pm);
2101
2102	LIST_FOREACH_SAFE(pv, &pm->pm_pvlist, pv_proc, pvtmp) {
2103		ptep = pv->pv_ptep;
2104		pte = *ptep;
2105
2106		KASSERTMSG(lxpde_valid(pte),
2107		    "pte is not valid: pmap=%p, va=%016lx",
2108		    pm, pv->pv_va);
2109
2110		pa = lxpde_pa(pte);
2111		pp = phys_to_pp(pa);
2112
2113		KASSERTMSG(pp != NULL,
2114		    "no pmap_page of physical address:%016lx, "
2115		    "pmap=%p, va=%016lx",
2116		    pa, pm, pv->pv_va);
2117
2118		pmap_pv_lock(pp);
2119		opv = _pmap_remove_pv(pp, pm, trunc_page(pv->pv_va), pte);
2120		pmap_pv_unlock(pp);
2121		if (opv != NULL) {
2122			opv->pv_next = pvtofree;
2123			pvtofree = opv;
2124		}
2125	}
2126	/* all PTE should now be cleared */
2127	pm->pm_stats.wired_count = 0;
2128	pm->pm_stats.resident_count = 0;
2129
2130	/* clear L0 page table page */
2131	pmap_zero_page(pm->pm_l0table_pa);
2132
2133	aarch64_tlbi_by_asid(PMAP_PAI(pm, cpu_tlb_info(ci))->pai_asid);
2134
2135	/* free L1-L3 page table pages, but not L0 */
2136	_pmap_free_pdp_all(pm, false);
2137
2138	pm_unlock(pm);
2139
2140	for (pv = pvtofree; pv != NULL; pv = pvtmp) {
2141		pvtmp = pv->pv_next;
2142		pool_cache_put(&_pmap_pv_pool, pv);
2143	}
2144
2145	return true;
2146}
2147
2148static void
2149_pmap_remove(struct pmap *pm, vaddr_t sva, vaddr_t eva, bool kremove,
2150    struct pv_entry **pvtofree)
2151{
2152	pt_entry_t pte, *ptep = NULL;
2153	struct pmap_page *pp;
2154	struct pv_entry *opv;
2155	paddr_t pa;
2156	vaddr_t va;
2157	vsize_t blocksize = 0;
2158	bool pdpremoved;
2159
2160	UVMHIST_FUNC(__func__);
2161	UVMHIST_CALLARGS(pmaphist, "pm=%p, sva=%016lx, eva=%016lx, kremove=%d",
2162	    pm, sva, eva, kremove);
2163
2164	KASSERT(kremove || mutex_owned(&pm->pm_lock));
2165
2166	for (va = sva; (va < eva) && (pm->pm_stats.resident_count != 0);
2167	    va = (va + blocksize) & ~(blocksize - 1)) {
2168
2169		/* va is belong to the same L3 table as before? */
2170		if ((blocksize == L3_SIZE) && ((va & L3INDEXMASK) != 0))
2171			ptep++;
2172		else
2173			ptep = _pmap_pte_lookup_bs(pm, va, &blocksize);
2174
2175		pte = *ptep;
2176		if (!lxpde_valid(pte))
2177			continue;
2178
2179		if (!kremove) {
2180			pa = lxpde_pa(pte);
2181			pp = phys_to_pp(pa);
2182			if (pp != NULL) {
2183
2184				pmap_pv_lock(pp);
2185				opv = _pmap_remove_pv(pp, pm, va, pte);
2186				pmap_pv_unlock(pp);
2187				if (opv != NULL) {
2188					opv->pv_next = *pvtofree;
2189					*pvtofree = opv;
2190				}
2191			}
2192		}
2193
2194		pte = atomic_swap_64(ptep, 0);
2195		if (!lxpde_valid(pte))
2196			continue;
2197		struct pmap_asid_info * const pai = PMAP_PAI(pm,
2198		    cpu_tlb_info(ci));
2199
2200		pdpremoved = _pmap_pdp_delref(pm,
2201		    AARCH64_KVA_TO_PA(trunc_page((vaddr_t)ptep)), true);
2202		AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
2203
2204		if (pdpremoved) {
2205			/*
2206			 * this Ln page table page has been removed.
2207			 * skip to next Ln table
2208			 */
2209			blocksize *= Ln_ENTRIES;
2210		}
2211
2212		if ((pte & LX_BLKPAG_OS_WIRED) != 0) {
2213			_pmap_adj_wired_count(pm, -1);
2214		}
2215		_pmap_adj_resident_count(pm, -1);
2216	}
2217}
2218
2219void
2220pmap_remove(struct pmap *pm, vaddr_t sva, vaddr_t eva)
2221{
2222	struct pv_entry *pvtofree = NULL;
2223	struct pv_entry *pv, *pvtmp;
2224
2225	KASSERT_PM_ADDR(pm, sva);
2226	KASSERT(!IN_DIRECTMAP_ADDR(sva));
2227
2228	pm_lock(pm);
2229	_pmap_remove(pm, sva, eva, false, &pvtofree);
2230	pm_unlock(pm);
2231
2232	for (pv = pvtofree; pv != NULL; pv = pvtmp) {
2233		pvtmp = pv->pv_next;
2234		pool_cache_put(&_pmap_pv_pool, pv);
2235	}
2236}
2237
2238static void
2239pmap_page_remove(struct pmap_page *pp, vm_prot_t prot)
2240{
2241	struct pv_entry *pv, *pvtmp;
2242	struct pv_entry *pvtofree = NULL;
2243	struct pmap *pm;
2244	pt_entry_t opte;
2245
2246	/* remove all pages reference to this physical page */
2247	pmap_pv_lock(pp);
2248	for (pv = &pp->pp_pv; pv != NULL;) {
2249		if ((pm = pv->pv_pmap) == NULL) {
2250			KASSERT(pv == &pp->pp_pv);
2251			pv = pp->pp_pv.pv_next;
2252			continue;
2253		}
2254		if (!pm_reverse_lock(pm, pp)) {
2255			/* now retry */
2256			pv = &pp->pp_pv;
2257			continue;
2258		}
2259		opte = atomic_swap_64(pv->pv_ptep, 0);
2260		struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
2261		const vaddr_t va = trunc_page(pv->pv_va);
2262
2263		if (lxpde_valid(opte)) {
2264			_pmap_pdp_delref(pm,
2265			    AARCH64_KVA_TO_PA(trunc_page(
2266			    (vaddr_t)pv->pv_ptep)), false);
2267			AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
2268
2269			if ((opte & LX_BLKPAG_OS_WIRED) != 0) {
2270				_pmap_adj_wired_count(pm, -1);
2271			}
2272			_pmap_adj_resident_count(pm, -1);
2273		}
2274		pvtmp = _pmap_remove_pv(pp, pm, va, opte);
2275		if (pvtmp == NULL) {
2276			KASSERT(pv == &pp->pp_pv);
2277		} else {
2278			KASSERT(pv == pvtmp);
2279			KASSERT(pp->pp_pv.pv_next == pv->pv_next);
2280			pv->pv_next = pvtofree;
2281			pvtofree = pv;
2282		}
2283		pm_unlock(pm);
2284		pv = pp->pp_pv.pv_next;
2285	}
2286	pmap_pv_unlock(pp);
2287
2288	for (pv = pvtofree; pv != NULL; pv = pvtmp) {
2289		pvtmp = pv->pv_next;
2290		pool_cache_put(&_pmap_pv_pool, pv);
2291	}
2292}
2293
2294#ifdef __HAVE_PMAP_PV_TRACK
2295void
2296pmap_pv_protect(paddr_t pa, vm_prot_t prot)
2297{
2298	struct pmap_page *pp;
2299
2300	UVMHIST_FUNC(__func__);
2301	UVMHIST_CALLARGS(pmaphist, "pa=%016lx, prot=%08x", pa, prot, 0, 0);
2302
2303	pp = pmap_pv_tracked(pa);
2304	if (pp == NULL)
2305		panic("pmap_pv_protect: page not pv-tracked: %#" PRIxPADDR, pa);
2306
2307	KASSERT(prot == VM_PROT_NONE);
2308	pmap_page_remove(pp, prot);
2309}
2310#endif
2311
2312void
2313pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
2314{
2315	struct pv_entry *pv;
2316	struct pmap_page *pp;
2317	struct pmap *pm;
2318
2319	KASSERT((prot & VM_PROT_READ) || !(prot & VM_PROT_WRITE));
2320
2321	pp = VM_PAGE_TO_PP(pg);
2322
2323	UVMHIST_FUNC(__func__);
2324	UVMHIST_CALLARGS(pmaphist, "pg=%p, pp=%p, pa=%016lx, prot=%08x",
2325	    pg, pp, VM_PAGE_TO_PHYS(pg), prot);
2326
2327	/* do an unlocked check first */
2328	if (atomic_load_relaxed(&pp->pp_pv.pv_pmap) == NULL &&
2329	    atomic_load_relaxed(&pp->pp_pv.pv_next) == NULL) {
2330		return;
2331	}
2332
2333	if ((prot & (VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2334	    VM_PROT_NONE) {
2335		pmap_page_remove(pp, prot);
2336	} else {
2337		pmap_pv_lock(pp);
2338		pv = &pp->pp_pv;
2339		while (pv != NULL) {
2340			if ((pm = pv->pv_pmap) == NULL) {
2341				KASSERT(pv == &pp->pp_pv);
2342				pv = pv->pv_next;
2343				continue;
2344			}
2345			if (!pm_reverse_lock(pm, pp)) {
2346				/* retry */
2347				pv = &pp->pp_pv;
2348				continue;
2349			}
2350			_pmap_protect_pv(pp, pv, prot);
2351			pm_unlock(pm);
2352			pv = pv->pv_next;
2353		}
2354		pmap_pv_unlock(pp);
2355	}
2356}
2357
2358void
2359pmap_unwire(struct pmap *pm, vaddr_t va)
2360{
2361	pt_entry_t pte, *ptep;
2362
2363	UVMHIST_FUNC(__func__);
2364	UVMHIST_CALLARGS(pmaphist, "pm=%p, va=%016lx", pm, va, 0, 0);
2365
2366	PMAP_COUNT(unwire);
2367
2368	KASSERT_PM_ADDR(pm, va);
2369	KASSERT(!IN_DIRECTMAP_ADDR(va));
2370
2371	pm_lock(pm);
2372	ptep = _pmap_pte_lookup_l3(pm, va);
2373	if (ptep != NULL) {
2374		pte = *ptep;
2375		if (!l3pte_valid(pte) ||
2376		    ((pte & LX_BLKPAG_OS_WIRED) == 0)) {
2377			/* invalid pte, or pte is not wired */
2378			PMAP_COUNT(unwire_failure);
2379			pm_unlock(pm);
2380			return;
2381		}
2382
2383		pte &= ~LX_BLKPAG_OS_WIRED;
2384		atomic_swap_64(ptep, pte);
2385
2386		_pmap_adj_wired_count(pm, -1);
2387	}
2388	pm_unlock(pm);
2389}
2390
2391bool
2392pmap_fault_fixup(struct pmap *pm, vaddr_t va, vm_prot_t accessprot, bool user)
2393{
2394	struct pmap_page *pp;
2395	pt_entry_t *ptep, pte;
2396	vm_prot_t pmap_prot;
2397	paddr_t pa;
2398	bool fixed = false;
2399
2400	UVMHIST_FUNC(__func__);
2401	UVMHIST_CALLARGS(pmaphist, "pm=%p, va=%016lx, accessprot=%08x",
2402	    pm, va, accessprot, 0);
2403
2404#if 0
2405	KASSERT_PM_ADDR(pm, va);
2406#else
2407	if (((pm == pmap_kernel()) &&
2408	    !(IN_RANGE(va, VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS))) ||
2409	    ((pm != pmap_kernel()) &&
2410	    !(IN_RANGE(va, VM_MIN_ADDRESS, VM_MAX_ADDRESS)))) {
2411
2412		UVMHIST_LOG(pmaphist,
2413		    "pmap space and va mismatch: kernel=%jd, va=%016lx",
2414		    pm == pmap_kernel(), va, 0, 0);
2415		return false;
2416	}
2417#endif
2418
2419	pm_lock(pm);
2420
2421	ptep = _pmap_pte_lookup_l3(pm, va);
2422	if (ptep == NULL) {
2423		UVMHIST_LOG(pmaphist, "pte_lookup failure: va=%016lx",
2424		    va, 0, 0, 0);
2425		goto done;
2426	}
2427
2428	pte = *ptep;
2429	if (!l3pte_valid(pte)) {
2430		UVMHIST_LOG(pmaphist, "invalid pte: %016llx: va=%016lx",
2431		    pte, va, 0, 0);
2432		goto done;
2433	}
2434
2435	pa = l3pte_pa(*ptep);
2436	pp = phys_to_pp(pa);
2437	if (pp == NULL) {
2438		UVMHIST_LOG(pmaphist, "pmap_page not found: va=%016lx", va, 0, 0, 0);
2439		goto done;
2440	}
2441
2442	/*
2443	 * Get the prot specified by pmap_enter().
2444	 * A valid pte is considered a readable page.
2445	 * If DBM is 1, it is considered a writable page.
2446	 */
2447	pmap_prot = VM_PROT_READ;
2448	if ((pte & (LX_BLKPAG_OS_WRITE | LX_BLKPAG_DBM)) != 0)
2449		pmap_prot |= VM_PROT_WRITE;
2450
2451	if (l3pte_executable(pte, pm != pmap_kernel()))
2452		pmap_prot |= VM_PROT_EXECUTE;
2453
2454	UVMHIST_LOG(pmaphist, "va=%016lx, pmapprot=%08x, accessprot=%08x",
2455	    va, pmap_prot, accessprot, 0);
2456
2457	/* ignore except read/write */
2458	accessprot &= (VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE);
2459
2460	/* PROT_EXEC requires implicit PROT_READ */
2461	if (accessprot & VM_PROT_EXECUTE)
2462		accessprot |= VM_PROT_READ;
2463
2464	/* no permission to read/write/execute for this page */
2465	if ((pmap_prot & accessprot) != accessprot) {
2466		UVMHIST_LOG(pmaphist, "no permission to access", 0, 0, 0, 0);
2467		goto done;
2468	}
2469
2470	/* pte is readable and writable, but occurred fault? probably copy(9) */
2471	if ((pte & LX_BLKPAG_AF) && ((pte & LX_BLKPAG_AP) == LX_BLKPAG_AP_RW))
2472		goto done;
2473
2474	pmap_pv_lock(pp);
2475	if ((pte & LX_BLKPAG_AF) == 0) {
2476		/* pte has no AF bit, set referenced and AF bit */
2477		UVMHIST_LOG(pmaphist,
2478		    "REFERENCED:"
2479		    " va=%016lx, pa=%016lx, pte_prot=%08x, accessprot=%08x",
2480		    va, pa, pmap_prot, accessprot);
2481		pp->pp_pv.pv_va |= VM_PROT_READ;	/* set referenced */
2482		pte |= LX_BLKPAG_AF;
2483
2484		PMAP_COUNT(fixup_referenced);
2485	}
2486	if ((accessprot & VM_PROT_WRITE) &&
2487	    ((pte & LX_BLKPAG_AP) == LX_BLKPAG_AP_RO)) {
2488		/* pte is not RW. set modified and RW */
2489
2490		UVMHIST_LOG(pmaphist, "MODIFIED:"
2491		    " va=%016lx, pa=%016lx, pte_prot=%08x, accessprot=%08x",
2492		    va, pa, pmap_prot, accessprot);
2493		pp->pp_pv.pv_va |= VM_PROT_WRITE;	/* set modified */
2494		pte &= ~LX_BLKPAG_AP;
2495		pte |= LX_BLKPAG_AP_RW;
2496
2497		PMAP_COUNT(fixup_modified);
2498	}
2499	pmap_pv_unlock(pp);
2500
2501	atomic_swap_64(ptep, pte);
2502	struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
2503	AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
2504
2505	fixed = true;
2506
2507 done:
2508	pm_unlock(pm);
2509	return fixed;
2510}
2511
2512bool
2513pmap_clear_modify(struct vm_page *pg)
2514{
2515	struct pv_entry *pv;
2516	struct pmap_page * const pp = VM_PAGE_TO_PP(pg);
2517	pt_entry_t *ptep, pte, opte;
2518	vaddr_t va;
2519#ifdef ARMV81_HAFDBS
2520	bool modified;
2521#endif
2522
2523	UVMHIST_FUNC(__func__);
2524	UVMHIST_CALLARGS(pmaphist, "pg=%p, flags=%08x",
2525	    pg, (int)(pp->pp_pv.pv_va & (PAGE_SIZE - 1)), 0, 0);
2526
2527	PMAP_COUNT(clear_modify);
2528
2529	/*
2530	 * if this is a new page, assert it has no mappings and simply zap
2531	 * the stored attributes without taking any locks.
2532	 */
2533	if ((pg->flags & PG_FAKE) != 0) {
2534		KASSERT(atomic_load_relaxed(&pp->pp_pv.pv_pmap) == NULL);
2535		KASSERT(atomic_load_relaxed(&pp->pp_pv.pv_next) == NULL);
2536		atomic_store_relaxed(&pp->pp_pv.pv_va, 0);
2537		return false;
2538	}
2539
2540	pmap_pv_lock(pp);
2541
2542	if (
2543#ifdef ARMV81_HAFDBS
2544	    aarch64_hafdbs_enabled != ID_AA64MMFR1_EL1_HAFDBS_AD &&
2545#endif
2546	    (pp->pp_pv.pv_va & VM_PROT_WRITE) == 0) {
2547		pmap_pv_unlock(pp);
2548		return false;
2549	}
2550#ifdef ARMV81_HAFDBS
2551	modified = ((pp->pp_pv.pv_va & VM_PROT_WRITE) != 0);
2552#endif
2553	pp->pp_pv.pv_va &= ~(vaddr_t)VM_PROT_WRITE;
2554
2555	for (pv = &pp->pp_pv; pv != NULL; pv = pv->pv_next) {
2556		if (pv->pv_pmap == NULL) {
2557			KASSERT(pv == &pp->pp_pv);
2558			continue;
2559		}
2560
2561		PMAP_COUNT(clear_modify_pages);
2562
2563		va = trunc_page(pv->pv_va);
2564
2565		ptep = pv->pv_ptep;
2566		opte = pte = *ptep;
2567 tryagain:
2568		if (!l3pte_valid(pte))
2569			continue;
2570		if ((pte & LX_BLKPAG_AP) == LX_BLKPAG_AP_RO)
2571			continue;
2572#ifdef ARMV81_HAFDBS
2573		modified = true;
2574#endif
2575		/* clear write permission */
2576		pte &= ~LX_BLKPAG_AP;
2577		pte |= LX_BLKPAG_AP_RO;
2578
2579		/* XXX: possible deadlock if using PM_LOCK(). this is racy */
2580		if ((pte = atomic_cas_64(ptep, opte, pte)) != opte) {
2581			opte = pte;
2582			goto tryagain;
2583		}
2584
2585		struct pmap * const pm = pv->pv_pmap;
2586		struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
2587		AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
2588
2589		UVMHIST_LOG(pmaphist,
2590		    "va=%016llx, ptep=%p, pa=%016lx, RW -> RO",
2591		    va, ptep, l3pte_pa(pte), 0);
2592	}
2593
2594	pmap_pv_unlock(pp);
2595
2596#ifdef ARMV81_HAFDBS
2597	return modified;
2598#else
2599	return true;
2600#endif
2601}
2602
2603bool
2604pmap_clear_reference(struct vm_page *pg)
2605{
2606	struct pv_entry *pv;
2607	struct pmap_page * const pp = VM_PAGE_TO_PP(pg);
2608	pt_entry_t *ptep, pte, opte;
2609	vaddr_t va;
2610#ifdef ARMV81_HAFDBS
2611	bool referenced;
2612#endif
2613
2614	UVMHIST_FUNC(__func__);
2615	UVMHIST_CALLARGS(pmaphist, "pg=%p, pp=%p, flags=%08x",
2616	    pg, pp, (int)(pp->pp_pv.pv_va & (PAGE_SIZE - 1)), 0);
2617
2618	pmap_pv_lock(pp);
2619
2620	if (
2621#ifdef ARMV81_HAFDBS
2622	    aarch64_hafdbs_enabled == ID_AA64MMFR1_EL1_HAFDBS_NONE &&
2623#endif
2624	    (pp->pp_pv.pv_va & VM_PROT_READ) == 0) {
2625		pmap_pv_unlock(pp);
2626		return false;
2627	}
2628#ifdef ARMV81_HAFDBS
2629	referenced = ((pp->pp_pv.pv_va & VM_PROT_READ) != 0);
2630#endif
2631	pp->pp_pv.pv_va &= ~(vaddr_t)VM_PROT_READ;
2632
2633	PMAP_COUNT(clear_reference);
2634	for (pv = &pp->pp_pv; pv != NULL; pv = pv->pv_next) {
2635		if (pv->pv_pmap == NULL) {
2636			KASSERT(pv == &pp->pp_pv);
2637			continue;
2638		}
2639
2640		PMAP_COUNT(clear_reference_pages);
2641
2642		va = trunc_page(pv->pv_va);
2643
2644		ptep = pv->pv_ptep;
2645		opte = pte = *ptep;
2646 tryagain:
2647		if (!l3pte_valid(pte))
2648			continue;
2649		if ((pte & LX_BLKPAG_AF) == 0)
2650			continue;
2651#ifdef ARMV81_HAFDBS
2652		referenced = true;
2653#endif
2654		/* clear access permission */
2655		pte &= ~LX_BLKPAG_AF;
2656
2657		/* XXX: possible deadlock if using PM_LOCK(). this is racy */
2658		if ((pte = atomic_cas_64(ptep, opte, pte)) != opte) {
2659			opte = pte;
2660			goto tryagain;
2661		}
2662
2663		struct pmap * const pm = pv->pv_pmap;
2664		struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
2665		AARCH64_TLBI_BY_ASID_VA(pai->pai_asid, va);
2666
2667		UVMHIST_LOG(pmaphist, "va=%016llx, ptep=%p, pa=%016lx, unse AF",
2668		    va, ptep, l3pte_pa(pte), 0);
2669	}
2670
2671	pmap_pv_unlock(pp);
2672
2673#ifdef ARMV81_HAFDBS
2674	return referenced;
2675#else
2676	return true;
2677#endif
2678}
2679
2680bool
2681pmap_is_modified(struct vm_page *pg)
2682{
2683	struct pmap_page * const pp = VM_PAGE_TO_PP(pg);
2684
2685	if (pp->pp_pv.pv_va & VM_PROT_WRITE)
2686		return true;
2687
2688#ifdef ARMV81_HAFDBS
2689	/* check hardware dirty flag on each pte */
2690	if (aarch64_hafdbs_enabled == ID_AA64MMFR1_EL1_HAFDBS_AD) {
2691		struct pv_entry *pv;
2692		pt_entry_t *ptep, pte;
2693
2694		pmap_pv_lock(pp);
2695		for (pv = &pp->pp_pv; pv != NULL; pv = pv->pv_next) {
2696			if (pv->pv_pmap == NULL) {
2697				KASSERT(pv == &pp->pp_pv);
2698				continue;
2699			}
2700
2701			ptep = pv->pv_ptep;
2702			pte = *ptep;
2703			if (!l3pte_valid(pte))
2704				continue;
2705
2706			if ((pte & LX_BLKPAG_AP) == LX_BLKPAG_AP_RW) {
2707				pp->pp_pv.pv_va |= VM_PROT_WRITE;
2708				pmap_pv_unlock(pp);
2709				return true;
2710			}
2711		}
2712		pmap_pv_unlock(pp);
2713	}
2714#endif
2715
2716	return false;
2717}
2718
2719bool
2720pmap_is_referenced(struct vm_page *pg)
2721{
2722	struct pmap_page * const pp = VM_PAGE_TO_PP(pg);
2723
2724	if (pp->pp_pv.pv_va & VM_PROT_READ)
2725		return true;
2726
2727#ifdef ARMV81_HAFDBS
2728	/* check hardware access flag on each pte */
2729	if (aarch64_hafdbs_enabled != ID_AA64MMFR1_EL1_HAFDBS_NONE) {
2730		struct pv_entry *pv;
2731		pt_entry_t *ptep, pte;
2732
2733		pmap_pv_lock(pp);
2734		for (pv = &pp->pp_pv; pv != NULL; pv = pv->pv_next) {
2735			if (pv->pv_pmap == NULL) {
2736				KASSERT(pv == &pp->pp_pv);
2737				continue;
2738			}
2739
2740			ptep = pv->pv_ptep;
2741			pte = *ptep;
2742			if (!l3pte_valid(pte))
2743				continue;
2744
2745			if (pte & LX_BLKPAG_AF) {
2746				pp->pp_pv.pv_va |= VM_PROT_READ;
2747				pmap_pv_unlock(pp);
2748				return true;
2749			}
2750		}
2751		pmap_pv_unlock(pp);
2752	}
2753#endif
2754
2755	return false;
2756}
2757
2758/* get pointer to kernel segment L2 or L3 table entry */
2759pt_entry_t *
2760kvtopte(vaddr_t va)
2761{
2762	KASSERT(IN_RANGE(va, VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS));
2763
2764	return _pmap_pte_lookup_bs(pmap_kernel(), va, NULL);
2765}
2766
2767#ifdef DDB
2768void
2769pmap_db_pmap_print(struct pmap *pm,
2770    void (*pr)(const char *, ...) __printflike(1, 2))
2771{
2772	struct pmap_asid_info * const pai = PMAP_PAI(pm, cpu_tlb_info(ci));
2773
2774	pr(" pm_asid       = %d\n", pai->pai_asid);
2775	pr(" pm_l0table    = %p\n", pm->pm_l0table);
2776	pr(" pm_l0table_pa = %lx\n", pm->pm_l0table_pa);
2777	pr(" pm_activated  = %d\n\n", pm->pm_activated);
2778}
2779#endif /* DDB */
2780