pmap.c revision 305879
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 *    notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 *    notice, this list of conditions and the following disclaimer in the
31 *    documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 *    must display the following acknowledgement:
34 *	This product includes software developed by the University of
35 *	California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 *    may be used to endorse or promote products derived from this software
38 *    without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53 */
54/*-
55 * Copyright (c) 2003 Networks Associates Technology, Inc.
56 * All rights reserved.
57 *
58 * This software was developed for the FreeBSD Project by Jake Burkholder,
59 * Safeport Network Services, and Network Associates Laboratories, the
60 * Security Research Division of Network Associates, Inc. under
61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62 * CHATS research program.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 *    notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 *    notice, this list of conditions and the following disclaimer in the
71 *    documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86#include <sys/cdefs.h>
87__FBSDID("$FreeBSD: stable/11/sys/arm64/arm64/pmap.c 305879 2016-09-16 12:17:01Z andrew $");
88
89/*
90 *	Manages physical address maps.
91 *
92 *	Since the information managed by this module is
93 *	also stored by the logical address mapping module,
94 *	this module may throw away valid virtual-to-physical
95 *	mappings at almost any time.  However, invalidations
96 *	of virtual-to-physical mappings must be done as
97 *	requested.
98 *
99 *	In order to cope with hardware architectures which
100 *	make virtual-to-physical map invalidates expensive,
101 *	this module may delay invalidate or reduced protection
102 *	operations until such time as they are actually
103 *	necessary.  This module is given full information as
104 *	to which processors are currently using which maps,
105 *	and to when physical maps must be made correct.
106 */
107
108#include <sys/param.h>
109#include <sys/bus.h>
110#include <sys/systm.h>
111#include <sys/kernel.h>
112#include <sys/ktr.h>
113#include <sys/lock.h>
114#include <sys/malloc.h>
115#include <sys/mman.h>
116#include <sys/msgbuf.h>
117#include <sys/mutex.h>
118#include <sys/proc.h>
119#include <sys/rwlock.h>
120#include <sys/sx.h>
121#include <sys/vmem.h>
122#include <sys/vmmeter.h>
123#include <sys/sched.h>
124#include <sys/sysctl.h>
125#include <sys/_unrhdr.h>
126#include <sys/smp.h>
127
128#include <vm/vm.h>
129#include <vm/vm_param.h>
130#include <vm/vm_kern.h>
131#include <vm/vm_page.h>
132#include <vm/vm_map.h>
133#include <vm/vm_object.h>
134#include <vm/vm_extern.h>
135#include <vm/vm_pageout.h>
136#include <vm/vm_pager.h>
137#include <vm/vm_radix.h>
138#include <vm/vm_reserv.h>
139#include <vm/uma.h>
140
141#include <machine/machdep.h>
142#include <machine/md_var.h>
143#include <machine/pcb.h>
144
145#define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
146#define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
147#define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
148#define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
149
150#define	NUL0E		L0_ENTRIES
151#define	NUL1E		(NUL0E * NL1PG)
152#define	NUL2E		(NUL1E * NL2PG)
153
154#if !defined(DIAGNOSTIC)
155#ifdef __GNUC_GNU_INLINE__
156#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
157#else
158#define PMAP_INLINE	extern inline
159#endif
160#else
161#define PMAP_INLINE
162#endif
163
164/*
165 * These are configured by the mair_el1 register. This is set up in locore.S
166 */
167#define	DEVICE_MEMORY	0
168#define	UNCACHED_MEMORY	1
169#define	CACHED_MEMORY	2
170
171
172#ifdef PV_STATS
173#define PV_STAT(x)	do { x ; } while (0)
174#else
175#define PV_STAT(x)	do { } while (0)
176#endif
177
178#define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
179
180#define	NPV_LIST_LOCKS	MAXCPU
181
182#define	PHYS_TO_PV_LIST_LOCK(pa)	\
183			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
184
185#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
186	struct rwlock **_lockp = (lockp);		\
187	struct rwlock *_new_lock;			\
188							\
189	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
190	if (_new_lock != *_lockp) {			\
191		if (*_lockp != NULL)			\
192			rw_wunlock(*_lockp);		\
193		*_lockp = _new_lock;			\
194		rw_wlock(*_lockp);			\
195	}						\
196} while (0)
197
198#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
199			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
200
201#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
202	struct rwlock **_lockp = (lockp);		\
203							\
204	if (*_lockp != NULL) {				\
205		rw_wunlock(*_lockp);			\
206		*_lockp = NULL;				\
207	}						\
208} while (0)
209
210#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
211			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
212
213struct pmap kernel_pmap_store;
214
215vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
216vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
217vm_offset_t kernel_vm_end = 0;
218
219struct msgbuf *msgbufp = NULL;
220
221vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
222vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
223vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
224
225/* This code assumes all L1 DMAP entries will be used */
226CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
227CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
228
229#define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
230extern pt_entry_t pagetable_dmap[];
231
232/*
233 * Data for the pv entry allocation mechanism
234 */
235static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
236static struct mtx pv_chunks_mutex;
237static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
238
239static void	free_pv_chunk(struct pv_chunk *pc);
240static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
241static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
242static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
243static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
244static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
245		    vm_offset_t va);
246static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
247    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
248static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
249    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
250static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
251    vm_page_t m, struct rwlock **lockp);
252
253static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
254		struct rwlock **lockp);
255
256static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
257    struct spglist *free);
258static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
259
260/*
261 * These load the old table data and store the new value.
262 * They need to be atomic as the System MMU may write to the table at
263 * the same time as the CPU.
264 */
265#define	pmap_load_store(table, entry) atomic_swap_64(table, entry)
266#define	pmap_set(table, mask) atomic_set_64(table, mask)
267#define	pmap_load_clear(table) atomic_swap_64(table, 0)
268#define	pmap_load(table) (*table)
269
270/********************/
271/* Inline functions */
272/********************/
273
274static __inline void
275pagecopy(void *s, void *d)
276{
277
278	memcpy(d, s, PAGE_SIZE);
279}
280
281#define	pmap_l0_index(va)	(((va) >> L0_SHIFT) & L0_ADDR_MASK)
282#define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
283#define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
284#define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
285
286static __inline pd_entry_t *
287pmap_l0(pmap_t pmap, vm_offset_t va)
288{
289
290	return (&pmap->pm_l0[pmap_l0_index(va)]);
291}
292
293static __inline pd_entry_t *
294pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
295{
296	pd_entry_t *l1;
297
298	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
299	return (&l1[pmap_l1_index(va)]);
300}
301
302static __inline pd_entry_t *
303pmap_l1(pmap_t pmap, vm_offset_t va)
304{
305	pd_entry_t *l0;
306
307	l0 = pmap_l0(pmap, va);
308	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
309		return (NULL);
310
311	return (pmap_l0_to_l1(l0, va));
312}
313
314static __inline pd_entry_t *
315pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
316{
317	pd_entry_t *l2;
318
319	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
320	return (&l2[pmap_l2_index(va)]);
321}
322
323static __inline pd_entry_t *
324pmap_l2(pmap_t pmap, vm_offset_t va)
325{
326	pd_entry_t *l1;
327
328	l1 = pmap_l1(pmap, va);
329	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
330		return (NULL);
331
332	return (pmap_l1_to_l2(l1, va));
333}
334
335static __inline pt_entry_t *
336pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
337{
338	pt_entry_t *l3;
339
340	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
341	return (&l3[pmap_l3_index(va)]);
342}
343
344/*
345 * Returns the lowest valid pde for a given virtual address.
346 * The next level may or may not point to a valid page or block.
347 */
348static __inline pd_entry_t *
349pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
350{
351	pd_entry_t *l0, *l1, *l2, desc;
352
353	l0 = pmap_l0(pmap, va);
354	desc = pmap_load(l0) & ATTR_DESCR_MASK;
355	if (desc != L0_TABLE) {
356		*level = -1;
357		return (NULL);
358	}
359
360	l1 = pmap_l0_to_l1(l0, va);
361	desc = pmap_load(l1) & ATTR_DESCR_MASK;
362	if (desc != L1_TABLE) {
363		*level = 0;
364		return (l0);
365	}
366
367	l2 = pmap_l1_to_l2(l1, va);
368	desc = pmap_load(l2) & ATTR_DESCR_MASK;
369	if (desc != L2_TABLE) {
370		*level = 1;
371		return (l1);
372	}
373
374	*level = 2;
375	return (l2);
376}
377
378/*
379 * Returns the lowest valid pte block or table entry for a given virtual
380 * address. If there are no valid entries return NULL and set the level to
381 * the first invalid level.
382 */
383static __inline pt_entry_t *
384pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
385{
386	pd_entry_t *l1, *l2, desc;
387	pt_entry_t *l3;
388
389	l1 = pmap_l1(pmap, va);
390	if (l1 == NULL) {
391		*level = 0;
392		return (NULL);
393	}
394	desc = pmap_load(l1) & ATTR_DESCR_MASK;
395	if (desc == L1_BLOCK) {
396		*level = 1;
397		return (l1);
398	}
399
400	if (desc != L1_TABLE) {
401		*level = 1;
402		return (NULL);
403	}
404
405	l2 = pmap_l1_to_l2(l1, va);
406	desc = pmap_load(l2) & ATTR_DESCR_MASK;
407	if (desc == L2_BLOCK) {
408		*level = 2;
409		return (l2);
410	}
411
412	if (desc != L2_TABLE) {
413		*level = 2;
414		return (NULL);
415	}
416
417	*level = 3;
418	l3 = pmap_l2_to_l3(l2, va);
419	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
420		return (NULL);
421
422	return (l3);
423}
424
425bool
426pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
427    pd_entry_t **l2, pt_entry_t **l3)
428{
429	pd_entry_t *l0p, *l1p, *l2p;
430
431	if (pmap->pm_l0 == NULL)
432		return (false);
433
434	l0p = pmap_l0(pmap, va);
435	*l0 = l0p;
436
437	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
438		return (false);
439
440	l1p = pmap_l0_to_l1(l0p, va);
441	*l1 = l1p;
442
443	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
444		*l2 = NULL;
445		*l3 = NULL;
446		return (true);
447	}
448
449	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
450		return (false);
451
452	l2p = pmap_l1_to_l2(l1p, va);
453	*l2 = l2p;
454
455	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
456		*l3 = NULL;
457		return (true);
458	}
459
460	*l3 = pmap_l2_to_l3(l2p, va);
461
462	return (true);
463}
464
465static __inline int
466pmap_is_current(pmap_t pmap)
467{
468
469	return ((pmap == pmap_kernel()) ||
470	    (pmap == curthread->td_proc->p_vmspace->vm_map.pmap));
471}
472
473static __inline int
474pmap_l3_valid(pt_entry_t l3)
475{
476
477	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
478}
479
480static __inline int
481pmap_l3_valid_cacheable(pt_entry_t l3)
482{
483
484	return (((l3 & ATTR_DESCR_MASK) == L3_PAGE) &&
485	    ((l3 & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY)));
486}
487
488#define	PTE_SYNC(pte)	cpu_dcache_wb_range((vm_offset_t)pte, sizeof(*pte))
489
490/*
491 * Checks if the page is dirty. We currently lack proper tracking of this on
492 * arm64 so for now assume is a page mapped as rw was accessed it is.
493 */
494static inline int
495pmap_page_dirty(pt_entry_t pte)
496{
497
498	return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) ==
499	    (ATTR_AF | ATTR_AP(ATTR_AP_RW)));
500}
501
502static __inline void
503pmap_resident_count_inc(pmap_t pmap, int count)
504{
505
506	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
507	pmap->pm_stats.resident_count += count;
508}
509
510static __inline void
511pmap_resident_count_dec(pmap_t pmap, int count)
512{
513
514	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
515	KASSERT(pmap->pm_stats.resident_count >= count,
516	    ("pmap %p resident count underflow %ld %d", pmap,
517	    pmap->pm_stats.resident_count, count));
518	pmap->pm_stats.resident_count -= count;
519}
520
521static pt_entry_t *
522pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
523    u_int *l2_slot)
524{
525	pt_entry_t *l2;
526	pd_entry_t *l1;
527
528	l1 = (pd_entry_t *)l1pt;
529	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
530
531	/* Check locore has used a table L1 map */
532	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
533	   ("Invalid bootstrap L1 table"));
534	/* Find the address of the L2 table */
535	l2 = (pt_entry_t *)init_pt_va;
536	*l2_slot = pmap_l2_index(va);
537
538	return (l2);
539}
540
541static vm_paddr_t
542pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
543{
544	u_int l1_slot, l2_slot;
545	pt_entry_t *l2;
546
547	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
548
549	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
550}
551
552static void
553pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
554{
555	vm_offset_t va;
556	vm_paddr_t pa;
557	u_int l1_slot;
558
559	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
560	va = DMAP_MIN_ADDRESS;
561	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
562	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
563		l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
564
565		pmap_load_store(&pagetable_dmap[l1_slot],
566		    (pa & ~L1_OFFSET) | ATTR_DEFAULT |
567		    ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
568	}
569
570	/* Set the upper limit of the DMAP region */
571	dmap_phys_max = pa;
572	dmap_max_addr = va;
573
574	cpu_dcache_wb_range((vm_offset_t)pagetable_dmap,
575	    PAGE_SIZE * DMAP_TABLES);
576	cpu_tlb_flushID();
577}
578
579static vm_offset_t
580pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
581{
582	vm_offset_t l2pt;
583	vm_paddr_t pa;
584	pd_entry_t *l1;
585	u_int l1_slot;
586
587	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
588
589	l1 = (pd_entry_t *)l1pt;
590	l1_slot = pmap_l1_index(va);
591	l2pt = l2_start;
592
593	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
594		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
595
596		pa = pmap_early_vtophys(l1pt, l2pt);
597		pmap_load_store(&l1[l1_slot],
598		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
599		l2pt += PAGE_SIZE;
600	}
601
602	/* Clean the L2 page table */
603	memset((void *)l2_start, 0, l2pt - l2_start);
604	cpu_dcache_wb_range(l2_start, l2pt - l2_start);
605
606	/* Flush the l1 table to ram */
607	cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE);
608
609	return l2pt;
610}
611
612static vm_offset_t
613pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
614{
615	vm_offset_t l2pt, l3pt;
616	vm_paddr_t pa;
617	pd_entry_t *l2;
618	u_int l2_slot;
619
620	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
621
622	l2 = pmap_l2(kernel_pmap, va);
623	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
624	l2pt = (vm_offset_t)l2;
625	l2_slot = pmap_l2_index(va);
626	l3pt = l3_start;
627
628	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
629		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
630
631		pa = pmap_early_vtophys(l1pt, l3pt);
632		pmap_load_store(&l2[l2_slot],
633		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
634		l3pt += PAGE_SIZE;
635	}
636
637	/* Clean the L2 page table */
638	memset((void *)l3_start, 0, l3pt - l3_start);
639	cpu_dcache_wb_range(l3_start, l3pt - l3_start);
640
641	cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE);
642
643	return l3pt;
644}
645
646/*
647 *	Bootstrap the system enough to run with virtual memory.
648 */
649void
650pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
651    vm_size_t kernlen)
652{
653	u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot;
654	uint64_t kern_delta;
655	pt_entry_t *l2;
656	vm_offset_t va, freemempos;
657	vm_offset_t dpcpu, msgbufpv;
658	vm_paddr_t pa, max_pa, min_pa;
659	int i;
660
661	kern_delta = KERNBASE - kernstart;
662	physmem = 0;
663
664	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
665	printf("%lx\n", l1pt);
666	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
667
668	/* Set this early so we can use the pagetable walking functions */
669	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
670	PMAP_LOCK_INIT(kernel_pmap);
671
672	/* Assume the address we were loaded to is a valid physical address */
673	min_pa = max_pa = KERNBASE - kern_delta;
674
675	/*
676	 * Find the minimum physical address. physmap is sorted,
677	 * but may contain empty ranges.
678	 */
679	for (i = 0; i < (physmap_idx * 2); i += 2) {
680		if (physmap[i] == physmap[i + 1])
681			continue;
682		if (physmap[i] <= min_pa)
683			min_pa = physmap[i];
684		if (physmap[i + 1] > max_pa)
685			max_pa = physmap[i + 1];
686	}
687
688	/* Create a direct map region early so we can use it for pa -> va */
689	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
690
691	va = KERNBASE;
692	pa = KERNBASE - kern_delta;
693
694	/*
695	 * Start to initialise phys_avail by copying from physmap
696	 * up to the physical address KERNBASE points at.
697	 */
698	map_slot = avail_slot = 0;
699	for (; map_slot < (physmap_idx * 2) &&
700	    avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) {
701		if (physmap[map_slot] == physmap[map_slot + 1])
702			continue;
703
704		if (physmap[map_slot] <= pa &&
705		    physmap[map_slot + 1] > pa)
706			break;
707
708		phys_avail[avail_slot] = physmap[map_slot];
709		phys_avail[avail_slot + 1] = physmap[map_slot + 1];
710		physmem += (phys_avail[avail_slot + 1] -
711		    phys_avail[avail_slot]) >> PAGE_SHIFT;
712		avail_slot += 2;
713	}
714
715	/* Add the memory before the kernel */
716	if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) {
717		phys_avail[avail_slot] = physmap[map_slot];
718		phys_avail[avail_slot + 1] = pa;
719		physmem += (phys_avail[avail_slot + 1] -
720		    phys_avail[avail_slot]) >> PAGE_SHIFT;
721		avail_slot += 2;
722	}
723	used_map_slot = map_slot;
724
725	/*
726	 * Read the page table to find out what is already mapped.
727	 * This assumes we have mapped a block of memory from KERNBASE
728	 * using a single L1 entry.
729	 */
730	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
731
732	/* Sanity check the index, KERNBASE should be the first VA */
733	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
734
735	/* Find how many pages we have mapped */
736	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
737		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
738			break;
739
740		/* Check locore used L2 blocks */
741		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
742		    ("Invalid bootstrap L2 table"));
743		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
744		    ("Incorrect PA in L2 table"));
745
746		va += L2_SIZE;
747		pa += L2_SIZE;
748	}
749
750	va = roundup2(va, L1_SIZE);
751
752	freemempos = KERNBASE + kernlen;
753	freemempos = roundup2(freemempos, PAGE_SIZE);
754	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
755	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
756	/* And the l3 tables for the early devmap */
757	freemempos = pmap_bootstrap_l3(l1pt,
758	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
759
760	cpu_tlb_flushID();
761
762#define alloc_pages(var, np)						\
763	(var) = freemempos;						\
764	freemempos += (np * PAGE_SIZE);					\
765	memset((char *)(var), 0, ((np) * PAGE_SIZE));
766
767	/* Allocate dynamic per-cpu area. */
768	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
769	dpcpu_init((void *)dpcpu, 0);
770
771	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
772	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
773	msgbufp = (void *)msgbufpv;
774
775	virtual_avail = roundup2(freemempos, L1_SIZE);
776	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
777	kernel_vm_end = virtual_avail;
778
779	pa = pmap_early_vtophys(l1pt, freemempos);
780
781	/* Finish initialising physmap */
782	map_slot = used_map_slot;
783	for (; avail_slot < (PHYS_AVAIL_SIZE - 2) &&
784	    map_slot < (physmap_idx * 2); map_slot += 2) {
785		if (physmap[map_slot] == physmap[map_slot + 1])
786			continue;
787
788		/* Have we used the current range? */
789		if (physmap[map_slot + 1] <= pa)
790			continue;
791
792		/* Do we need to split the entry? */
793		if (physmap[map_slot] < pa) {
794			phys_avail[avail_slot] = pa;
795			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
796		} else {
797			phys_avail[avail_slot] = physmap[map_slot];
798			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
799		}
800		physmem += (phys_avail[avail_slot + 1] -
801		    phys_avail[avail_slot]) >> PAGE_SHIFT;
802
803		avail_slot += 2;
804	}
805	phys_avail[avail_slot] = 0;
806	phys_avail[avail_slot + 1] = 0;
807
808	/*
809	 * Maxmem isn't the "maximum memory", it's one larger than the
810	 * highest page of the physical address space.  It should be
811	 * called something like "Maxphyspage".
812	 */
813	Maxmem = atop(phys_avail[avail_slot - 1]);
814
815	cpu_tlb_flushID();
816}
817
818/*
819 *	Initialize a vm_page's machine-dependent fields.
820 */
821void
822pmap_page_init(vm_page_t m)
823{
824
825	TAILQ_INIT(&m->md.pv_list);
826	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
827}
828
829/*
830 *	Initialize the pmap module.
831 *	Called by vm_init, to initialize any structures that the pmap
832 *	system needs to map virtual memory.
833 */
834void
835pmap_init(void)
836{
837	int i;
838
839	/*
840	 * Initialize the pv chunk list mutex.
841	 */
842	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
843
844	/*
845	 * Initialize the pool of pv list locks.
846	 */
847	for (i = 0; i < NPV_LIST_LOCKS; i++)
848		rw_init(&pv_list_locks[i], "pmap pv list");
849}
850
851/*
852 * Invalidate a single TLB entry.
853 */
854PMAP_INLINE void
855pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
856{
857
858	sched_pin();
859	__asm __volatile(
860	    "dsb  ishst		\n"
861	    "tlbi vaae1is, %0	\n"
862	    "dsb  ish		\n"
863	    "isb		\n"
864	    : : "r"(va >> PAGE_SHIFT));
865	sched_unpin();
866}
867
868PMAP_INLINE void
869pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
870{
871	vm_offset_t addr;
872
873	sched_pin();
874	dsb(ishst);
875	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
876		__asm __volatile(
877		    "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
878	}
879	__asm __volatile(
880	    "dsb  ish	\n"
881	    "isb	\n");
882	sched_unpin();
883}
884
885PMAP_INLINE void
886pmap_invalidate_all(pmap_t pmap)
887{
888
889	sched_pin();
890	__asm __volatile(
891	    "dsb  ishst		\n"
892	    "tlbi vmalle1is	\n"
893	    "dsb  ish		\n"
894	    "isb		\n");
895	sched_unpin();
896}
897
898/*
899 *	Routine:	pmap_extract
900 *	Function:
901 *		Extract the physical page address associated
902 *		with the given map/virtual_address pair.
903 */
904vm_paddr_t
905pmap_extract(pmap_t pmap, vm_offset_t va)
906{
907	pt_entry_t *pte, tpte;
908	vm_paddr_t pa;
909	int lvl;
910
911	pa = 0;
912	PMAP_LOCK(pmap);
913	/*
914	 * Find the block or page map for this virtual address. pmap_pte
915	 * will return either a valid block/page entry, or NULL.
916	 */
917	pte = pmap_pte(pmap, va, &lvl);
918	if (pte != NULL) {
919		tpte = pmap_load(pte);
920		pa = tpte & ~ATTR_MASK;
921		switch(lvl) {
922		case 1:
923			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
924			    ("pmap_extract: Invalid L1 pte found: %lx",
925			    tpte & ATTR_DESCR_MASK));
926			pa |= (va & L1_OFFSET);
927			break;
928		case 2:
929			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
930			    ("pmap_extract: Invalid L2 pte found: %lx",
931			    tpte & ATTR_DESCR_MASK));
932			pa |= (va & L2_OFFSET);
933			break;
934		case 3:
935			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
936			    ("pmap_extract: Invalid L3 pte found: %lx",
937			    tpte & ATTR_DESCR_MASK));
938			pa |= (va & L3_OFFSET);
939			break;
940		}
941	}
942	PMAP_UNLOCK(pmap);
943	return (pa);
944}
945
946/*
947 *	Routine:	pmap_extract_and_hold
948 *	Function:
949 *		Atomically extract and hold the physical page
950 *		with the given pmap and virtual address pair
951 *		if that mapping permits the given protection.
952 */
953vm_page_t
954pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
955{
956	pt_entry_t *pte, tpte;
957	vm_paddr_t pa;
958	vm_page_t m;
959	int lvl;
960
961	pa = 0;
962	m = NULL;
963	PMAP_LOCK(pmap);
964retry:
965	pte = pmap_pte(pmap, va, &lvl);
966	if (pte != NULL) {
967		tpte = pmap_load(pte);
968
969		KASSERT(lvl > 0 && lvl <= 3,
970		    ("pmap_extract_and_hold: Invalid level %d", lvl));
971		CTASSERT(L1_BLOCK == L2_BLOCK);
972		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
973		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
974		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
975		     tpte & ATTR_DESCR_MASK));
976		if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
977		    ((prot & VM_PROT_WRITE) == 0)) {
978			if (vm_page_pa_tryrelock(pmap, tpte & ~ATTR_MASK, &pa))
979				goto retry;
980			m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
981			vm_page_hold(m);
982		}
983	}
984	PA_UNLOCK_COND(pa);
985	PMAP_UNLOCK(pmap);
986	return (m);
987}
988
989vm_paddr_t
990pmap_kextract(vm_offset_t va)
991{
992	pt_entry_t *pte, tpte;
993	vm_paddr_t pa;
994	int lvl;
995
996	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
997		pa = DMAP_TO_PHYS(va);
998	} else {
999		pa = 0;
1000		pte = pmap_pte(kernel_pmap, va, &lvl);
1001		if (pte != NULL) {
1002			tpte = pmap_load(pte);
1003			pa = tpte & ~ATTR_MASK;
1004			switch(lvl) {
1005			case 1:
1006				KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1007				    ("pmap_kextract: Invalid L1 pte found: %lx",
1008				    tpte & ATTR_DESCR_MASK));
1009				pa |= (va & L1_OFFSET);
1010				break;
1011			case 2:
1012				KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1013				    ("pmap_kextract: Invalid L2 pte found: %lx",
1014				    tpte & ATTR_DESCR_MASK));
1015				pa |= (va & L2_OFFSET);
1016				break;
1017			case 3:
1018				KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1019				    ("pmap_kextract: Invalid L3 pte found: %lx",
1020				    tpte & ATTR_DESCR_MASK));
1021				pa |= (va & L3_OFFSET);
1022				break;
1023			}
1024		}
1025	}
1026	return (pa);
1027}
1028
1029/***************************************************
1030 * Low level mapping routines.....
1031 ***************************************************/
1032
1033static void
1034pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1035{
1036	pd_entry_t *pde;
1037	pt_entry_t *pte;
1038	vm_offset_t va;
1039	int lvl;
1040
1041	KASSERT((pa & L3_OFFSET) == 0,
1042	   ("pmap_kenter: Invalid physical address"));
1043	KASSERT((sva & L3_OFFSET) == 0,
1044	   ("pmap_kenter: Invalid virtual address"));
1045	KASSERT((size & PAGE_MASK) == 0,
1046	    ("pmap_kenter: Mapping is not page-sized"));
1047
1048	va = sva;
1049	while (size != 0) {
1050		pde = pmap_pde(kernel_pmap, va, &lvl);
1051		KASSERT(pde != NULL,
1052		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1053		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1054
1055		pte = pmap_l2_to_l3(pde, va);
1056		pmap_load_store(pte, (pa & ~L3_OFFSET) | ATTR_DEFAULT |
1057		    ATTR_IDX(mode) | L3_PAGE);
1058		PTE_SYNC(pte);
1059
1060		va += PAGE_SIZE;
1061		pa += PAGE_SIZE;
1062		size -= PAGE_SIZE;
1063	}
1064	pmap_invalidate_range(kernel_pmap, sva, va);
1065}
1066
1067void
1068pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1069{
1070
1071	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
1072}
1073
1074/*
1075 * Remove a page from the kernel pagetables.
1076 */
1077PMAP_INLINE void
1078pmap_kremove(vm_offset_t va)
1079{
1080	pt_entry_t *pte;
1081	int lvl;
1082
1083	pte = pmap_pte(kernel_pmap, va, &lvl);
1084	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
1085	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
1086
1087	if (pmap_l3_valid_cacheable(pmap_load(pte)))
1088		cpu_dcache_wb_range(va, L3_SIZE);
1089	pmap_load_clear(pte);
1090	PTE_SYNC(pte);
1091	pmap_invalidate_page(kernel_pmap, va);
1092}
1093
1094void
1095pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1096{
1097	pt_entry_t *pte;
1098	vm_offset_t va;
1099	int lvl;
1100
1101	KASSERT((sva & L3_OFFSET) == 0,
1102	   ("pmap_kremove_device: Invalid virtual address"));
1103	KASSERT((size & PAGE_MASK) == 0,
1104	    ("pmap_kremove_device: Mapping is not page-sized"));
1105
1106	va = sva;
1107	while (size != 0) {
1108		pte = pmap_pte(kernel_pmap, va, &lvl);
1109		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
1110		KASSERT(lvl == 3,
1111		    ("Invalid device pagetable level: %d != 3", lvl));
1112		pmap_load_clear(pte);
1113		PTE_SYNC(pte);
1114
1115		va += PAGE_SIZE;
1116		size -= PAGE_SIZE;
1117	}
1118	pmap_invalidate_range(kernel_pmap, sva, va);
1119}
1120
1121/*
1122 *	Used to map a range of physical addresses into kernel
1123 *	virtual address space.
1124 *
1125 *	The value passed in '*virt' is a suggested virtual address for
1126 *	the mapping. Architectures which can support a direct-mapped
1127 *	physical to virtual region can return the appropriate address
1128 *	within that region, leaving '*virt' unchanged. Other
1129 *	architectures should map the pages starting at '*virt' and
1130 *	update '*virt' with the first usable address after the mapped
1131 *	region.
1132 */
1133vm_offset_t
1134pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1135{
1136	return PHYS_TO_DMAP(start);
1137}
1138
1139
1140/*
1141 * Add a list of wired pages to the kva
1142 * this routine is only used for temporary
1143 * kernel mappings that do not need to have
1144 * page modification or references recorded.
1145 * Note that old mappings are simply written
1146 * over.  The page *must* be wired.
1147 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1148 */
1149void
1150pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1151{
1152	pd_entry_t *pde;
1153	pt_entry_t *pte, pa;
1154	vm_offset_t va;
1155	vm_page_t m;
1156	int i, lvl;
1157
1158	va = sva;
1159	for (i = 0; i < count; i++) {
1160		pde = pmap_pde(kernel_pmap, va, &lvl);
1161		KASSERT(pde != NULL,
1162		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1163		KASSERT(lvl == 2,
1164		    ("pmap_qenter: Invalid level %d", lvl));
1165
1166		m = ma[i];
1167		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
1168		    ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
1169		pte = pmap_l2_to_l3(pde, va);
1170		pmap_load_store(pte, pa);
1171		PTE_SYNC(pte);
1172
1173		va += L3_SIZE;
1174	}
1175	pmap_invalidate_range(kernel_pmap, sva, va);
1176}
1177
1178/*
1179 * This routine tears out page mappings from the
1180 * kernel -- it is meant only for temporary mappings.
1181 */
1182void
1183pmap_qremove(vm_offset_t sva, int count)
1184{
1185	pt_entry_t *pte;
1186	vm_offset_t va;
1187	int lvl;
1188
1189	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1190
1191	va = sva;
1192	while (count-- > 0) {
1193		pte = pmap_pte(kernel_pmap, va, &lvl);
1194		KASSERT(lvl == 3,
1195		    ("Invalid device pagetable level: %d != 3", lvl));
1196		if (pte != NULL) {
1197			if (pmap_l3_valid_cacheable(pmap_load(pte)))
1198				cpu_dcache_wb_range(va, L3_SIZE);
1199			pmap_load_clear(pte);
1200			PTE_SYNC(pte);
1201		}
1202
1203		va += PAGE_SIZE;
1204	}
1205	pmap_invalidate_range(kernel_pmap, sva, va);
1206}
1207
1208/***************************************************
1209 * Page table page management routines.....
1210 ***************************************************/
1211static __inline void
1212pmap_free_zero_pages(struct spglist *free)
1213{
1214	vm_page_t m;
1215
1216	while ((m = SLIST_FIRST(free)) != NULL) {
1217		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1218		/* Preserve the page's PG_ZERO setting. */
1219		vm_page_free_toq(m);
1220	}
1221}
1222
1223/*
1224 * Schedule the specified unused page table page to be freed.  Specifically,
1225 * add the page to the specified list of pages that will be released to the
1226 * physical memory manager after the TLB has been updated.
1227 */
1228static __inline void
1229pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1230    boolean_t set_PG_ZERO)
1231{
1232
1233	if (set_PG_ZERO)
1234		m->flags |= PG_ZERO;
1235	else
1236		m->flags &= ~PG_ZERO;
1237	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1238}
1239
1240/*
1241 * Decrements a page table page's wire count, which is used to record the
1242 * number of valid page table entries within the page.  If the wire count
1243 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1244 * page table page was unmapped and FALSE otherwise.
1245 */
1246static inline boolean_t
1247pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1248{
1249
1250	--m->wire_count;
1251	if (m->wire_count == 0) {
1252		_pmap_unwire_l3(pmap, va, m, free);
1253		return (TRUE);
1254	} else
1255		return (FALSE);
1256}
1257
1258static void
1259_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1260{
1261
1262	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1263	/*
1264	 * unmap the page table page
1265	 */
1266	if (m->pindex >= (NUL2E + NUL1E)) {
1267		/* l1 page */
1268		pd_entry_t *l0;
1269
1270		l0 = pmap_l0(pmap, va);
1271		pmap_load_clear(l0);
1272		PTE_SYNC(l0);
1273	} else if (m->pindex >= NUL2E) {
1274		/* l2 page */
1275		pd_entry_t *l1;
1276
1277		l1 = pmap_l1(pmap, va);
1278		pmap_load_clear(l1);
1279		PTE_SYNC(l1);
1280	} else {
1281		/* l3 page */
1282		pd_entry_t *l2;
1283
1284		l2 = pmap_l2(pmap, va);
1285		pmap_load_clear(l2);
1286		PTE_SYNC(l2);
1287	}
1288	pmap_resident_count_dec(pmap, 1);
1289	if (m->pindex < NUL2E) {
1290		/* We just released an l3, unhold the matching l2 */
1291		pd_entry_t *l1, tl1;
1292		vm_page_t l2pg;
1293
1294		l1 = pmap_l1(pmap, va);
1295		tl1 = pmap_load(l1);
1296		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1297		pmap_unwire_l3(pmap, va, l2pg, free);
1298	} else if (m->pindex < (NUL2E + NUL1E)) {
1299		/* We just released an l2, unhold the matching l1 */
1300		pd_entry_t *l0, tl0;
1301		vm_page_t l1pg;
1302
1303		l0 = pmap_l0(pmap, va);
1304		tl0 = pmap_load(l0);
1305		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1306		pmap_unwire_l3(pmap, va, l1pg, free);
1307	}
1308	pmap_invalidate_page(pmap, va);
1309
1310	/*
1311	 * This is a release store so that the ordinary store unmapping
1312	 * the page table page is globally performed before TLB shoot-
1313	 * down is begun.
1314	 */
1315	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
1316
1317	/*
1318	 * Put page on a list so that it is released after
1319	 * *ALL* TLB shootdown is done
1320	 */
1321	pmap_add_delayed_free_list(m, free, TRUE);
1322}
1323
1324/*
1325 * After removing an l3 entry, this routine is used to
1326 * conditionally free the page, and manage the hold/wire counts.
1327 */
1328static int
1329pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1330    struct spglist *free)
1331{
1332	vm_page_t mpte;
1333
1334	if (va >= VM_MAXUSER_ADDRESS)
1335		return (0);
1336	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1337	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1338	return (pmap_unwire_l3(pmap, va, mpte, free));
1339}
1340
1341void
1342pmap_pinit0(pmap_t pmap)
1343{
1344
1345	PMAP_LOCK_INIT(pmap);
1346	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1347	pmap->pm_l0 = kernel_pmap->pm_l0;
1348}
1349
1350int
1351pmap_pinit(pmap_t pmap)
1352{
1353	vm_paddr_t l0phys;
1354	vm_page_t l0pt;
1355
1356	/*
1357	 * allocate the l0 page
1358	 */
1359	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1360	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1361		VM_WAIT;
1362
1363	l0phys = VM_PAGE_TO_PHYS(l0pt);
1364	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
1365
1366	if ((l0pt->flags & PG_ZERO) == 0)
1367		pagezero(pmap->pm_l0);
1368
1369	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1370
1371	return (1);
1372}
1373
1374/*
1375 * This routine is called if the desired page table page does not exist.
1376 *
1377 * If page table page allocation fails, this routine may sleep before
1378 * returning NULL.  It sleeps only if a lock pointer was given.
1379 *
1380 * Note: If a page allocation fails at page table level two or three,
1381 * one or two pages may be held during the wait, only to be released
1382 * afterwards.  This conservative approach is easily argued to avoid
1383 * race conditions.
1384 */
1385static vm_page_t
1386_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1387{
1388	vm_page_t m, l1pg, l2pg;
1389
1390	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1391
1392	/*
1393	 * Allocate a page table page.
1394	 */
1395	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1396	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1397		if (lockp != NULL) {
1398			RELEASE_PV_LIST_LOCK(lockp);
1399			PMAP_UNLOCK(pmap);
1400			VM_WAIT;
1401			PMAP_LOCK(pmap);
1402		}
1403
1404		/*
1405		 * Indicate the need to retry.  While waiting, the page table
1406		 * page may have been allocated.
1407		 */
1408		return (NULL);
1409	}
1410	if ((m->flags & PG_ZERO) == 0)
1411		pmap_zero_page(m);
1412
1413	/*
1414	 * Map the pagetable page into the process address space, if
1415	 * it isn't already there.
1416	 */
1417
1418	if (ptepindex >= (NUL2E + NUL1E)) {
1419		pd_entry_t *l0;
1420		vm_pindex_t l0index;
1421
1422		l0index = ptepindex - (NUL2E + NUL1E);
1423		l0 = &pmap->pm_l0[l0index];
1424		pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
1425		PTE_SYNC(l0);
1426	} else if (ptepindex >= NUL2E) {
1427		vm_pindex_t l0index, l1index;
1428		pd_entry_t *l0, *l1;
1429		pd_entry_t tl0;
1430
1431		l1index = ptepindex - NUL2E;
1432		l0index = l1index >> L0_ENTRIES_SHIFT;
1433
1434		l0 = &pmap->pm_l0[l0index];
1435		tl0 = pmap_load(l0);
1436		if (tl0 == 0) {
1437			/* recurse for allocating page dir */
1438			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1439			    lockp) == NULL) {
1440				--m->wire_count;
1441				/* XXX: release mem barrier? */
1442				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1443				vm_page_free_zero(m);
1444				return (NULL);
1445			}
1446		} else {
1447			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1448			l1pg->wire_count++;
1449		}
1450
1451		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1452		l1 = &l1[ptepindex & Ln_ADDR_MASK];
1453		pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1454		PTE_SYNC(l1);
1455	} else {
1456		vm_pindex_t l0index, l1index;
1457		pd_entry_t *l0, *l1, *l2;
1458		pd_entry_t tl0, tl1;
1459
1460		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1461		l0index = l1index >> L0_ENTRIES_SHIFT;
1462
1463		l0 = &pmap->pm_l0[l0index];
1464		tl0 = pmap_load(l0);
1465		if (tl0 == 0) {
1466			/* recurse for allocating page dir */
1467			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1468			    lockp) == NULL) {
1469				--m->wire_count;
1470				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1471				vm_page_free_zero(m);
1472				return (NULL);
1473			}
1474			tl0 = pmap_load(l0);
1475			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1476			l1 = &l1[l1index & Ln_ADDR_MASK];
1477		} else {
1478			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1479			l1 = &l1[l1index & Ln_ADDR_MASK];
1480			tl1 = pmap_load(l1);
1481			if (tl1 == 0) {
1482				/* recurse for allocating page dir */
1483				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1484				    lockp) == NULL) {
1485					--m->wire_count;
1486					/* XXX: release mem barrier? */
1487					atomic_subtract_int(
1488					    &vm_cnt.v_wire_count, 1);
1489					vm_page_free_zero(m);
1490					return (NULL);
1491				}
1492			} else {
1493				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1494				l2pg->wire_count++;
1495			}
1496		}
1497
1498		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
1499		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1500		pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
1501		PTE_SYNC(l2);
1502	}
1503
1504	pmap_resident_count_inc(pmap, 1);
1505
1506	return (m);
1507}
1508
1509static vm_page_t
1510pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1511{
1512	vm_pindex_t ptepindex;
1513	pd_entry_t *pde, tpde;
1514	vm_page_t m;
1515	int lvl;
1516
1517	/*
1518	 * Calculate pagetable page index
1519	 */
1520	ptepindex = pmap_l2_pindex(va);
1521retry:
1522	/*
1523	 * Get the page directory entry
1524	 */
1525	pde = pmap_pde(pmap, va, &lvl);
1526
1527	/*
1528	 * If the page table page is mapped, we just increment the hold count,
1529	 * and activate it. If we get a level 2 pde it will point to a level 3
1530	 * table.
1531	 */
1532	if (lvl == 2) {
1533		tpde = pmap_load(pde);
1534		if (tpde != 0) {
1535			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
1536			m->wire_count++;
1537			return (m);
1538		}
1539	}
1540
1541	/*
1542	 * Here if the pte page isn't mapped, or if it has been deallocated.
1543	 */
1544	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1545	if (m == NULL && lockp != NULL)
1546		goto retry;
1547
1548	return (m);
1549}
1550
1551
1552/***************************************************
1553 * Pmap allocation/deallocation routines.
1554 ***************************************************/
1555
1556/*
1557 * Release any resources held by the given physical map.
1558 * Called when a pmap initialized by pmap_pinit is being released.
1559 * Should only be called if the map contains no valid mappings.
1560 */
1561void
1562pmap_release(pmap_t pmap)
1563{
1564	vm_page_t m;
1565
1566	KASSERT(pmap->pm_stats.resident_count == 0,
1567	    ("pmap_release: pmap resident count %ld != 0",
1568	    pmap->pm_stats.resident_count));
1569
1570	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
1571
1572	m->wire_count--;
1573	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1574	vm_page_free_zero(m);
1575}
1576
1577#if 0
1578static int
1579kvm_size(SYSCTL_HANDLER_ARGS)
1580{
1581	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1582
1583	return sysctl_handle_long(oidp, &ksize, 0, req);
1584}
1585SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1586    0, 0, kvm_size, "LU", "Size of KVM");
1587
1588static int
1589kvm_free(SYSCTL_HANDLER_ARGS)
1590{
1591	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1592
1593	return sysctl_handle_long(oidp, &kfree, 0, req);
1594}
1595SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1596    0, 0, kvm_free, "LU", "Amount of KVM free");
1597#endif /* 0 */
1598
1599/*
1600 * grow the number of kernel page table entries, if needed
1601 */
1602void
1603pmap_growkernel(vm_offset_t addr)
1604{
1605	vm_paddr_t paddr;
1606	vm_page_t nkpg;
1607	pd_entry_t *l0, *l1, *l2;
1608
1609	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1610
1611	addr = roundup2(addr, L2_SIZE);
1612	if (addr - 1 >= kernel_map->max_offset)
1613		addr = kernel_map->max_offset;
1614	while (kernel_vm_end < addr) {
1615		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
1616		KASSERT(pmap_load(l0) != 0,
1617		    ("pmap_growkernel: No level 0 kernel entry"));
1618
1619		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
1620		if (pmap_load(l1) == 0) {
1621			/* We need a new PDP entry */
1622			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
1623			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1624			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1625			if (nkpg == NULL)
1626				panic("pmap_growkernel: no memory to grow kernel");
1627			if ((nkpg->flags & PG_ZERO) == 0)
1628				pmap_zero_page(nkpg);
1629			paddr = VM_PAGE_TO_PHYS(nkpg);
1630			pmap_load_store(l1, paddr | L1_TABLE);
1631			PTE_SYNC(l1);
1632			continue; /* try again */
1633		}
1634		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1635		if ((pmap_load(l2) & ATTR_AF) != 0) {
1636			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1637			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1638				kernel_vm_end = kernel_map->max_offset;
1639				break;
1640			}
1641			continue;
1642		}
1643
1644		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
1645		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1646		    VM_ALLOC_ZERO);
1647		if (nkpg == NULL)
1648			panic("pmap_growkernel: no memory to grow kernel");
1649		if ((nkpg->flags & PG_ZERO) == 0)
1650			pmap_zero_page(nkpg);
1651		paddr = VM_PAGE_TO_PHYS(nkpg);
1652		pmap_load_store(l2, paddr | L2_TABLE);
1653		PTE_SYNC(l2);
1654		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1655
1656		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1657		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1658			kernel_vm_end = kernel_map->max_offset;
1659			break;
1660		}
1661	}
1662}
1663
1664
1665/***************************************************
1666 * page management routines.
1667 ***************************************************/
1668
1669CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1670CTASSERT(_NPCM == 3);
1671CTASSERT(_NPCPV == 168);
1672
1673static __inline struct pv_chunk *
1674pv_to_chunk(pv_entry_t pv)
1675{
1676
1677	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1678}
1679
1680#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1681
1682#define	PC_FREE0	0xfffffffffffffffful
1683#define	PC_FREE1	0xfffffffffffffffful
1684#define	PC_FREE2	0x000000fffffffffful
1685
1686static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1687
1688#if 0
1689#ifdef PV_STATS
1690static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1691
1692SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1693	"Current number of pv entry chunks");
1694SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1695	"Current number of pv entry chunks allocated");
1696SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1697	"Current number of pv entry chunks frees");
1698SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1699	"Number of times tried to get a chunk page but failed.");
1700
1701static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1702static int pv_entry_spare;
1703
1704SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1705	"Current number of pv entry frees");
1706SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1707	"Current number of pv entry allocs");
1708SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1709	"Current number of pv entries");
1710SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1711	"Current number of spare pv entries");
1712#endif
1713#endif /* 0 */
1714
1715/*
1716 * We are in a serious low memory condition.  Resort to
1717 * drastic measures to free some pages so we can allocate
1718 * another pv entry chunk.
1719 *
1720 * Returns NULL if PV entries were reclaimed from the specified pmap.
1721 *
1722 * We do not, however, unmap 2mpages because subsequent accesses will
1723 * allocate per-page pv entries until repromotion occurs, thereby
1724 * exacerbating the shortage of free pv entries.
1725 */
1726static vm_page_t
1727reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1728{
1729
1730	panic("ARM64TODO: reclaim_pv_chunk");
1731}
1732
1733/*
1734 * free the pv_entry back to the free list
1735 */
1736static void
1737free_pv_entry(pmap_t pmap, pv_entry_t pv)
1738{
1739	struct pv_chunk *pc;
1740	int idx, field, bit;
1741
1742	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1743	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1744	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1745	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1746	pc = pv_to_chunk(pv);
1747	idx = pv - &pc->pc_pventry[0];
1748	field = idx / 64;
1749	bit = idx % 64;
1750	pc->pc_map[field] |= 1ul << bit;
1751	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1752	    pc->pc_map[2] != PC_FREE2) {
1753		/* 98% of the time, pc is already at the head of the list. */
1754		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1755			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1756			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1757		}
1758		return;
1759	}
1760	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1761	free_pv_chunk(pc);
1762}
1763
1764static void
1765free_pv_chunk(struct pv_chunk *pc)
1766{
1767	vm_page_t m;
1768
1769	mtx_lock(&pv_chunks_mutex);
1770 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1771	mtx_unlock(&pv_chunks_mutex);
1772	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1773	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1774	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1775	/* entire chunk is free, return it */
1776	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1777	dump_drop_page(m->phys_addr);
1778	vm_page_unwire(m, PQ_NONE);
1779	vm_page_free(m);
1780}
1781
1782/*
1783 * Returns a new PV entry, allocating a new PV chunk from the system when
1784 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1785 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1786 * returned.
1787 *
1788 * The given PV list lock may be released.
1789 */
1790static pv_entry_t
1791get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1792{
1793	int bit, field;
1794	pv_entry_t pv;
1795	struct pv_chunk *pc;
1796	vm_page_t m;
1797
1798	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1799	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1800retry:
1801	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1802	if (pc != NULL) {
1803		for (field = 0; field < _NPCM; field++) {
1804			if (pc->pc_map[field]) {
1805				bit = ffsl(pc->pc_map[field]) - 1;
1806				break;
1807			}
1808		}
1809		if (field < _NPCM) {
1810			pv = &pc->pc_pventry[field * 64 + bit];
1811			pc->pc_map[field] &= ~(1ul << bit);
1812			/* If this was the last item, move it to tail */
1813			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1814			    pc->pc_map[2] == 0) {
1815				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1816				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1817				    pc_list);
1818			}
1819			PV_STAT(atomic_add_long(&pv_entry_count, 1));
1820			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1821			return (pv);
1822		}
1823	}
1824	/* No free items, allocate another chunk */
1825	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1826	    VM_ALLOC_WIRED);
1827	if (m == NULL) {
1828		if (lockp == NULL) {
1829			PV_STAT(pc_chunk_tryfail++);
1830			return (NULL);
1831		}
1832		m = reclaim_pv_chunk(pmap, lockp);
1833		if (m == NULL)
1834			goto retry;
1835	}
1836	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1837	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1838	dump_add_page(m->phys_addr);
1839	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1840	pc->pc_pmap = pmap;
1841	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
1842	pc->pc_map[1] = PC_FREE1;
1843	pc->pc_map[2] = PC_FREE2;
1844	mtx_lock(&pv_chunks_mutex);
1845	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1846	mtx_unlock(&pv_chunks_mutex);
1847	pv = &pc->pc_pventry[0];
1848	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1849	PV_STAT(atomic_add_long(&pv_entry_count, 1));
1850	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1851	return (pv);
1852}
1853
1854/*
1855 * First find and then remove the pv entry for the specified pmap and virtual
1856 * address from the specified pv list.  Returns the pv entry if found and NULL
1857 * otherwise.  This operation can be performed on pv lists for either 4KB or
1858 * 2MB page mappings.
1859 */
1860static __inline pv_entry_t
1861pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1862{
1863	pv_entry_t pv;
1864
1865	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
1866		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1867			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
1868			pvh->pv_gen++;
1869			break;
1870		}
1871	}
1872	return (pv);
1873}
1874
1875/*
1876 * First find and then destroy the pv entry for the specified pmap and virtual
1877 * address.  This operation can be performed on pv lists for either 4KB or 2MB
1878 * page mappings.
1879 */
1880static void
1881pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1882{
1883	pv_entry_t pv;
1884
1885	pv = pmap_pvh_remove(pvh, pmap, va);
1886	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1887	free_pv_entry(pmap, pv);
1888}
1889
1890/*
1891 * Conditionally create the PV entry for a 4KB page mapping if the required
1892 * memory can be allocated without resorting to reclamation.
1893 */
1894static boolean_t
1895pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1896    struct rwlock **lockp)
1897{
1898	pv_entry_t pv;
1899
1900	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1901	/* Pass NULL instead of the lock pointer to disable reclamation. */
1902	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1903		pv->pv_va = va;
1904		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1905		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
1906		m->md.pv_gen++;
1907		return (TRUE);
1908	} else
1909		return (FALSE);
1910}
1911
1912/*
1913 * pmap_remove_l3: do the things to unmap a page in a process
1914 */
1915static int
1916pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
1917    pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
1918{
1919	pt_entry_t old_l3;
1920	vm_page_t m;
1921
1922	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1923	if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3)))
1924		cpu_dcache_wb_range(va, L3_SIZE);
1925	old_l3 = pmap_load_clear(l3);
1926	PTE_SYNC(l3);
1927	pmap_invalidate_page(pmap, va);
1928	if (old_l3 & ATTR_SW_WIRED)
1929		pmap->pm_stats.wired_count -= 1;
1930	pmap_resident_count_dec(pmap, 1);
1931	if (old_l3 & ATTR_SW_MANAGED) {
1932		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
1933		if (pmap_page_dirty(old_l3))
1934			vm_page_dirty(m);
1935		if (old_l3 & ATTR_AF)
1936			vm_page_aflag_set(m, PGA_REFERENCED);
1937		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1938		pmap_pvh_free(&m->md, pmap, va);
1939	}
1940	return (pmap_unuse_l3(pmap, va, l2e, free));
1941}
1942
1943/*
1944 *	Remove the given range of addresses from the specified map.
1945 *
1946 *	It is assumed that the start and end are properly
1947 *	rounded to the page size.
1948 */
1949void
1950pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1951{
1952	struct rwlock *lock;
1953	vm_offset_t va, va_next;
1954	pd_entry_t *l0, *l1, *l2;
1955	pt_entry_t l3_paddr, *l3;
1956	struct spglist free;
1957	int anyvalid;
1958
1959	/*
1960	 * Perform an unsynchronized read.  This is, however, safe.
1961	 */
1962	if (pmap->pm_stats.resident_count == 0)
1963		return;
1964
1965	anyvalid = 0;
1966	SLIST_INIT(&free);
1967
1968	PMAP_LOCK(pmap);
1969
1970	lock = NULL;
1971	for (; sva < eva; sva = va_next) {
1972
1973		if (pmap->pm_stats.resident_count == 0)
1974			break;
1975
1976		l0 = pmap_l0(pmap, sva);
1977		if (pmap_load(l0) == 0) {
1978			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
1979			if (va_next < sva)
1980				va_next = eva;
1981			continue;
1982		}
1983
1984		l1 = pmap_l0_to_l1(l0, sva);
1985		if (pmap_load(l1) == 0) {
1986			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
1987			if (va_next < sva)
1988				va_next = eva;
1989			continue;
1990		}
1991
1992		/*
1993		 * Calculate index for next page table.
1994		 */
1995		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
1996		if (va_next < sva)
1997			va_next = eva;
1998
1999		l2 = pmap_l1_to_l2(l1, sva);
2000		if (l2 == NULL)
2001			continue;
2002
2003		l3_paddr = pmap_load(l2);
2004
2005		/*
2006		 * Weed out invalid mappings.
2007		 */
2008		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
2009			continue;
2010
2011		/*
2012		 * Limit our scan to either the end of the va represented
2013		 * by the current page table page, or to the end of the
2014		 * range being removed.
2015		 */
2016		if (va_next > eva)
2017			va_next = eva;
2018
2019		va = va_next;
2020		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2021		    sva += L3_SIZE) {
2022			if (l3 == NULL)
2023				panic("l3 == NULL");
2024			if (pmap_load(l3) == 0) {
2025				if (va != va_next) {
2026					pmap_invalidate_range(pmap, va, sva);
2027					va = va_next;
2028				}
2029				continue;
2030			}
2031			if (va == va_next)
2032				va = sva;
2033			if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
2034			    &lock)) {
2035				sva += L3_SIZE;
2036				break;
2037			}
2038		}
2039		if (va != va_next)
2040			pmap_invalidate_range(pmap, va, sva);
2041	}
2042	if (lock != NULL)
2043		rw_wunlock(lock);
2044	if (anyvalid)
2045		pmap_invalidate_all(pmap);
2046	PMAP_UNLOCK(pmap);
2047	pmap_free_zero_pages(&free);
2048}
2049
2050/*
2051 *	Routine:	pmap_remove_all
2052 *	Function:
2053 *		Removes this physical page from
2054 *		all physical maps in which it resides.
2055 *		Reflects back modify bits to the pager.
2056 *
2057 *	Notes:
2058 *		Original versions of this routine were very
2059 *		inefficient because they iteratively called
2060 *		pmap_remove (slow...)
2061 */
2062
2063void
2064pmap_remove_all(vm_page_t m)
2065{
2066	pv_entry_t pv;
2067	pmap_t pmap;
2068	struct rwlock *lock;
2069	pd_entry_t *pde, tpde;
2070	pt_entry_t *pte, tpte;
2071	struct spglist free;
2072	int lvl, md_gen;
2073
2074	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2075	    ("pmap_remove_all: page %p is not managed", m));
2076	SLIST_INIT(&free);
2077	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2078retry:
2079	rw_wlock(lock);
2080	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2081		pmap = PV_PMAP(pv);
2082		if (!PMAP_TRYLOCK(pmap)) {
2083			md_gen = m->md.pv_gen;
2084			rw_wunlock(lock);
2085			PMAP_LOCK(pmap);
2086			rw_wlock(lock);
2087			if (md_gen != m->md.pv_gen) {
2088				rw_wunlock(lock);
2089				PMAP_UNLOCK(pmap);
2090				goto retry;
2091			}
2092		}
2093		pmap_resident_count_dec(pmap, 1);
2094
2095		pde = pmap_pde(pmap, pv->pv_va, &lvl);
2096		KASSERT(pde != NULL,
2097		    ("pmap_remove_all: no page directory entry found"));
2098		KASSERT(lvl == 2,
2099		    ("pmap_remove_all: invalid pde level %d", lvl));
2100		tpde = pmap_load(pde);
2101
2102		pte = pmap_l2_to_l3(pde, pv->pv_va);
2103		tpte = pmap_load(pte);
2104		if (pmap_is_current(pmap) &&
2105		    pmap_l3_valid_cacheable(tpte))
2106			cpu_dcache_wb_range(pv->pv_va, L3_SIZE);
2107		pmap_load_clear(pte);
2108		PTE_SYNC(pte);
2109		pmap_invalidate_page(pmap, pv->pv_va);
2110		if (tpte & ATTR_SW_WIRED)
2111			pmap->pm_stats.wired_count--;
2112		if ((tpte & ATTR_AF) != 0)
2113			vm_page_aflag_set(m, PGA_REFERENCED);
2114
2115		/*
2116		 * Update the vm_page_t clean and reference bits.
2117		 */
2118		if (pmap_page_dirty(tpte))
2119			vm_page_dirty(m);
2120		pmap_unuse_l3(pmap, pv->pv_va, tpde, &free);
2121		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2122		m->md.pv_gen++;
2123		free_pv_entry(pmap, pv);
2124		PMAP_UNLOCK(pmap);
2125	}
2126	vm_page_aflag_clear(m, PGA_WRITEABLE);
2127	rw_wunlock(lock);
2128	pmap_free_zero_pages(&free);
2129}
2130
2131/*
2132 *	Set the physical protection on the
2133 *	specified range of this map as requested.
2134 */
2135void
2136pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2137{
2138	vm_offset_t va, va_next;
2139	pd_entry_t *l0, *l1, *l2;
2140	pt_entry_t *l3p, l3;
2141
2142	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2143		pmap_remove(pmap, sva, eva);
2144		return;
2145	}
2146
2147	if ((prot & VM_PROT_WRITE) == VM_PROT_WRITE)
2148		return;
2149
2150	PMAP_LOCK(pmap);
2151	for (; sva < eva; sva = va_next) {
2152
2153		l0 = pmap_l0(pmap, sva);
2154		if (pmap_load(l0) == 0) {
2155			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2156			if (va_next < sva)
2157				va_next = eva;
2158			continue;
2159		}
2160
2161		l1 = pmap_l0_to_l1(l0, sva);
2162		if (pmap_load(l1) == 0) {
2163			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2164			if (va_next < sva)
2165				va_next = eva;
2166			continue;
2167		}
2168
2169		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2170		if (va_next < sva)
2171			va_next = eva;
2172
2173		l2 = pmap_l1_to_l2(l1, sva);
2174		if (l2 == NULL || (pmap_load(l2) & ATTR_DESCR_MASK) != L2_TABLE)
2175			continue;
2176
2177		if (va_next > eva)
2178			va_next = eva;
2179
2180		va = va_next;
2181		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
2182		    sva += L3_SIZE) {
2183			l3 = pmap_load(l3p);
2184			if (pmap_l3_valid(l3)) {
2185				pmap_set(l3p, ATTR_AP(ATTR_AP_RO));
2186				PTE_SYNC(l3p);
2187				/* XXX: Use pmap_invalidate_range */
2188				pmap_invalidate_page(pmap, va);
2189			}
2190		}
2191	}
2192	PMAP_UNLOCK(pmap);
2193
2194	/* TODO: Only invalidate entries we are touching */
2195	pmap_invalidate_all(pmap);
2196}
2197
2198/*
2199 *	Insert the given physical page (p) at
2200 *	the specified virtual address (v) in the
2201 *	target physical map with the protection requested.
2202 *
2203 *	If specified, the page will be wired down, meaning
2204 *	that the related pte can not be reclaimed.
2205 *
2206 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2207 *	or lose information.  That is, this routine must actually
2208 *	insert this page into the given map NOW.
2209 */
2210int
2211pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2212    u_int flags, int8_t psind __unused)
2213{
2214	struct rwlock *lock;
2215	pd_entry_t *pde;
2216	pt_entry_t new_l3, orig_l3;
2217	pt_entry_t *l3;
2218	pv_entry_t pv;
2219	vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa;
2220	vm_page_t mpte, om, l1_m, l2_m, l3_m;
2221	boolean_t nosleep;
2222	int lvl;
2223
2224	va = trunc_page(va);
2225	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
2226		VM_OBJECT_ASSERT_LOCKED(m->object);
2227	pa = VM_PAGE_TO_PHYS(m);
2228	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
2229	    L3_PAGE);
2230	if ((prot & VM_PROT_WRITE) == 0)
2231		new_l3 |= ATTR_AP(ATTR_AP_RO);
2232	if ((flags & PMAP_ENTER_WIRED) != 0)
2233		new_l3 |= ATTR_SW_WIRED;
2234	if ((va >> 63) == 0)
2235		new_l3 |= ATTR_AP(ATTR_AP_USER);
2236
2237	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
2238
2239	mpte = NULL;
2240
2241	lock = NULL;
2242	PMAP_LOCK(pmap);
2243
2244	if (va < VM_MAXUSER_ADDRESS) {
2245		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2246		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
2247		if (mpte == NULL && nosleep) {
2248			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
2249			if (lock != NULL)
2250				rw_wunlock(lock);
2251			PMAP_UNLOCK(pmap);
2252			return (KERN_RESOURCE_SHORTAGE);
2253		}
2254		pde = pmap_pde(pmap, va, &lvl);
2255		KASSERT(pde != NULL,
2256		    ("pmap_enter: Invalid page entry, va: 0x%lx", va));
2257		KASSERT(lvl == 2,
2258		    ("pmap_enter: Invalid level %d", lvl));
2259
2260		l3 = pmap_l2_to_l3(pde, va);
2261	} else {
2262		pde = pmap_pde(pmap, va, &lvl);
2263		/*
2264		 * If we get a level 2 pde it must point to a level 3 entry
2265		 * otherwise we will need to create the intermediate tables
2266		 */
2267		if (lvl < 2) {
2268			switch(lvl) {
2269			default:
2270			case -1:
2271				/* Get the l0 pde to update */
2272				pde = pmap_l0(pmap, va);
2273				KASSERT(pde != NULL, ("..."));
2274
2275				l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2276				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2277				    VM_ALLOC_ZERO);
2278				if (l1_m == NULL)
2279					panic("pmap_enter: l1 pte_m == NULL");
2280				if ((l1_m->flags & PG_ZERO) == 0)
2281					pmap_zero_page(l1_m);
2282
2283				l1_pa = VM_PAGE_TO_PHYS(l1_m);
2284				pmap_load_store(pde, l1_pa | L0_TABLE);
2285				PTE_SYNC(pde);
2286				/* FALLTHROUGH */
2287			case 0:
2288				/* Get the l1 pde to update */
2289				pde = pmap_l1_to_l2(pde, va);
2290				KASSERT(pde != NULL, ("..."));
2291
2292				l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2293				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2294				    VM_ALLOC_ZERO);
2295				if (l2_m == NULL)
2296					panic("pmap_enter: l2 pte_m == NULL");
2297				if ((l2_m->flags & PG_ZERO) == 0)
2298					pmap_zero_page(l2_m);
2299
2300				l2_pa = VM_PAGE_TO_PHYS(l2_m);
2301				pmap_load_store(pde, l2_pa | L1_TABLE);
2302				PTE_SYNC(pde);
2303				/* FALLTHROUGH */
2304			case 1:
2305				/* Get the l2 pde to update */
2306				pde = pmap_l1_to_l2(pde, va);
2307
2308				l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2309				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2310				    VM_ALLOC_ZERO);
2311				if (l3_m == NULL)
2312					panic("pmap_enter: l3 pte_m == NULL");
2313				if ((l3_m->flags & PG_ZERO) == 0)
2314					pmap_zero_page(l3_m);
2315
2316				l3_pa = VM_PAGE_TO_PHYS(l3_m);
2317				pmap_load_store(pde, l3_pa | L2_TABLE);
2318				PTE_SYNC(pde);
2319				break;
2320			}
2321		}
2322		l3 = pmap_l2_to_l3(pde, va);
2323		pmap_invalidate_page(pmap, va);
2324	}
2325
2326	om = NULL;
2327	orig_l3 = pmap_load(l3);
2328	opa = orig_l3 & ~ATTR_MASK;
2329
2330	/*
2331	 * Is the specified virtual address already mapped?
2332	 */
2333	if (pmap_l3_valid(orig_l3)) {
2334		/*
2335		 * Wiring change, just update stats. We don't worry about
2336		 * wiring PT pages as they remain resident as long as there
2337		 * are valid mappings in them. Hence, if a user page is wired,
2338		 * the PT page will be also.
2339		 */
2340		if ((flags & PMAP_ENTER_WIRED) != 0 &&
2341		    (orig_l3 & ATTR_SW_WIRED) == 0)
2342			pmap->pm_stats.wired_count++;
2343		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
2344		    (orig_l3 & ATTR_SW_WIRED) != 0)
2345			pmap->pm_stats.wired_count--;
2346
2347		/*
2348		 * Remove the extra PT page reference.
2349		 */
2350		if (mpte != NULL) {
2351			mpte->wire_count--;
2352			KASSERT(mpte->wire_count > 0,
2353			    ("pmap_enter: missing reference to page table page,"
2354			     " va: 0x%lx", va));
2355		}
2356
2357		/*
2358		 * Has the physical page changed?
2359		 */
2360		if (opa == pa) {
2361			/*
2362			 * No, might be a protection or wiring change.
2363			 */
2364			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
2365				new_l3 |= ATTR_SW_MANAGED;
2366				if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
2367				    ATTR_AP(ATTR_AP_RW)) {
2368					vm_page_aflag_set(m, PGA_WRITEABLE);
2369				}
2370			}
2371			goto validate;
2372		}
2373
2374		/* Flush the cache, there might be uncommitted data in it */
2375		if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(orig_l3))
2376			cpu_dcache_wb_range(va, L3_SIZE);
2377	} else {
2378		/*
2379		 * Increment the counters.
2380		 */
2381		if ((new_l3 & ATTR_SW_WIRED) != 0)
2382			pmap->pm_stats.wired_count++;
2383		pmap_resident_count_inc(pmap, 1);
2384	}
2385	/*
2386	 * Enter on the PV list if part of our managed memory.
2387	 */
2388	if ((m->oflags & VPO_UNMANAGED) == 0) {
2389		new_l3 |= ATTR_SW_MANAGED;
2390		pv = get_pv_entry(pmap, &lock);
2391		pv->pv_va = va;
2392		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
2393		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2394		m->md.pv_gen++;
2395		if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
2396			vm_page_aflag_set(m, PGA_WRITEABLE);
2397	}
2398
2399	/*
2400	 * Update the L3 entry.
2401	 */
2402	if (orig_l3 != 0) {
2403validate:
2404		orig_l3 = pmap_load_store(l3, new_l3);
2405		PTE_SYNC(l3);
2406		opa = orig_l3 & ~ATTR_MASK;
2407
2408		if (opa != pa) {
2409			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
2410				om = PHYS_TO_VM_PAGE(opa);
2411				if (pmap_page_dirty(orig_l3))
2412					vm_page_dirty(om);
2413				if ((orig_l3 & ATTR_AF) != 0)
2414					vm_page_aflag_set(om, PGA_REFERENCED);
2415				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
2416				pmap_pvh_free(&om->md, pmap, va);
2417			}
2418		} else if (pmap_page_dirty(orig_l3)) {
2419			if ((orig_l3 & ATTR_SW_MANAGED) != 0)
2420				vm_page_dirty(m);
2421		}
2422	} else {
2423		pmap_load_store(l3, new_l3);
2424		PTE_SYNC(l3);
2425	}
2426	pmap_invalidate_page(pmap, va);
2427	if ((pmap != pmap_kernel()) && (pmap == &curproc->p_vmspace->vm_pmap))
2428	    cpu_icache_sync_range(va, PAGE_SIZE);
2429
2430	if (lock != NULL)
2431		rw_wunlock(lock);
2432	PMAP_UNLOCK(pmap);
2433	return (KERN_SUCCESS);
2434}
2435
2436/*
2437 * Maps a sequence of resident pages belonging to the same object.
2438 * The sequence begins with the given page m_start.  This page is
2439 * mapped at the given virtual address start.  Each subsequent page is
2440 * mapped at a virtual address that is offset from start by the same
2441 * amount as the page is offset from m_start within the object.  The
2442 * last page in the sequence is the page with the largest offset from
2443 * m_start that can be mapped at a virtual address less than the given
2444 * virtual address end.  Not every virtual page between start and end
2445 * is mapped; only those for which a resident page exists with the
2446 * corresponding offset from m_start are mapped.
2447 */
2448void
2449pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2450    vm_page_t m_start, vm_prot_t prot)
2451{
2452	struct rwlock *lock;
2453	vm_offset_t va;
2454	vm_page_t m, mpte;
2455	vm_pindex_t diff, psize;
2456
2457	VM_OBJECT_ASSERT_LOCKED(m_start->object);
2458
2459	psize = atop(end - start);
2460	mpte = NULL;
2461	m = m_start;
2462	lock = NULL;
2463	PMAP_LOCK(pmap);
2464	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2465		va = start + ptoa(diff);
2466		mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
2467		m = TAILQ_NEXT(m, listq);
2468	}
2469	if (lock != NULL)
2470		rw_wunlock(lock);
2471	PMAP_UNLOCK(pmap);
2472}
2473
2474/*
2475 * this code makes some *MAJOR* assumptions:
2476 * 1. Current pmap & pmap exists.
2477 * 2. Not wired.
2478 * 3. Read access.
2479 * 4. No page table pages.
2480 * but is *MUCH* faster than pmap_enter...
2481 */
2482
2483void
2484pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2485{
2486	struct rwlock *lock;
2487
2488	lock = NULL;
2489	PMAP_LOCK(pmap);
2490	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
2491	if (lock != NULL)
2492		rw_wunlock(lock);
2493	PMAP_UNLOCK(pmap);
2494}
2495
2496static vm_page_t
2497pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
2498    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
2499{
2500	struct spglist free;
2501	pd_entry_t *pde;
2502	pt_entry_t *l3;
2503	vm_paddr_t pa;
2504	int lvl;
2505
2506	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2507	    (m->oflags & VPO_UNMANAGED) != 0,
2508	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2509	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2510
2511	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
2512	/*
2513	 * In the case that a page table page is not
2514	 * resident, we are creating it here.
2515	 */
2516	if (va < VM_MAXUSER_ADDRESS) {
2517		vm_pindex_t l2pindex;
2518
2519		/*
2520		 * Calculate pagetable page index
2521		 */
2522		l2pindex = pmap_l2_pindex(va);
2523		if (mpte && (mpte->pindex == l2pindex)) {
2524			mpte->wire_count++;
2525		} else {
2526			/*
2527			 * Get the l2 entry
2528			 */
2529			pde = pmap_pde(pmap, va, &lvl);
2530
2531			/*
2532			 * If the page table page is mapped, we just increment
2533			 * the hold count, and activate it.  Otherwise, we
2534			 * attempt to allocate a page table page.  If this
2535			 * attempt fails, we don't retry.  Instead, we give up.
2536			 */
2537			if (lvl == 2 && pmap_load(pde) != 0) {
2538				mpte =
2539				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
2540				mpte->wire_count++;
2541			} else {
2542				/*
2543				 * Pass NULL instead of the PV list lock
2544				 * pointer, because we don't intend to sleep.
2545				 */
2546				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
2547				if (mpte == NULL)
2548					return (mpte);
2549			}
2550		}
2551		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
2552		l3 = &l3[pmap_l3_index(va)];
2553	} else {
2554		mpte = NULL;
2555		pde = pmap_pde(kernel_pmap, va, &lvl);
2556		KASSERT(pde != NULL,
2557		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
2558		     va));
2559		KASSERT(lvl == 2,
2560		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
2561		l3 = pmap_l2_to_l3(pde, va);
2562	}
2563
2564	if (pmap_load(l3) != 0) {
2565		if (mpte != NULL) {
2566			mpte->wire_count--;
2567			mpte = NULL;
2568		}
2569		return (mpte);
2570	}
2571
2572	/*
2573	 * Enter on the PV list if part of our managed memory.
2574	 */
2575	if ((m->oflags & VPO_UNMANAGED) == 0 &&
2576	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
2577		if (mpte != NULL) {
2578			SLIST_INIT(&free);
2579			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
2580				pmap_invalidate_page(pmap, va);
2581				pmap_free_zero_pages(&free);
2582			}
2583			mpte = NULL;
2584		}
2585		return (mpte);
2586	}
2587
2588	/*
2589	 * Increment counters
2590	 */
2591	pmap_resident_count_inc(pmap, 1);
2592
2593	pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
2594	    ATTR_AP(ATTR_AP_RW) | L3_PAGE;
2595
2596	/*
2597	 * Now validate mapping with RO protection
2598	 */
2599	if ((m->oflags & VPO_UNMANAGED) == 0)
2600		pa |= ATTR_SW_MANAGED;
2601	pmap_load_store(l3, pa);
2602	PTE_SYNC(l3);
2603	pmap_invalidate_page(pmap, va);
2604	return (mpte);
2605}
2606
2607/*
2608 * This code maps large physical mmap regions into the
2609 * processor address space.  Note that some shortcuts
2610 * are taken, but the code works.
2611 */
2612void
2613pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
2614    vm_pindex_t pindex, vm_size_t size)
2615{
2616
2617	VM_OBJECT_ASSERT_WLOCKED(object);
2618	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
2619	    ("pmap_object_init_pt: non-device object"));
2620}
2621
2622/*
2623 *	Clear the wired attribute from the mappings for the specified range of
2624 *	addresses in the given pmap.  Every valid mapping within that range
2625 *	must have the wired attribute set.  In contrast, invalid mappings
2626 *	cannot have the wired attribute set, so they are ignored.
2627 *
2628 *	The wired attribute of the page table entry is not a hardware feature,
2629 *	so there is no need to invalidate any TLB entries.
2630 */
2631void
2632pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2633{
2634	vm_offset_t va_next;
2635	pd_entry_t *l0, *l1, *l2;
2636	pt_entry_t *l3;
2637
2638	PMAP_LOCK(pmap);
2639	for (; sva < eva; sva = va_next) {
2640		l0 = pmap_l0(pmap, sva);
2641		if (pmap_load(l0) == 0) {
2642			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2643			if (va_next < sva)
2644				va_next = eva;
2645			continue;
2646		}
2647
2648		l1 = pmap_l0_to_l1(l0, sva);
2649		if (pmap_load(l1) == 0) {
2650			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2651			if (va_next < sva)
2652				va_next = eva;
2653			continue;
2654		}
2655
2656		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2657		if (va_next < sva)
2658			va_next = eva;
2659
2660		l2 = pmap_l1_to_l2(l1, sva);
2661		if (pmap_load(l2) == 0)
2662			continue;
2663
2664		if (va_next > eva)
2665			va_next = eva;
2666		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2667		    sva += L3_SIZE) {
2668			if (pmap_load(l3) == 0)
2669				continue;
2670			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
2671				panic("pmap_unwire: l3 %#jx is missing "
2672				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
2673
2674			/*
2675			 * PG_W must be cleared atomically.  Although the pmap
2676			 * lock synchronizes access to PG_W, another processor
2677			 * could be setting PG_M and/or PG_A concurrently.
2678			 */
2679			atomic_clear_long(l3, ATTR_SW_WIRED);
2680			pmap->pm_stats.wired_count--;
2681		}
2682	}
2683	PMAP_UNLOCK(pmap);
2684}
2685
2686/*
2687 *	Copy the range specified by src_addr/len
2688 *	from the source map to the range dst_addr/len
2689 *	in the destination map.
2690 *
2691 *	This routine is only advisory and need not do anything.
2692 */
2693
2694void
2695pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2696    vm_offset_t src_addr)
2697{
2698}
2699
2700/*
2701 *	pmap_zero_page zeros the specified hardware page by mapping
2702 *	the page into KVM and using bzero to clear its contents.
2703 */
2704void
2705pmap_zero_page(vm_page_t m)
2706{
2707	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2708
2709	pagezero((void *)va);
2710}
2711
2712/*
2713 *	pmap_zero_page_area zeros the specified hardware page by mapping
2714 *	the page into KVM and using bzero to clear its contents.
2715 *
2716 *	off and size may not cover an area beyond a single hardware page.
2717 */
2718void
2719pmap_zero_page_area(vm_page_t m, int off, int size)
2720{
2721	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2722
2723	if (off == 0 && size == PAGE_SIZE)
2724		pagezero((void *)va);
2725	else
2726		bzero((char *)va + off, size);
2727}
2728
2729/*
2730 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2731 *	the page into KVM and using bzero to clear its contents.  This
2732 *	is intended to be called from the vm_pagezero process only and
2733 *	outside of Giant.
2734 */
2735void
2736pmap_zero_page_idle(vm_page_t m)
2737{
2738	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2739
2740	pagezero((void *)va);
2741}
2742
2743/*
2744 *	pmap_copy_page copies the specified (machine independent)
2745 *	page by mapping the page into virtual memory and using
2746 *	bcopy to copy the page, one machine dependent page at a
2747 *	time.
2748 */
2749void
2750pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2751{
2752	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2753	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2754
2755	pagecopy((void *)src, (void *)dst);
2756}
2757
2758int unmapped_buf_allowed = 1;
2759
2760void
2761pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2762    vm_offset_t b_offset, int xfersize)
2763{
2764	void *a_cp, *b_cp;
2765	vm_page_t m_a, m_b;
2766	vm_paddr_t p_a, p_b;
2767	vm_offset_t a_pg_offset, b_pg_offset;
2768	int cnt;
2769
2770	while (xfersize > 0) {
2771		a_pg_offset = a_offset & PAGE_MASK;
2772		m_a = ma[a_offset >> PAGE_SHIFT];
2773		p_a = m_a->phys_addr;
2774		b_pg_offset = b_offset & PAGE_MASK;
2775		m_b = mb[b_offset >> PAGE_SHIFT];
2776		p_b = m_b->phys_addr;
2777		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
2778		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
2779		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
2780			panic("!DMAP a %lx", p_a);
2781		} else {
2782			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
2783		}
2784		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
2785			panic("!DMAP b %lx", p_b);
2786		} else {
2787			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
2788		}
2789		bcopy(a_cp, b_cp, cnt);
2790		a_offset += cnt;
2791		b_offset += cnt;
2792		xfersize -= cnt;
2793	}
2794}
2795
2796vm_offset_t
2797pmap_quick_enter_page(vm_page_t m)
2798{
2799
2800	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
2801}
2802
2803void
2804pmap_quick_remove_page(vm_offset_t addr)
2805{
2806}
2807
2808/*
2809 * Returns true if the pmap's pv is one of the first
2810 * 16 pvs linked to from this page.  This count may
2811 * be changed upwards or downwards in the future; it
2812 * is only necessary that true be returned for a small
2813 * subset of pmaps for proper page aging.
2814 */
2815boolean_t
2816pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2817{
2818	struct rwlock *lock;
2819	pv_entry_t pv;
2820	int loops = 0;
2821	boolean_t rv;
2822
2823	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2824	    ("pmap_page_exists_quick: page %p is not managed", m));
2825	rv = FALSE;
2826	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2827	rw_rlock(lock);
2828	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
2829		if (PV_PMAP(pv) == pmap) {
2830			rv = TRUE;
2831			break;
2832		}
2833		loops++;
2834		if (loops >= 16)
2835			break;
2836	}
2837	rw_runlock(lock);
2838	return (rv);
2839}
2840
2841/*
2842 *	pmap_page_wired_mappings:
2843 *
2844 *	Return the number of managed mappings to the given physical page
2845 *	that are wired.
2846 */
2847int
2848pmap_page_wired_mappings(vm_page_t m)
2849{
2850	struct rwlock *lock;
2851	pmap_t pmap;
2852	pt_entry_t *pte;
2853	pv_entry_t pv;
2854	int count, lvl, md_gen;
2855
2856	if ((m->oflags & VPO_UNMANAGED) != 0)
2857		return (0);
2858	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2859	rw_rlock(lock);
2860restart:
2861	count = 0;
2862	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
2863		pmap = PV_PMAP(pv);
2864		if (!PMAP_TRYLOCK(pmap)) {
2865			md_gen = m->md.pv_gen;
2866			rw_runlock(lock);
2867			PMAP_LOCK(pmap);
2868			rw_rlock(lock);
2869			if (md_gen != m->md.pv_gen) {
2870				PMAP_UNLOCK(pmap);
2871				goto restart;
2872			}
2873		}
2874		pte = pmap_pte(pmap, pv->pv_va, &lvl);
2875		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
2876			count++;
2877		PMAP_UNLOCK(pmap);
2878	}
2879	rw_runlock(lock);
2880	return (count);
2881}
2882
2883/*
2884 * Destroy all managed, non-wired mappings in the given user-space
2885 * pmap.  This pmap cannot be active on any processor besides the
2886 * caller.
2887 *
2888 * This function cannot be applied to the kernel pmap.  Moreover, it
2889 * is not intended for general use.  It is only to be used during
2890 * process termination.  Consequently, it can be implemented in ways
2891 * that make it faster than pmap_remove().  First, it can more quickly
2892 * destroy mappings by iterating over the pmap's collection of PV
2893 * entries, rather than searching the page table.  Second, it doesn't
2894 * have to test and clear the page table entries atomically, because
2895 * no processor is currently accessing the user address space.  In
2896 * particular, a page table entry's dirty bit won't change state once
2897 * this function starts.
2898 */
2899void
2900pmap_remove_pages(pmap_t pmap)
2901{
2902	pd_entry_t *pde;
2903	pt_entry_t *pte, tpte;
2904	struct spglist free;
2905	vm_page_t m;
2906	pv_entry_t pv;
2907	struct pv_chunk *pc, *npc;
2908	struct rwlock *lock;
2909	int64_t bit;
2910	uint64_t inuse, bitmask;
2911	int allfree, field, freed, idx, lvl;
2912	vm_paddr_t pa;
2913
2914	lock = NULL;
2915
2916	SLIST_INIT(&free);
2917	PMAP_LOCK(pmap);
2918	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
2919		allfree = 1;
2920		freed = 0;
2921		for (field = 0; field < _NPCM; field++) {
2922			inuse = ~pc->pc_map[field] & pc_freemask[field];
2923			while (inuse != 0) {
2924				bit = ffsl(inuse) - 1;
2925				bitmask = 1UL << bit;
2926				idx = field * 64 + bit;
2927				pv = &pc->pc_pventry[idx];
2928				inuse &= ~bitmask;
2929
2930				pde = pmap_pde(pmap, pv->pv_va, &lvl);
2931				KASSERT(pde != NULL,
2932				    ("Attempting to remove an unmapped page"));
2933				KASSERT(lvl == 2,
2934				    ("Invalid page directory level: %d", lvl));
2935
2936				pte = pmap_l2_to_l3(pde, pv->pv_va);
2937				KASSERT(pte != NULL,
2938				    ("Attempting to remove an unmapped page"));
2939
2940				tpte = pmap_load(pte);
2941
2942/*
2943 * We cannot remove wired pages from a process' mapping at this time
2944 */
2945				if (tpte & ATTR_SW_WIRED) {
2946					allfree = 0;
2947					continue;
2948				}
2949
2950				pa = tpte & ~ATTR_MASK;
2951
2952				m = PHYS_TO_VM_PAGE(pa);
2953				KASSERT(m->phys_addr == pa,
2954				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2955				    m, (uintmax_t)m->phys_addr,
2956				    (uintmax_t)tpte));
2957
2958				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
2959				    m < &vm_page_array[vm_page_array_size],
2960				    ("pmap_remove_pages: bad pte %#jx",
2961				    (uintmax_t)tpte));
2962
2963				/* XXX: assumes tpte is level 3 */
2964				if (pmap_is_current(pmap) &&
2965				    pmap_l3_valid_cacheable(tpte))
2966					cpu_dcache_wb_range(pv->pv_va, L3_SIZE);
2967				pmap_load_clear(pte);
2968				PTE_SYNC(pte);
2969				pmap_invalidate_page(pmap, pv->pv_va);
2970
2971				/*
2972				 * Update the vm_page_t clean/reference bits.
2973				 */
2974				if ((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
2975					vm_page_dirty(m);
2976
2977				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
2978
2979				/* Mark free */
2980				pc->pc_map[field] |= bitmask;
2981
2982				pmap_resident_count_dec(pmap, 1);
2983				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2984				m->md.pv_gen++;
2985
2986				pmap_unuse_l3(pmap, pv->pv_va, pmap_load(pde),
2987				    &free);
2988				freed++;
2989			}
2990		}
2991		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2992		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2993		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2994		if (allfree) {
2995			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2996			free_pv_chunk(pc);
2997		}
2998	}
2999	pmap_invalidate_all(pmap);
3000	if (lock != NULL)
3001		rw_wunlock(lock);
3002	PMAP_UNLOCK(pmap);
3003	pmap_free_zero_pages(&free);
3004}
3005
3006/*
3007 * This is used to check if a page has been accessed or modified. As we
3008 * don't have a bit to see if it has been modified we have to assume it
3009 * has been if the page is read/write.
3010 */
3011static boolean_t
3012pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3013{
3014	struct rwlock *lock;
3015	pv_entry_t pv;
3016	pt_entry_t *pte, mask, value;
3017	pmap_t pmap;
3018	int lvl, md_gen;
3019	boolean_t rv;
3020
3021	rv = FALSE;
3022	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3023	rw_rlock(lock);
3024restart:
3025	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3026		pmap = PV_PMAP(pv);
3027		if (!PMAP_TRYLOCK(pmap)) {
3028			md_gen = m->md.pv_gen;
3029			rw_runlock(lock);
3030			PMAP_LOCK(pmap);
3031			rw_rlock(lock);
3032			if (md_gen != m->md.pv_gen) {
3033				PMAP_UNLOCK(pmap);
3034				goto restart;
3035			}
3036		}
3037		pte = pmap_pte(pmap, pv->pv_va, &lvl);
3038		KASSERT(lvl == 3,
3039		    ("pmap_page_test_mappings: Invalid level %d", lvl));
3040		mask = 0;
3041		value = 0;
3042		if (modified) {
3043			mask |= ATTR_AP_RW_BIT;
3044			value |= ATTR_AP(ATTR_AP_RW);
3045		}
3046		if (accessed) {
3047			mask |= ATTR_AF | ATTR_DESCR_MASK;
3048			value |= ATTR_AF | L3_PAGE;
3049		}
3050		rv = (pmap_load(pte) & mask) == value;
3051		PMAP_UNLOCK(pmap);
3052		if (rv)
3053			goto out;
3054	}
3055out:
3056	rw_runlock(lock);
3057	return (rv);
3058}
3059
3060/*
3061 *	pmap_is_modified:
3062 *
3063 *	Return whether or not the specified physical page was modified
3064 *	in any physical maps.
3065 */
3066boolean_t
3067pmap_is_modified(vm_page_t m)
3068{
3069
3070	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3071	    ("pmap_is_modified: page %p is not managed", m));
3072
3073	/*
3074	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3075	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
3076	 * is clear, no PTEs can have PG_M set.
3077	 */
3078	VM_OBJECT_ASSERT_WLOCKED(m->object);
3079	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3080		return (FALSE);
3081	return (pmap_page_test_mappings(m, FALSE, TRUE));
3082}
3083
3084/*
3085 *	pmap_is_prefaultable:
3086 *
3087 *	Return whether or not the specified virtual address is eligible
3088 *	for prefault.
3089 */
3090boolean_t
3091pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3092{
3093	pt_entry_t *pte;
3094	boolean_t rv;
3095	int lvl;
3096
3097	rv = FALSE;
3098	PMAP_LOCK(pmap);
3099	pte = pmap_pte(pmap, addr, &lvl);
3100	if (pte != NULL && pmap_load(pte) != 0) {
3101		rv = TRUE;
3102	}
3103	PMAP_UNLOCK(pmap);
3104	return (rv);
3105}
3106
3107/*
3108 *	pmap_is_referenced:
3109 *
3110 *	Return whether or not the specified physical page was referenced
3111 *	in any physical maps.
3112 */
3113boolean_t
3114pmap_is_referenced(vm_page_t m)
3115{
3116
3117	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3118	    ("pmap_is_referenced: page %p is not managed", m));
3119	return (pmap_page_test_mappings(m, TRUE, FALSE));
3120}
3121
3122/*
3123 * Clear the write and modified bits in each of the given page's mappings.
3124 */
3125void
3126pmap_remove_write(vm_page_t m)
3127{
3128	pmap_t pmap;
3129	struct rwlock *lock;
3130	pv_entry_t pv;
3131	pt_entry_t oldpte, *pte;
3132	int lvl, md_gen;
3133
3134	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3135	    ("pmap_remove_write: page %p is not managed", m));
3136
3137	/*
3138	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3139	 * set by another thread while the object is locked.  Thus,
3140	 * if PGA_WRITEABLE is clear, no page table entries need updating.
3141	 */
3142	VM_OBJECT_ASSERT_WLOCKED(m->object);
3143	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3144		return;
3145	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3146retry_pv_loop:
3147	rw_wlock(lock);
3148	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3149		pmap = PV_PMAP(pv);
3150		if (!PMAP_TRYLOCK(pmap)) {
3151			md_gen = m->md.pv_gen;
3152			rw_wunlock(lock);
3153			PMAP_LOCK(pmap);
3154			rw_wlock(lock);
3155			if (md_gen != m->md.pv_gen) {
3156				PMAP_UNLOCK(pmap);
3157				rw_wunlock(lock);
3158				goto retry_pv_loop;
3159			}
3160		}
3161		pte = pmap_pte(pmap, pv->pv_va, &lvl);
3162retry:
3163		oldpte = pmap_load(pte);
3164		if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
3165			if (!atomic_cmpset_long(pte, oldpte,
3166			    oldpte | ATTR_AP(ATTR_AP_RO)))
3167				goto retry;
3168			if ((oldpte & ATTR_AF) != 0)
3169				vm_page_dirty(m);
3170			pmap_invalidate_page(pmap, pv->pv_va);
3171		}
3172		PMAP_UNLOCK(pmap);
3173	}
3174	rw_wunlock(lock);
3175	vm_page_aflag_clear(m, PGA_WRITEABLE);
3176}
3177
3178static __inline boolean_t
3179safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
3180{
3181
3182	return (FALSE);
3183}
3184
3185#define	PMAP_TS_REFERENCED_MAX	5
3186
3187/*
3188 *	pmap_ts_referenced:
3189 *
3190 *	Return a count of reference bits for a page, clearing those bits.
3191 *	It is not necessary for every reference bit to be cleared, but it
3192 *	is necessary that 0 only be returned when there are truly no
3193 *	reference bits set.
3194 *
3195 *	XXX: The exact number of bits to check and clear is a matter that
3196 *	should be tested and standardized at some point in the future for
3197 *	optimal aging of shared pages.
3198 */
3199int
3200pmap_ts_referenced(vm_page_t m)
3201{
3202	pv_entry_t pv, pvf;
3203	pmap_t pmap;
3204	struct rwlock *lock;
3205	pd_entry_t *pde, tpde;
3206	pt_entry_t *pte, tpte;
3207	vm_paddr_t pa;
3208	int cleared, md_gen, not_cleared, lvl;
3209	struct spglist free;
3210
3211	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3212	    ("pmap_ts_referenced: page %p is not managed", m));
3213	SLIST_INIT(&free);
3214	cleared = 0;
3215	pa = VM_PAGE_TO_PHYS(m);
3216	lock = PHYS_TO_PV_LIST_LOCK(pa);
3217	rw_wlock(lock);
3218retry:
3219	not_cleared = 0;
3220	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
3221		goto out;
3222	pv = pvf;
3223	do {
3224		if (pvf == NULL)
3225			pvf = pv;
3226		pmap = PV_PMAP(pv);
3227		if (!PMAP_TRYLOCK(pmap)) {
3228			md_gen = m->md.pv_gen;
3229			rw_wunlock(lock);
3230			PMAP_LOCK(pmap);
3231			rw_wlock(lock);
3232			if (md_gen != m->md.pv_gen) {
3233				PMAP_UNLOCK(pmap);
3234				goto retry;
3235			}
3236		}
3237		pde = pmap_pde(pmap, pv->pv_va, &lvl);
3238		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
3239		KASSERT(lvl == 2,
3240		    ("pmap_ts_referenced: invalid pde level %d", lvl));
3241		tpde = pmap_load(pde);
3242		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
3243		    ("pmap_ts_referenced: found an invalid l2 table"));
3244		pte = pmap_l2_to_l3(pde, pv->pv_va);
3245		tpte = pmap_load(pte);
3246		if ((tpte & ATTR_AF) != 0) {
3247			if (safe_to_clear_referenced(pmap, tpte)) {
3248				/*
3249				 * TODO: We don't handle the access flag
3250				 * at all. We need to be able to set it in
3251				 * the exception handler.
3252				 */
3253				panic("ARM64TODO: safe_to_clear_referenced\n");
3254			} else if ((tpte & ATTR_SW_WIRED) == 0) {
3255				/*
3256				 * Wired pages cannot be paged out so
3257				 * doing accessed bit emulation for
3258				 * them is wasted effort. We do the
3259				 * hard work for unwired pages only.
3260				 */
3261				pmap_remove_l3(pmap, pte, pv->pv_va, tpde,
3262				    &free, &lock);
3263				pmap_invalidate_page(pmap, pv->pv_va);
3264				cleared++;
3265				if (pvf == pv)
3266					pvf = NULL;
3267				pv = NULL;
3268				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3269				    ("inconsistent pv lock %p %p for page %p",
3270				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3271			} else
3272				not_cleared++;
3273		}
3274		PMAP_UNLOCK(pmap);
3275		/* Rotate the PV list if it has more than one entry. */
3276		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
3277			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3278			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3279			m->md.pv_gen++;
3280		}
3281	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
3282	    not_cleared < PMAP_TS_REFERENCED_MAX);
3283out:
3284	rw_wunlock(lock);
3285	pmap_free_zero_pages(&free);
3286	return (cleared + not_cleared);
3287}
3288
3289/*
3290 *	Apply the given advice to the specified range of addresses within the
3291 *	given pmap.  Depending on the advice, clear the referenced and/or
3292 *	modified flags in each mapping and set the mapped page's dirty field.
3293 */
3294void
3295pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
3296{
3297}
3298
3299/*
3300 *	Clear the modify bits on the specified physical page.
3301 */
3302void
3303pmap_clear_modify(vm_page_t m)
3304{
3305
3306	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3307	    ("pmap_clear_modify: page %p is not managed", m));
3308	VM_OBJECT_ASSERT_WLOCKED(m->object);
3309	KASSERT(!vm_page_xbusied(m),
3310	    ("pmap_clear_modify: page %p is exclusive busied", m));
3311
3312	/*
3313	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
3314	 * If the object containing the page is locked and the page is not
3315	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
3316	 */
3317	if ((m->aflags & PGA_WRITEABLE) == 0)
3318		return;
3319
3320	/* ARM64TODO: We lack support for tracking if a page is modified */
3321}
3322
3323void *
3324pmap_mapbios(vm_paddr_t pa, vm_size_t size)
3325{
3326
3327        return ((void *)PHYS_TO_DMAP(pa));
3328}
3329
3330void
3331pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
3332{
3333}
3334
3335/*
3336 * Sets the memory attribute for the specified page.
3337 */
3338void
3339pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
3340{
3341
3342	m->md.pv_memattr = ma;
3343
3344	/*
3345	 * ARM64TODO: Implement the below (from the amd64 pmap)
3346	 * If "m" is a normal page, update its direct mapping.  This update
3347	 * can be relied upon to perform any cache operations that are
3348	 * required for data coherence.
3349	 */
3350	if ((m->flags & PG_FICTITIOUS) == 0 &&
3351	    PHYS_IN_DMAP(VM_PAGE_TO_PHYS(m)))
3352		panic("ARM64TODO: pmap_page_set_memattr");
3353}
3354
3355/*
3356 * perform the pmap work for mincore
3357 */
3358int
3359pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
3360{
3361	pd_entry_t *l1p, l1;
3362	pd_entry_t *l2p, l2;
3363	pt_entry_t *l3p, l3;
3364	vm_paddr_t pa;
3365	bool managed;
3366	int val;
3367
3368	PMAP_LOCK(pmap);
3369retry:
3370	pa = 0;
3371	val = 0;
3372	managed = false;
3373
3374	l1p = pmap_l1(pmap, addr);
3375	if (l1p == NULL) /* No l1 */
3376		goto done;
3377
3378	l1 = pmap_load(l1p);
3379	if ((l1 & ATTR_DESCR_MASK) == L1_INVAL)
3380		goto done;
3381
3382	if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
3383		pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET);
3384		managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
3385		val = MINCORE_SUPER | MINCORE_INCORE;
3386		if (pmap_page_dirty(l1))
3387			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
3388		if ((l1 & ATTR_AF) == ATTR_AF)
3389			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
3390		goto done;
3391	}
3392
3393	l2p = pmap_l1_to_l2(l1p, addr);
3394	if (l2p == NULL) /* No l2 */
3395		goto done;
3396
3397	l2 = pmap_load(l2p);
3398	if ((l2 & ATTR_DESCR_MASK) == L2_INVAL)
3399		goto done;
3400
3401	if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
3402		pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET);
3403		managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
3404		val = MINCORE_SUPER | MINCORE_INCORE;
3405		if (pmap_page_dirty(l2))
3406			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
3407		if ((l2 & ATTR_AF) == ATTR_AF)
3408			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
3409		goto done;
3410	}
3411
3412	l3p = pmap_l2_to_l3(l2p, addr);
3413	if (l3p == NULL) /* No l3 */
3414		goto done;
3415
3416	l3 = pmap_load(l2p);
3417	if ((l3 & ATTR_DESCR_MASK) == L3_INVAL)
3418		goto done;
3419
3420	if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) {
3421		pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET);
3422		managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
3423		val = MINCORE_INCORE;
3424		if (pmap_page_dirty(l3))
3425			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
3426		if ((l3 & ATTR_AF) == ATTR_AF)
3427			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
3428	}
3429
3430done:
3431	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
3432	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
3433		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
3434		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
3435			goto retry;
3436	} else
3437		PA_UNLOCK_COND(*locked_pa);
3438	PMAP_UNLOCK(pmap);
3439
3440	return (val);
3441}
3442
3443void
3444pmap_activate(struct thread *td)
3445{
3446	pmap_t	pmap;
3447
3448	critical_enter();
3449	pmap = vmspace_pmap(td->td_proc->p_vmspace);
3450	td->td_pcb->pcb_l0addr = vtophys(pmap->pm_l0);
3451	__asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_pcb->pcb_l0addr));
3452	pmap_invalidate_all(pmap);
3453	critical_exit();
3454}
3455
3456void
3457pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
3458{
3459
3460	if (va >= VM_MIN_KERNEL_ADDRESS) {
3461		cpu_icache_sync_range(va, sz);
3462	} else {
3463		u_int len, offset;
3464		vm_paddr_t pa;
3465
3466		/* Find the length of data in this page to flush */
3467		offset = va & PAGE_MASK;
3468		len = imin(PAGE_SIZE - offset, sz);
3469
3470		while (sz != 0) {
3471			/* Extract the physical address & find it in the DMAP */
3472			pa = pmap_extract(pmap, va);
3473			if (pa != 0)
3474				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
3475
3476			/* Move to the next page */
3477			sz -= len;
3478			va += len;
3479			/* Set the length for the next iteration */
3480			len = imin(PAGE_SIZE, sz);
3481		}
3482	}
3483}
3484
3485/*
3486 *	Increase the starting virtual address of the given mapping if a
3487 *	different alignment might result in more superpage mappings.
3488 */
3489void
3490pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
3491    vm_offset_t *addr, vm_size_t size)
3492{
3493}
3494
3495/**
3496 * Get the kernel virtual address of a set of physical pages. If there are
3497 * physical addresses not covered by the DMAP perform a transient mapping
3498 * that will be removed when calling pmap_unmap_io_transient.
3499 *
3500 * \param page        The pages the caller wishes to obtain the virtual
3501 *                    address on the kernel memory map.
3502 * \param vaddr       On return contains the kernel virtual memory address
3503 *                    of the pages passed in the page parameter.
3504 * \param count       Number of pages passed in.
3505 * \param can_fault   TRUE if the thread using the mapped pages can take
3506 *                    page faults, FALSE otherwise.
3507 *
3508 * \returns TRUE if the caller must call pmap_unmap_io_transient when
3509 *          finished or FALSE otherwise.
3510 *
3511 */
3512boolean_t
3513pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
3514    boolean_t can_fault)
3515{
3516	vm_paddr_t paddr;
3517	boolean_t needs_mapping;
3518	int error, i;
3519
3520	/*
3521	 * Allocate any KVA space that we need, this is done in a separate
3522	 * loop to prevent calling vmem_alloc while pinned.
3523	 */
3524	needs_mapping = FALSE;
3525	for (i = 0; i < count; i++) {
3526		paddr = VM_PAGE_TO_PHYS(page[i]);
3527		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
3528			error = vmem_alloc(kernel_arena, PAGE_SIZE,
3529			    M_BESTFIT | M_WAITOK, &vaddr[i]);
3530			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
3531			needs_mapping = TRUE;
3532		} else {
3533			vaddr[i] = PHYS_TO_DMAP(paddr);
3534		}
3535	}
3536
3537	/* Exit early if everything is covered by the DMAP */
3538	if (!needs_mapping)
3539		return (FALSE);
3540
3541	if (!can_fault)
3542		sched_pin();
3543	for (i = 0; i < count; i++) {
3544		paddr = VM_PAGE_TO_PHYS(page[i]);
3545		if (!PHYS_IN_DMAP(paddr)) {
3546			panic(
3547			   "pmap_map_io_transient: TODO: Map out of DMAP data");
3548		}
3549	}
3550
3551	return (needs_mapping);
3552}
3553
3554void
3555pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
3556    boolean_t can_fault)
3557{
3558	vm_paddr_t paddr;
3559	int i;
3560
3561	if (!can_fault)
3562		sched_unpin();
3563	for (i = 0; i < count; i++) {
3564		paddr = VM_PAGE_TO_PHYS(page[i]);
3565		if (!PHYS_IN_DMAP(paddr)) {
3566			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
3567		}
3568	}
3569}
3570