pmap.c revision 282221
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 *    notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 *    notice, this list of conditions and the following disclaimer in the
31 *    documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 *    must display the following acknowledgement:
34 *	This product includes software developed by the University of
35 *	California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 *    may be used to endorse or promote products derived from this software
38 *    without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53 */
54/*-
55 * Copyright (c) 2003 Networks Associates Technology, Inc.
56 * All rights reserved.
57 *
58 * This software was developed for the FreeBSD Project by Jake Burkholder,
59 * Safeport Network Services, and Network Associates Laboratories, the
60 * Security Research Division of Network Associates, Inc. under
61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62 * CHATS research program.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 *    notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 *    notice, this list of conditions and the following disclaimer in the
71 *    documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86#define	AMD64_NPT_AWARE
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/arm64/arm64/pmap.c 282221 2015-04-29 15:00:43Z andrew $");
90
91/*
92 *	Manages physical address maps.
93 *
94 *	Since the information managed by this module is
95 *	also stored by the logical address mapping module,
96 *	this module may throw away valid virtual-to-physical
97 *	mappings at almost any time.  However, invalidations
98 *	of virtual-to-physical mappings must be done as
99 *	requested.
100 *
101 *	In order to cope with hardware architectures which
102 *	make virtual-to-physical map invalidates expensive,
103 *	this module may delay invalidate or reduced protection
104 *	operations until such time as they are actually
105 *	necessary.  This module is given full information as
106 *	to which processors are currently using which maps,
107 *	and to when physical maps must be made correct.
108 */
109
110#include <sys/param.h>
111#include <sys/bus.h>
112#include <sys/systm.h>
113#include <sys/kernel.h>
114#include <sys/ktr.h>
115#include <sys/lock.h>
116#include <sys/malloc.h>
117#include <sys/mman.h>
118#include <sys/msgbuf.h>
119#include <sys/mutex.h>
120#include <sys/proc.h>
121#include <sys/rwlock.h>
122#include <sys/sx.h>
123#include <sys/vmem.h>
124#include <sys/vmmeter.h>
125#include <sys/sched.h>
126#include <sys/sysctl.h>
127#include <sys/_unrhdr.h>
128#include <sys/smp.h>
129
130#include <vm/vm.h>
131#include <vm/vm_param.h>
132#include <vm/vm_kern.h>
133#include <vm/vm_page.h>
134#include <vm/vm_map.h>
135#include <vm/vm_object.h>
136#include <vm/vm_extern.h>
137#include <vm/vm_pageout.h>
138#include <vm/vm_pager.h>
139#include <vm/vm_radix.h>
140#include <vm/vm_reserv.h>
141#include <vm/uma.h>
142
143#include <machine/machdep.h>
144#include <machine/md_var.h>
145#include <machine/pcb.h>
146
147#define	NPDEPG		(PAGE_SIZE/(sizeof (pd_entry_t)))
148#define	NUPDE			(NPDEPG * NPDEPG)
149#define	NUSERPGTBLS		(NUPDE + NPDEPG)
150
151#if !defined(DIAGNOSTIC)
152#ifdef __GNUC_GNU_INLINE__
153#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
154#else
155#define PMAP_INLINE	extern inline
156#endif
157#else
158#define PMAP_INLINE
159#endif
160
161/*
162 * These are configured by the mair_el1 register. This is set up in locore.S
163 */
164#define	DEVICE_MEMORY	0
165#define	UNCACHED_MEMORY	1
166#define	CACHED_MEMORY	2
167
168
169#ifdef PV_STATS
170#define PV_STAT(x)	do { x ; } while (0)
171#else
172#define PV_STAT(x)	do { } while (0)
173#endif
174
175#define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
176
177#define	NPV_LIST_LOCKS	MAXCPU
178
179#define	PHYS_TO_PV_LIST_LOCK(pa)	\
180			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
181
182#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
183	struct rwlock **_lockp = (lockp);		\
184	struct rwlock *_new_lock;			\
185							\
186	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
187	if (_new_lock != *_lockp) {			\
188		if (*_lockp != NULL)			\
189			rw_wunlock(*_lockp);		\
190		*_lockp = _new_lock;			\
191		rw_wlock(*_lockp);			\
192	}						\
193} while (0)
194
195#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
196			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
197
198#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
199	struct rwlock **_lockp = (lockp);		\
200							\
201	if (*_lockp != NULL) {				\
202		rw_wunlock(*_lockp);			\
203		*_lockp = NULL;				\
204	}						\
205} while (0)
206
207#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
208			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
209
210struct pmap kernel_pmap_store;
211
212vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
213vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
214vm_offset_t kernel_vm_end = 0;
215
216struct msgbuf *msgbufp = NULL;
217
218static struct rwlock_padalign pvh_global_lock;
219
220/*
221 * Data for the pv entry allocation mechanism
222 */
223static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
224static struct mtx pv_chunks_mutex;
225static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
226
227static void	free_pv_chunk(struct pv_chunk *pc);
228static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
229static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
230static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
231static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
232static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
233		    vm_offset_t va);
234static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
235    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
236static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
237    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
238static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
239    vm_page_t m, struct rwlock **lockp);
240
241static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
242		struct rwlock **lockp);
243
244static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
245    struct spglist *free);
246static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
247
248/********************/
249/* Inline functions */
250/********************/
251
252static __inline void
253pagecopy(void *s, void *d)
254{
255
256	memcpy(d, s, PAGE_SIZE);
257}
258
259static __inline void
260pagezero(void *p)
261{
262
263	bzero(p, PAGE_SIZE);
264}
265
266#define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
267#define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
268#define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
269
270static __inline pd_entry_t *
271pmap_l1(pmap_t pmap, vm_offset_t va)
272{
273
274	return (&pmap->pm_l1[pmap_l1_index(va)]);
275}
276
277static __inline pd_entry_t *
278pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
279{
280	pd_entry_t *l2;
281
282	l2 = (pd_entry_t *)PHYS_TO_DMAP(*l1 & ~ATTR_MASK);
283	return (&l2[pmap_l2_index(va)]);
284}
285
286static __inline pd_entry_t *
287pmap_l2(pmap_t pmap, vm_offset_t va)
288{
289	pd_entry_t *l1;
290
291	l1 = pmap_l1(pmap, va);
292	if ((*l1 & ATTR_DESCR_MASK) != L1_TABLE)
293		return (NULL);
294
295	return (pmap_l1_to_l2(l1, va));
296}
297
298static __inline pt_entry_t *
299pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
300{
301	pt_entry_t *l3;
302
303	l3 = (pd_entry_t *)PHYS_TO_DMAP(*l2 & ~ATTR_MASK);
304	return (&l3[pmap_l3_index(va)]);
305}
306
307static __inline pt_entry_t *
308pmap_l3(pmap_t pmap, vm_offset_t va)
309{
310	pd_entry_t *l2;
311
312	l2 = pmap_l2(pmap, va);
313	if (l2 == NULL || (*l2 & ATTR_DESCR_MASK) != L2_TABLE)
314		return (NULL);
315
316	return (pmap_l2_to_l3(l2, va));
317}
318
319/*
320 * These load the old table data and store the new value.
321 * They need to be atomic as the System MMU may write to the table at
322 * the same time as the CPU.
323 */
324#define	pmap_load_store(table, entry) atomic_swap_64(table, entry)
325#define	pmap_set(table, mask) atomic_set_64(table, mask)
326#define	pmap_load_clear(table) atomic_swap_64(table, 0)
327#define	pmap_load(table) (*table)
328
329static __inline int
330pmap_is_current(pmap_t pmap)
331{
332
333	return ((pmap == pmap_kernel()) ||
334	    (pmap == curthread->td_proc->p_vmspace->vm_map.pmap));
335}
336
337static __inline int
338pmap_l3_valid(pt_entry_t l3)
339{
340
341	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
342}
343
344static __inline int
345pmap_l3_valid_cacheable(pt_entry_t l3)
346{
347
348	return (((l3 & ATTR_DESCR_MASK) == L3_PAGE) &&
349	    ((l3 & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY)));
350}
351
352#define	PTE_SYNC(pte)	cpu_dcache_wb_range((vm_offset_t)pte, sizeof(*pte))
353
354/*
355 * Checks if the page is dirty. We currently lack proper tracking of this on
356 * arm64 so for now assume is a page mapped as rw was accessed it is.
357 */
358static inline int
359pmap_page_dirty(pt_entry_t pte)
360{
361
362	return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) ==
363	    (ATTR_AF | ATTR_AP(ATTR_AP_RW)));
364}
365
366static __inline void
367pmap_resident_count_inc(pmap_t pmap, int count)
368{
369
370	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
371	pmap->pm_stats.resident_count += count;
372}
373
374static __inline void
375pmap_resident_count_dec(pmap_t pmap, int count)
376{
377
378	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
379	KASSERT(pmap->pm_stats.resident_count >= count,
380	    ("pmap %p resident count underflow %ld %d", pmap,
381	    pmap->pm_stats.resident_count, count));
382	pmap->pm_stats.resident_count -= count;
383}
384
385static pt_entry_t *
386pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
387    u_int *l2_slot)
388{
389	pt_entry_t *l2;
390	pd_entry_t *l1;
391
392	l1 = (pd_entry_t *)l1pt;
393	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
394
395	/* Check locore has used a table L1 map */
396	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
397	   ("Invalid bootstrap L1 table"));
398	/* Find the address of the L2 table */
399	l2 = (pt_entry_t *)init_pt_va;
400	*l2_slot = pmap_l2_index(va);
401
402	return (l2);
403}
404
405static vm_paddr_t
406pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
407{
408	u_int l1_slot, l2_slot;
409	pt_entry_t *l2;
410
411	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
412
413	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
414}
415
416static void
417pmap_bootstrap_dmap(vm_offset_t l1pt)
418{
419	vm_offset_t va;
420	vm_paddr_t pa;
421	pd_entry_t *l1;
422	u_int l1_slot;
423
424	va = DMAP_MIN_ADDRESS;
425	l1 = (pd_entry_t *)l1pt;
426	l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS);
427
428	for (pa = 0; va < DMAP_MAX_ADDRESS;
429	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
430		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
431
432		/*
433		 * TODO: Turn the cache on here when we have cache
434		 * flushing code.
435		 */
436		pmap_load_store(&l1[l1_slot],
437		    (pa & ~L1_OFFSET) | ATTR_AF | L1_BLOCK |
438		    ATTR_IDX(CACHED_MEMORY));
439	}
440
441	cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE);
442	cpu_tlb_flushID();
443}
444
445static vm_offset_t
446pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
447{
448	vm_offset_t l2pt;
449	vm_paddr_t pa;
450	pd_entry_t *l1;
451	u_int l1_slot;
452
453	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
454
455	l1 = (pd_entry_t *)l1pt;
456	l1_slot = pmap_l1_index(va);
457	l2pt = l2_start;
458
459	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
460		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
461
462		pa = pmap_early_vtophys(l1pt, l2pt);
463		pmap_load_store(&l1[l1_slot],
464		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
465		l2pt += PAGE_SIZE;
466	}
467
468	/* Clean the L2 page table */
469	memset((void *)l2_start, 0, l2pt - l2_start);
470	cpu_dcache_wb_range(l2_start, l2pt - l2_start);
471
472	/* Flush the l1 table to ram */
473	cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE);
474
475	return l2pt;
476}
477
478static vm_offset_t
479pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
480{
481	vm_offset_t l2pt, l3pt;
482	vm_paddr_t pa;
483	pd_entry_t *l2;
484	u_int l2_slot;
485
486	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
487
488	l2 = pmap_l2(kernel_pmap, va);
489	l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1));
490	l2pt = (vm_offset_t)l2;
491	l2_slot = pmap_l2_index(va);
492	l3pt = l3_start;
493
494	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
495		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
496
497		pa = pmap_early_vtophys(l1pt, l3pt);
498		pmap_load_store(&l2[l2_slot],
499		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
500		l3pt += PAGE_SIZE;
501	}
502
503	/* Clean the L2 page table */
504	memset((void *)l3_start, 0, l3pt - l3_start);
505	cpu_dcache_wb_range(l3_start, l3pt - l3_start);
506
507	cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE);
508
509	return l3pt;
510}
511
512/*
513 *	Bootstrap the system enough to run with virtual memory.
514 */
515void
516pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
517{
518	u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot;
519	uint64_t kern_delta;
520	pt_entry_t *l2;
521	vm_offset_t va, freemempos;
522	vm_offset_t dpcpu, msgbufpv;
523	vm_paddr_t pa;
524
525	kern_delta = KERNBASE - kernstart;
526	physmem = 0;
527
528	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
529	printf("%lx\n", l1pt);
530	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
531
532	/* Set this early so we can use the pagetable walking functions */
533	kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt;
534	PMAP_LOCK_INIT(kernel_pmap);
535
536 	/*
537	 * Initialize the global pv list lock.
538	 */
539	rw_init(&pvh_global_lock, "pmap pv global");
540
541	/* Create a direct map region early so we can use it for pa -> va */
542	pmap_bootstrap_dmap(l1pt);
543
544	va = KERNBASE;
545	pa = KERNBASE - kern_delta;
546
547	/*
548	 * Start to initialise phys_avail by copying from physmap
549	 * up to the physical address KERNBASE points at.
550	 */
551	map_slot = avail_slot = 0;
552	for (; map_slot < (physmap_idx * 2); map_slot += 2) {
553		if (physmap[map_slot] == physmap[map_slot + 1])
554			continue;
555
556		if (physmap[map_slot] <= pa &&
557		    physmap[map_slot + 1] > pa)
558			break;
559
560		phys_avail[avail_slot] = physmap[map_slot];
561		phys_avail[avail_slot + 1] = physmap[map_slot + 1];
562		physmem += (phys_avail[avail_slot + 1] -
563		    phys_avail[avail_slot]) >> PAGE_SHIFT;
564		avail_slot += 2;
565	}
566
567	/* Add the memory before the kernel */
568	if (physmap[avail_slot] < pa) {
569		phys_avail[avail_slot] = physmap[map_slot];
570		phys_avail[avail_slot + 1] = pa;
571		physmem += (phys_avail[avail_slot + 1] -
572		    phys_avail[avail_slot]) >> PAGE_SHIFT;
573		avail_slot += 2;
574	}
575	used_map_slot = map_slot;
576
577	/*
578	 * Read the page table to find out what is already mapped.
579	 * This assumes we have mapped a block of memory from KERNBASE
580	 * using a single L1 entry.
581	 */
582	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
583
584	/* Sanity check the index, KERNBASE should be the first VA */
585	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
586
587	/* Find how many pages we have mapped */
588	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
589		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
590			break;
591
592		/* Check locore used L2 blocks */
593		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
594		    ("Invalid bootstrap L2 table"));
595		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
596		    ("Incorrect PA in L2 table"));
597
598		va += L2_SIZE;
599		pa += L2_SIZE;
600	}
601
602	va = roundup2(va, L1_SIZE);
603
604	freemempos = KERNBASE + kernlen;
605	freemempos = roundup2(freemempos, PAGE_SIZE);
606	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
607	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
608	/* And the l3 tables for the early devmap */
609	freemempos = pmap_bootstrap_l3(l1pt,
610	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
611
612	cpu_tlb_flushID();
613
614#define alloc_pages(var, np)						\
615	(var) = freemempos;						\
616	freemempos += (np * PAGE_SIZE);					\
617	memset((char *)(var), 0, ((np) * PAGE_SIZE));
618
619	/* Allocate dynamic per-cpu area. */
620	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
621	dpcpu_init((void *)dpcpu, 0);
622
623	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
624	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
625	msgbufp = (void *)msgbufpv;
626
627	virtual_avail = roundup2(freemempos, L1_SIZE);
628	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
629	kernel_vm_end = virtual_avail;
630
631	pa = pmap_early_vtophys(l1pt, freemempos);
632
633	/* Finish initialising physmap */
634	map_slot = used_map_slot;
635	for (; avail_slot < (PHYS_AVAIL_SIZE - 2) &&
636	    map_slot < (physmap_idx * 2); map_slot += 2) {
637		if (physmap[map_slot] == physmap[map_slot + 1])
638			continue;
639
640		/* Have we used the current range? */
641		if (physmap[map_slot + 1] <= pa)
642			continue;
643
644		/* Do we need to split the entry? */
645		if (physmap[map_slot] < pa) {
646			phys_avail[avail_slot] = pa;
647			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
648		} else {
649			phys_avail[avail_slot] = physmap[map_slot];
650			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
651		}
652		physmem += (phys_avail[avail_slot + 1] -
653		    phys_avail[avail_slot]) >> PAGE_SHIFT;
654
655		avail_slot += 2;
656	}
657	phys_avail[avail_slot] = 0;
658	phys_avail[avail_slot + 1] = 0;
659
660	/*
661	 * Maxmem isn't the "maximum memory", it's one larger than the
662	 * highest page of the physical address space.  It should be
663	 * called something like "Maxphyspage".
664	 */
665	Maxmem = atop(phys_avail[avail_slot - 1]);
666
667	cpu_tlb_flushID();
668}
669
670/*
671 *	Initialize a vm_page's machine-dependent fields.
672 */
673void
674pmap_page_init(vm_page_t m)
675{
676
677	TAILQ_INIT(&m->md.pv_list);
678	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
679}
680
681/*
682 *	Initialize the pmap module.
683 *	Called by vm_init, to initialize any structures that the pmap
684 *	system needs to map virtual memory.
685 */
686void
687pmap_init(void)
688{
689	int i;
690
691	/*
692	 * Initialize the pv chunk list mutex.
693	 */
694	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
695
696	/*
697	 * Initialize the pool of pv list locks.
698	 */
699	for (i = 0; i < NPV_LIST_LOCKS; i++)
700		rw_init(&pv_list_locks[i], "pmap pv list");
701}
702
703/*
704 * Normal, non-SMP, invalidation functions.
705 * We inline these within pmap.c for speed.
706 */
707PMAP_INLINE void
708pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
709{
710
711	sched_pin();
712	__asm __volatile(
713	    "dsb  sy		\n"
714	    "tlbi vaae1is, %0	\n"
715	    "dsb  sy		\n"
716	    "isb		\n"
717	    : : "r"(va >> PAGE_SHIFT));
718	sched_unpin();
719}
720
721PMAP_INLINE void
722pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
723{
724	vm_offset_t addr;
725
726	sched_pin();
727	sva >>= PAGE_SHIFT;
728	eva >>= PAGE_SHIFT;
729	__asm __volatile("dsb	sy");
730	for (addr = sva; addr < eva; addr++) {
731		__asm __volatile(
732		    "tlbi vaae1is, %0" : : "r"(addr));
733	}
734	__asm __volatile(
735	    "dsb  sy	\n"
736	    "isb	\n");
737	sched_unpin();
738}
739
740PMAP_INLINE void
741pmap_invalidate_all(pmap_t pmap)
742{
743
744	sched_pin();
745	__asm __volatile(
746	    "dsb  sy		\n"
747	    "tlbi vmalle1is	\n"
748	    "dsb  sy		\n"
749	    "isb		\n");
750	sched_unpin();
751}
752
753/*
754 *	Routine:	pmap_extract
755 *	Function:
756 *		Extract the physical page address associated
757 *		with the given map/virtual_address pair.
758 */
759vm_paddr_t
760pmap_extract(pmap_t pmap, vm_offset_t va)
761{
762	pd_entry_t *l2p, l2;
763	pt_entry_t *l3p, l3;
764	vm_paddr_t pa;
765
766	pa = 0;
767	PMAP_LOCK(pmap);
768	/*
769	 * Start with the l2 tabel. We are unable to allocate
770	 * pages in the l1 table.
771	 */
772	l2p = pmap_l2(pmap, va);
773	if (l2p != NULL) {
774		l2 = *l2p;
775		if ((l2 & ATTR_DESCR_MASK) == L2_TABLE) {
776			l3p = pmap_l2_to_l3(l2p, va);
777			if (l3p != NULL) {
778				l3 = *l3p;
779
780				if ((l3 & ATTR_DESCR_MASK) == L3_PAGE)
781					pa = (l3 & ~ATTR_MASK) |
782					    (va & L3_OFFSET);
783			}
784		} else if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK)
785			pa = (l2 & ~ATTR_MASK) | (va & L2_OFFSET);
786	}
787	PMAP_UNLOCK(pmap);
788	return (pa);
789}
790
791/*
792 *	Routine:	pmap_extract_and_hold
793 *	Function:
794 *		Atomically extract and hold the physical page
795 *		with the given pmap and virtual address pair
796 *		if that mapping permits the given protection.
797 */
798vm_page_t
799pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
800{
801	pt_entry_t *l3p, l3;
802	vm_paddr_t pa;
803	vm_page_t m;
804
805	pa = 0;
806	m = NULL;
807	PMAP_LOCK(pmap);
808retry:
809	l3p = pmap_l3(pmap, va);
810	if (l3p != NULL && (l3 = *l3p) != 0) {
811		if (((l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
812		    ((prot & VM_PROT_WRITE) == 0)) {
813			if (vm_page_pa_tryrelock(pmap, l3 & ~ATTR_MASK, &pa))
814				goto retry;
815			m = PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK);
816			vm_page_hold(m);
817		}
818	}
819	PA_UNLOCK_COND(pa);
820	PMAP_UNLOCK(pmap);
821	return (m);
822}
823
824vm_paddr_t
825pmap_kextract(vm_offset_t va)
826{
827	pd_entry_t *l2;
828	pt_entry_t *l3;
829	vm_paddr_t pa;
830
831	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
832		pa = DMAP_TO_PHYS(va);
833	} else {
834		l2 = pmap_l2(kernel_pmap, va);
835		if (l2 == NULL)
836			panic("pmap_kextract: No l2");
837		if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK)
838			return ((*l2 & ~ATTR_MASK) | (va & L2_OFFSET));
839
840		l3 = pmap_l2_to_l3(l2, va);
841		if (l3 == NULL)
842			panic("pmap_kextract: No l3...");
843		pa = (*l3 & ~ATTR_MASK) | (va & PAGE_MASK);
844	}
845	return (pa);
846}
847
848/***************************************************
849 * Low level mapping routines.....
850 ***************************************************/
851
852void
853pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa)
854{
855	pt_entry_t *l3;
856
857	KASSERT((pa & L3_OFFSET) == 0,
858	   ("pmap_kenter_device: Invalid physical address"));
859	KASSERT((va & L3_OFFSET) == 0,
860	   ("pmap_kenter_device: Invalid virtual address"));
861	KASSERT((size & PAGE_MASK) == 0,
862	    ("pmap_kenter_device: Mapping is not page-sized"));
863
864	while (size != 0) {
865		l3 = pmap_l3(kernel_pmap, va);
866		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
867		pmap_load_store(l3, (pa & ~L3_OFFSET) | ATTR_AF | L3_PAGE |
868		    ATTR_IDX(DEVICE_MEMORY));
869		PTE_SYNC(l3);
870
871		va += PAGE_SIZE;
872		pa += PAGE_SIZE;
873		size -= PAGE_SIZE;
874	}
875}
876
877/*
878 * Remove a page from the kernel pagetables.
879 * Note: not SMP coherent.
880 */
881PMAP_INLINE void
882pmap_kremove(vm_offset_t va)
883{
884	pt_entry_t *l3;
885
886	l3 = pmap_l3(kernel_pmap, va);
887	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
888
889	if (pmap_l3_valid_cacheable(pmap_load(l3)))
890		cpu_dcache_wb_range(va, L3_SIZE);
891	pmap_load_clear(l3);
892	PTE_SYNC(l3);
893}
894
895void
896pmap_kremove_device(vm_offset_t va, vm_size_t size)
897{
898	pt_entry_t *l3;
899
900	KASSERT((va & L3_OFFSET) == 0,
901	   ("pmap_kremove_device: Invalid virtual address"));
902	KASSERT((size & PAGE_MASK) == 0,
903	    ("pmap_kremove_device: Mapping is not page-sized"));
904
905	while (size != 0) {
906		l3 = pmap_l3(kernel_pmap, va);
907		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
908		pmap_load_clear(l3);
909		PTE_SYNC(l3);
910
911		va += PAGE_SIZE;
912		size -= PAGE_SIZE;
913	}
914}
915
916/*
917 *	Used to map a range of physical addresses into kernel
918 *	virtual address space.
919 *
920 *	The value passed in '*virt' is a suggested virtual address for
921 *	the mapping. Architectures which can support a direct-mapped
922 *	physical to virtual region can return the appropriate address
923 *	within that region, leaving '*virt' unchanged. Other
924 *	architectures should map the pages starting at '*virt' and
925 *	update '*virt' with the first usable address after the mapped
926 *	region.
927 */
928vm_offset_t
929pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
930{
931	return PHYS_TO_DMAP(start);
932}
933
934
935/*
936 * Add a list of wired pages to the kva
937 * this routine is only used for temporary
938 * kernel mappings that do not need to have
939 * page modification or references recorded.
940 * Note that old mappings are simply written
941 * over.  The page *must* be wired.
942 * Note: SMP coherent.  Uses a ranged shootdown IPI.
943 */
944void
945pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
946{
947	pt_entry_t *l3, pa;
948	vm_offset_t va;
949	vm_page_t m;
950	int i;
951
952	va = sva;
953	for (i = 0; i < count; i++) {
954		m = ma[i];
955		pa = VM_PAGE_TO_PHYS(m) | ATTR_AF |
956		    ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RW) | L3_PAGE;
957		l3 = pmap_l3(kernel_pmap, va);
958		pmap_load_store(l3, pa);
959		PTE_SYNC(l3);
960
961		va += L3_SIZE;
962	}
963}
964
965/*
966 * This routine tears out page mappings from the
967 * kernel -- it is meant only for temporary mappings.
968 * Note: SMP coherent.  Uses a ranged shootdown IPI.
969 */
970void
971pmap_qremove(vm_offset_t sva, int count)
972{
973	vm_offset_t va;
974
975	va = sva;
976	while (count-- > 0) {
977		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
978		pmap_kremove(va);
979		va += PAGE_SIZE;
980	}
981	pmap_invalidate_range(kernel_pmap, sva, va);
982}
983
984/***************************************************
985 * Page table page management routines.....
986 ***************************************************/
987static __inline void
988pmap_free_zero_pages(struct spglist *free)
989{
990	vm_page_t m;
991
992	while ((m = SLIST_FIRST(free)) != NULL) {
993		SLIST_REMOVE_HEAD(free, plinks.s.ss);
994		/* Preserve the page's PG_ZERO setting. */
995		vm_page_free_toq(m);
996	}
997}
998
999/*
1000 * Schedule the specified unused page table page to be freed.  Specifically,
1001 * add the page to the specified list of pages that will be released to the
1002 * physical memory manager after the TLB has been updated.
1003 */
1004static __inline void
1005pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1006    boolean_t set_PG_ZERO)
1007{
1008
1009	if (set_PG_ZERO)
1010		m->flags |= PG_ZERO;
1011	else
1012		m->flags &= ~PG_ZERO;
1013	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1014}
1015
1016/*
1017 * Decrements a page table page's wire count, which is used to record the
1018 * number of valid page table entries within the page.  If the wire count
1019 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1020 * page table page was unmapped and FALSE otherwise.
1021 */
1022static inline boolean_t
1023pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1024{
1025
1026	--m->wire_count;
1027	if (m->wire_count == 0) {
1028		_pmap_unwire_l3(pmap, va, m, free);
1029		return (TRUE);
1030	} else
1031		return (FALSE);
1032}
1033
1034static void
1035_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1036{
1037
1038	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1039	/*
1040	 * unmap the page table page
1041	 */
1042	if (m->pindex >= NUPDE) {
1043		/* PD page */
1044		pd_entry_t *l1;
1045		l1 = pmap_l1(pmap, va);
1046		pmap_load_clear(l1);
1047		PTE_SYNC(l1);
1048	} else {
1049		/* PTE page */
1050		pd_entry_t *l2;
1051		l2 = pmap_l2(pmap, va);
1052		pmap_load_clear(l2);
1053		PTE_SYNC(l2);
1054	}
1055	pmap_resident_count_dec(pmap, 1);
1056	if (m->pindex < NUPDE) {
1057		/* We just released a PT, unhold the matching PD */
1058		vm_page_t pdpg;
1059
1060		pdpg = PHYS_TO_VM_PAGE(*pmap_l1(pmap, va) & ~ATTR_MASK);
1061		pmap_unwire_l3(pmap, va, pdpg, free);
1062	}
1063
1064	/*
1065	 * This is a release store so that the ordinary store unmapping
1066	 * the page table page is globally performed before TLB shoot-
1067	 * down is begun.
1068	 */
1069	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
1070
1071	/*
1072	 * Put page on a list so that it is released after
1073	 * *ALL* TLB shootdown is done
1074	 */
1075	pmap_add_delayed_free_list(m, free, TRUE);
1076}
1077
1078/*
1079 * After removing an l3 entry, this routine is used to
1080 * conditionally free the page, and manage the hold/wire counts.
1081 */
1082static int
1083pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1084    struct spglist *free)
1085{
1086	vm_page_t mpte;
1087
1088	if (va >= VM_MAXUSER_ADDRESS)
1089		return (0);
1090	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1091	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1092	return (pmap_unwire_l3(pmap, va, mpte, free));
1093}
1094
1095void
1096pmap_pinit0(pmap_t pmap)
1097{
1098
1099	PMAP_LOCK_INIT(pmap);
1100	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1101	pmap->pm_l1 = kernel_pmap->pm_l1;
1102}
1103
1104int
1105pmap_pinit(pmap_t pmap)
1106{
1107	vm_paddr_t l1phys;
1108	vm_page_t l1pt;
1109
1110	/*
1111	 * allocate the l1 page
1112	 */
1113	while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL |
1114	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1115		VM_WAIT;
1116
1117	l1phys = VM_PAGE_TO_PHYS(l1pt);
1118	pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys);
1119
1120	if ((l1pt->flags & PG_ZERO) == 0)
1121		pagezero(pmap->pm_l1);
1122
1123	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1124
1125	return (1);
1126}
1127
1128/*
1129 * This routine is called if the desired page table page does not exist.
1130 *
1131 * If page table page allocation fails, this routine may sleep before
1132 * returning NULL.  It sleeps only if a lock pointer was given.
1133 *
1134 * Note: If a page allocation fails at page table level two or three,
1135 * one or two pages may be held during the wait, only to be released
1136 * afterwards.  This conservative approach is easily argued to avoid
1137 * race conditions.
1138 */
1139static vm_page_t
1140_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1141{
1142	vm_page_t m, /*pdppg, */pdpg;
1143
1144	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1145
1146	/*
1147	 * Allocate a page table page.
1148	 */
1149	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1150	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1151		if (lockp != NULL) {
1152			RELEASE_PV_LIST_LOCK(lockp);
1153			PMAP_UNLOCK(pmap);
1154			rw_runlock(&pvh_global_lock);
1155			VM_WAIT;
1156			rw_rlock(&pvh_global_lock);
1157			PMAP_LOCK(pmap);
1158		}
1159
1160		/*
1161		 * Indicate the need to retry.  While waiting, the page table
1162		 * page may have been allocated.
1163		 */
1164		return (NULL);
1165	}
1166	if ((m->flags & PG_ZERO) == 0)
1167		pmap_zero_page(m);
1168
1169	/*
1170	 * Map the pagetable page into the process address space, if
1171	 * it isn't already there.
1172	 */
1173
1174	if (ptepindex >= NUPDE) {
1175		pd_entry_t *l1;
1176		vm_pindex_t l1index;
1177
1178		l1index = ptepindex - NUPDE;
1179		l1 = &pmap->pm_l1[l1index];
1180		pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1181		PTE_SYNC(l1);
1182
1183	} else {
1184		vm_pindex_t l1index;
1185		pd_entry_t *l1, *l2;
1186
1187		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1188		l1 = &pmap->pm_l1[l1index];
1189		if (*l1 == 0) {
1190			/* recurse for allocating page dir */
1191			if (_pmap_alloc_l3(pmap, NUPDE + l1index,
1192			    lockp) == NULL) {
1193				--m->wire_count;
1194				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1195				vm_page_free_zero(m);
1196				return (NULL);
1197			}
1198		} else {
1199			pdpg = PHYS_TO_VM_PAGE(*l1 & ~ATTR_MASK);
1200			pdpg->wire_count++;
1201		}
1202
1203		l2 = (pd_entry_t *)PHYS_TO_DMAP(*l1 & ~ATTR_MASK);
1204		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1205		pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | ATTR_AF |
1206		    ATTR_IDX(CACHED_MEMORY) | L2_TABLE);
1207		PTE_SYNC(l2);
1208	}
1209
1210	pmap_resident_count_inc(pmap, 1);
1211
1212	return (m);
1213}
1214
1215static vm_page_t
1216pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1217{
1218	vm_pindex_t ptepindex;
1219	pd_entry_t *l2;
1220	vm_page_t m;
1221
1222	/*
1223	 * Calculate pagetable page index
1224	 */
1225	ptepindex = pmap_l2_pindex(va);
1226retry:
1227	/*
1228	 * Get the page directory entry
1229	 */
1230	l2 = pmap_l2(pmap, va);
1231
1232	/*
1233	 * If the page table page is mapped, we just increment the
1234	 * hold count, and activate it.
1235	 */
1236	if (l2 != NULL && *l2 != 0) {
1237		m = PHYS_TO_VM_PAGE(*l2 & ~ATTR_MASK);
1238		m->wire_count++;
1239	} else {
1240		/*
1241		 * Here if the pte page isn't mapped, or if it has been
1242		 * deallocated.
1243		 */
1244		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1245		if (m == NULL && lockp != NULL)
1246			goto retry;
1247	}
1248	/*
1249	 * XXXARM64: I'm not sure why we need this but it fixes a crash
1250	 * when running things from a shell script.
1251	 */
1252	pmap_invalidate_all(pmap);
1253	return (m);
1254}
1255
1256
1257/***************************************************
1258 * Pmap allocation/deallocation routines.
1259 ***************************************************/
1260
1261/*
1262 * Release any resources held by the given physical map.
1263 * Called when a pmap initialized by pmap_pinit is being released.
1264 * Should only be called if the map contains no valid mappings.
1265 */
1266void
1267pmap_release(pmap_t pmap)
1268{
1269	vm_page_t m;
1270
1271	KASSERT(pmap->pm_stats.resident_count == 0,
1272	    ("pmap_release: pmap resident count %ld != 0",
1273	    pmap->pm_stats.resident_count));
1274
1275	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1));
1276
1277	m->wire_count--;
1278	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1279	vm_page_free_zero(m);
1280}
1281
1282#if 0
1283static int
1284kvm_size(SYSCTL_HANDLER_ARGS)
1285{
1286	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1287
1288	return sysctl_handle_long(oidp, &ksize, 0, req);
1289}
1290SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1291    0, 0, kvm_size, "LU", "Size of KVM");
1292
1293static int
1294kvm_free(SYSCTL_HANDLER_ARGS)
1295{
1296	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1297
1298	return sysctl_handle_long(oidp, &kfree, 0, req);
1299}
1300SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1301    0, 0, kvm_free, "LU", "Amount of KVM free");
1302#endif /* 0 */
1303
1304/*
1305 * grow the number of kernel page table entries, if needed
1306 */
1307void
1308pmap_growkernel(vm_offset_t addr)
1309{
1310	vm_paddr_t paddr;
1311	vm_page_t nkpg;
1312	pd_entry_t *l1, *l2;
1313
1314	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1315
1316	addr = roundup2(addr, L2_SIZE);
1317	if (addr - 1 >= kernel_map->max_offset)
1318		addr = kernel_map->max_offset;
1319	while (kernel_vm_end < addr) {
1320		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1321		if (*l1 == 0) {
1322			/* We need a new PDP entry */
1323			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
1324			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1325			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1326			if (nkpg == NULL)
1327				panic("pmap_growkernel: no memory to grow kernel");
1328			if ((nkpg->flags & PG_ZERO) == 0)
1329				pmap_zero_page(nkpg);
1330			paddr = VM_PAGE_TO_PHYS(nkpg);
1331			pmap_load_store(l1, paddr | L1_TABLE);
1332			PTE_SYNC(l1);
1333			continue; /* try again */
1334		}
1335		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1336		if ((*l2 & ATTR_AF) != 0) {
1337			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1338			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1339				kernel_vm_end = kernel_map->max_offset;
1340				break;
1341			}
1342			continue;
1343		}
1344
1345		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
1346		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1347		    VM_ALLOC_ZERO);
1348		if (nkpg == NULL)
1349			panic("pmap_growkernel: no memory to grow kernel");
1350		if ((nkpg->flags & PG_ZERO) == 0)
1351			pmap_zero_page(nkpg);
1352		paddr = VM_PAGE_TO_PHYS(nkpg);
1353		pmap_load_store(l2, paddr | L2_TABLE);
1354		PTE_SYNC(l2);
1355
1356		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1357		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1358			kernel_vm_end = kernel_map->max_offset;
1359			break;
1360		}
1361	}
1362}
1363
1364
1365/***************************************************
1366 * page management routines.
1367 ***************************************************/
1368
1369CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1370CTASSERT(_NPCM == 3);
1371CTASSERT(_NPCPV == 168);
1372
1373static __inline struct pv_chunk *
1374pv_to_chunk(pv_entry_t pv)
1375{
1376
1377	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1378}
1379
1380#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1381
1382#define	PC_FREE0	0xfffffffffffffffful
1383#define	PC_FREE1	0xfffffffffffffffful
1384#define	PC_FREE2	0x000000fffffffffful
1385
1386static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1387
1388#if 0
1389#ifdef PV_STATS
1390static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1391
1392SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1393	"Current number of pv entry chunks");
1394SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1395	"Current number of pv entry chunks allocated");
1396SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1397	"Current number of pv entry chunks frees");
1398SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1399	"Number of times tried to get a chunk page but failed.");
1400
1401static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1402static int pv_entry_spare;
1403
1404SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1405	"Current number of pv entry frees");
1406SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1407	"Current number of pv entry allocs");
1408SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1409	"Current number of pv entries");
1410SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1411	"Current number of spare pv entries");
1412#endif
1413#endif /* 0 */
1414
1415/*
1416 * We are in a serious low memory condition.  Resort to
1417 * drastic measures to free some pages so we can allocate
1418 * another pv entry chunk.
1419 *
1420 * Returns NULL if PV entries were reclaimed from the specified pmap.
1421 *
1422 * We do not, however, unmap 2mpages because subsequent accesses will
1423 * allocate per-page pv entries until repromotion occurs, thereby
1424 * exacerbating the shortage of free pv entries.
1425 */
1426static vm_page_t
1427reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1428{
1429
1430	panic("reclaim_pv_chunk");
1431}
1432
1433/*
1434 * free the pv_entry back to the free list
1435 */
1436static void
1437free_pv_entry(pmap_t pmap, pv_entry_t pv)
1438{
1439	struct pv_chunk *pc;
1440	int idx, field, bit;
1441
1442	rw_assert(&pvh_global_lock, RA_LOCKED);
1443	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1444	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1445	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1446	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1447	pc = pv_to_chunk(pv);
1448	idx = pv - &pc->pc_pventry[0];
1449	field = idx / 64;
1450	bit = idx % 64;
1451	pc->pc_map[field] |= 1ul << bit;
1452	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1453	    pc->pc_map[2] != PC_FREE2) {
1454		/* 98% of the time, pc is already at the head of the list. */
1455		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1456			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1457			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1458		}
1459		return;
1460	}
1461	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1462	free_pv_chunk(pc);
1463}
1464
1465static void
1466free_pv_chunk(struct pv_chunk *pc)
1467{
1468	vm_page_t m;
1469
1470	mtx_lock(&pv_chunks_mutex);
1471 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1472	mtx_unlock(&pv_chunks_mutex);
1473	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1474	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1475	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1476	/* entire chunk is free, return it */
1477	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1478#if 0 /* TODO: For minidump */
1479	dump_drop_page(m->phys_addr);
1480#endif
1481	vm_page_unwire(m, PQ_INACTIVE);
1482	vm_page_free(m);
1483}
1484
1485/*
1486 * Returns a new PV entry, allocating a new PV chunk from the system when
1487 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1488 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1489 * returned.
1490 *
1491 * The given PV list lock may be released.
1492 */
1493static pv_entry_t
1494get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1495{
1496	int bit, field;
1497	pv_entry_t pv;
1498	struct pv_chunk *pc;
1499	vm_page_t m;
1500
1501	rw_assert(&pvh_global_lock, RA_LOCKED);
1502	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1503	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1504retry:
1505	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1506	if (pc != NULL) {
1507		for (field = 0; field < _NPCM; field++) {
1508			if (pc->pc_map[field]) {
1509				bit = ffsl(pc->pc_map[field]) - 1;
1510				break;
1511			}
1512		}
1513		if (field < _NPCM) {
1514			pv = &pc->pc_pventry[field * 64 + bit];
1515			pc->pc_map[field] &= ~(1ul << bit);
1516			/* If this was the last item, move it to tail */
1517			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1518			    pc->pc_map[2] == 0) {
1519				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1520				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1521				    pc_list);
1522			}
1523			PV_STAT(atomic_add_long(&pv_entry_count, 1));
1524			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1525			return (pv);
1526		}
1527	}
1528	/* No free items, allocate another chunk */
1529	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1530	    VM_ALLOC_WIRED);
1531	if (m == NULL) {
1532		if (lockp == NULL) {
1533			PV_STAT(pc_chunk_tryfail++);
1534			return (NULL);
1535		}
1536		m = reclaim_pv_chunk(pmap, lockp);
1537		if (m == NULL)
1538			goto retry;
1539	}
1540	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1541	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1542#if 0 /* TODO: This is for minidump */
1543	dump_add_page(m->phys_addr);
1544#endif
1545	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1546	pc->pc_pmap = pmap;
1547	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
1548	pc->pc_map[1] = PC_FREE1;
1549	pc->pc_map[2] = PC_FREE2;
1550	mtx_lock(&pv_chunks_mutex);
1551	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1552	mtx_unlock(&pv_chunks_mutex);
1553	pv = &pc->pc_pventry[0];
1554	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1555	PV_STAT(atomic_add_long(&pv_entry_count, 1));
1556	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1557	return (pv);
1558}
1559
1560/*
1561 * First find and then remove the pv entry for the specified pmap and virtual
1562 * address from the specified pv list.  Returns the pv entry if found and NULL
1563 * otherwise.  This operation can be performed on pv lists for either 4KB or
1564 * 2MB page mappings.
1565 */
1566static __inline pv_entry_t
1567pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1568{
1569	pv_entry_t pv;
1570
1571	rw_assert(&pvh_global_lock, RA_LOCKED);
1572	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
1573		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1574			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
1575			pvh->pv_gen++;
1576			break;
1577		}
1578	}
1579	return (pv);
1580}
1581
1582/*
1583 * First find and then destroy the pv entry for the specified pmap and virtual
1584 * address.  This operation can be performed on pv lists for either 4KB or 2MB
1585 * page mappings.
1586 */
1587static void
1588pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1589{
1590	pv_entry_t pv;
1591
1592	pv = pmap_pvh_remove(pvh, pmap, va);
1593	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1594	free_pv_entry(pmap, pv);
1595}
1596
1597/*
1598 * Conditionally create the PV entry for a 4KB page mapping if the required
1599 * memory can be allocated without resorting to reclamation.
1600 */
1601static boolean_t
1602pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1603    struct rwlock **lockp)
1604{
1605	pv_entry_t pv;
1606
1607	rw_assert(&pvh_global_lock, RA_LOCKED);
1608	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1609	/* Pass NULL instead of the lock pointer to disable reclamation. */
1610	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1611		pv->pv_va = va;
1612		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1613		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
1614		m->md.pv_gen++;
1615		return (TRUE);
1616	} else
1617		return (FALSE);
1618}
1619
1620/*
1621 * pmap_remove_l3: do the things to unmap a page in a process
1622 */
1623static int
1624pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
1625    pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
1626{
1627	pt_entry_t old_l3;
1628	vm_page_t m;
1629
1630	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1631	if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3)))
1632		cpu_dcache_wb_range(va, L3_SIZE);
1633	old_l3 = pmap_load_clear(l3);
1634	PTE_SYNC(l3);
1635	if (old_l3 & ATTR_SW_WIRED)
1636		pmap->pm_stats.wired_count -= 1;
1637	pmap_resident_count_dec(pmap, 1);
1638	if (old_l3 & ATTR_SW_MANAGED) {
1639		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
1640		if (pmap_page_dirty(old_l3))
1641			vm_page_dirty(m);
1642		if (old_l3 & ATTR_AF)
1643			vm_page_aflag_set(m, PGA_REFERENCED);
1644		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1645		pmap_pvh_free(&m->md, pmap, va);
1646	}
1647	return (pmap_unuse_l3(pmap, va, l2e, free));
1648}
1649
1650/*
1651 *	Remove the given range of addresses from the specified map.
1652 *
1653 *	It is assumed that the start and end are properly
1654 *	rounded to the page size.
1655 */
1656void
1657pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1658{
1659	struct rwlock *lock;
1660	vm_offset_t va, va_next;
1661	pd_entry_t *l1, *l2;
1662	pt_entry_t l3_paddr, *l3;
1663	struct spglist free;
1664	int anyvalid;
1665
1666	/*
1667	 * Perform an unsynchronized read.  This is, however, safe.
1668	 */
1669	if (pmap->pm_stats.resident_count == 0)
1670		return;
1671
1672	anyvalid = 0;
1673	SLIST_INIT(&free);
1674
1675	rw_rlock(&pvh_global_lock);
1676	PMAP_LOCK(pmap);
1677
1678	lock = NULL;
1679	for (; sva < eva; sva = va_next) {
1680
1681		if (pmap->pm_stats.resident_count == 0)
1682			break;
1683
1684		l1 = pmap_l1(pmap, sva);
1685		if (*l1 == 0) {
1686			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
1687			if (va_next < sva)
1688				va_next = eva;
1689			continue;
1690		}
1691
1692		/*
1693		 * Calculate index for next page table.
1694		 */
1695		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
1696		if (va_next < sva)
1697			va_next = eva;
1698
1699		l2 = pmap_l1_to_l2(l1, sva);
1700		if (l2 == NULL)
1701			continue;
1702
1703		l3_paddr = *l2;
1704
1705		/*
1706		 * Weed out invalid mappings.
1707		 */
1708		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
1709			continue;
1710
1711		/*
1712		 * Limit our scan to either the end of the va represented
1713		 * by the current page table page, or to the end of the
1714		 * range being removed.
1715		 */
1716		if (va_next > eva)
1717			va_next = eva;
1718
1719		va = va_next;
1720		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
1721		    sva += L3_SIZE) {
1722			if (l3 == NULL)
1723				panic("l3 == NULL");
1724			if (*l3 == 0) {
1725				if (va != va_next) {
1726					pmap_invalidate_range(pmap, va, sva);
1727					va = va_next;
1728				}
1729				continue;
1730			}
1731			if (va == va_next)
1732				va = sva;
1733			if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
1734			    &lock)) {
1735				sva += L3_SIZE;
1736				break;
1737			}
1738		}
1739		if (va != va_next)
1740			pmap_invalidate_range(pmap, va, sva);
1741	}
1742	if (lock != NULL)
1743		rw_wunlock(lock);
1744	if (anyvalid)
1745		pmap_invalidate_all(pmap);
1746	rw_runlock(&pvh_global_lock);
1747	PMAP_UNLOCK(pmap);
1748	pmap_free_zero_pages(&free);
1749}
1750
1751/*
1752 *	Routine:	pmap_remove_all
1753 *	Function:
1754 *		Removes this physical page from
1755 *		all physical maps in which it resides.
1756 *		Reflects back modify bits to the pager.
1757 *
1758 *	Notes:
1759 *		Original versions of this routine were very
1760 *		inefficient because they iteratively called
1761 *		pmap_remove (slow...)
1762 */
1763
1764void
1765pmap_remove_all(vm_page_t m)
1766{
1767	pv_entry_t pv;
1768	pmap_t pmap;
1769	pt_entry_t *l3, tl3;
1770	pd_entry_t *l2;
1771	struct spglist free;
1772
1773	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1774	    ("pmap_remove_all: page %p is not managed", m));
1775	SLIST_INIT(&free);
1776	rw_wlock(&pvh_global_lock);
1777	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1778		pmap = PV_PMAP(pv);
1779		PMAP_LOCK(pmap);
1780		pmap_resident_count_dec(pmap, 1);
1781		l2 = pmap_l2(pmap, pv->pv_va);
1782		KASSERT((*l2 & ATTR_DESCR_MASK) == L2_TABLE,
1783		    ("pmap_remove_all: found a table when expecting "
1784		     "a block in %p's pv list", m));
1785		l3 = pmap_l2_to_l3(l2, pv->pv_va);
1786		if (pmap_is_current(pmap) &&
1787		    pmap_l3_valid_cacheable(pmap_load(l3)))
1788			cpu_dcache_wb_range(pv->pv_va, L3_SIZE);
1789		tl3 = pmap_load_clear(l3);
1790		PTE_SYNC(l3);
1791		if (tl3 & ATTR_SW_WIRED)
1792			pmap->pm_stats.wired_count--;
1793		if ((tl3 & ATTR_AF) != 0)
1794			vm_page_aflag_set(m, PGA_REFERENCED);
1795
1796		/*
1797		 * Update the vm_page_t clean and reference bits.
1798		 */
1799		if (pmap_page_dirty(tl3))
1800			vm_page_dirty(m);
1801		pmap_unuse_l3(pmap, pv->pv_va, *l2, &free);
1802		pmap_invalidate_page(pmap, pv->pv_va);
1803		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
1804		m->md.pv_gen++;
1805		free_pv_entry(pmap, pv);
1806		PMAP_UNLOCK(pmap);
1807	}
1808	vm_page_aflag_clear(m, PGA_WRITEABLE);
1809	rw_wunlock(&pvh_global_lock);
1810	pmap_free_zero_pages(&free);
1811}
1812
1813/*
1814 *	Set the physical protection on the
1815 *	specified range of this map as requested.
1816 */
1817void
1818pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1819{
1820	vm_offset_t va, va_next;
1821	pd_entry_t *l1, *l2;
1822	pt_entry_t *l3p, l3;
1823
1824	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1825		pmap_remove(pmap, sva, eva);
1826		return;
1827	}
1828
1829	if ((prot & VM_PROT_WRITE) == VM_PROT_WRITE)
1830		return;
1831
1832	PMAP_LOCK(pmap);
1833	for (; sva < eva; sva = va_next) {
1834
1835		l1 = pmap_l1(pmap, sva);
1836		if (*l1 == 0) {
1837			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
1838			if (va_next < sva)
1839				va_next = eva;
1840			continue;
1841		}
1842
1843		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
1844		if (va_next < sva)
1845			va_next = eva;
1846
1847		l2 = pmap_l1_to_l2(l1, sva);
1848		if (l2 == NULL || (*l2 & ATTR_DESCR_MASK) != L2_TABLE)
1849			continue;
1850
1851		if (va_next > eva)
1852			va_next = eva;
1853
1854		va = va_next;
1855		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
1856		    sva += L3_SIZE) {
1857			l3 = pmap_load(l3p);
1858			if (pmap_l3_valid(l3)) {
1859				pmap_set(l3p, ATTR_AP(ATTR_AP_RO));
1860				PTE_SYNC(l3p);
1861			}
1862		}
1863	}
1864	PMAP_UNLOCK(pmap);
1865
1866	/* TODO: Only invalidate entries we are touching */
1867	pmap_invalidate_all(pmap);
1868}
1869
1870/*
1871 *	Insert the given physical page (p) at
1872 *	the specified virtual address (v) in the
1873 *	target physical map with the protection requested.
1874 *
1875 *	If specified, the page will be wired down, meaning
1876 *	that the related pte can not be reclaimed.
1877 *
1878 *	NB:  This is the only routine which MAY NOT lazy-evaluate
1879 *	or lose information.  That is, this routine must actually
1880 *	insert this page into the given map NOW.
1881 */
1882int
1883pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1884    u_int flags, int8_t psind __unused)
1885{
1886	struct rwlock *lock;
1887	pd_entry_t *l1, *l2;
1888	pt_entry_t new_l3, orig_l3;
1889	pt_entry_t *l3;
1890	pv_entry_t pv;
1891	vm_paddr_t opa, pa, l2_pa, l3_pa;
1892	vm_page_t mpte, om, l2_m, l3_m;
1893	boolean_t nosleep;
1894
1895	va = trunc_page(va);
1896	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
1897		VM_OBJECT_ASSERT_LOCKED(m->object);
1898	pa = VM_PAGE_TO_PHYS(m);
1899	new_l3 = (pt_entry_t)(pa | ATTR_AF | L3_PAGE);
1900	if ((prot & VM_PROT_WRITE) == 0)
1901		new_l3 |= ATTR_AP(ATTR_AP_RO);
1902	if ((flags & PMAP_ENTER_WIRED) != 0)
1903		new_l3 |= ATTR_SW_WIRED;
1904	if ((va >> 63) == 0)
1905		new_l3 |= ATTR_AP(ATTR_AP_USER);
1906	new_l3 |= ATTR_IDX(m->md.pv_memattr);
1907
1908	mpte = NULL;
1909
1910	lock = NULL;
1911	rw_rlock(&pvh_global_lock);
1912	PMAP_LOCK(pmap);
1913
1914	if (va < VM_MAXUSER_ADDRESS) {
1915		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
1916		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
1917		if (mpte == NULL && nosleep) {
1918			if (lock != NULL)
1919				rw_wunlock(lock);
1920			rw_runlock(&pvh_global_lock);
1921			PMAP_UNLOCK(pmap);
1922			return (KERN_RESOURCE_SHORTAGE);
1923		}
1924		l3 = pmap_l3(pmap, va);
1925	} else {
1926		l3 = pmap_l3(pmap, va);
1927		/* TODO: This is not optimal, but should mostly work */
1928		if (l3 == NULL) {
1929			l2 = pmap_l2(pmap, va);
1930
1931			if (l2 == NULL) {
1932				l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1933				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1934				    VM_ALLOC_ZERO);
1935				if (l2_m == NULL)
1936					panic("pmap_enter: l2 pte_m == NULL");
1937				if ((l2_m->flags & PG_ZERO) == 0)
1938					pmap_zero_page(l2_m);
1939
1940				l2_pa = VM_PAGE_TO_PHYS(l2_m);
1941				l1 = pmap_l1(pmap, va);
1942				pmap_load_store(l1, l2_pa | L1_TABLE);
1943				PTE_SYNC(l1);
1944				l2 = pmap_l1_to_l2(l1, va);
1945			}
1946
1947			KASSERT(l2 != NULL,
1948			    ("No l2 table after allocating one"));
1949
1950			l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1951			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1952			if (l3_m == NULL)
1953				panic("pmap_enter: l3 pte_m == NULL");
1954			if ((l3_m->flags & PG_ZERO) == 0)
1955				pmap_zero_page(l3_m);
1956
1957			l3_pa = VM_PAGE_TO_PHYS(l3_m);
1958			pmap_load_store(l2, l3_pa | L2_TABLE);
1959			PTE_SYNC(l2);
1960			l3 = pmap_l2_to_l3(l2, va);
1961		}
1962	}
1963
1964	om = NULL;
1965	orig_l3 = pmap_load(l3);
1966	opa = orig_l3 & ~ATTR_MASK;
1967
1968	/*
1969	 * Is the specified virtual address already mapped?
1970	 */
1971	if (pmap_l3_valid(orig_l3)) {
1972		/*
1973		 * Wiring change, just update stats. We don't worry about
1974		 * wiring PT pages as they remain resident as long as there
1975		 * are valid mappings in them. Hence, if a user page is wired,
1976		 * the PT page will be also.
1977		 */
1978		if ((flags & PMAP_ENTER_WIRED) != 0 &&
1979		    (orig_l3 & ATTR_SW_WIRED) == 0)
1980			pmap->pm_stats.wired_count++;
1981		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
1982		    (orig_l3 & ATTR_SW_WIRED) != 0)
1983			pmap->pm_stats.wired_count--;
1984
1985		/*
1986		 * Remove the extra PT page reference.
1987		 */
1988		if (mpte != NULL) {
1989			mpte->wire_count--;
1990			KASSERT(mpte->wire_count > 0,
1991			    ("pmap_enter: missing reference to page table page,"
1992			     " va: 0x%lx", va));
1993		}
1994
1995		/*
1996		 * Has the physical page changed?
1997		 */
1998		if (opa == pa) {
1999			/*
2000			 * No, might be a protection or wiring change.
2001			 */
2002			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
2003				new_l3 |= ATTR_SW_MANAGED;
2004				if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
2005				    ATTR_AP(ATTR_AP_RW)) {
2006					vm_page_aflag_set(m, PGA_WRITEABLE);
2007				}
2008			}
2009			goto validate;
2010		}
2011
2012		/* Flush the cache, there might be uncommitted data in it */
2013		if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(orig_l3))
2014			cpu_dcache_wb_range(va, L3_SIZE);
2015	} else {
2016		/*
2017		 * Increment the counters.
2018		 */
2019		if ((new_l3 & ATTR_SW_WIRED) != 0)
2020			pmap->pm_stats.wired_count++;
2021		pmap_resident_count_inc(pmap, 1);
2022	}
2023	/*
2024	 * Enter on the PV list if part of our managed memory.
2025	 */
2026	if ((m->oflags & VPO_UNMANAGED) == 0) {
2027		new_l3 |= ATTR_SW_MANAGED;
2028		pv = get_pv_entry(pmap, &lock);
2029		pv->pv_va = va;
2030		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
2031		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2032		m->md.pv_gen++;
2033		if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
2034			vm_page_aflag_set(m, PGA_WRITEABLE);
2035	}
2036
2037	/*
2038	 * Update the L3 entry.
2039	 */
2040	if (orig_l3 != 0) {
2041validate:
2042		orig_l3 = pmap_load_store(l3, new_l3);
2043		PTE_SYNC(l3);
2044		opa = orig_l3 & ~ATTR_MASK;
2045
2046		if (opa != pa) {
2047			if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
2048				om = PHYS_TO_VM_PAGE(opa);
2049				if (pmap_page_dirty(orig_l3))
2050					vm_page_dirty(om);
2051				if ((orig_l3 & ATTR_AF) != 0)
2052					vm_page_aflag_set(om, PGA_REFERENCED);
2053				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
2054				pmap_pvh_free(&om->md, pmap, va);
2055			}
2056		} else if (pmap_page_dirty(orig_l3)) {
2057			if ((orig_l3 & ATTR_SW_MANAGED) != 0)
2058				vm_page_dirty(m);
2059		}
2060		if ((orig_l3 & ATTR_AF) != 0)
2061			pmap_invalidate_page(pmap, va);
2062	} else {
2063		pmap_load_store(l3, new_l3);
2064		PTE_SYNC(l3);
2065	}
2066	if ((pmap != pmap_kernel()) && (pmap == &curproc->p_vmspace->vm_pmap))
2067	    cpu_icache_sync_range(va, PAGE_SIZE);
2068
2069	if (lock != NULL)
2070		rw_wunlock(lock);
2071	rw_runlock(&pvh_global_lock);
2072	PMAP_UNLOCK(pmap);
2073	return (KERN_SUCCESS);
2074}
2075
2076/*
2077 * Maps a sequence of resident pages belonging to the same object.
2078 * The sequence begins with the given page m_start.  This page is
2079 * mapped at the given virtual address start.  Each subsequent page is
2080 * mapped at a virtual address that is offset from start by the same
2081 * amount as the page is offset from m_start within the object.  The
2082 * last page in the sequence is the page with the largest offset from
2083 * m_start that can be mapped at a virtual address less than the given
2084 * virtual address end.  Not every virtual page between start and end
2085 * is mapped; only those for which a resident page exists with the
2086 * corresponding offset from m_start are mapped.
2087 */
2088void
2089pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2090    vm_page_t m_start, vm_prot_t prot)
2091{
2092	struct rwlock *lock;
2093	vm_offset_t va;
2094	vm_page_t m, mpte;
2095	vm_pindex_t diff, psize;
2096
2097	VM_OBJECT_ASSERT_LOCKED(m_start->object);
2098
2099	psize = atop(end - start);
2100	mpte = NULL;
2101	m = m_start;
2102	lock = NULL;
2103	rw_rlock(&pvh_global_lock);
2104	PMAP_LOCK(pmap);
2105	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2106		va = start + ptoa(diff);
2107		mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
2108		m = TAILQ_NEXT(m, listq);
2109	}
2110	if (lock != NULL)
2111		rw_wunlock(lock);
2112	rw_runlock(&pvh_global_lock);
2113	PMAP_UNLOCK(pmap);
2114}
2115
2116/*
2117 * this code makes some *MAJOR* assumptions:
2118 * 1. Current pmap & pmap exists.
2119 * 2. Not wired.
2120 * 3. Read access.
2121 * 4. No page table pages.
2122 * but is *MUCH* faster than pmap_enter...
2123 */
2124
2125void
2126pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2127{
2128	struct rwlock *lock;
2129
2130	lock = NULL;
2131	rw_rlock(&pvh_global_lock);
2132	PMAP_LOCK(pmap);
2133	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
2134	if (lock != NULL)
2135		rw_wunlock(lock);
2136	rw_runlock(&pvh_global_lock);
2137	PMAP_UNLOCK(pmap);
2138}
2139
2140static vm_page_t
2141pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
2142    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
2143{
2144	struct spglist free;
2145	pd_entry_t *l2;
2146	pt_entry_t *l3;
2147	vm_paddr_t pa;
2148
2149	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2150	    (m->oflags & VPO_UNMANAGED) != 0,
2151	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2152	rw_assert(&pvh_global_lock, RA_LOCKED);
2153	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2154
2155	/*
2156	 * In the case that a page table page is not
2157	 * resident, we are creating it here.
2158	 */
2159	if (va < VM_MAXUSER_ADDRESS) {
2160		vm_pindex_t l2pindex;
2161
2162		/*
2163		 * Calculate pagetable page index
2164		 */
2165		l2pindex = pmap_l2_pindex(va);
2166		if (mpte && (mpte->pindex == l2pindex)) {
2167			mpte->wire_count++;
2168		} else {
2169			/*
2170			 * Get the l2 entry
2171			 */
2172			l2 = pmap_l2(pmap, va);
2173
2174			/*
2175			 * If the page table page is mapped, we just increment
2176			 * the hold count, and activate it.  Otherwise, we
2177			 * attempt to allocate a page table page.  If this
2178			 * attempt fails, we don't retry.  Instead, we give up.
2179			 */
2180			if (l2 != NULL && *l2 != 0) {
2181				mpte = PHYS_TO_VM_PAGE(*l2 & ~ATTR_MASK);
2182				mpte->wire_count++;
2183			} else {
2184				/*
2185				 * Pass NULL instead of the PV list lock
2186				 * pointer, because we don't intend to sleep.
2187				 */
2188				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
2189				if (mpte == NULL)
2190					return (mpte);
2191			}
2192		}
2193		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
2194		l3 = &l3[pmap_l3_index(va)];
2195	} else {
2196		mpte = NULL;
2197		l3 = pmap_l3(kernel_pmap, va);
2198	}
2199	if (l3 == NULL)
2200		panic("pmap_enter_quick_locked: No l3");
2201	if (*l3) {
2202		if (mpte != NULL) {
2203			mpte->wire_count--;
2204			mpte = NULL;
2205		}
2206		return (mpte);
2207	}
2208
2209	/*
2210	 * Enter on the PV list if part of our managed memory.
2211	 */
2212	if ((m->oflags & VPO_UNMANAGED) == 0 &&
2213	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
2214		if (mpte != NULL) {
2215			SLIST_INIT(&free);
2216			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
2217				pmap_invalidate_page(pmap, va);
2218				pmap_free_zero_pages(&free);
2219			}
2220			mpte = NULL;
2221		}
2222		return (mpte);
2223	}
2224
2225	/*
2226	 * Increment counters
2227	 */
2228	pmap_resident_count_inc(pmap, 1);
2229
2230	pa = VM_PAGE_TO_PHYS(m) | ATTR_AF | ATTR_IDX(m->md.pv_memattr) |
2231	    ATTR_AP(ATTR_AP_RW) | L3_PAGE;
2232
2233	/*
2234	 * Now validate mapping with RO protection
2235	 */
2236	if ((m->oflags & VPO_UNMANAGED) == 0)
2237		pa |= ATTR_SW_MANAGED;
2238	pmap_load_store(l3, pa);
2239	PTE_SYNC(l3);
2240	pmap_invalidate_page(pmap, va);
2241	return (mpte);
2242}
2243
2244/*
2245 * This code maps large physical mmap regions into the
2246 * processor address space.  Note that some shortcuts
2247 * are taken, but the code works.
2248 */
2249void
2250pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
2251    vm_pindex_t pindex, vm_size_t size)
2252{
2253
2254	VM_OBJECT_ASSERT_WLOCKED(object);
2255	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
2256	    ("pmap_object_init_pt: non-device object"));
2257}
2258
2259/*
2260 *	Clear the wired attribute from the mappings for the specified range of
2261 *	addresses in the given pmap.  Every valid mapping within that range
2262 *	must have the wired attribute set.  In contrast, invalid mappings
2263 *	cannot have the wired attribute set, so they are ignored.
2264 *
2265 *	The wired attribute of the page table entry is not a hardware feature,
2266 *	so there is no need to invalidate any TLB entries.
2267 */
2268void
2269pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2270{
2271	vm_offset_t va_next;
2272	pd_entry_t *l1, *l2;
2273	pt_entry_t *l3;
2274	boolean_t pv_lists_locked;
2275
2276	pv_lists_locked = FALSE;
2277	PMAP_LOCK(pmap);
2278	for (; sva < eva; sva = va_next) {
2279		l1 = pmap_l1(pmap, sva);
2280		if (*l1 == 0) {
2281			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2282			if (va_next < sva)
2283				va_next = eva;
2284			continue;
2285		}
2286
2287		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2288		if (va_next < sva)
2289			va_next = eva;
2290
2291		l2 = pmap_l1_to_l2(l1, sva);
2292		if (*l2 == 0)
2293			continue;
2294
2295		if (va_next > eva)
2296			va_next = eva;
2297		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2298		    sva += L3_SIZE) {
2299			if (*l3 == 0)
2300				continue;
2301			if ((*l3 & ATTR_SW_WIRED) == 0)
2302				panic("pmap_unwire: l3 %#jx is missing "
2303				    "ATTR_SW_WIRED", (uintmax_t)*l3);
2304
2305			/*
2306			 * PG_W must be cleared atomically.  Although the pmap
2307			 * lock synchronizes access to PG_W, another processor
2308			 * could be setting PG_M and/or PG_A concurrently.
2309			 */
2310			atomic_clear_long(l3, ATTR_SW_WIRED);
2311			pmap->pm_stats.wired_count--;
2312		}
2313	}
2314	if (pv_lists_locked)
2315		rw_runlock(&pvh_global_lock);
2316	PMAP_UNLOCK(pmap);
2317}
2318
2319/*
2320 *	Copy the range specified by src_addr/len
2321 *	from the source map to the range dst_addr/len
2322 *	in the destination map.
2323 *
2324 *	This routine is only advisory and need not do anything.
2325 */
2326
2327void
2328pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2329    vm_offset_t src_addr)
2330{
2331}
2332
2333/*
2334 *	pmap_zero_page zeros the specified hardware page by mapping
2335 *	the page into KVM and using bzero to clear its contents.
2336 */
2337void
2338pmap_zero_page(vm_page_t m)
2339{
2340	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2341
2342	pagezero((void *)va);
2343}
2344
2345/*
2346 *	pmap_zero_page_area zeros the specified hardware page by mapping
2347 *	the page into KVM and using bzero to clear its contents.
2348 *
2349 *	off and size may not cover an area beyond a single hardware page.
2350 */
2351void
2352pmap_zero_page_area(vm_page_t m, int off, int size)
2353{
2354	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2355
2356	if (off == 0 && size == PAGE_SIZE)
2357		pagezero((void *)va);
2358	else
2359		bzero((char *)va + off, size);
2360}
2361
2362/*
2363 *	pmap_zero_page_idle zeros the specified hardware page by mapping
2364 *	the page into KVM and using bzero to clear its contents.  This
2365 *	is intended to be called from the vm_pagezero process only and
2366 *	outside of Giant.
2367 */
2368void
2369pmap_zero_page_idle(vm_page_t m)
2370{
2371	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2372
2373	pagezero((void *)va);
2374}
2375
2376/*
2377 *	pmap_copy_page copies the specified (machine independent)
2378 *	page by mapping the page into virtual memory and using
2379 *	bcopy to copy the page, one machine dependent page at a
2380 *	time.
2381 */
2382void
2383pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2384{
2385	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2386	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2387
2388	pagecopy((void *)src, (void *)dst);
2389}
2390
2391int unmapped_buf_allowed = 1;
2392
2393void
2394pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2395    vm_offset_t b_offset, int xfersize)
2396{
2397	void *a_cp, *b_cp;
2398	vm_page_t m_a, m_b;
2399	vm_paddr_t p_a, p_b;
2400	vm_offset_t a_pg_offset, b_pg_offset;
2401	int cnt;
2402
2403	while (xfersize > 0) {
2404		a_pg_offset = a_offset & PAGE_MASK;
2405		m_a = ma[a_offset >> PAGE_SHIFT];
2406		p_a = m_a->phys_addr;
2407		b_pg_offset = b_offset & PAGE_MASK;
2408		m_b = mb[b_offset >> PAGE_SHIFT];
2409		p_b = m_b->phys_addr;
2410		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
2411		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
2412		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
2413			panic("!DMAP a %lx", p_a);
2414		} else {
2415			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
2416		}
2417		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
2418			panic("!DMAP b %lx", p_b);
2419		} else {
2420			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
2421		}
2422		bcopy(a_cp, b_cp, cnt);
2423		a_offset += cnt;
2424		b_offset += cnt;
2425		xfersize -= cnt;
2426	}
2427}
2428
2429/*
2430 * Returns true if the pmap's pv is one of the first
2431 * 16 pvs linked to from this page.  This count may
2432 * be changed upwards or downwards in the future; it
2433 * is only necessary that true be returned for a small
2434 * subset of pmaps for proper page aging.
2435 */
2436boolean_t
2437pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2438{
2439	struct rwlock *lock;
2440	pv_entry_t pv;
2441	int loops = 0;
2442	boolean_t rv;
2443
2444	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2445	    ("pmap_page_exists_quick: page %p is not managed", m));
2446	rv = FALSE;
2447	rw_rlock(&pvh_global_lock);
2448	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2449	rw_rlock(lock);
2450	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
2451		if (PV_PMAP(pv) == pmap) {
2452			rv = TRUE;
2453			break;
2454		}
2455		loops++;
2456		if (loops >= 16)
2457			break;
2458	}
2459	rw_runlock(lock);
2460	rw_runlock(&pvh_global_lock);
2461	return (rv);
2462}
2463
2464/*
2465 *	pmap_page_wired_mappings:
2466 *
2467 *	Return the number of managed mappings to the given physical page
2468 *	that are wired.
2469 */
2470int
2471pmap_page_wired_mappings(vm_page_t m)
2472{
2473	struct rwlock *lock;
2474	pmap_t pmap;
2475	pt_entry_t *l3;
2476	pv_entry_t pv;
2477	int count, md_gen;
2478
2479	if ((m->oflags & VPO_UNMANAGED) != 0)
2480		return (0);
2481	rw_rlock(&pvh_global_lock);
2482	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2483	rw_rlock(lock);
2484restart:
2485	count = 0;
2486	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
2487		pmap = PV_PMAP(pv);
2488		if (!PMAP_TRYLOCK(pmap)) {
2489			md_gen = m->md.pv_gen;
2490			rw_runlock(lock);
2491			PMAP_LOCK(pmap);
2492			rw_rlock(lock);
2493			if (md_gen != m->md.pv_gen) {
2494				PMAP_UNLOCK(pmap);
2495				goto restart;
2496			}
2497		}
2498		l3 = pmap_l3(pmap, pv->pv_va);
2499		if (l3 != NULL && (*l3 & ATTR_SW_WIRED) != 0)
2500			count++;
2501		PMAP_UNLOCK(pmap);
2502	}
2503	rw_runlock(lock);
2504	rw_runlock(&pvh_global_lock);
2505	return (count);
2506}
2507
2508/*
2509 * Destroy all managed, non-wired mappings in the given user-space
2510 * pmap.  This pmap cannot be active on any processor besides the
2511 * caller.
2512 *
2513 * This function cannot be applied to the kernel pmap.  Moreover, it
2514 * is not intended for general use.  It is only to be used during
2515 * process termination.  Consequently, it can be implemented in ways
2516 * that make it faster than pmap_remove().  First, it can more quickly
2517 * destroy mappings by iterating over the pmap's collection of PV
2518 * entries, rather than searching the page table.  Second, it doesn't
2519 * have to test and clear the page table entries atomically, because
2520 * no processor is currently accessing the user address space.  In
2521 * particular, a page table entry's dirty bit won't change state once
2522 * this function starts.
2523 */
2524void
2525pmap_remove_pages(pmap_t pmap)
2526{
2527	pd_entry_t ptepde, *l2;
2528	pt_entry_t *l3, tl3;
2529	struct spglist free;
2530	vm_page_t m;
2531	pv_entry_t pv;
2532	struct pv_chunk *pc, *npc;
2533	struct rwlock *lock;
2534	int64_t bit;
2535	uint64_t inuse, bitmask;
2536	int allfree, field, freed, idx;
2537	vm_paddr_t pa;
2538
2539	lock = NULL;
2540
2541	SLIST_INIT(&free);
2542	rw_rlock(&pvh_global_lock);
2543	PMAP_LOCK(pmap);
2544	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
2545		allfree = 1;
2546		freed = 0;
2547		for (field = 0; field < _NPCM; field++) {
2548			inuse = ~pc->pc_map[field] & pc_freemask[field];
2549			while (inuse != 0) {
2550				bit = ffsl(inuse) - 1;
2551				bitmask = 1UL << bit;
2552				idx = field * 64 + bit;
2553				pv = &pc->pc_pventry[idx];
2554				inuse &= ~bitmask;
2555
2556				l2 = pmap_l2(pmap, pv->pv_va);
2557				ptepde = pmap_load(l2);
2558				l3 = pmap_l2_to_l3(l2, pv->pv_va);
2559				tl3 = pmap_load(l3);
2560
2561/*
2562 * We cannot remove wired pages from a process' mapping at this time
2563 */
2564				if (tl3 & ATTR_SW_WIRED) {
2565					allfree = 0;
2566					continue;
2567				}
2568
2569				pa = tl3 & ~ATTR_MASK;
2570
2571				m = PHYS_TO_VM_PAGE(pa);
2572				KASSERT(m->phys_addr == pa,
2573				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2574				    m, (uintmax_t)m->phys_addr,
2575				    (uintmax_t)tl3));
2576
2577				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
2578				    m < &vm_page_array[vm_page_array_size],
2579				    ("pmap_remove_pages: bad l3 %#jx",
2580				    (uintmax_t)tl3));
2581
2582				if (pmap_is_current(pmap) &&
2583				    pmap_l3_valid_cacheable(pmap_load(l3)))
2584					cpu_dcache_wb_range(pv->pv_va, L3_SIZE);
2585				pmap_load_clear(l3);
2586				PTE_SYNC(l3);
2587
2588				/*
2589				 * Update the vm_page_t clean/reference bits.
2590				 */
2591				if ((tl3 & ATTR_AP_RW_BIT) ==
2592				    ATTR_AP(ATTR_AP_RW))
2593					vm_page_dirty(m);
2594
2595				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
2596
2597				/* Mark free */
2598				pc->pc_map[field] |= bitmask;
2599
2600				pmap_resident_count_dec(pmap, 1);
2601				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2602				m->md.pv_gen++;
2603
2604				pmap_unuse_l3(pmap, pv->pv_va, ptepde, &free);
2605				freed++;
2606			}
2607		}
2608		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2609		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2610		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2611		if (allfree) {
2612			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2613			free_pv_chunk(pc);
2614		}
2615	}
2616	pmap_invalidate_all(pmap);
2617	if (lock != NULL)
2618		rw_wunlock(lock);
2619	rw_runlock(&pvh_global_lock);
2620	PMAP_UNLOCK(pmap);
2621	pmap_free_zero_pages(&free);
2622}
2623
2624/*
2625 * This is used to check if a page has been accessed or modified. As we
2626 * don't have a bit to see if it has been modified we have to assume it
2627 * has been if the page is read/write.
2628 */
2629static boolean_t
2630pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
2631{
2632	struct rwlock *lock;
2633	pv_entry_t pv;
2634	pt_entry_t *l3, mask, value;
2635	pmap_t pmap;
2636	int md_gen;
2637	boolean_t rv;
2638
2639	rv = FALSE;
2640	rw_rlock(&pvh_global_lock);
2641	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2642	rw_rlock(lock);
2643restart:
2644	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
2645		pmap = PV_PMAP(pv);
2646		if (!PMAP_TRYLOCK(pmap)) {
2647			md_gen = m->md.pv_gen;
2648			rw_runlock(lock);
2649			PMAP_LOCK(pmap);
2650			rw_rlock(lock);
2651			if (md_gen != m->md.pv_gen) {
2652				PMAP_UNLOCK(pmap);
2653				goto restart;
2654			}
2655		}
2656		l3 = pmap_l3(pmap, pv->pv_va);
2657		mask = 0;
2658		value = 0;
2659		if (modified) {
2660			mask |= ATTR_AP_RW_BIT;
2661			value |= ATTR_AP(ATTR_AP_RW);
2662		}
2663		if (accessed) {
2664			mask |= ATTR_AF | ATTR_DESCR_MASK;
2665			value |= ATTR_AF | L3_PAGE;
2666		}
2667		rv = (pmap_load(l3) & mask) == value;
2668		PMAP_UNLOCK(pmap);
2669		if (rv)
2670			goto out;
2671	}
2672out:
2673	rw_runlock(lock);
2674	rw_runlock(&pvh_global_lock);
2675	return (rv);
2676}
2677
2678/*
2679 *	pmap_is_modified:
2680 *
2681 *	Return whether or not the specified physical page was modified
2682 *	in any physical maps.
2683 */
2684boolean_t
2685pmap_is_modified(vm_page_t m)
2686{
2687
2688	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2689	    ("pmap_is_modified: page %p is not managed", m));
2690
2691	/*
2692	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
2693	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
2694	 * is clear, no PTEs can have PG_M set.
2695	 */
2696	VM_OBJECT_ASSERT_WLOCKED(m->object);
2697	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
2698		return (FALSE);
2699	return (pmap_page_test_mappings(m, FALSE, TRUE));
2700}
2701
2702/*
2703 *	pmap_is_prefaultable:
2704 *
2705 *	Return whether or not the specified virtual address is eligible
2706 *	for prefault.
2707 */
2708boolean_t
2709pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2710{
2711	pt_entry_t *l3;
2712	boolean_t rv;
2713
2714	rv = FALSE;
2715	PMAP_LOCK(pmap);
2716	l3 = pmap_l3(pmap, addr);
2717	if (l3 != NULL && *l3 != 0) {
2718		rv = TRUE;
2719	}
2720	PMAP_UNLOCK(pmap);
2721	return (rv);
2722}
2723
2724/*
2725 *	pmap_is_referenced:
2726 *
2727 *	Return whether or not the specified physical page was referenced
2728 *	in any physical maps.
2729 */
2730boolean_t
2731pmap_is_referenced(vm_page_t m)
2732{
2733
2734	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2735	    ("pmap_is_referenced: page %p is not managed", m));
2736	return (pmap_page_test_mappings(m, TRUE, FALSE));
2737}
2738
2739/*
2740 * Clear the write and modified bits in each of the given page's mappings.
2741 */
2742void
2743pmap_remove_write(vm_page_t m)
2744{
2745	pmap_t pmap;
2746	struct rwlock *lock;
2747	pv_entry_t pv;
2748	pt_entry_t *l3, oldl3;
2749	int md_gen;
2750
2751	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2752	    ("pmap_remove_write: page %p is not managed", m));
2753
2754	/*
2755	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
2756	 * set by another thread while the object is locked.  Thus,
2757	 * if PGA_WRITEABLE is clear, no page table entries need updating.
2758	 */
2759	VM_OBJECT_ASSERT_WLOCKED(m->object);
2760	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
2761		return;
2762	rw_rlock(&pvh_global_lock);
2763	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2764retry_pv_loop:
2765	rw_wlock(lock);
2766	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
2767		pmap = PV_PMAP(pv);
2768		if (!PMAP_TRYLOCK(pmap)) {
2769			md_gen = m->md.pv_gen;
2770			rw_wunlock(lock);
2771			PMAP_LOCK(pmap);
2772			rw_wlock(lock);
2773			if (md_gen != m->md.pv_gen) {
2774				PMAP_UNLOCK(pmap);
2775				rw_wunlock(lock);
2776				goto retry_pv_loop;
2777			}
2778		}
2779		l3 = pmap_l3(pmap, pv->pv_va);
2780retry:
2781		oldl3 = *l3;
2782		if ((oldl3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
2783			if (!atomic_cmpset_long(l3, oldl3,
2784			    oldl3 | ATTR_AP(ATTR_AP_RO)))
2785				goto retry;
2786			if ((oldl3 & ATTR_AF) != 0)
2787				vm_page_dirty(m);
2788			pmap_invalidate_page(pmap, pv->pv_va);
2789		}
2790		PMAP_UNLOCK(pmap);
2791	}
2792	rw_wunlock(lock);
2793	vm_page_aflag_clear(m, PGA_WRITEABLE);
2794	rw_runlock(&pvh_global_lock);
2795}
2796
2797static __inline boolean_t
2798safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
2799{
2800
2801	return (FALSE);
2802}
2803
2804#define	PMAP_TS_REFERENCED_MAX	5
2805
2806/*
2807 *	pmap_ts_referenced:
2808 *
2809 *	Return a count of reference bits for a page, clearing those bits.
2810 *	It is not necessary for every reference bit to be cleared, but it
2811 *	is necessary that 0 only be returned when there are truly no
2812 *	reference bits set.
2813 *
2814 *	XXX: The exact number of bits to check and clear is a matter that
2815 *	should be tested and standardized at some point in the future for
2816 *	optimal aging of shared pages.
2817 */
2818int
2819pmap_ts_referenced(vm_page_t m)
2820{
2821	pv_entry_t pv, pvf;
2822	pmap_t pmap;
2823	struct rwlock *lock;
2824	pd_entry_t *l2;
2825	pt_entry_t *l3;
2826	vm_paddr_t pa;
2827	int cleared, md_gen, not_cleared;
2828	struct spglist free;
2829
2830	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2831	    ("pmap_ts_referenced: page %p is not managed", m));
2832	SLIST_INIT(&free);
2833	cleared = 0;
2834	pa = VM_PAGE_TO_PHYS(m);
2835	lock = PHYS_TO_PV_LIST_LOCK(pa);
2836	rw_rlock(&pvh_global_lock);
2837	rw_wlock(lock);
2838retry:
2839	not_cleared = 0;
2840	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
2841		goto out;
2842	pv = pvf;
2843	do {
2844		if (pvf == NULL)
2845			pvf = pv;
2846		pmap = PV_PMAP(pv);
2847		if (!PMAP_TRYLOCK(pmap)) {
2848			md_gen = m->md.pv_gen;
2849			rw_wunlock(lock);
2850			PMAP_LOCK(pmap);
2851			rw_wlock(lock);
2852			if (md_gen != m->md.pv_gen) {
2853				PMAP_UNLOCK(pmap);
2854				goto retry;
2855			}
2856		}
2857		l2 = pmap_l2(pmap, pv->pv_va);
2858		KASSERT((*l2 & ATTR_DESCR_MASK) == L2_TABLE,
2859		    ("pmap_ts_referenced: found an invalid l2 table"));
2860		l3 = pmap_l2_to_l3(l2, pv->pv_va);
2861		if ((*l3 & ATTR_AF) != 0) {
2862			if (safe_to_clear_referenced(pmap, *l3)) {
2863				/*
2864				 * TODO: We don't handle the access flag
2865				 * at all. We need to be able to set it in
2866				 * the exception handler.
2867				 */
2868				panic("TODO: safe_to_clear_referenced\n");
2869			} else if ((*l3 & ATTR_SW_WIRED) == 0) {
2870				/*
2871				 * Wired pages cannot be paged out so
2872				 * doing accessed bit emulation for
2873				 * them is wasted effort. We do the
2874				 * hard work for unwired pages only.
2875				 */
2876				pmap_remove_l3(pmap, l3, pv->pv_va,
2877				    *l2, &free, &lock);
2878				pmap_invalidate_page(pmap, pv->pv_va);
2879				cleared++;
2880				if (pvf == pv)
2881					pvf = NULL;
2882				pv = NULL;
2883				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
2884				    ("inconsistent pv lock %p %p for page %p",
2885				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
2886			} else
2887				not_cleared++;
2888		}
2889		PMAP_UNLOCK(pmap);
2890		/* Rotate the PV list if it has more than one entry. */
2891		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
2892			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2893			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2894			m->md.pv_gen++;
2895		}
2896	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
2897	    not_cleared < PMAP_TS_REFERENCED_MAX);
2898out:
2899	rw_wunlock(lock);
2900	rw_runlock(&pvh_global_lock);
2901	pmap_free_zero_pages(&free);
2902	return (cleared + not_cleared);
2903}
2904
2905/*
2906 *	Apply the given advice to the specified range of addresses within the
2907 *	given pmap.  Depending on the advice, clear the referenced and/or
2908 *	modified flags in each mapping and set the mapped page's dirty field.
2909 */
2910void
2911pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
2912{
2913}
2914
2915/*
2916 *	Clear the modify bits on the specified physical page.
2917 */
2918void
2919pmap_clear_modify(vm_page_t m)
2920{
2921
2922	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2923	    ("pmap_clear_modify: page %p is not managed", m));
2924	VM_OBJECT_ASSERT_WLOCKED(m->object);
2925	KASSERT(!vm_page_xbusied(m),
2926	    ("pmap_clear_modify: page %p is exclusive busied", m));
2927
2928	/*
2929	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
2930	 * If the object containing the page is locked and the page is not
2931	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
2932	 */
2933	if ((m->aflags & PGA_WRITEABLE) == 0)
2934		return;
2935
2936	/* TODO: We lack support for tracking if a page is modified */
2937}
2938
2939void *
2940pmap_mapbios(vm_paddr_t pa, vm_size_t size)
2941{
2942
2943        return ((void *)PHYS_TO_DMAP(pa));
2944}
2945
2946void
2947pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
2948{
2949}
2950
2951/*
2952 * Sets the memory attribute for the specified page.
2953 */
2954void
2955pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
2956{
2957
2958	panic("pmap_page_set_memattr");
2959}
2960
2961/*
2962 * perform the pmap work for mincore
2963 */
2964int
2965pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
2966{
2967
2968	panic("pmap_mincore");
2969}
2970
2971void
2972pmap_activate(struct thread *td)
2973{
2974	pmap_t	pmap;
2975
2976	critical_enter();
2977	pmap = vmspace_pmap(td->td_proc->p_vmspace);
2978	td->td_pcb->pcb_l1addr = vtophys(pmap->pm_l1);
2979	__asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_pcb->pcb_l1addr));
2980	critical_exit();
2981}
2982
2983void
2984pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
2985{
2986
2987	panic("pmap_sync_icache");
2988}
2989
2990/*
2991 *	Increase the starting virtual address of the given mapping if a
2992 *	different alignment might result in more superpage mappings.
2993 */
2994void
2995pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
2996    vm_offset_t *addr, vm_size_t size)
2997{
2998}
2999
3000/**
3001 * Get the kernel virtual address of a set of physical pages. If there are
3002 * physical addresses not covered by the DMAP perform a transient mapping
3003 * that will be removed when calling pmap_unmap_io_transient.
3004 *
3005 * \param page        The pages the caller wishes to obtain the virtual
3006 *                    address on the kernel memory map.
3007 * \param vaddr       On return contains the kernel virtual memory address
3008 *                    of the pages passed in the page parameter.
3009 * \param count       Number of pages passed in.
3010 * \param can_fault   TRUE if the thread using the mapped pages can take
3011 *                    page faults, FALSE otherwise.
3012 *
3013 * \returns TRUE if the caller must call pmap_unmap_io_transient when
3014 *          finished or FALSE otherwise.
3015 *
3016 */
3017boolean_t
3018pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
3019    boolean_t can_fault)
3020{
3021	vm_paddr_t paddr;
3022	boolean_t needs_mapping;
3023	int error, i;
3024
3025	/*
3026	 * Allocate any KVA space that we need, this is done in a separate
3027	 * loop to prevent calling vmem_alloc while pinned.
3028	 */
3029	needs_mapping = FALSE;
3030	for (i = 0; i < count; i++) {
3031		paddr = VM_PAGE_TO_PHYS(page[i]);
3032		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
3033			error = vmem_alloc(kernel_arena, PAGE_SIZE,
3034			    M_BESTFIT | M_WAITOK, &vaddr[i]);
3035			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
3036			needs_mapping = TRUE;
3037		} else {
3038			vaddr[i] = PHYS_TO_DMAP(paddr);
3039		}
3040	}
3041
3042	/* Exit early if everything is covered by the DMAP */
3043	if (!needs_mapping)
3044		return (FALSE);
3045
3046	/*
3047	 * NB:  The sequence of updating a page table followed by accesses
3048	 * to the corresponding pages used in the !DMAP case is subject to
3049	 * the situation described in the "AMD64 Architecture Programmer's
3050	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
3051	 * Coherency Considerations".  Therefore, issuing the INVLPG right
3052	 * after modifying the PTE bits is crucial.
3053	 */
3054	if (!can_fault)
3055		sched_pin();
3056	for (i = 0; i < count; i++) {
3057		paddr = VM_PAGE_TO_PHYS(page[i]);
3058		if (paddr >= DMAP_MAX_PHYSADDR) {
3059			panic(
3060			   "pmap_map_io_transient: TODO: Map out of DMAP data");
3061		}
3062	}
3063
3064	return (needs_mapping);
3065}
3066
3067void
3068pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
3069    boolean_t can_fault)
3070{
3071	vm_paddr_t paddr;
3072	int i;
3073
3074	if (!can_fault)
3075		sched_unpin();
3076	for (i = 0; i < count; i++) {
3077		paddr = VM_PAGE_TO_PHYS(page[i]);
3078		if (paddr >= DMAP_MAX_PHYSADDR) {
3079			panic("pmap_unmap_io_transient: TODO: Unmap data");
3080		}
3081	}
3082}
3083