1/* $NetBSD: pmap.c,v 1.257 2012/02/02 18:59:44 para Exp $ */
2
3/*-
4 * Copyright (c) 1998, 1999, 2000, 2001, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center and by Chris G. Demetriou.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1991, 1993
35 *	The Regents of the University of California.  All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *	@(#)pmap.c	8.6 (Berkeley) 5/27/94
66 */
67
68/*
69 * DEC Alpha physical map management code.
70 *
71 * History:
72 *
73 *	This pmap started life as a Motorola 68851/68030 pmap,
74 *	written by Mike Hibler at the University of Utah.
75 *
76 *	It was modified for the DEC Alpha by Chris Demetriou
77 *	at Carnegie Mellon University.
78 *
79 *	Support for non-contiguous physical memory was added by
80 *	Jason R. Thorpe of the Numerical Aerospace Simulation
81 *	Facility, NASA Ames Research Center and Chris Demetriou.
82 *
83 *	Page table management and a major cleanup were undertaken
84 *	by Jason R. Thorpe, with lots of help from Ross Harvey of
85 *	Avalon Computer Systems and from Chris Demetriou.
86 *
87 *	Support for the new UVM pmap interface was written by
88 *	Jason R. Thorpe.
89 *
90 *	Support for ASNs was written by Jason R. Thorpe, again
91 *	with help from Chris Demetriou and Ross Harvey.
92 *
93 *	The locking protocol was written by Jason R. Thorpe,
94 *	using Chuck Cranor's i386 pmap for UVM as a model.
95 *
96 *	TLB shootdown code was written by Jason R. Thorpe.
97 *
98 *	Multiprocessor modifications by Andrew Doran.
99 *
100 * Notes:
101 *
102 *	All page table access is done via K0SEG.  The one exception
103 *	to this is for kernel mappings.  Since all kernel page
104 *	tables are pre-allocated, we can use the Virtual Page Table
105 *	to access PTEs that map K1SEG addresses.
106 *
107 *	Kernel page table pages are statically allocated in
108 *	pmap_bootstrap(), and are never freed.  In the future,
109 *	support for dynamically adding additional kernel page
110 *	table pages may be added.  User page table pages are
111 *	dynamically allocated and freed.
112 *
113 * Bugs/misfeatures:
114 *
115 *	- Some things could be optimized.
116 */
117
118/*
119 *	Manages physical address maps.
120 *
121 *	Since the information managed by this module is
122 *	also stored by the logical address mapping module,
123 *	this module may throw away valid virtual-to-physical
124 *	mappings at almost any time.  However, invalidations
125 *	of virtual-to-physical mappings must be done as
126 *	requested.
127 *
128 *	In order to cope with hardware architectures which
129 *	make virtual-to-physical map invalidates expensive,
130 *	this module may delay invalidate or reduced protection
131 *	operations until such time as they are actually
132 *	necessary.  This module is given full information as
133 *	to which processors are currently using which maps,
134 *	and to when physical maps must be made correct.
135 */
136
137#include "opt_lockdebug.h"
138#include "opt_sysv.h"
139#include "opt_multiprocessor.h"
140
141#include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
142
143__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.257 2012/02/02 18:59:44 para Exp $");
144
145#include <sys/param.h>
146#include <sys/systm.h>
147#include <sys/kernel.h>
148#include <sys/proc.h>
149#include <sys/malloc.h>
150#include <sys/pool.h>
151#include <sys/buf.h>
152#include <sys/shm.h>
153#include <sys/atomic.h>
154#include <sys/cpu.h>
155
156#include <uvm/uvm.h>
157
158#if defined(_PMAP_MAY_USE_PROM_CONSOLE) || defined(MULTIPROCESSOR)
159#include <machine/rpb.h>
160#endif
161
162#ifdef DEBUG
163#define	PDB_FOLLOW	0x0001
164#define	PDB_INIT	0x0002
165#define	PDB_ENTER	0x0004
166#define	PDB_REMOVE	0x0008
167#define	PDB_CREATE	0x0010
168#define	PDB_PTPAGE	0x0020
169#define	PDB_ASN		0x0040
170#define	PDB_BITS	0x0080
171#define	PDB_COLLECT	0x0100
172#define	PDB_PROTECT	0x0200
173#define	PDB_BOOTSTRAP	0x1000
174#define	PDB_PARANOIA	0x2000
175#define	PDB_WIRING	0x4000
176#define	PDB_PVDUMP	0x8000
177
178int debugmap = 0;
179int pmapdebug = PDB_PARANOIA;
180#endif
181
182/*
183 * Given a map and a machine independent protection code,
184 * convert to an alpha protection code.
185 */
186#define pte_prot(m, p)	(protection_codes[m == pmap_kernel() ? 0 : 1][p])
187static int	protection_codes[2][8];
188
189/*
190 * kernel_lev1map:
191 *
192 *	Kernel level 1 page table.  This maps all kernel level 2
193 *	page table pages, and is used as a template for all user
194 *	pmap level 1 page tables.  When a new user level 1 page
195 *	table is allocated, all kernel_lev1map PTEs for kernel
196 *	addresses are copied to the new map.
197 *
198 *	The kernel also has an initial set of kernel level 2 page
199 *	table pages.  These map the kernel level 3 page table pages.
200 *	As kernel level 3 page table pages are added, more level 2
201 *	page table pages may be added to map them.  These pages are
202 *	never freed.
203 *
204 *	Finally, the kernel also has an initial set of kernel level
205 *	3 page table pages.  These map pages in K1SEG.  More level
206 *	3 page table pages may be added at run-time if additional
207 *	K1SEG address space is required.  These pages are never freed.
208 *
209 * NOTE: When mappings are inserted into the kernel pmap, all
210 * level 2 and level 3 page table pages must already be allocated
211 * and mapped into the parent page table.
212 */
213pt_entry_t	*kernel_lev1map;
214
215/*
216 * Virtual Page Table.
217 */
218static pt_entry_t *VPT;
219
220static struct pmap	kernel_pmap_store
221	[(PMAP_SIZEOF(ALPHA_MAXPROCS) + sizeof(struct pmap) - 1)
222		/ sizeof(struct pmap)];
223struct pmap *const kernel_pmap_ptr = kernel_pmap_store;
224
225paddr_t    	avail_start;	/* PA of first available physical page */
226paddr_t		avail_end;	/* PA of last available physical page */
227static vaddr_t	virtual_end;	/* VA of last avail page (end of kernel AS) */
228
229static bool pmap_initialized;	/* Has pmap_init completed? */
230
231u_long		pmap_pages_stolen;	/* instrumentation */
232
233/*
234 * This variable contains the number of CPU IDs we need to allocate
235 * space for when allocating the pmap structure.  It is used to
236 * size a per-CPU array of ASN and ASN Generation number.
237 */
238static u_long 	pmap_ncpuids;
239
240#ifndef PMAP_PV_LOWAT
241#define	PMAP_PV_LOWAT	16
242#endif
243int		pmap_pv_lowat = PMAP_PV_LOWAT;
244
245/*
246 * List of all pmaps, used to update them when e.g. additional kernel
247 * page tables are allocated.  This list is kept LRU-ordered by
248 * pmap_activate().
249 */
250static TAILQ_HEAD(, pmap) pmap_all_pmaps;
251
252/*
253 * The pools from which pmap structures and sub-structures are allocated.
254 */
255static struct pool_cache pmap_pmap_cache;
256static struct pool_cache pmap_l1pt_cache;
257static struct pool_cache pmap_pv_cache;
258
259/*
260 * Address Space Numbers.
261 *
262 * On many implementations of the Alpha architecture, the TLB entries and
263 * I-cache blocks are tagged with a unique number within an implementation-
264 * specified range.  When a process context becomes active, the ASN is used
265 * to match TLB entries; if a TLB entry for a particular VA does not match
266 * the current ASN, it is ignored (one could think of the processor as
267 * having a collection of <max ASN> separate TLBs).  This allows operating
268 * system software to skip the TLB flush that would otherwise be necessary
269 * at context switch time.
270 *
271 * Alpha PTEs have a bit in them (PG_ASM - Address Space Match) that
272 * causes TLB entries to match any ASN.  The PALcode also provides
273 * a TBI (Translation Buffer Invalidate) operation that flushes all
274 * TLB entries that _do not_ have PG_ASM.  We use this bit for kernel
275 * mappings, so that invalidation of all user mappings does not invalidate
276 * kernel mappings (which are consistent across all processes).
277 *
278 * pmap_next_asn always indicates to the next ASN to use.  When
279 * pmap_next_asn exceeds pmap_max_asn, we start a new ASN generation.
280 *
281 * When a new ASN generation is created, the per-process (i.e. non-PG_ASM)
282 * TLB entries and the I-cache are flushed, the generation number is bumped,
283 * and pmap_next_asn is changed to indicate the first non-reserved ASN.
284 *
285 * We reserve ASN #0 for pmaps that use the global kernel_lev1map.  This
286 * prevents the following scenario:
287 *
288 *	* New ASN generation starts, and process A is given ASN #0.
289 *
290 *	* A new process B (and thus new pmap) is created.  The ASN,
291 *	  for lack of a better value, is initialized to 0.
292 *
293 *	* Process B runs.  It is now using the TLB entries tagged
294 *	  by process A.  *poof*
295 *
296 * In the scenario above, in addition to the processor using using incorrect
297 * TLB entires, the PALcode might use incorrect information to service a
298 * TLB miss.  (The PALcode uses the recursively mapped Virtual Page Table
299 * to locate the PTE for a faulting address, and tagged TLB entires exist
300 * for the Virtual Page Table addresses in order to speed up this procedure,
301 * as well.)
302 *
303 * By reserving an ASN for kernel_lev1map users, we are guaranteeing that
304 * new pmaps will initially run with no TLB entries for user addresses
305 * or VPT mappings that map user page tables.  Since kernel_lev1map only
306 * contains mappings for kernel addresses, and since those mappings
307 * are always made with PG_ASM, sharing an ASN for kernel_lev1map users is
308 * safe (since PG_ASM mappings match any ASN).
309 *
310 * On processors that do not support ASNs, the PALcode invalidates
311 * the TLB and I-cache automatically on swpctx.  We still still go
312 * through the motions of assigning an ASN (really, just refreshing
313 * the ASN generation in this particular case) to keep the logic sane
314 * in other parts of the code.
315 */
316static u_int	pmap_max_asn;		/* max ASN supported by the system */
317					/* next ASN and cur ASN generation */
318static struct pmap_asn_info pmap_asn_info[ALPHA_MAXPROCS];
319
320/*
321 * Locking:
322 *
323 *	READ/WRITE LOCKS
324 *	----------------
325 *
326 *	* pmap_main_lock - This lock is used to prevent deadlock and/or
327 *	  provide mutex access to the pmap module.  Most operations lock
328 *	  the pmap first, then PV lists as needed.  However, some operations,
329 *	  such as pmap_page_protect(), lock the PV lists before locking
330 *	  the pmaps.  To prevent deadlock, we require a mutex lock on the
331 *	  pmap module if locking in the PV->pmap direction.  This is
332 *	  implemented by acquiring a (shared) read lock on pmap_main_lock
333 *	  if locking pmap->PV and a (exclusive) write lock if locking in
334 *	  the PV->pmap direction.  Since only one thread can hold a write
335 *	  lock at a time, this provides the mutex.
336 *
337 *	MUTEXES
338 *	-------
339 *
340 *	* pm_lock (per-pmap) - This lock protects all of the members
341 *	  of the pmap structure itself.  This lock will be asserted
342 *	  in pmap_activate() and pmap_deactivate() from a critical
343 *	  section of mi_switch(), and must never sleep.  Note that
344 *	  in the case of the kernel pmap, interrupts which cause
345 *	  memory allocation *must* be blocked while this lock is
346 *	  asserted.
347 *
348 *	* pvh_lock (global hash) - These locks protects the PV lists
349 *	  for managed pages.
350 *
351 *	* pmap_all_pmaps_lock - This lock protects the global list of
352 *	  all pmaps.  Note that a pm_lock must never be held while this
353 *	  lock is held.
354 *
355 *	* pmap_growkernel_lock - This lock protects pmap_growkernel()
356 *	  and the virtual_end variable.
357 *
358 *	  There is a lock ordering constraint for pmap_growkernel_lock.
359 *	  pmap_growkernel() acquires the locks in the following order:
360 *
361 *		pmap_growkernel_lock (write) -> pmap_all_pmaps_lock ->
362 *		    pmap->pm_lock
363 *
364 *	  We need to ensure consistency between user pmaps and the
365 *	  kernel_lev1map.  For this reason, pmap_growkernel_lock must
366 *	  be held to prevent kernel_lev1map changing across pmaps
367 *	  being added to / removed from the global pmaps list.
368 *
369 *	Address space number management (global ASN counters and per-pmap
370 *	ASN state) are not locked; they use arrays of values indexed
371 *	per-processor.
372 *
373 *	All internal functions which operate on a pmap are called
374 *	with the pmap already locked by the caller (which will be
375 *	an interface function).
376 */
377static krwlock_t pmap_main_lock;
378static kmutex_t pmap_all_pmaps_lock;
379static krwlock_t pmap_growkernel_lock;
380
381#define	PMAP_MAP_TO_HEAD_LOCK()		rw_enter(&pmap_main_lock, RW_READER)
382#define	PMAP_MAP_TO_HEAD_UNLOCK()	rw_exit(&pmap_main_lock)
383#define	PMAP_HEAD_TO_MAP_LOCK()		rw_enter(&pmap_main_lock, RW_WRITER)
384#define	PMAP_HEAD_TO_MAP_UNLOCK()	rw_exit(&pmap_main_lock)
385
386struct {
387	kmutex_t lock;
388} __aligned(64) static pmap_pvh_locks[64] __aligned(64);
389
390static inline kmutex_t *
391pmap_pvh_lock(struct vm_page *pg)
392{
393
394	/* Cut bits 11-6 out of page address and use directly as offset. */
395	return (kmutex_t *)((uintptr_t)&pmap_pvh_locks +
396	    ((uintptr_t)pg & (63 << 6)));
397}
398
399#if defined(MULTIPROCESSOR)
400/*
401 * TLB Shootdown:
402 *
403 * When a mapping is changed in a pmap, the TLB entry corresponding to
404 * the virtual address must be invalidated on all processors.  In order
405 * to accomplish this on systems with multiple processors, messages are
406 * sent from the processor which performs the mapping change to all
407 * processors on which the pmap is active.  For other processors, the
408 * ASN generation numbers for that processor is invalidated, so that
409 * the next time the pmap is activated on that processor, a new ASN
410 * will be allocated (which implicitly invalidates all TLB entries).
411 *
412 * Note, we can use the pool allocator to allocate job entries
413 * since pool pages are mapped with K0SEG, not with the TLB.
414 */
415struct pmap_tlb_shootdown_job {
416	TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
417	vaddr_t pj_va;			/* virtual address */
418	pmap_t pj_pmap;			/* the pmap which maps the address */
419	pt_entry_t pj_pte;		/* the PTE bits */
420};
421
422static struct pmap_tlb_shootdown_q {
423	TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;	/* queue 16b */
424	kmutex_t pq_lock;		/* spin lock on queue 16b */
425	int pq_pte;			/* aggregate PTE bits 4b */
426	int pq_count;			/* number of pending requests 4b */
427	int pq_tbia;			/* pending global flush 4b */
428	uint8_t pq_pad[64-16-16-4-4-4];	/* pad to 64 bytes */
429} pmap_tlb_shootdown_q[ALPHA_MAXPROCS] __aligned(CACHE_LINE_SIZE);
430
431/* If we have more pending jobs than this, we just nail the whole TLB. */
432#define	PMAP_TLB_SHOOTDOWN_MAXJOBS	6
433
434static struct pool_cache pmap_tlb_shootdown_job_cache;
435#endif /* MULTIPROCESSOR */
436
437/*
438 * Internal routines
439 */
440static void	alpha_protection_init(void);
441static bool	pmap_remove_mapping(pmap_t, vaddr_t, pt_entry_t *, bool, long);
442static void	pmap_changebit(struct vm_page *, pt_entry_t, pt_entry_t, long);
443
444/*
445 * PT page management functions.
446 */
447static int	pmap_lev1map_create(pmap_t, long);
448static void	pmap_lev1map_destroy(pmap_t, long);
449static int	pmap_ptpage_alloc(pmap_t, pt_entry_t *, int);
450static void	pmap_ptpage_free(pmap_t, pt_entry_t *);
451static void	pmap_l3pt_delref(pmap_t, vaddr_t, pt_entry_t *, long);
452static void	pmap_l2pt_delref(pmap_t, pt_entry_t *, pt_entry_t *, long);
453static void	pmap_l1pt_delref(pmap_t, pt_entry_t *, long);
454
455static void	*pmap_l1pt_alloc(struct pool *, int);
456static void	pmap_l1pt_free(struct pool *, void *);
457
458static struct pool_allocator pmap_l1pt_allocator = {
459	pmap_l1pt_alloc, pmap_l1pt_free, 0,
460};
461
462static int	pmap_l1pt_ctor(void *, void *, int);
463
464/*
465 * PV table management functions.
466 */
467static int	pmap_pv_enter(pmap_t, struct vm_page *, vaddr_t, pt_entry_t *,
468			      bool);
469static void	pmap_pv_remove(pmap_t, struct vm_page *, vaddr_t, bool);
470static void	*pmap_pv_page_alloc(struct pool *, int);
471static void	pmap_pv_page_free(struct pool *, void *);
472
473static struct pool_allocator pmap_pv_page_allocator = {
474	pmap_pv_page_alloc, pmap_pv_page_free, 0,
475};
476
477#ifdef DEBUG
478void	pmap_pv_dump(paddr_t);
479#endif
480
481#define	pmap_pv_alloc()		pool_cache_get(&pmap_pv_cache, PR_NOWAIT)
482#define	pmap_pv_free(pv)	pool_cache_put(&pmap_pv_cache, (pv))
483
484/*
485 * ASN management functions.
486 */
487static void	pmap_asn_alloc(pmap_t, long);
488
489/*
490 * Misc. functions.
491 */
492static bool	pmap_physpage_alloc(int, paddr_t *);
493static void	pmap_physpage_free(paddr_t);
494static int	pmap_physpage_addref(void *);
495static int	pmap_physpage_delref(void *);
496
497/*
498 * PMAP_ISACTIVE{,_TEST}:
499 *
500 *	Check to see if a pmap is active on the current processor.
501 */
502#define	PMAP_ISACTIVE_TEST(pm, cpu_id)					\
503	(((pm)->pm_cpus & (1UL << (cpu_id))) != 0)
504
505#if defined(DEBUG) && !defined(MULTIPROCESSOR)
506#define	PMAP_ISACTIVE(pm, cpu_id)					\
507({									\
508	/*								\
509	 * XXX This test is not MP-safe.				\
510	 */								\
511	int isactive_ = PMAP_ISACTIVE_TEST(pm, cpu_id);			\
512									\
513	if ((curlwp->l_flag & LW_IDLE) != 0 &&				\
514	    curproc->p_vmspace != NULL &&				\
515	   ((curproc->p_sflag & PS_WEXIT) == 0) &&			\
516	   (isactive_ ^ ((pm) == curproc->p_vmspace->vm_map.pmap)))	\
517		panic("PMAP_ISACTIVE");					\
518	(isactive_);							\
519})
520#else
521#define	PMAP_ISACTIVE(pm, cpu_id)	PMAP_ISACTIVE_TEST(pm, cpu_id)
522#endif /* DEBUG && !MULTIPROCESSOR */
523
524/*
525 * PMAP_ACTIVATE_ASN_SANITY:
526 *
527 *	DEBUG sanity checks for ASNs within PMAP_ACTIVATE.
528 */
529#ifdef DEBUG
530#define	PMAP_ACTIVATE_ASN_SANITY(pmap, cpu_id)				\
531do {									\
532	struct pmap_asn_info *__pma = &(pmap)->pm_asni[(cpu_id)];	\
533	struct pmap_asn_info *__cpma = &pmap_asn_info[(cpu_id)];	\
534									\
535	if ((pmap)->pm_lev1map == kernel_lev1map) {			\
536		/*							\
537		 * This pmap implementation also ensures that pmaps	\
538		 * referencing kernel_lev1map use a reserved ASN	\
539		 * ASN to prevent the PALcode from servicing a TLB	\
540		 * miss	with the wrong PTE.				\
541		 */							\
542		if (__pma->pma_asn != PMAP_ASN_RESERVED) {		\
543			printf("kernel_lev1map with non-reserved ASN "	\
544			    "(line %d)\n", __LINE__);			\
545			panic("PMAP_ACTIVATE_ASN_SANITY");		\
546		}							\
547	} else {							\
548		if (__pma->pma_asngen != __cpma->pma_asngen) {		\
549			/*						\
550			 * ASN generation number isn't valid!		\
551			 */						\
552			printf("pmap asngen %lu, current %lu "		\
553			    "(line %d)\n",				\
554			    __pma->pma_asngen,				\
555			    __cpma->pma_asngen,				\
556			    __LINE__);					\
557			panic("PMAP_ACTIVATE_ASN_SANITY");		\
558		}							\
559		if (__pma->pma_asn == PMAP_ASN_RESERVED) {		\
560			/*						\
561			 * DANGER WILL ROBINSON!  We're going to	\
562			 * pollute the VPT TLB entries!			\
563			 */						\
564			printf("Using reserved ASN! (line %d)\n",	\
565			    __LINE__);					\
566			panic("PMAP_ACTIVATE_ASN_SANITY");		\
567		}							\
568	}								\
569} while (/*CONSTCOND*/0)
570#else
571#define	PMAP_ACTIVATE_ASN_SANITY(pmap, cpu_id)	/* nothing */
572#endif
573
574/*
575 * PMAP_ACTIVATE:
576 *
577 *	This is essentially the guts of pmap_activate(), without
578 *	ASN allocation.  This is used by pmap_activate(),
579 *	pmap_lev1map_create(), and pmap_lev1map_destroy().
580 *
581 *	This is called only when it is known that a pmap is "active"
582 *	on the current processor; the ASN must already be valid.
583 */
584#define	PMAP_ACTIVATE(pmap, l, cpu_id)					\
585do {									\
586	struct pcb *pcb = lwp_getpcb(l);				\
587	PMAP_ACTIVATE_ASN_SANITY(pmap, cpu_id);				\
588									\
589	pcb->pcb_hw.apcb_ptbr =				\
590	    ALPHA_K0SEG_TO_PHYS((vaddr_t)(pmap)->pm_lev1map) >> PGSHIFT; \
591	pcb->pcb_hw.apcb_asn = (pmap)->pm_asni[(cpu_id)].pma_asn;	\
592									\
593	if ((l) == curlwp) {						\
594		/*							\
595		 * Page table base register has changed; switch to	\
596		 * our own context again so that it will take effect.	\
597		 */							\
598		(void) alpha_pal_swpctx((u_long)l->l_md.md_pcbpaddr);	\
599	}								\
600} while (/*CONSTCOND*/0)
601
602/*
603 * PMAP_SET_NEEDISYNC:
604 *
605 *	Mark that a user pmap needs an I-stream synch on its
606 *	way back out to userspace.
607 */
608#define	PMAP_SET_NEEDISYNC(pmap)	(pmap)->pm_needisync = ~0UL
609
610/*
611 * PMAP_SYNC_ISTREAM:
612 *
613 *	Synchronize the I-stream for the specified pmap.  For user
614 *	pmaps, this is deferred until a process using the pmap returns
615 *	to userspace.
616 */
617#if defined(MULTIPROCESSOR)
618#define	PMAP_SYNC_ISTREAM_KERNEL()					\
619do {									\
620	alpha_pal_imb();						\
621	alpha_broadcast_ipi(ALPHA_IPI_IMB);				\
622} while (/*CONSTCOND*/0)
623
624#define	PMAP_SYNC_ISTREAM_USER(pmap)					\
625do {									\
626	alpha_multicast_ipi((pmap)->pm_cpus, ALPHA_IPI_AST);		\
627	/* for curcpu, will happen in userret() */			\
628} while (/*CONSTCOND*/0)
629#else
630#define	PMAP_SYNC_ISTREAM_KERNEL()	alpha_pal_imb()
631#define	PMAP_SYNC_ISTREAM_USER(pmap)	/* will happen in userret() */
632#endif /* MULTIPROCESSOR */
633
634#define	PMAP_SYNC_ISTREAM(pmap)						\
635do {									\
636	if ((pmap) == pmap_kernel())					\
637		PMAP_SYNC_ISTREAM_KERNEL();				\
638	else								\
639		PMAP_SYNC_ISTREAM_USER(pmap);				\
640} while (/*CONSTCOND*/0)
641
642/*
643 * PMAP_INVALIDATE_ASN:
644 *
645 *	Invalidate the specified pmap's ASN, so as to force allocation
646 *	of a new one the next time pmap_asn_alloc() is called.
647 *
648 *	NOTE: THIS MUST ONLY BE CALLED IF AT LEAST ONE OF THE FOLLOWING
649 *	CONDITIONS ARE true:
650 *
651 *		(1) The pmap references the global kernel_lev1map.
652 *
653 *		(2) The pmap is not active on the current processor.
654 */
655#define	PMAP_INVALIDATE_ASN(pmap, cpu_id)				\
656do {									\
657	(pmap)->pm_asni[(cpu_id)].pma_asn = PMAP_ASN_RESERVED;		\
658} while (/*CONSTCOND*/0)
659
660/*
661 * PMAP_INVALIDATE_TLB:
662 *
663 *	Invalidate the TLB entry for the pmap/va pair.
664 */
665#define	PMAP_INVALIDATE_TLB(pmap, va, hadasm, isactive, cpu_id)		\
666do {									\
667	if ((hadasm) || (isactive)) {					\
668		/*							\
669		 * Simply invalidating the TLB entry and I-cache	\
670		 * works in this case.					\
671		 */							\
672		ALPHA_TBIS((va));					\
673	} else if ((pmap)->pm_asni[(cpu_id)].pma_asngen ==		\
674		   pmap_asn_info[(cpu_id)].pma_asngen) {		\
675		/*							\
676		 * We can't directly invalidate the TLB entry		\
677		 * in this case, so we have to force allocation		\
678		 * of a new ASN the next time this pmap becomes		\
679		 * active.						\
680		 */							\
681		PMAP_INVALIDATE_ASN((pmap), (cpu_id));			\
682	}								\
683		/*							\
684		 * Nothing to do in this case; the next time the	\
685		 * pmap becomes active on this processor, a new		\
686		 * ASN will be allocated anyway.			\
687		 */							\
688} while (/*CONSTCOND*/0)
689
690/*
691 * PMAP_KERNEL_PTE:
692 *
693 *	Get a kernel PTE.
694 *
695 *	If debugging, do a table walk.  If not debugging, just use
696 *	the Virtual Page Table, since all kernel page tables are
697 *	pre-allocated and mapped in.
698 */
699#ifdef DEBUG
700#define	PMAP_KERNEL_PTE(va)						\
701({									\
702	pt_entry_t *l1pte_, *l2pte_;					\
703									\
704	l1pte_ = pmap_l1pte(pmap_kernel(), va);				\
705	if (pmap_pte_v(l1pte_) == 0) {					\
706		printf("kernel level 1 PTE not valid, va 0x%lx "	\
707		    "(line %d)\n", (va), __LINE__);			\
708		panic("PMAP_KERNEL_PTE");				\
709	}								\
710	l2pte_ = pmap_l2pte(pmap_kernel(), va, l1pte_);			\
711	if (pmap_pte_v(l2pte_) == 0) {					\
712		printf("kernel level 2 PTE not valid, va 0x%lx "	\
713		    "(line %d)\n", (va), __LINE__);			\
714		panic("PMAP_KERNEL_PTE");				\
715	}								\
716	pmap_l3pte(pmap_kernel(), va, l2pte_);				\
717})
718#else
719#define	PMAP_KERNEL_PTE(va)	(&VPT[VPT_INDEX((va))])
720#endif
721
722/*
723 * PMAP_SET_PTE:
724 *
725 *	Set a PTE to a specified value.
726 */
727#define	PMAP_SET_PTE(ptep, val)	*(ptep) = (val)
728
729/*
730 * PMAP_STAT_{INCR,DECR}:
731 *
732 *	Increment or decrement a pmap statistic.
733 */
734#define	PMAP_STAT_INCR(s, v)	atomic_add_long((unsigned long *)(&(s)), (v))
735#define	PMAP_STAT_DECR(s, v)	atomic_add_long((unsigned long *)(&(s)), -(v))
736
737/*
738 * pmap_bootstrap:
739 *
740 *	Bootstrap the system to run with virtual memory.
741 *
742 *	Note: no locking is necessary in this function.
743 */
744void
745pmap_bootstrap(paddr_t ptaddr, u_int maxasn, u_long ncpuids)
746{
747	vsize_t lev2mapsize, lev3mapsize;
748	pt_entry_t *lev2map, *lev3map;
749	pt_entry_t pte;
750	vsize_t bufsz;
751	struct pcb *pcb;
752	int i;
753
754#ifdef DEBUG
755	if (pmapdebug & (PDB_FOLLOW|PDB_BOOTSTRAP))
756		printf("pmap_bootstrap(0x%lx, %u)\n", ptaddr, maxasn);
757#endif
758
759	/*
760	 * Compute the number of pages kmem_arena will have.
761	 */
762	kmeminit_nkmempages();
763
764	/*
765	 * Figure out how many initial PTE's are necessary to map the
766	 * kernel.  We also reserve space for kmem_alloc_pageable()
767	 * for vm_fork().
768	 */
769
770	/* Get size of buffer cache and set an upper limit */
771	bufsz = buf_memcalc();
772	buf_setvalimit(bufsz);
773
774	lev3mapsize =
775		(VM_PHYS_SIZE + (ubc_nwins << ubc_winshift) +
776		 bufsz + 16 * NCARGS + pager_map_size) / PAGE_SIZE +
777		(maxproc * UPAGES) + nkmempages;
778
779#ifdef SYSVSHM
780	lev3mapsize += shminfo.shmall;
781#endif
782	lev3mapsize = roundup(lev3mapsize, NPTEPG);
783
784	/*
785	 * Initialize `FYI' variables.  Note we're relying on
786	 * the fact that BSEARCH sorts the vm_physmem[] array
787	 * for us.
788	 */
789	avail_start = ptoa(VM_PHYSMEM_PTR(0)->start);
790	avail_end = ptoa(VM_PHYSMEM_PTR(vm_nphysseg - 1)->end);
791	virtual_end = VM_MIN_KERNEL_ADDRESS + lev3mapsize * PAGE_SIZE;
792
793#if 0
794	printf("avail_start = 0x%lx\n", avail_start);
795	printf("avail_end = 0x%lx\n", avail_end);
796	printf("virtual_end = 0x%lx\n", virtual_end);
797#endif
798
799	/*
800	 * Allocate a level 1 PTE table for the kernel.
801	 * This is always one page long.
802	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
803	 */
804	kernel_lev1map = (pt_entry_t *)
805	    uvm_pageboot_alloc(sizeof(pt_entry_t) * NPTEPG);
806
807	/*
808	 * Allocate a level 2 PTE table for the kernel.
809	 * These must map all of the level3 PTEs.
810	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
811	 */
812	lev2mapsize = roundup(howmany(lev3mapsize, NPTEPG), NPTEPG);
813	lev2map = (pt_entry_t *)
814	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev2mapsize);
815
816	/*
817	 * Allocate a level 3 PTE table for the kernel.
818	 * Contains lev3mapsize PTEs.
819	 */
820	lev3map = (pt_entry_t *)
821	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev3mapsize);
822
823	/*
824	 * Set up level 1 page table
825	 */
826
827	/* Map all of the level 2 pte pages */
828	for (i = 0; i < howmany(lev2mapsize, NPTEPG); i++) {
829		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev2map) +
830		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
831		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
832		kernel_lev1map[l1pte_index(VM_MIN_KERNEL_ADDRESS +
833		    (i*PAGE_SIZE*NPTEPG*NPTEPG))] = pte;
834	}
835
836	/* Map the virtual page table */
837	pte = (ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT)
838	    << PG_SHIFT;
839	pte |= PG_V | PG_KRE | PG_KWE; /* NOTE NO ASM */
840	kernel_lev1map[l1pte_index(VPTBASE)] = pte;
841	VPT = (pt_entry_t *)VPTBASE;
842
843#ifdef _PMAP_MAY_USE_PROM_CONSOLE
844    {
845	extern pt_entry_t prom_pte;			/* XXX */
846	extern int prom_mapped;				/* XXX */
847
848	if (pmap_uses_prom_console()) {
849		/*
850		 * XXX Save old PTE so we can remap the PROM, if
851		 * XXX necessary.
852		 */
853		prom_pte = *(pt_entry_t *)ptaddr & ~PG_ASM;
854	}
855	prom_mapped = 0;
856
857	/*
858	 * Actually, this code lies.  The prom is still mapped, and will
859	 * remain so until the context switch after alpha_init() returns.
860	 */
861    }
862#endif
863
864	/*
865	 * Set up level 2 page table.
866	 */
867	/* Map all of the level 3 pte pages */
868	for (i = 0; i < howmany(lev3mapsize, NPTEPG); i++) {
869		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev3map) +
870		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
871		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
872		lev2map[l2pte_index(VM_MIN_KERNEL_ADDRESS+
873		    (i*PAGE_SIZE*NPTEPG))] = pte;
874	}
875
876	/* Initialize the pmap_growkernel_lock. */
877	rw_init(&pmap_growkernel_lock);
878
879	/*
880	 * Set up level three page table (lev3map)
881	 */
882	/* Nothing to do; it's already zero'd */
883
884	/*
885	 * Initialize the pmap pools and list.
886	 */
887	pmap_ncpuids = ncpuids;
888	pool_cache_bootstrap(&pmap_pmap_cache, PMAP_SIZEOF(pmap_ncpuids), 0,
889	    0, 0, "pmap", NULL, IPL_NONE, NULL, NULL, NULL);
890	pool_cache_bootstrap(&pmap_l1pt_cache, PAGE_SIZE, 0, 0, 0, "pmapl1pt",
891	    &pmap_l1pt_allocator, IPL_NONE, pmap_l1pt_ctor, NULL, NULL);
892	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
893	    PR_LARGECACHE, "pmappv", &pmap_pv_page_allocator, IPL_NONE, NULL,
894	    NULL, NULL);
895
896	TAILQ_INIT(&pmap_all_pmaps);
897
898	/*
899	 * Initialize the ASN logic.
900	 */
901	pmap_max_asn = maxasn;
902	for (i = 0; i < ALPHA_MAXPROCS; i++) {
903		pmap_asn_info[i].pma_asn = 1;
904		pmap_asn_info[i].pma_asngen = 0;
905	}
906
907	/*
908	 * Initialize the locks.
909	 */
910	rw_init(&pmap_main_lock);
911	mutex_init(&pmap_all_pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
912	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
913		mutex_init(&pmap_pvh_locks[i].lock, MUTEX_DEFAULT, IPL_NONE);
914	}
915
916	/*
917	 * Initialize kernel pmap.  Note that all kernel mappings
918	 * have PG_ASM set, so the ASN doesn't really matter for
919	 * the kernel pmap.  Also, since the kernel pmap always
920	 * references kernel_lev1map, it always has an invalid ASN
921	 * generation.
922	 */
923	memset(pmap_kernel(), 0, sizeof(struct pmap));
924	pmap_kernel()->pm_lev1map = kernel_lev1map;
925	pmap_kernel()->pm_count = 1;
926	for (i = 0; i < ALPHA_MAXPROCS; i++) {
927		pmap_kernel()->pm_asni[i].pma_asn = PMAP_ASN_RESERVED;
928		pmap_kernel()->pm_asni[i].pma_asngen =
929		    pmap_asn_info[i].pma_asngen;
930	}
931	mutex_init(&pmap_kernel()->pm_lock, MUTEX_DEFAULT, IPL_NONE);
932	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap_kernel(), pm_list);
933
934#if defined(MULTIPROCESSOR)
935	/*
936	 * Initialize the TLB shootdown queues.
937	 */
938	pool_cache_bootstrap(&pmap_tlb_shootdown_job_cache,
939	    sizeof(struct pmap_tlb_shootdown_job), CACHE_LINE_SIZE,
940	     0, PR_LARGECACHE, "pmaptlb", NULL, IPL_VM, NULL, NULL, NULL);
941	for (i = 0; i < ALPHA_MAXPROCS; i++) {
942		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
943		mutex_init(&pmap_tlb_shootdown_q[i].pq_lock, MUTEX_DEFAULT,
944		    IPL_SCHED);
945	}
946#endif
947
948	/*
949	 * Set up lwp0's PCB such that the ptbr points to the right place
950	 * and has the kernel pmap's (really unused) ASN.
951	 */
952	pcb = lwp_getpcb(&lwp0);
953	pcb->pcb_hw.apcb_ptbr =
954	    ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT;
955	pcb->pcb_hw.apcb_asn = pmap_kernel()->pm_asni[cpu_number()].pma_asn;
956
957	/*
958	 * Mark the kernel pmap `active' on this processor.
959	 */
960	atomic_or_ulong(&pmap_kernel()->pm_cpus,
961	    (1UL << cpu_number()));
962}
963
964#ifdef _PMAP_MAY_USE_PROM_CONSOLE
965int
966pmap_uses_prom_console(void)
967{
968
969	return (cputype == ST_DEC_21000);
970}
971#endif /* _PMAP_MAY_USE_PROM_CONSOLE */
972
973/*
974 * pmap_virtual_space:		[ INTERFACE ]
975 *
976 *	Define the initial bounds of the kernel virtual address space.
977 */
978void
979pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
980{
981
982	*vstartp = VM_MIN_KERNEL_ADDRESS;	/* kernel is in K0SEG */
983	*vendp = VM_MAX_KERNEL_ADDRESS;		/* we use pmap_growkernel */
984}
985
986/*
987 * pmap_steal_memory:		[ INTERFACE ]
988 *
989 *	Bootstrap memory allocator (alternative to vm_bootstrap_steal_memory()).
990 *	This function allows for early dynamic memory allocation until the
991 *	virtual memory system has been bootstrapped.  After that point, either
992 *	kmem_alloc or malloc should be used.  This function works by stealing
993 *	pages from the (to be) managed page pool, then implicitly mapping the
994 *	pages (by using their k0seg addresses) and zeroing them.
995 *
996 *	It may be used once the physical memory segments have been pre-loaded
997 *	into the vm_physmem[] array.  Early memory allocation MUST use this
998 *	interface!  This cannot be used after vm_page_startup(), and will
999 *	generate a panic if tried.
1000 *
1001 *	Note that this memory will never be freed, and in essence it is wired
1002 *	down.
1003 *
1004 *	We must adjust *vstartp and/or *vendp iff we use address space
1005 *	from the kernel virtual address range defined by pmap_virtual_space().
1006 *
1007 *	Note: no locking is necessary in this function.
1008 */
1009vaddr_t
1010pmap_steal_memory(vsize_t size, vaddr_t *vstartp, vaddr_t *vendp)
1011{
1012	int bank, npgs, x;
1013	vaddr_t va;
1014	paddr_t pa;
1015
1016	size = round_page(size);
1017	npgs = atop(size);
1018
1019#if 0
1020	printf("PSM: size 0x%lx (npgs 0x%x)\n", size, npgs);
1021#endif
1022
1023	for (bank = 0; bank < vm_nphysseg; bank++) {
1024		if (uvm.page_init_done == true)
1025			panic("pmap_steal_memory: called _after_ bootstrap");
1026
1027#if 0
1028		printf("     bank %d: avail_start 0x%lx, start 0x%lx, "
1029		    "avail_end 0x%lx\n", bank, VM_PHYSMEM_PTR(bank)->avail_start,
1030		    VM_PHYSMEM_PTR(bank)->start, VM_PHYSMEM_PTR(bank)->avail_end);
1031#endif
1032
1033		if (VM_PHYSMEM_PTR(bank)->avail_start != VM_PHYSMEM_PTR(bank)->start ||
1034		    VM_PHYSMEM_PTR(bank)->avail_start >= VM_PHYSMEM_PTR(bank)->avail_end)
1035			continue;
1036
1037#if 0
1038		printf("             avail_end - avail_start = 0x%lx\n",
1039		    VM_PHYSMEM_PTR(bank)->avail_end - VM_PHYSMEM_PTR(bank)->avail_start);
1040#endif
1041
1042		if ((VM_PHYSMEM_PTR(bank)->avail_end - VM_PHYSMEM_PTR(bank)->avail_start)
1043		    < npgs)
1044			continue;
1045
1046		/*
1047		 * There are enough pages here; steal them!
1048		 */
1049		pa = ptoa(VM_PHYSMEM_PTR(bank)->avail_start);
1050		VM_PHYSMEM_PTR(bank)->avail_start += npgs;
1051		VM_PHYSMEM_PTR(bank)->start += npgs;
1052
1053		/*
1054		 * Have we used up this segment?
1055		 */
1056		if (VM_PHYSMEM_PTR(bank)->avail_start == VM_PHYSMEM_PTR(bank)->end) {
1057			if (vm_nphysseg == 1)
1058				panic("pmap_steal_memory: out of memory!");
1059
1060			/* Remove this segment from the list. */
1061			vm_nphysseg--;
1062			for (x = bank; x < vm_nphysseg; x++) {
1063				/* structure copy */
1064				VM_PHYSMEM_PTR_SWAP(x, x + 1);
1065			}
1066		}
1067
1068		va = ALPHA_PHYS_TO_K0SEG(pa);
1069		memset((void *)va, 0, size);
1070		pmap_pages_stolen += npgs;
1071		return (va);
1072	}
1073
1074	/*
1075	 * If we got here, this was no memory left.
1076	 */
1077	panic("pmap_steal_memory: no memory to steal");
1078}
1079
1080/*
1081 * pmap_init:			[ INTERFACE ]
1082 *
1083 *	Initialize the pmap module.  Called by vm_init(), to initialize any
1084 *	structures that the pmap system needs to map virtual memory.
1085 *
1086 *	Note: no locking is necessary in this function.
1087 */
1088void
1089pmap_init(void)
1090{
1091
1092#ifdef DEBUG
1093	if (pmapdebug & PDB_FOLLOW)
1094	        printf("pmap_init()\n");
1095#endif
1096
1097	/* initialize protection array */
1098	alpha_protection_init();
1099
1100	/*
1101	 * Set a low water mark on the pv_entry pool, so that we are
1102	 * more likely to have these around even in extreme memory
1103	 * starvation.
1104	 */
1105	pool_cache_setlowat(&pmap_pv_cache, pmap_pv_lowat);
1106
1107	/*
1108	 * Now it is safe to enable pv entry recording.
1109	 */
1110	pmap_initialized = true;
1111
1112#if 0
1113	for (bank = 0; bank < vm_nphysseg; bank++) {
1114		printf("bank %d\n", bank);
1115		printf("\tstart = 0x%x\n", ptoa(VM_PHYSMEM_PTR(bank)->start));
1116		printf("\tend = 0x%x\n", ptoa(VM_PHYSMEM_PTR(bank)->end));
1117		printf("\tavail_start = 0x%x\n",
1118		    ptoa(VM_PHYSMEM_PTR(bank)->avail_start));
1119		printf("\tavail_end = 0x%x\n",
1120		    ptoa(VM_PHYSMEM_PTR(bank)->avail_end));
1121	}
1122#endif
1123}
1124
1125/*
1126 * pmap_create:			[ INTERFACE ]
1127 *
1128 *	Create and return a physical map.
1129 *
1130 *	Note: no locking is necessary in this function.
1131 */
1132pmap_t
1133pmap_create(void)
1134{
1135	pmap_t pmap;
1136	int i;
1137
1138#ifdef DEBUG
1139	if (pmapdebug & (PDB_FOLLOW|PDB_CREATE))
1140		printf("pmap_create()\n");
1141#endif
1142
1143	pmap = pool_cache_get(&pmap_pmap_cache, PR_WAITOK);
1144	memset(pmap, 0, sizeof(*pmap));
1145
1146	/*
1147	 * Defer allocation of a new level 1 page table until
1148	 * the first new mapping is entered; just take a reference
1149	 * to the kernel kernel_lev1map.
1150	 */
1151	pmap->pm_lev1map = kernel_lev1map;
1152
1153	pmap->pm_count = 1;
1154	for (i = 0; i < pmap_ncpuids; i++) {
1155		pmap->pm_asni[i].pma_asn = PMAP_ASN_RESERVED;
1156		/* XXX Locking? */
1157		pmap->pm_asni[i].pma_asngen = pmap_asn_info[i].pma_asngen;
1158	}
1159	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1160
1161 try_again:
1162	rw_enter(&pmap_growkernel_lock, RW_READER);
1163
1164	if (pmap_lev1map_create(pmap, cpu_number()) != 0) {
1165		rw_exit(&pmap_growkernel_lock);
1166		(void) kpause("pmap_create", false, hz >> 2, NULL);
1167		goto try_again;
1168	}
1169
1170	mutex_enter(&pmap_all_pmaps_lock);
1171	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap, pm_list);
1172	mutex_exit(&pmap_all_pmaps_lock);
1173
1174	rw_exit(&pmap_growkernel_lock);
1175
1176	return (pmap);
1177}
1178
1179/*
1180 * pmap_destroy:		[ INTERFACE ]
1181 *
1182 *	Drop the reference count on the specified pmap, releasing
1183 *	all resources if the reference count drops to zero.
1184 */
1185void
1186pmap_destroy(pmap_t pmap)
1187{
1188
1189#ifdef DEBUG
1190	if (pmapdebug & PDB_FOLLOW)
1191		printf("pmap_destroy(%p)\n", pmap);
1192#endif
1193
1194	if (atomic_dec_uint_nv(&pmap->pm_count) > 0)
1195		return;
1196
1197	rw_enter(&pmap_growkernel_lock, RW_READER);
1198
1199	/*
1200	 * Remove it from the global list of all pmaps.
1201	 */
1202	mutex_enter(&pmap_all_pmaps_lock);
1203	TAILQ_REMOVE(&pmap_all_pmaps, pmap, pm_list);
1204	mutex_exit(&pmap_all_pmaps_lock);
1205
1206	pmap_lev1map_destroy(pmap, cpu_number());
1207
1208	rw_exit(&pmap_growkernel_lock);
1209
1210	/*
1211	 * Since the pmap is supposed to contain no valid
1212	 * mappings at this point, we should always see
1213	 * kernel_lev1map here.
1214	 */
1215	KASSERT(pmap->pm_lev1map == kernel_lev1map);
1216
1217	mutex_destroy(&pmap->pm_lock);
1218	pool_cache_put(&pmap_pmap_cache, pmap);
1219}
1220
1221/*
1222 * pmap_reference:		[ INTERFACE ]
1223 *
1224 *	Add a reference to the specified pmap.
1225 */
1226void
1227pmap_reference(pmap_t pmap)
1228{
1229
1230#ifdef DEBUG
1231	if (pmapdebug & PDB_FOLLOW)
1232		printf("pmap_reference(%p)\n", pmap);
1233#endif
1234
1235	atomic_inc_uint(&pmap->pm_count);
1236}
1237
1238/*
1239 * pmap_remove:			[ INTERFACE ]
1240 *
1241 *	Remove the given range of addresses from the specified map.
1242 *
1243 *	It is assumed that the start and end are properly
1244 *	rounded to the page size.
1245 */
1246void
1247pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
1248{
1249	pt_entry_t *l1pte, *l2pte, *l3pte;
1250	pt_entry_t *saved_l1pte, *saved_l2pte, *saved_l3pte;
1251	vaddr_t l1eva, l2eva, vptva;
1252	bool needisync = false;
1253	long cpu_id = cpu_number();
1254
1255#ifdef DEBUG
1256	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
1257		printf("pmap_remove(%p, %lx, %lx)\n", pmap, sva, eva);
1258#endif
1259
1260	/*
1261	 * If this is the kernel pmap, we can use a faster method
1262	 * for accessing the PTEs (since the PT pages are always
1263	 * resident).
1264	 *
1265	 * Note that this routine should NEVER be called from an
1266	 * interrupt context; pmap_kremove() is used for that.
1267	 */
1268	if (pmap == pmap_kernel()) {
1269		PMAP_MAP_TO_HEAD_LOCK();
1270		PMAP_LOCK(pmap);
1271
1272		while (sva < eva) {
1273			l3pte = PMAP_KERNEL_PTE(sva);
1274			if (pmap_pte_v(l3pte)) {
1275#ifdef DIAGNOSTIC
1276				if (uvm_pageismanaged(pmap_pte_pa(l3pte)) &&
1277				    pmap_pte_pv(l3pte) == 0)
1278					panic("pmap_remove: managed page "
1279					    "without PG_PVLIST for 0x%lx",
1280					    sva);
1281#endif
1282				needisync |= pmap_remove_mapping(pmap, sva,
1283				    l3pte, true, cpu_id);
1284			}
1285			sva += PAGE_SIZE;
1286		}
1287
1288		PMAP_UNLOCK(pmap);
1289		PMAP_MAP_TO_HEAD_UNLOCK();
1290
1291		if (needisync)
1292			PMAP_SYNC_ISTREAM_KERNEL();
1293		return;
1294	}
1295
1296#ifdef DIAGNOSTIC
1297	if (sva > VM_MAXUSER_ADDRESS || eva > VM_MAXUSER_ADDRESS)
1298		panic("pmap_remove: (0x%lx - 0x%lx) user pmap, kernel "
1299		    "address range", sva, eva);
1300#endif
1301
1302	PMAP_MAP_TO_HEAD_LOCK();
1303	PMAP_LOCK(pmap);
1304
1305	/*
1306	 * If we're already referencing the kernel_lev1map, there
1307	 * is no work for us to do.
1308	 */
1309	if (pmap->pm_lev1map == kernel_lev1map)
1310		goto out;
1311
1312	saved_l1pte = l1pte = pmap_l1pte(pmap, sva);
1313
1314	/*
1315	 * Add a reference to the L1 table to it won't get
1316	 * removed from under us.
1317	 */
1318	pmap_physpage_addref(saved_l1pte);
1319
1320	for (; sva < eva; sva = l1eva, l1pte++) {
1321		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
1322		if (pmap_pte_v(l1pte)) {
1323			saved_l2pte = l2pte = pmap_l2pte(pmap, sva, l1pte);
1324
1325			/*
1326			 * Add a reference to the L2 table so it won't
1327			 * get removed from under us.
1328			 */
1329			pmap_physpage_addref(saved_l2pte);
1330
1331			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
1332				l2eva =
1333				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
1334				if (pmap_pte_v(l2pte)) {
1335					saved_l3pte = l3pte =
1336					    pmap_l3pte(pmap, sva, l2pte);
1337
1338					/*
1339					 * Add a reference to the L3 table so
1340					 * it won't get removed from under us.
1341					 */
1342					pmap_physpage_addref(saved_l3pte);
1343
1344					/*
1345					 * Remember this sva; if the L3 table
1346					 * gets removed, we need to invalidate
1347					 * the VPT TLB entry for it.
1348					 */
1349					vptva = sva;
1350
1351					for (; sva < l2eva && sva < eva;
1352					     sva += PAGE_SIZE, l3pte++) {
1353						if (!pmap_pte_v(l3pte)) {
1354							continue;
1355						}
1356						needisync |=
1357						    pmap_remove_mapping(
1358							pmap, sva,
1359							l3pte, true,
1360							cpu_id);
1361					}
1362
1363					/*
1364					 * Remove the reference to the L3
1365					 * table that we added above.  This
1366					 * may free the L3 table.
1367					 */
1368					pmap_l3pt_delref(pmap, vptva,
1369					    saved_l3pte, cpu_id);
1370				}
1371			}
1372
1373			/*
1374			 * Remove the reference to the L2 table that we
1375			 * added above.  This may free the L2 table.
1376			 */
1377			pmap_l2pt_delref(pmap, l1pte, saved_l2pte, cpu_id);
1378		}
1379	}
1380
1381	/*
1382	 * Remove the reference to the L1 table that we added above.
1383	 * This may free the L1 table.
1384	 */
1385	pmap_l1pt_delref(pmap, saved_l1pte, cpu_id);
1386
1387	if (needisync)
1388		PMAP_SYNC_ISTREAM_USER(pmap);
1389
1390 out:
1391	PMAP_UNLOCK(pmap);
1392	PMAP_MAP_TO_HEAD_UNLOCK();
1393}
1394
1395/*
1396 * pmap_page_protect:		[ INTERFACE ]
1397 *
1398 *	Lower the permission for all mappings to a given page to
1399 *	the permissions specified.
1400 */
1401void
1402pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
1403{
1404	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
1405	pmap_t pmap;
1406	pv_entry_t pv, nextpv;
1407	bool needkisync = false;
1408	long cpu_id = cpu_number();
1409	kmutex_t *lock;
1410	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
1411#ifdef DEBUG
1412	paddr_t pa = VM_PAGE_TO_PHYS(pg);
1413
1414
1415	if ((pmapdebug & (PDB_FOLLOW|PDB_PROTECT)) ||
1416	    (prot == VM_PROT_NONE && (pmapdebug & PDB_REMOVE)))
1417		printf("pmap_page_protect(%p, %x)\n", pg, prot);
1418#endif
1419
1420	switch (prot) {
1421	case VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE:
1422	case VM_PROT_READ|VM_PROT_WRITE:
1423		return;
1424
1425	/* copy_on_write */
1426	case VM_PROT_READ|VM_PROT_EXECUTE:
1427	case VM_PROT_READ:
1428		PMAP_HEAD_TO_MAP_LOCK();
1429		lock = pmap_pvh_lock(pg);
1430		mutex_enter(lock);
1431		for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
1432			PMAP_LOCK(pv->pv_pmap);
1433			if (*pv->pv_pte & (PG_KWE | PG_UWE)) {
1434				*pv->pv_pte &= ~(PG_KWE | PG_UWE);
1435				PMAP_INVALIDATE_TLB(pv->pv_pmap, pv->pv_va,
1436				    pmap_pte_asm(pv->pv_pte),
1437				    PMAP_ISACTIVE(pv->pv_pmap, cpu_id), cpu_id);
1438				PMAP_TLB_SHOOTDOWN(pv->pv_pmap, pv->pv_va,
1439				    pmap_pte_asm(pv->pv_pte));
1440			}
1441			PMAP_UNLOCK(pv->pv_pmap);
1442		}
1443		mutex_exit(lock);
1444		PMAP_HEAD_TO_MAP_UNLOCK();
1445		PMAP_TLB_SHOOTNOW();
1446		return;
1447
1448	/* remove_all */
1449	default:
1450		break;
1451	}
1452
1453	PMAP_HEAD_TO_MAP_LOCK();
1454	lock = pmap_pvh_lock(pg);
1455	mutex_enter(lock);
1456	for (pv = md->pvh_list; pv != NULL; pv = nextpv) {
1457		nextpv = pv->pv_next;
1458		pmap = pv->pv_pmap;
1459
1460		PMAP_LOCK(pmap);
1461#ifdef DEBUG
1462		if (pmap_pte_v(pmap_l2pte(pv->pv_pmap, pv->pv_va, NULL)) == 0 ||
1463		    pmap_pte_pa(pv->pv_pte) != pa)
1464			panic("pmap_page_protect: bad mapping");
1465#endif
1466		if (pmap_remove_mapping(pmap, pv->pv_va, pv->pv_pte,
1467		    false, cpu_id) == true) {
1468			if (pmap == pmap_kernel())
1469				needkisync |= true;
1470			else
1471				PMAP_SYNC_ISTREAM_USER(pmap);
1472		}
1473		PMAP_UNLOCK(pmap);
1474	}
1475
1476	if (needkisync)
1477		PMAP_SYNC_ISTREAM_KERNEL();
1478
1479	mutex_exit(lock);
1480	PMAP_HEAD_TO_MAP_UNLOCK();
1481}
1482
1483/*
1484 * pmap_protect:		[ INTERFACE ]
1485 *
1486 *	Set the physical protection on the specified range of this map
1487 *	as requested.
1488 */
1489void
1490pmap_protect(pmap_t pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
1491{
1492	pt_entry_t *l1pte, *l2pte, *l3pte, bits;
1493	bool isactive;
1494	bool hadasm;
1495	vaddr_t l1eva, l2eva;
1496	long cpu_id = cpu_number();
1497	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
1498
1499#ifdef DEBUG
1500	if (pmapdebug & (PDB_FOLLOW|PDB_PROTECT))
1501		printf("pmap_protect(%p, %lx, %lx, %x)\n",
1502		    pmap, sva, eva, prot);
1503#endif
1504
1505	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1506		pmap_remove(pmap, sva, eva);
1507		return;
1508	}
1509
1510	PMAP_LOCK(pmap);
1511
1512	bits = pte_prot(pmap, prot);
1513	isactive = PMAP_ISACTIVE(pmap, cpu_id);
1514
1515	l1pte = pmap_l1pte(pmap, sva);
1516	for (; sva < eva; sva = l1eva, l1pte++) {
1517		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
1518		if (pmap_pte_v(l1pte)) {
1519			l2pte = pmap_l2pte(pmap, sva, l1pte);
1520			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
1521				l2eva =
1522				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
1523				if (pmap_pte_v(l2pte)) {
1524					l3pte = pmap_l3pte(pmap, sva, l2pte);
1525					for (; sva < l2eva && sva < eva;
1526					     sva += PAGE_SIZE, l3pte++) {
1527						if (pmap_pte_v(l3pte) &&
1528						    pmap_pte_prot_chg(l3pte,
1529						    bits)) {
1530							hadasm =
1531							   (pmap_pte_asm(l3pte)
1532							    != 0);
1533							pmap_pte_set_prot(l3pte,
1534							   bits);
1535							PMAP_INVALIDATE_TLB(
1536							   pmap, sva, hadasm,
1537							   isactive, cpu_id);
1538							PMAP_TLB_SHOOTDOWN(
1539							   pmap, sva,
1540							   hadasm ? PG_ASM : 0);
1541						}
1542					}
1543				}
1544			}
1545		}
1546	}
1547
1548	PMAP_TLB_SHOOTNOW();
1549
1550	if (prot & VM_PROT_EXECUTE)
1551		PMAP_SYNC_ISTREAM(pmap);
1552
1553	PMAP_UNLOCK(pmap);
1554}
1555
1556/*
1557 * pmap_enter:			[ INTERFACE ]
1558 *
1559 *	Insert the given physical page (p) at
1560 *	the specified virtual address (v) in the
1561 *	target physical map with the protection requested.
1562 *
1563 *	If specified, the page will be wired down, meaning
1564 *	that the related pte can not be reclaimed.
1565 *
1566 *	Note:  This is the only routine which MAY NOT lazy-evaluate
1567 *	or lose information.  That is, this routine must actually
1568 *	insert this page into the given map NOW.
1569 */
1570int
1571pmap_enter(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1572{
1573	struct vm_page *pg;			/* if != NULL, managed page */
1574	pt_entry_t *pte, npte, opte;
1575	paddr_t opa;
1576	bool tflush = true;
1577	bool hadasm = false;	/* XXX gcc -Wuninitialized */
1578	bool needisync = false;
1579	bool setisync = false;
1580	bool isactive;
1581	bool wired;
1582	long cpu_id = cpu_number();
1583	int error = 0;
1584	kmutex_t *lock;
1585	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
1586
1587#ifdef DEBUG
1588	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
1589		printf("pmap_enter(%p, %lx, %lx, %x, %x)\n",
1590		       pmap, va, pa, prot, flags);
1591#endif
1592	pg = PHYS_TO_VM_PAGE(pa);
1593	isactive = PMAP_ISACTIVE(pmap, cpu_id);
1594	wired = (flags & PMAP_WIRED) != 0;
1595
1596	/*
1597	 * Determine what we need to do about the I-stream.  If
1598	 * VM_PROT_EXECUTE is set, we mark a user pmap as needing
1599	 * an I-sync on the way back out to userspace.  We always
1600	 * need an immediate I-sync for the kernel pmap.
1601	 */
1602	if (prot & VM_PROT_EXECUTE) {
1603		if (pmap == pmap_kernel())
1604			needisync = true;
1605		else {
1606			setisync = true;
1607			needisync = (pmap->pm_cpus != 0);
1608		}
1609	}
1610
1611	PMAP_MAP_TO_HEAD_LOCK();
1612	PMAP_LOCK(pmap);
1613
1614	if (pmap == pmap_kernel()) {
1615#ifdef DIAGNOSTIC
1616		/*
1617		 * Sanity check the virtual address.
1618		 */
1619		if (va < VM_MIN_KERNEL_ADDRESS)
1620			panic("pmap_enter: kernel pmap, invalid va 0x%lx", va);
1621#endif
1622		pte = PMAP_KERNEL_PTE(va);
1623	} else {
1624		pt_entry_t *l1pte, *l2pte;
1625
1626#ifdef DIAGNOSTIC
1627		/*
1628		 * Sanity check the virtual address.
1629		 */
1630		if (va >= VM_MAXUSER_ADDRESS)
1631			panic("pmap_enter: user pmap, invalid va 0x%lx", va);
1632#endif
1633
1634		KASSERT(pmap->pm_lev1map != kernel_lev1map);
1635
1636		/*
1637		 * Check to see if the level 1 PTE is valid, and
1638		 * allocate a new level 2 page table page if it's not.
1639		 * A reference will be added to the level 2 table when
1640		 * the level 3 table is created.
1641		 */
1642		l1pte = pmap_l1pte(pmap, va);
1643		if (pmap_pte_v(l1pte) == 0) {
1644			pmap_physpage_addref(l1pte);
1645			error = pmap_ptpage_alloc(pmap, l1pte, PGU_L2PT);
1646			if (error) {
1647				pmap_l1pt_delref(pmap, l1pte, cpu_id);
1648				if (flags & PMAP_CANFAIL)
1649					goto out;
1650				panic("pmap_enter: unable to create L2 PT "
1651				    "page");
1652			}
1653#ifdef DEBUG
1654			if (pmapdebug & PDB_PTPAGE)
1655				printf("pmap_enter: new level 2 table at "
1656				    "0x%lx\n", pmap_pte_pa(l1pte));
1657#endif
1658		}
1659
1660		/*
1661		 * Check to see if the level 2 PTE is valid, and
1662		 * allocate a new level 3 page table page if it's not.
1663		 * A reference will be added to the level 3 table when
1664		 * the mapping is validated.
1665		 */
1666		l2pte = pmap_l2pte(pmap, va, l1pte);
1667		if (pmap_pte_v(l2pte) == 0) {
1668			pmap_physpage_addref(l2pte);
1669			error = pmap_ptpage_alloc(pmap, l2pte, PGU_L3PT);
1670			if (error) {
1671				pmap_l2pt_delref(pmap, l1pte, l2pte, cpu_id);
1672				if (flags & PMAP_CANFAIL)
1673					goto out;
1674				panic("pmap_enter: unable to create L3 PT "
1675				    "page");
1676			}
1677#ifdef DEBUG
1678			if (pmapdebug & PDB_PTPAGE)
1679				printf("pmap_enter: new level 3 table at "
1680				    "0x%lx\n", pmap_pte_pa(l2pte));
1681#endif
1682		}
1683
1684		/*
1685		 * Get the PTE that will map the page.
1686		 */
1687		pte = pmap_l3pte(pmap, va, l2pte);
1688	}
1689
1690	/* Remember all of the old PTE; used for TBI check later. */
1691	opte = *pte;
1692
1693	/*
1694	 * Check to see if the old mapping is valid.  If not, validate the
1695	 * new one immediately.
1696	 */
1697	if (pmap_pte_v(pte) == 0) {
1698		/*
1699		 * No need to invalidate the TLB in this case; an invalid
1700		 * mapping won't be in the TLB, and a previously valid
1701		 * mapping would have been flushed when it was invalidated.
1702		 */
1703		tflush = false;
1704
1705		/*
1706		 * No need to synchronize the I-stream, either, for basically
1707		 * the same reason.
1708		 */
1709		setisync = needisync = false;
1710
1711		if (pmap != pmap_kernel()) {
1712			/*
1713			 * New mappings gain a reference on the level 3
1714			 * table.
1715			 */
1716			pmap_physpage_addref(pte);
1717		}
1718		goto validate_enterpv;
1719	}
1720
1721	opa = pmap_pte_pa(pte);
1722	hadasm = (pmap_pte_asm(pte) != 0);
1723
1724	if (opa == pa) {
1725		/*
1726		 * Mapping has not changed; must be a protection or
1727		 * wiring change.
1728		 */
1729		if (pmap_pte_w_chg(pte, wired ? PG_WIRED : 0)) {
1730#ifdef DEBUG
1731			if (pmapdebug & PDB_ENTER)
1732				printf("pmap_enter: wiring change -> %d\n",
1733				    wired);
1734#endif
1735			/*
1736			 * Adjust the wiring count.
1737			 */
1738			if (wired)
1739				PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
1740			else
1741				PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
1742		}
1743
1744		/*
1745		 * Set the PTE.
1746		 */
1747		goto validate;
1748	}
1749
1750	/*
1751	 * The mapping has changed.  We need to invalidate the
1752	 * old mapping before creating the new one.
1753	 */
1754#ifdef DEBUG
1755	if (pmapdebug & PDB_ENTER)
1756		printf("pmap_enter: removing old mapping 0x%lx\n", va);
1757#endif
1758	if (pmap != pmap_kernel()) {
1759		/*
1760		 * Gain an extra reference on the level 3 table.
1761		 * pmap_remove_mapping() will delete a reference,
1762		 * and we don't want the table to be erroneously
1763		 * freed.
1764		 */
1765		pmap_physpage_addref(pte);
1766	}
1767	needisync |= pmap_remove_mapping(pmap, va, pte, true, cpu_id);
1768
1769 validate_enterpv:
1770	/*
1771	 * Enter the mapping into the pv_table if appropriate.
1772	 */
1773	if (pg != NULL) {
1774		error = pmap_pv_enter(pmap, pg, va, pte, true);
1775		if (error) {
1776			pmap_l3pt_delref(pmap, va, pte, cpu_id);
1777			if (flags & PMAP_CANFAIL)
1778				goto out;
1779			panic("pmap_enter: unable to enter mapping in PV "
1780			    "table");
1781		}
1782	}
1783
1784	/*
1785	 * Increment counters.
1786	 */
1787	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
1788	if (wired)
1789		PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
1790
1791 validate:
1792	/*
1793	 * Build the new PTE.
1794	 */
1795	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap, prot) | PG_V;
1796	if (pg != NULL) {
1797		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
1798		int attrs;
1799
1800#ifdef DIAGNOSTIC
1801		if ((flags & VM_PROT_ALL) & ~prot)
1802			panic("pmap_enter: access type exceeds prot");
1803#endif
1804		lock = pmap_pvh_lock(pg);
1805		mutex_enter(lock);
1806		if (flags & VM_PROT_WRITE)
1807			md->pvh_attrs |= (PGA_REFERENCED|PGA_MODIFIED);
1808		else if (flags & VM_PROT_ALL)
1809			md->pvh_attrs |= PGA_REFERENCED;
1810		attrs = md->pvh_attrs;
1811		mutex_exit(lock);
1812
1813		/*
1814		 * Set up referenced/modified emulation for new mapping.
1815		 */
1816		if ((attrs & PGA_REFERENCED) == 0)
1817			npte |= PG_FOR | PG_FOW | PG_FOE;
1818		else if ((attrs & PGA_MODIFIED) == 0)
1819			npte |= PG_FOW;
1820
1821		/*
1822		 * Mapping was entered on PV list.
1823		 */
1824		npte |= PG_PVLIST;
1825	}
1826	if (wired)
1827		npte |= PG_WIRED;
1828#ifdef DEBUG
1829	if (pmapdebug & PDB_ENTER)
1830		printf("pmap_enter: new pte = 0x%lx\n", npte);
1831#endif
1832
1833	/*
1834	 * If the PALcode portion of the new PTE is the same as the
1835	 * old PTE, no TBI is necessary.
1836	 */
1837	if (PG_PALCODE(opte) == PG_PALCODE(npte))
1838		tflush = false;
1839
1840	/*
1841	 * Set the new PTE.
1842	 */
1843	PMAP_SET_PTE(pte, npte);
1844
1845	/*
1846	 * Invalidate the TLB entry for this VA and any appropriate
1847	 * caches.
1848	 */
1849	if (tflush) {
1850		PMAP_INVALIDATE_TLB(pmap, va, hadasm, isactive, cpu_id);
1851		PMAP_TLB_SHOOTDOWN(pmap, va, hadasm ? PG_ASM : 0);
1852		PMAP_TLB_SHOOTNOW();
1853	}
1854	if (setisync)
1855		PMAP_SET_NEEDISYNC(pmap);
1856	if (needisync)
1857		PMAP_SYNC_ISTREAM(pmap);
1858
1859out:
1860	PMAP_UNLOCK(pmap);
1861	PMAP_MAP_TO_HEAD_UNLOCK();
1862
1863	return error;
1864}
1865
1866/*
1867 * pmap_kenter_pa:		[ INTERFACE ]
1868 *
1869 *	Enter a va -> pa mapping into the kernel pmap without any
1870 *	physical->virtual tracking.
1871 *
1872 *	Note: no locking is necessary in this function.
1873 */
1874void
1875pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1876{
1877	pt_entry_t *pte, npte;
1878	long cpu_id = cpu_number();
1879	bool needisync = false;
1880	pmap_t pmap = pmap_kernel();
1881	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
1882
1883#ifdef DEBUG
1884	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
1885		printf("pmap_kenter_pa(%lx, %lx, %x)\n",
1886		    va, pa, prot);
1887#endif
1888
1889#ifdef DIAGNOSTIC
1890	/*
1891	 * Sanity check the virtual address.
1892	 */
1893	if (va < VM_MIN_KERNEL_ADDRESS)
1894		panic("pmap_kenter_pa: kernel pmap, invalid va 0x%lx", va);
1895#endif
1896
1897	pte = PMAP_KERNEL_PTE(va);
1898
1899	if (pmap_pte_v(pte) == 0)
1900		PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
1901	if (pmap_pte_w(pte) == 0)
1902		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
1903
1904	if ((prot & VM_PROT_EXECUTE) != 0 || pmap_pte_exec(pte))
1905		needisync = true;
1906
1907	/*
1908	 * Build the new PTE.
1909	 */
1910	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap_kernel(), prot) |
1911	    PG_V | PG_WIRED;
1912
1913	/*
1914	 * Set the new PTE.
1915	 */
1916	PMAP_SET_PTE(pte, npte);
1917#if defined(MULTIPROCESSOR)
1918	alpha_mb();		/* XXX alpha_wmb()? */
1919#endif
1920
1921	/*
1922	 * Invalidate the TLB entry for this VA and any appropriate
1923	 * caches.
1924	 */
1925	PMAP_INVALIDATE_TLB(pmap, va, true, true, cpu_id);
1926	PMAP_TLB_SHOOTDOWN(pmap, va, PG_ASM);
1927	PMAP_TLB_SHOOTNOW();
1928
1929	if (needisync)
1930		PMAP_SYNC_ISTREAM_KERNEL();
1931}
1932
1933/*
1934 * pmap_kremove:		[ INTERFACE ]
1935 *
1936 *	Remove a mapping entered with pmap_kenter_pa() starting at va,
1937 *	for size bytes (assumed to be page rounded).
1938 */
1939void
1940pmap_kremove(vaddr_t va, vsize_t size)
1941{
1942	pt_entry_t *pte;
1943	bool needisync = false;
1944	long cpu_id = cpu_number();
1945	pmap_t pmap = pmap_kernel();
1946	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
1947
1948#ifdef DEBUG
1949	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
1950		printf("pmap_kremove(%lx, %lx)\n",
1951		    va, size);
1952#endif
1953
1954#ifdef DIAGNOSTIC
1955	if (va < VM_MIN_KERNEL_ADDRESS)
1956		panic("pmap_kremove: user address");
1957#endif
1958
1959	for (; size != 0; size -= PAGE_SIZE, va += PAGE_SIZE) {
1960		pte = PMAP_KERNEL_PTE(va);
1961		if (pmap_pte_v(pte)) {
1962#ifdef DIAGNOSTIC
1963			if (pmap_pte_pv(pte))
1964				panic("pmap_kremove: PG_PVLIST mapping for "
1965				    "0x%lx", va);
1966#endif
1967			if (pmap_pte_exec(pte))
1968				needisync = true;
1969
1970			/* Zap the mapping. */
1971			PMAP_SET_PTE(pte, PG_NV);
1972#if defined(MULTIPROCESSOR)
1973			alpha_mb();		/* XXX alpha_wmb()? */
1974#endif
1975			PMAP_INVALIDATE_TLB(pmap, va, true, true, cpu_id);
1976			PMAP_TLB_SHOOTDOWN(pmap, va, PG_ASM);
1977
1978			/* Update stats. */
1979			PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
1980			PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
1981		}
1982	}
1983
1984	PMAP_TLB_SHOOTNOW();
1985
1986	if (needisync)
1987		PMAP_SYNC_ISTREAM_KERNEL();
1988}
1989
1990/*
1991 * pmap_unwire:			[ INTERFACE ]
1992 *
1993 *	Clear the wired attribute for a map/virtual-address pair.
1994 *
1995 *	The mapping must already exist in the pmap.
1996 */
1997void
1998pmap_unwire(pmap_t pmap, vaddr_t va)
1999{
2000	pt_entry_t *pte;
2001
2002#ifdef DEBUG
2003	if (pmapdebug & PDB_FOLLOW)
2004		printf("pmap_unwire(%p, %lx)\n", pmap, va);
2005#endif
2006
2007	PMAP_LOCK(pmap);
2008
2009	pte = pmap_l3pte(pmap, va, NULL);
2010#ifdef DIAGNOSTIC
2011	if (pte == NULL || pmap_pte_v(pte) == 0)
2012		panic("pmap_unwire");
2013#endif
2014
2015	/*
2016	 * If wiring actually changed (always?) clear the wire bit and
2017	 * update the wire count.  Note that wiring is not a hardware
2018	 * characteristic so there is no need to invalidate the TLB.
2019	 */
2020	if (pmap_pte_w_chg(pte, 0)) {
2021		pmap_pte_set_w(pte, false);
2022		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2023	}
2024#ifdef DIAGNOSTIC
2025	else {
2026		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
2027		    "didn't change!\n", pmap, va);
2028	}
2029#endif
2030
2031	PMAP_UNLOCK(pmap);
2032}
2033
2034/*
2035 * pmap_extract:		[ INTERFACE ]
2036 *
2037 *	Extract the physical address associated with the given
2038 *	pmap/virtual address pair.
2039 */
2040bool
2041pmap_extract(pmap_t pmap, vaddr_t va, paddr_t *pap)
2042{
2043	pt_entry_t *l1pte, *l2pte, *l3pte;
2044	paddr_t pa;
2045
2046#ifdef DEBUG
2047	if (pmapdebug & PDB_FOLLOW)
2048		printf("pmap_extract(%p, %lx) -> ", pmap, va);
2049#endif
2050
2051	/*
2052	 * Take a faster path for the kernel pmap.  Avoids locking,
2053	 * handles K0SEG.
2054	 */
2055	if (pmap == pmap_kernel()) {
2056		pa = vtophys(va);
2057		if (pap != NULL)
2058			*pap = pa;
2059#ifdef DEBUG
2060		if (pmapdebug & PDB_FOLLOW)
2061			printf("0x%lx (kernel vtophys)\n", pa);
2062#endif
2063		return (pa != 0);	/* XXX */
2064	}
2065
2066	PMAP_LOCK(pmap);
2067
2068	l1pte = pmap_l1pte(pmap, va);
2069	if (pmap_pte_v(l1pte) == 0)
2070		goto out;
2071
2072	l2pte = pmap_l2pte(pmap, va, l1pte);
2073	if (pmap_pte_v(l2pte) == 0)
2074		goto out;
2075
2076	l3pte = pmap_l3pte(pmap, va, l2pte);
2077	if (pmap_pte_v(l3pte) == 0)
2078		goto out;
2079
2080	pa = pmap_pte_pa(l3pte) | (va & PGOFSET);
2081	PMAP_UNLOCK(pmap);
2082	if (pap != NULL)
2083		*pap = pa;
2084#ifdef DEBUG
2085	if (pmapdebug & PDB_FOLLOW)
2086		printf("0x%lx\n", pa);
2087#endif
2088	return (true);
2089
2090 out:
2091	PMAP_UNLOCK(pmap);
2092#ifdef DEBUG
2093	if (pmapdebug & PDB_FOLLOW)
2094		printf("failed\n");
2095#endif
2096	return (false);
2097}
2098
2099/*
2100 * pmap_copy:			[ INTERFACE ]
2101 *
2102 *	Copy the mapping range specified by src_addr/len
2103 *	from the source map to the range dst_addr/len
2104 *	in the destination map.
2105 *
2106 *	This routine is only advisory and need not do anything.
2107 */
2108/* call deleted in <machine/pmap.h> */
2109
2110/*
2111 * pmap_update:			[ INTERFACE ]
2112 *
2113 *	Require that all active physical maps contain no
2114 *	incorrect entries NOW, by processing any deferred
2115 *	pmap operations.
2116 */
2117/* call deleted in <machine/pmap.h> */
2118
2119/*
2120 * pmap_activate:		[ INTERFACE ]
2121 *
2122 *	Activate the pmap used by the specified process.  This includes
2123 *	reloading the MMU context if the current process, and marking
2124 *	the pmap in use by the processor.
2125 */
2126void
2127pmap_activate(struct lwp *l)
2128{
2129	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
2130	long cpu_id = cpu_number();
2131
2132#ifdef DEBUG
2133	if (pmapdebug & PDB_FOLLOW)
2134		printf("pmap_activate(%p)\n", l);
2135#endif
2136
2137	/* Mark the pmap in use by this processor. */
2138	atomic_or_ulong(&pmap->pm_cpus, (1UL << cpu_id));
2139
2140	/* Allocate an ASN. */
2141	pmap_asn_alloc(pmap, cpu_id);
2142
2143	PMAP_ACTIVATE(pmap, l, cpu_id);
2144}
2145
2146/*
2147 * pmap_deactivate:		[ INTERFACE ]
2148 *
2149 *	Mark that the pmap used by the specified process is no longer
2150 *	in use by the processor.
2151 *
2152 *	The comment above pmap_activate() wrt. locking applies here,
2153 *	as well.  Note that we use only a single `atomic' operation,
2154 *	so no locking is necessary.
2155 */
2156void
2157pmap_deactivate(struct lwp *l)
2158{
2159	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
2160
2161#ifdef DEBUG
2162	if (pmapdebug & PDB_FOLLOW)
2163		printf("pmap_deactivate(%p)\n", l);
2164#endif
2165
2166	/*
2167	 * Mark the pmap no longer in use by this processor.
2168	 */
2169	atomic_and_ulong(&pmap->pm_cpus, ~(1UL << cpu_number()));
2170}
2171
2172/*
2173 * pmap_zero_page:		[ INTERFACE ]
2174 *
2175 *	Zero the specified (machine independent) page by mapping the page
2176 *	into virtual memory and clear its contents, one machine dependent
2177 *	page at a time.
2178 *
2179 *	Note: no locking is necessary in this function.
2180 */
2181void
2182pmap_zero_page(paddr_t phys)
2183{
2184	u_long *p0, *p1, *pend;
2185
2186#ifdef DEBUG
2187	if (pmapdebug & PDB_FOLLOW)
2188		printf("pmap_zero_page(%lx)\n", phys);
2189#endif
2190
2191	p0 = (u_long *)ALPHA_PHYS_TO_K0SEG(phys);
2192	p1 = NULL;
2193	pend = (u_long *)((u_long)p0 + PAGE_SIZE);
2194
2195	/*
2196	 * Unroll the loop a bit, doing 16 quadwords per iteration.
2197	 * Do only 8 back-to-back stores, and alternate registers.
2198	 */
2199	do {
2200		__asm volatile(
2201		"# BEGIN loop body\n"
2202		"	addq	%2, (8 * 8), %1		\n"
2203		"	stq	$31, (0 * 8)(%0)	\n"
2204		"	stq	$31, (1 * 8)(%0)	\n"
2205		"	stq	$31, (2 * 8)(%0)	\n"
2206		"	stq	$31, (3 * 8)(%0)	\n"
2207		"	stq	$31, (4 * 8)(%0)	\n"
2208		"	stq	$31, (5 * 8)(%0)	\n"
2209		"	stq	$31, (6 * 8)(%0)	\n"
2210		"	stq	$31, (7 * 8)(%0)	\n"
2211		"					\n"
2212		"	addq	%3, (8 * 8), %0		\n"
2213		"	stq	$31, (0 * 8)(%1)	\n"
2214		"	stq	$31, (1 * 8)(%1)	\n"
2215		"	stq	$31, (2 * 8)(%1)	\n"
2216		"	stq	$31, (3 * 8)(%1)	\n"
2217		"	stq	$31, (4 * 8)(%1)	\n"
2218		"	stq	$31, (5 * 8)(%1)	\n"
2219		"	stq	$31, (6 * 8)(%1)	\n"
2220		"	stq	$31, (7 * 8)(%1)	\n"
2221		"	# END loop body"
2222		: "=r" (p0), "=r" (p1)
2223		: "0" (p0), "1" (p1)
2224		: "memory");
2225	} while (p0 < pend);
2226}
2227
2228/*
2229 * pmap_copy_page:		[ INTERFACE ]
2230 *
2231 *	Copy the specified (machine independent) page by mapping the page
2232 *	into virtual memory and using memcpy to copy the page, one machine
2233 *	dependent page at a time.
2234 *
2235 *	Note: no locking is necessary in this function.
2236 */
2237void
2238pmap_copy_page(paddr_t src, paddr_t dst)
2239{
2240	const void *s;
2241	void *d;
2242
2243#ifdef DEBUG
2244	if (pmapdebug & PDB_FOLLOW)
2245		printf("pmap_copy_page(%lx, %lx)\n", src, dst);
2246#endif
2247	s = (const void *)ALPHA_PHYS_TO_K0SEG(src);
2248	d = (void *)ALPHA_PHYS_TO_K0SEG(dst);
2249	memcpy(d, s, PAGE_SIZE);
2250}
2251
2252/*
2253 * pmap_pageidlezero:		[ INTERFACE ]
2254 *
2255 *	Page zero'er for the idle loop.  Returns true if the
2256 *	page was zero'd, FLASE if we aborted for some reason.
2257 */
2258bool
2259pmap_pageidlezero(paddr_t pa)
2260{
2261	u_long *ptr;
2262	int i, cnt = PAGE_SIZE / sizeof(u_long);
2263
2264	for (i = 0, ptr = (u_long *) ALPHA_PHYS_TO_K0SEG(pa); i < cnt; i++) {
2265		if (sched_curcpu_runnable_p()) {
2266			/*
2267			 * An LWP has become ready.  Abort now,
2268			 * so we don't keep it waiting while we
2269			 * finish zeroing the page.
2270			 */
2271			return (false);
2272		}
2273		*ptr++ = 0;
2274	}
2275
2276	return (true);
2277}
2278
2279/*
2280 * pmap_clear_modify:		[ INTERFACE ]
2281 *
2282 *	Clear the modify bits on the specified physical page.
2283 */
2284bool
2285pmap_clear_modify(struct vm_page *pg)
2286{
2287	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2288	bool rv = false;
2289	long cpu_id = cpu_number();
2290	kmutex_t *lock;
2291
2292#ifdef DEBUG
2293	if (pmapdebug & PDB_FOLLOW)
2294		printf("pmap_clear_modify(%p)\n", pg);
2295#endif
2296
2297	PMAP_HEAD_TO_MAP_LOCK();
2298	lock = pmap_pvh_lock(pg);
2299	mutex_enter(lock);
2300
2301	if (md->pvh_attrs & PGA_MODIFIED) {
2302		rv = true;
2303		pmap_changebit(pg, PG_FOW, ~0, cpu_id);
2304		md->pvh_attrs &= ~PGA_MODIFIED;
2305	}
2306
2307	mutex_exit(lock);
2308	PMAP_HEAD_TO_MAP_UNLOCK();
2309
2310	return (rv);
2311}
2312
2313/*
2314 * pmap_clear_reference:	[ INTERFACE ]
2315 *
2316 *	Clear the reference bit on the specified physical page.
2317 */
2318bool
2319pmap_clear_reference(struct vm_page *pg)
2320{
2321	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2322	bool rv = false;
2323	long cpu_id = cpu_number();
2324	kmutex_t *lock;
2325
2326#ifdef DEBUG
2327	if (pmapdebug & PDB_FOLLOW)
2328		printf("pmap_clear_reference(%p)\n", pg);
2329#endif
2330
2331	PMAP_HEAD_TO_MAP_LOCK();
2332	lock = pmap_pvh_lock(pg);
2333	mutex_enter(lock);
2334
2335	if (md->pvh_attrs & PGA_REFERENCED) {
2336		rv = true;
2337		pmap_changebit(pg, PG_FOR | PG_FOW | PG_FOE, ~0, cpu_id);
2338		md->pvh_attrs &= ~PGA_REFERENCED;
2339	}
2340
2341	mutex_exit(lock);
2342	PMAP_HEAD_TO_MAP_UNLOCK();
2343
2344	return (rv);
2345}
2346
2347/*
2348 * pmap_is_referenced:		[ INTERFACE ]
2349 *
2350 *	Return whether or not the specified physical page is referenced
2351 *	by any physical maps.
2352 */
2353/* See <machine/pmap.h> */
2354
2355/*
2356 * pmap_is_modified:		[ INTERFACE ]
2357 *
2358 *	Return whether or not the specified physical page is modified
2359 *	by any physical maps.
2360 */
2361/* See <machine/pmap.h> */
2362
2363/*
2364 * pmap_phys_address:		[ INTERFACE ]
2365 *
2366 *	Return the physical address corresponding to the specified
2367 *	cookie.  Used by the device pager to decode a device driver's
2368 *	mmap entry point return value.
2369 *
2370 *	Note: no locking is necessary in this function.
2371 */
2372paddr_t
2373pmap_phys_address(paddr_t ppn)
2374{
2375
2376	return (alpha_ptob(ppn));
2377}
2378
2379/*
2380 * Miscellaneous support routines follow
2381 */
2382
2383/*
2384 * alpha_protection_init:
2385 *
2386 *	Initialize Alpha protection code array.
2387 *
2388 *	Note: no locking is necessary in this function.
2389 */
2390static void
2391alpha_protection_init(void)
2392{
2393	int prot, *kp, *up;
2394
2395	kp = protection_codes[0];
2396	up = protection_codes[1];
2397
2398	for (prot = 0; prot < 8; prot++) {
2399		kp[prot] = PG_ASM;
2400		up[prot] = 0;
2401
2402		if (prot & VM_PROT_READ) {
2403			kp[prot] |= PG_KRE;
2404			up[prot] |= PG_KRE | PG_URE;
2405		}
2406		if (prot & VM_PROT_WRITE) {
2407			kp[prot] |= PG_KWE;
2408			up[prot] |= PG_KWE | PG_UWE;
2409		}
2410		if (prot & VM_PROT_EXECUTE) {
2411			kp[prot] |= PG_EXEC | PG_KRE;
2412			up[prot] |= PG_EXEC | PG_KRE | PG_URE;
2413		} else {
2414			kp[prot] |= PG_FOE;
2415			up[prot] |= PG_FOE;
2416		}
2417	}
2418}
2419
2420/*
2421 * pmap_remove_mapping:
2422 *
2423 *	Invalidate a single page denoted by pmap/va.
2424 *
2425 *	If (pte != NULL), it is the already computed PTE for the page.
2426 *
2427 *	Note: locking in this function is complicated by the fact
2428 *	that we can be called when the PV list is already locked.
2429 *	(pmap_page_protect()).  In this case, the caller must be
2430 *	careful to get the next PV entry while we remove this entry
2431 *	from beneath it.  We assume that the pmap itself is already
2432 *	locked; dolock applies only to the PV list.
2433 *
2434 *	Returns true or false, indicating if an I-stream sync needs
2435 *	to be initiated (for this CPU or for other CPUs).
2436 */
2437static bool
2438pmap_remove_mapping(pmap_t pmap, vaddr_t va, pt_entry_t *pte,
2439    bool dolock, long cpu_id)
2440{
2441	paddr_t pa;
2442	struct vm_page *pg;		/* if != NULL, page is managed */
2443	bool onpv;
2444	bool hadasm;
2445	bool isactive;
2446	bool needisync = false;
2447	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
2448
2449#ifdef DEBUG
2450	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
2451		printf("pmap_remove_mapping(%p, %lx, %p, %d, %ld)\n",
2452		       pmap, va, pte, dolock, cpu_id);
2453#endif
2454
2455	/*
2456	 * PTE not provided, compute it from pmap and va.
2457	 */
2458	if (pte == NULL) {
2459		pte = pmap_l3pte(pmap, va, NULL);
2460		if (pmap_pte_v(pte) == 0)
2461			return (false);
2462	}
2463
2464	pa = pmap_pte_pa(pte);
2465	onpv = (pmap_pte_pv(pte) != 0);
2466	hadasm = (pmap_pte_asm(pte) != 0);
2467	isactive = PMAP_ISACTIVE(pmap, cpu_id);
2468
2469	/*
2470	 * Determine what we need to do about the I-stream.  If
2471	 * PG_EXEC was set, we mark a user pmap as needing an
2472	 * I-sync on the way out to userspace.  We always need
2473	 * an immediate I-sync for the kernel pmap.
2474	 */
2475	if (pmap_pte_exec(pte)) {
2476		if (pmap == pmap_kernel())
2477			needisync = true;
2478		else {
2479			PMAP_SET_NEEDISYNC(pmap);
2480			needisync = (pmap->pm_cpus != 0);
2481		}
2482	}
2483
2484	/*
2485	 * Update statistics
2486	 */
2487	if (pmap_pte_w(pte))
2488		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2489	PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
2490
2491	/*
2492	 * Invalidate the PTE after saving the reference modify info.
2493	 */
2494#ifdef DEBUG
2495	if (pmapdebug & PDB_REMOVE)
2496		printf("remove: invalidating pte at %p\n", pte);
2497#endif
2498	PMAP_SET_PTE(pte, PG_NV);
2499
2500	PMAP_INVALIDATE_TLB(pmap, va, hadasm, isactive, cpu_id);
2501	PMAP_TLB_SHOOTDOWN(pmap, va, hadasm ? PG_ASM : 0);
2502	PMAP_TLB_SHOOTNOW();
2503
2504	/*
2505	 * If we're removing a user mapping, check to see if we
2506	 * can free page table pages.
2507	 */
2508	if (pmap != pmap_kernel()) {
2509		/*
2510		 * Delete the reference on the level 3 table.  It will
2511		 * delete references on the level 2 and 1 tables as
2512		 * appropriate.
2513		 */
2514		pmap_l3pt_delref(pmap, va, pte, cpu_id);
2515	}
2516
2517	/*
2518	 * If the mapping wasn't entered on the PV list, we're all done.
2519	 */
2520	if (onpv == false)
2521		return (needisync);
2522
2523	/*
2524	 * Remove it from the PV table.
2525	 */
2526	pg = PHYS_TO_VM_PAGE(pa);
2527	KASSERT(pg != NULL);
2528	pmap_pv_remove(pmap, pg, va, dolock);
2529
2530	return (needisync);
2531}
2532
2533/*
2534 * pmap_changebit:
2535 *
2536 *	Set or clear the specified PTE bits for all mappings on the
2537 *	specified page.
2538 *
2539 *	Note: we assume that the pv_head is already locked, and that
2540 *	the caller has acquired a PV->pmap mutex so that we can lock
2541 *	the pmaps as we encounter them.
2542 */
2543static void
2544pmap_changebit(struct vm_page *pg, u_long set, u_long mask, long cpu_id)
2545{
2546	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2547	pv_entry_t pv;
2548	pt_entry_t *pte, npte;
2549	vaddr_t va;
2550	bool hadasm, isactive;
2551	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
2552
2553#ifdef DEBUG
2554	if (pmapdebug & PDB_BITS)
2555		printf("pmap_changebit(%p, 0x%lx, 0x%lx)\n",
2556		    pg, set, mask);
2557#endif
2558
2559	/*
2560	 * Loop over all current mappings setting/clearing as apropos.
2561	 */
2562	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
2563		va = pv->pv_va;
2564
2565		PMAP_LOCK(pv->pv_pmap);
2566
2567		pte = pv->pv_pte;
2568		npte = (*pte | set) & mask;
2569		if (*pte != npte) {
2570			hadasm = (pmap_pte_asm(pte) != 0);
2571			isactive = PMAP_ISACTIVE(pv->pv_pmap, cpu_id);
2572			PMAP_SET_PTE(pte, npte);
2573			PMAP_INVALIDATE_TLB(pv->pv_pmap, va, hadasm, isactive,
2574			    cpu_id);
2575			PMAP_TLB_SHOOTDOWN(pv->pv_pmap, va,
2576			    hadasm ? PG_ASM : 0);
2577		}
2578		PMAP_UNLOCK(pv->pv_pmap);
2579	}
2580
2581	PMAP_TLB_SHOOTNOW();
2582}
2583
2584/*
2585 * pmap_emulate_reference:
2586 *
2587 *	Emulate reference and/or modified bit hits.
2588 *	Return 1 if this was an execute fault on a non-exec mapping,
2589 *	otherwise return 0.
2590 */
2591int
2592pmap_emulate_reference(struct lwp *l, vaddr_t v, int user, int type)
2593{
2594	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
2595	pt_entry_t faultoff, *pte;
2596	struct vm_page *pg;
2597	paddr_t pa;
2598	bool didlock = false;
2599	bool exec = false;
2600	long cpu_id = cpu_number();
2601	kmutex_t *lock;
2602
2603#ifdef DEBUG
2604	if (pmapdebug & PDB_FOLLOW)
2605		printf("pmap_emulate_reference: %p, 0x%lx, %d, %d\n",
2606		    l, v, user, type);
2607#endif
2608
2609	/*
2610	 * Convert process and virtual address to physical address.
2611	 */
2612	if (v >= VM_MIN_KERNEL_ADDRESS) {
2613		if (user)
2614			panic("pmap_emulate_reference: user ref to kernel");
2615		/*
2616		 * No need to lock here; kernel PT pages never go away.
2617		 */
2618		pte = PMAP_KERNEL_PTE(v);
2619	} else {
2620#ifdef DIAGNOSTIC
2621		if (l == NULL)
2622			panic("pmap_emulate_reference: bad proc");
2623		if (l->l_proc->p_vmspace == NULL)
2624			panic("pmap_emulate_reference: bad p_vmspace");
2625#endif
2626		PMAP_LOCK(pmap);
2627		didlock = true;
2628		pte = pmap_l3pte(pmap, v, NULL);
2629		/*
2630		 * We'll unlock below where we're done with the PTE.
2631		 */
2632	}
2633	exec = pmap_pte_exec(pte);
2634	if (!exec && type == ALPHA_MMCSR_FOE) {
2635		if (didlock)
2636			PMAP_UNLOCK(pmap);
2637	       return (1);
2638	}
2639#ifdef DEBUG
2640	if (pmapdebug & PDB_FOLLOW) {
2641		printf("\tpte = %p, ", pte);
2642		printf("*pte = 0x%lx\n", *pte);
2643	}
2644#endif
2645#ifdef DEBUG				/* These checks are more expensive */
2646	if (!pmap_pte_v(pte))
2647		panic("pmap_emulate_reference: invalid pte");
2648	if (type == ALPHA_MMCSR_FOW) {
2649		if (!(*pte & (user ? PG_UWE : PG_UWE | PG_KWE)))
2650			panic("pmap_emulate_reference: write but unwritable");
2651		if (!(*pte & PG_FOW))
2652			panic("pmap_emulate_reference: write but not FOW");
2653	} else {
2654		if (!(*pte & (user ? PG_URE : PG_URE | PG_KRE)))
2655			panic("pmap_emulate_reference: !write but unreadable");
2656		if (!(*pte & (PG_FOR | PG_FOE)))
2657			panic("pmap_emulate_reference: !write but not FOR|FOE");
2658	}
2659	/* Other diagnostics? */
2660#endif
2661	pa = pmap_pte_pa(pte);
2662
2663	/*
2664	 * We're now done with the PTE.  If it was a user pmap, unlock
2665	 * it now.
2666	 */
2667	if (didlock)
2668		PMAP_UNLOCK(pmap);
2669
2670#ifdef DEBUG
2671	if (pmapdebug & PDB_FOLLOW)
2672		printf("\tpa = 0x%lx\n", pa);
2673#endif
2674#ifdef DIAGNOSTIC
2675	if (!uvm_pageismanaged(pa))
2676		panic("pmap_emulate_reference(%p, 0x%lx, %d, %d): "
2677		      "pa 0x%lx not managed", l, v, user, type, pa);
2678#endif
2679
2680	/*
2681	 * Twiddle the appropriate bits to reflect the reference
2682	 * and/or modification..
2683	 *
2684	 * The rules:
2685	 * 	(1) always mark page as used, and
2686	 *	(2) if it was a write fault, mark page as modified.
2687	 */
2688	pg = PHYS_TO_VM_PAGE(pa);
2689	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2690
2691	PMAP_HEAD_TO_MAP_LOCK();
2692	lock = pmap_pvh_lock(pg);
2693	mutex_enter(lock);
2694
2695	if (type == ALPHA_MMCSR_FOW) {
2696		md->pvh_attrs |= (PGA_REFERENCED|PGA_MODIFIED);
2697		faultoff = PG_FOR | PG_FOW;
2698	} else {
2699		md->pvh_attrs |= PGA_REFERENCED;
2700		faultoff = PG_FOR;
2701		if (exec) {
2702			faultoff |= PG_FOE;
2703		}
2704	}
2705	pmap_changebit(pg, 0, ~faultoff, cpu_id);
2706
2707	mutex_exit(lock);
2708	PMAP_HEAD_TO_MAP_UNLOCK();
2709	return (0);
2710}
2711
2712#ifdef DEBUG
2713/*
2714 * pmap_pv_dump:
2715 *
2716 *	Dump the physical->virtual data for the specified page.
2717 */
2718void
2719pmap_pv_dump(paddr_t pa)
2720{
2721	struct vm_page *pg;
2722	struct vm_page_md *md;
2723	pv_entry_t pv;
2724	kmutex_t *lock;
2725
2726	pg = PHYS_TO_VM_PAGE(pa);
2727	md = VM_PAGE_TO_MD(pg);
2728
2729	lock = pmap_pvh_lock(pg);
2730	mutex_enter(lock);
2731
2732	printf("pa 0x%lx (attrs = 0x%x):\n", pa, md->pvh_attrs);
2733	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next)
2734		printf("     pmap %p, va 0x%lx\n",
2735		    pv->pv_pmap, pv->pv_va);
2736	printf("\n");
2737
2738	mutex_exit(lock);
2739}
2740#endif
2741
2742/*
2743 * vtophys:
2744 *
2745 *	Return the physical address corresponding to the K0SEG or
2746 *	K1SEG address provided.
2747 *
2748 *	Note: no locking is necessary in this function.
2749 */
2750paddr_t
2751vtophys(vaddr_t vaddr)
2752{
2753	pt_entry_t *pte;
2754	paddr_t paddr = 0;
2755
2756	if (vaddr < ALPHA_K0SEG_BASE)
2757		printf("vtophys: invalid vaddr 0x%lx", vaddr);
2758	else if (vaddr <= ALPHA_K0SEG_END)
2759		paddr = ALPHA_K0SEG_TO_PHYS(vaddr);
2760	else {
2761		pte = PMAP_KERNEL_PTE(vaddr);
2762		if (pmap_pte_v(pte))
2763			paddr = pmap_pte_pa(pte) | (vaddr & PGOFSET);
2764	}
2765
2766#if 0
2767	printf("vtophys(0x%lx) -> 0x%lx\n", vaddr, paddr);
2768#endif
2769
2770	return (paddr);
2771}
2772
2773/******************** pv_entry management ********************/
2774
2775/*
2776 * pmap_pv_enter:
2777 *
2778 *	Add a physical->virtual entry to the pv_table.
2779 */
2780static int
2781pmap_pv_enter(pmap_t pmap, struct vm_page *pg, vaddr_t va, pt_entry_t *pte,
2782    bool dolock)
2783{
2784	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2785	pv_entry_t newpv;
2786	kmutex_t *lock;
2787
2788	/*
2789	 * Allocate and fill in the new pv_entry.
2790	 */
2791	newpv = pmap_pv_alloc();
2792	if (newpv == NULL)
2793		return ENOMEM;
2794	newpv->pv_va = va;
2795	newpv->pv_pmap = pmap;
2796	newpv->pv_pte = pte;
2797
2798	if (dolock) {
2799		lock = pmap_pvh_lock(pg);
2800		mutex_enter(lock);
2801	}
2802
2803#ifdef DEBUG
2804    {
2805	pv_entry_t pv;
2806	/*
2807	 * Make sure the entry doesn't already exist.
2808	 */
2809	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
2810		if (pmap == pv->pv_pmap && va == pv->pv_va) {
2811			printf("pmap = %p, va = 0x%lx\n", pmap, va);
2812			panic("pmap_pv_enter: already in pv table");
2813		}
2814	}
2815    }
2816#endif
2817
2818	/*
2819	 * ...and put it in the list.
2820	 */
2821	newpv->pv_next = md->pvh_list;
2822	md->pvh_list = newpv;
2823
2824	if (dolock) {
2825		mutex_exit(lock);
2826	}
2827
2828	return 0;
2829}
2830
2831/*
2832 * pmap_pv_remove:
2833 *
2834 *	Remove a physical->virtual entry from the pv_table.
2835 */
2836static void
2837pmap_pv_remove(pmap_t pmap, struct vm_page *pg, vaddr_t va, bool dolock)
2838{
2839	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2840	pv_entry_t pv, *pvp;
2841	kmutex_t *lock;
2842
2843	if (dolock) {
2844		lock = pmap_pvh_lock(pg);
2845		mutex_enter(lock);
2846	} else {
2847		lock = NULL; /* XXX stupid gcc */
2848	}
2849
2850	/*
2851	 * Find the entry to remove.
2852	 */
2853	for (pvp = &md->pvh_list, pv = *pvp;
2854	     pv != NULL; pvp = &pv->pv_next, pv = *pvp)
2855		if (pmap == pv->pv_pmap && va == pv->pv_va)
2856			break;
2857
2858#ifdef DEBUG
2859	if (pv == NULL)
2860		panic("pmap_pv_remove: not in pv table");
2861#endif
2862
2863	*pvp = pv->pv_next;
2864
2865	if (dolock) {
2866		mutex_exit(lock);
2867	}
2868
2869	pmap_pv_free(pv);
2870}
2871
2872/*
2873 * pmap_pv_page_alloc:
2874 *
2875 *	Allocate a page for the pv_entry pool.
2876 */
2877static void *
2878pmap_pv_page_alloc(struct pool *pp, int flags)
2879{
2880	paddr_t pg;
2881
2882	if (pmap_physpage_alloc(PGU_PVENT, &pg))
2883		return ((void *)ALPHA_PHYS_TO_K0SEG(pg));
2884	return (NULL);
2885}
2886
2887/*
2888 * pmap_pv_page_free:
2889 *
2890 *	Free a pv_entry pool page.
2891 */
2892static void
2893pmap_pv_page_free(struct pool *pp, void *v)
2894{
2895
2896	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t)v));
2897}
2898
2899/******************** misc. functions ********************/
2900
2901/*
2902 * pmap_physpage_alloc:
2903 *
2904 *	Allocate a single page from the VM system and return the
2905 *	physical address for that page.
2906 */
2907static bool
2908pmap_physpage_alloc(int usage, paddr_t *pap)
2909{
2910	struct vm_page *pg;
2911	paddr_t pa;
2912
2913	/*
2914	 * Don't ask for a zero'd page in the L1PT case -- we will
2915	 * properly initialize it in the constructor.
2916	 */
2917
2918	pg = uvm_pagealloc(NULL, 0, NULL, usage == PGU_L1PT ?
2919	    UVM_PGA_USERESERVE : UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2920	if (pg != NULL) {
2921		pa = VM_PAGE_TO_PHYS(pg);
2922#ifdef DEBUG
2923		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2924		if (md->pvh_refcnt != 0) {
2925			printf("pmap_physpage_alloc: page 0x%lx has "
2926			    "%d references\n", pa, md->pvh_refcnt);
2927			panic("pmap_physpage_alloc");
2928		}
2929#endif
2930		*pap = pa;
2931		return (true);
2932	}
2933	return (false);
2934}
2935
2936/*
2937 * pmap_physpage_free:
2938 *
2939 *	Free the single page table page at the specified physical address.
2940 */
2941static void
2942pmap_physpage_free(paddr_t pa)
2943{
2944	struct vm_page *pg;
2945
2946	if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
2947		panic("pmap_physpage_free: bogus physical page address");
2948
2949#ifdef DEBUG
2950	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2951	if (md->pvh_refcnt != 0)
2952		panic("pmap_physpage_free: page still has references");
2953#endif
2954
2955	uvm_pagefree(pg);
2956}
2957
2958/*
2959 * pmap_physpage_addref:
2960 *
2961 *	Add a reference to the specified special use page.
2962 */
2963static int
2964pmap_physpage_addref(void *kva)
2965{
2966	struct vm_page *pg;
2967	struct vm_page_md *md;
2968	paddr_t pa;
2969
2970	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
2971	pg = PHYS_TO_VM_PAGE(pa);
2972	md = VM_PAGE_TO_MD(pg);
2973
2974	KASSERT((int)md->pvh_refcnt >= 0);
2975
2976	return atomic_inc_uint_nv(&md->pvh_refcnt);
2977}
2978
2979/*
2980 * pmap_physpage_delref:
2981 *
2982 *	Delete a reference to the specified special use page.
2983 */
2984static int
2985pmap_physpage_delref(void *kva)
2986{
2987	struct vm_page *pg;
2988	struct vm_page_md *md;
2989	paddr_t pa;
2990
2991	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
2992	pg = PHYS_TO_VM_PAGE(pa);
2993	md = VM_PAGE_TO_MD(pg);
2994
2995	KASSERT((int)md->pvh_refcnt > 0);
2996
2997	return atomic_dec_uint_nv(&md->pvh_refcnt);
2998}
2999
3000/******************** page table page management ********************/
3001
3002/*
3003 * pmap_growkernel:		[ INTERFACE ]
3004 *
3005 *	Grow the kernel address space.  This is a hint from the
3006 *	upper layer to pre-allocate more kernel PT pages.
3007 */
3008vaddr_t
3009pmap_growkernel(vaddr_t maxkvaddr)
3010{
3011	struct pmap *kpm = pmap_kernel(), *pm;
3012	paddr_t ptaddr;
3013	pt_entry_t *l1pte, *l2pte, pte;
3014	vaddr_t va;
3015	int l1idx;
3016
3017	rw_enter(&pmap_growkernel_lock, RW_WRITER);
3018
3019	if (maxkvaddr <= virtual_end)
3020		goto out;		/* we are OK */
3021
3022	va = virtual_end;
3023
3024	while (va < maxkvaddr) {
3025		/*
3026		 * If there is no valid L1 PTE (i.e. no L2 PT page),
3027		 * allocate a new L2 PT page and insert it into the
3028		 * L1 map.
3029		 */
3030		l1pte = pmap_l1pte(kpm, va);
3031		if (pmap_pte_v(l1pte) == 0) {
3032			/*
3033			 * XXX PGU_NORMAL?  It's not a "traditional" PT page.
3034			 */
3035			if (uvm.page_init_done == false) {
3036				/*
3037				 * We're growing the kernel pmap early (from
3038				 * uvm_pageboot_alloc()).  This case must
3039				 * be handled a little differently.
3040				 */
3041				ptaddr = ALPHA_K0SEG_TO_PHYS(
3042				    pmap_steal_memory(PAGE_SIZE, NULL, NULL));
3043			} else if (pmap_physpage_alloc(PGU_NORMAL,
3044				   &ptaddr) == false)
3045				goto die;
3046			pte = (atop(ptaddr) << PG_SHIFT) |
3047			    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3048			*l1pte = pte;
3049
3050			l1idx = l1pte_index(va);
3051
3052			/* Update all the user pmaps. */
3053			mutex_enter(&pmap_all_pmaps_lock);
3054			for (pm = TAILQ_FIRST(&pmap_all_pmaps);
3055			     pm != NULL; pm = TAILQ_NEXT(pm, pm_list)) {
3056				/* Skip the kernel pmap. */
3057				if (pm == pmap_kernel())
3058					continue;
3059
3060				PMAP_LOCK(pm);
3061				if (pm->pm_lev1map == kernel_lev1map) {
3062					PMAP_UNLOCK(pm);
3063					continue;
3064				}
3065				pm->pm_lev1map[l1idx] = pte;
3066				PMAP_UNLOCK(pm);
3067			}
3068			mutex_exit(&pmap_all_pmaps_lock);
3069		}
3070
3071		/*
3072		 * Have an L2 PT page now, add the L3 PT page.
3073		 */
3074		l2pte = pmap_l2pte(kpm, va, l1pte);
3075		KASSERT(pmap_pte_v(l2pte) == 0);
3076		if (uvm.page_init_done == false) {
3077			/*
3078			 * See above.
3079			 */
3080			ptaddr = ALPHA_K0SEG_TO_PHYS(
3081			    pmap_steal_memory(PAGE_SIZE, NULL, NULL));
3082		} else if (pmap_physpage_alloc(PGU_NORMAL, &ptaddr) == false)
3083			goto die;
3084		*l2pte = (atop(ptaddr) << PG_SHIFT) |
3085		    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3086		va += ALPHA_L2SEG_SIZE;
3087	}
3088
3089	/* Invalidate the L1 PT cache. */
3090	pool_cache_invalidate(&pmap_l1pt_cache);
3091
3092	virtual_end = va;
3093
3094 out:
3095	rw_exit(&pmap_growkernel_lock);
3096
3097	return (virtual_end);
3098
3099 die:
3100	panic("pmap_growkernel: out of memory");
3101}
3102
3103/*
3104 * pmap_lev1map_create:
3105 *
3106 *	Create a new level 1 page table for the specified pmap.
3107 *
3108 *	Note: growkernel must already be held and the pmap either
3109 *	already locked or unreferenced globally.
3110 */
3111static int
3112pmap_lev1map_create(pmap_t pmap, long cpu_id)
3113{
3114	pt_entry_t *l1pt;
3115
3116	KASSERT(pmap != pmap_kernel());
3117
3118	KASSERT(pmap->pm_lev1map == kernel_lev1map);
3119	KASSERT(pmap->pm_asni[cpu_id].pma_asn == PMAP_ASN_RESERVED);
3120
3121	/* Don't sleep -- we're called with locks held. */
3122	l1pt = pool_cache_get(&pmap_l1pt_cache, PR_NOWAIT);
3123	if (l1pt == NULL)
3124		return (ENOMEM);
3125
3126	pmap->pm_lev1map = l1pt;
3127	return (0);
3128}
3129
3130/*
3131 * pmap_lev1map_destroy:
3132 *
3133 *	Destroy the level 1 page table for the specified pmap.
3134 *
3135 *	Note: growkernel must be held and the pmap must already be
3136 *	locked or not globally referenced.
3137 */
3138static void
3139pmap_lev1map_destroy(pmap_t pmap, long cpu_id)
3140{
3141	pt_entry_t *l1pt = pmap->pm_lev1map;
3142
3143	KASSERT(pmap != pmap_kernel());
3144
3145	/*
3146	 * Go back to referencing the global kernel_lev1map.
3147	 */
3148	pmap->pm_lev1map = kernel_lev1map;
3149
3150	/*
3151	 * Free the old level 1 page table page.
3152	 */
3153	pool_cache_put(&pmap_l1pt_cache, l1pt);
3154}
3155
3156/*
3157 * pmap_l1pt_ctor:
3158 *
3159 *	Pool cache constructor for L1 PT pages.
3160 *
3161 *	Note: The growkernel lock is held across allocations
3162 *	from our pool_cache, so we don't need to acquire it
3163 *	ourselves.
3164 */
3165static int
3166pmap_l1pt_ctor(void *arg, void *object, int flags)
3167{
3168	pt_entry_t *l1pt = object, pte;
3169	int i;
3170
3171	/*
3172	 * Initialize the new level 1 table by zeroing the
3173	 * user portion and copying the kernel mappings into
3174	 * the kernel portion.
3175	 */
3176	for (i = 0; i < l1pte_index(VM_MIN_KERNEL_ADDRESS); i++)
3177		l1pt[i] = 0;
3178
3179	for (i = l1pte_index(VM_MIN_KERNEL_ADDRESS);
3180	     i <= l1pte_index(VM_MAX_KERNEL_ADDRESS); i++)
3181		l1pt[i] = kernel_lev1map[i];
3182
3183	/*
3184	 * Now, map the new virtual page table.  NOTE: NO ASM!
3185	 */
3186	pte = ((ALPHA_K0SEG_TO_PHYS((vaddr_t) l1pt) >> PGSHIFT) << PG_SHIFT) |
3187	    PG_V | PG_KRE | PG_KWE;
3188	l1pt[l1pte_index(VPTBASE)] = pte;
3189
3190	return (0);
3191}
3192
3193/*
3194 * pmap_l1pt_alloc:
3195 *
3196 *	Page alloctaor for L1 PT pages.
3197 */
3198static void *
3199pmap_l1pt_alloc(struct pool *pp, int flags)
3200{
3201	paddr_t ptpa;
3202
3203	/*
3204	 * Attempt to allocate a free page.
3205	 */
3206	if (pmap_physpage_alloc(PGU_L1PT, &ptpa) == false)
3207		return (NULL);
3208
3209	return ((void *) ALPHA_PHYS_TO_K0SEG(ptpa));
3210}
3211
3212/*
3213 * pmap_l1pt_free:
3214 *
3215 *	Page freer for L1 PT pages.
3216 */
3217static void
3218pmap_l1pt_free(struct pool *pp, void *v)
3219{
3220
3221	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t) v));
3222}
3223
3224/*
3225 * pmap_ptpage_alloc:
3226 *
3227 *	Allocate a level 2 or level 3 page table page, and
3228 *	initialize the PTE that references it.
3229 *
3230 *	Note: the pmap must already be locked.
3231 */
3232static int
3233pmap_ptpage_alloc(pmap_t pmap, pt_entry_t *pte, int usage)
3234{
3235	paddr_t ptpa;
3236
3237	/*
3238	 * Allocate the page table page.
3239	 */
3240	if (pmap_physpage_alloc(usage, &ptpa) == false)
3241		return (ENOMEM);
3242
3243	/*
3244	 * Initialize the referencing PTE.
3245	 */
3246	PMAP_SET_PTE(pte, ((ptpa >> PGSHIFT) << PG_SHIFT) |
3247	    PG_V | PG_KRE | PG_KWE | PG_WIRED |
3248	    (pmap == pmap_kernel() ? PG_ASM : 0));
3249
3250	return (0);
3251}
3252
3253/*
3254 * pmap_ptpage_free:
3255 *
3256 *	Free the level 2 or level 3 page table page referenced
3257 *	be the provided PTE.
3258 *
3259 *	Note: the pmap must already be locked.
3260 */
3261static void
3262pmap_ptpage_free(pmap_t pmap, pt_entry_t *pte)
3263{
3264	paddr_t ptpa;
3265
3266	/*
3267	 * Extract the physical address of the page from the PTE
3268	 * and clear the entry.
3269	 */
3270	ptpa = pmap_pte_pa(pte);
3271	PMAP_SET_PTE(pte, PG_NV);
3272
3273#ifdef DEBUG
3274	pmap_zero_page(ptpa);
3275#endif
3276	pmap_physpage_free(ptpa);
3277}
3278
3279/*
3280 * pmap_l3pt_delref:
3281 *
3282 *	Delete a reference on a level 3 PT page.  If the reference drops
3283 *	to zero, free it.
3284 *
3285 *	Note: the pmap must already be locked.
3286 */
3287static void
3288pmap_l3pt_delref(pmap_t pmap, vaddr_t va, pt_entry_t *l3pte, long cpu_id)
3289{
3290	pt_entry_t *l1pte, *l2pte;
3291	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
3292
3293	l1pte = pmap_l1pte(pmap, va);
3294	l2pte = pmap_l2pte(pmap, va, l1pte);
3295
3296#ifdef DIAGNOSTIC
3297	if (pmap == pmap_kernel())
3298		panic("pmap_l3pt_delref: kernel pmap");
3299#endif
3300
3301	if (pmap_physpage_delref(l3pte) == 0) {
3302		/*
3303		 * No more mappings; we can free the level 3 table.
3304		 */
3305#ifdef DEBUG
3306		if (pmapdebug & PDB_PTPAGE)
3307			printf("pmap_l3pt_delref: freeing level 3 table at "
3308			    "0x%lx\n", pmap_pte_pa(l2pte));
3309#endif
3310		pmap_ptpage_free(pmap, l2pte);
3311
3312		/*
3313		 * We've freed a level 3 table, so we must
3314		 * invalidate the TLB entry for that PT page
3315		 * in the Virtual Page Table VA range, because
3316		 * otherwise the PALcode will service a TLB
3317		 * miss using the stale VPT TLB entry it entered
3318		 * behind our back to shortcut to the VA's PTE.
3319		 */
3320		PMAP_INVALIDATE_TLB(pmap,
3321		    (vaddr_t)(&VPT[VPT_INDEX(va)]), false,
3322		    PMAP_ISACTIVE(pmap, cpu_id), cpu_id);
3323		PMAP_TLB_SHOOTDOWN(pmap,
3324		    (vaddr_t)(&VPT[VPT_INDEX(va)]), 0);
3325		PMAP_TLB_SHOOTNOW();
3326
3327		/*
3328		 * We've freed a level 3 table, so delete the reference
3329		 * on the level 2 table.
3330		 */
3331		pmap_l2pt_delref(pmap, l1pte, l2pte, cpu_id);
3332	}
3333}
3334
3335/*
3336 * pmap_l2pt_delref:
3337 *
3338 *	Delete a reference on a level 2 PT page.  If the reference drops
3339 *	to zero, free it.
3340 *
3341 *	Note: the pmap must already be locked.
3342 */
3343static void
3344pmap_l2pt_delref(pmap_t pmap, pt_entry_t *l1pte, pt_entry_t *l2pte,
3345    long cpu_id)
3346{
3347
3348#ifdef DIAGNOSTIC
3349	if (pmap == pmap_kernel())
3350		panic("pmap_l2pt_delref: kernel pmap");
3351#endif
3352
3353	if (pmap_physpage_delref(l2pte) == 0) {
3354		/*
3355		 * No more mappings in this segment; we can free the
3356		 * level 2 table.
3357		 */
3358#ifdef DEBUG
3359		if (pmapdebug & PDB_PTPAGE)
3360			printf("pmap_l2pt_delref: freeing level 2 table at "
3361			    "0x%lx\n", pmap_pte_pa(l1pte));
3362#endif
3363		pmap_ptpage_free(pmap, l1pte);
3364
3365		/*
3366		 * We've freed a level 2 table, so delete the reference
3367		 * on the level 1 table.
3368		 */
3369		pmap_l1pt_delref(pmap, l1pte, cpu_id);
3370	}
3371}
3372
3373/*
3374 * pmap_l1pt_delref:
3375 *
3376 *	Delete a reference on a level 1 PT page.  If the reference drops
3377 *	to zero, free it.
3378 *
3379 *	Note: the pmap must already be locked.
3380 */
3381static void
3382pmap_l1pt_delref(pmap_t pmap, pt_entry_t *l1pte, long cpu_id)
3383{
3384
3385#ifdef DIAGNOSTIC
3386	if (pmap == pmap_kernel())
3387		panic("pmap_l1pt_delref: kernel pmap");
3388#endif
3389
3390	(void)pmap_physpage_delref(l1pte);
3391}
3392
3393/******************** Address Space Number management ********************/
3394
3395/*
3396 * pmap_asn_alloc:
3397 *
3398 *	Allocate and assign an ASN to the specified pmap.
3399 *
3400 *	Note: the pmap must already be locked.  This may be called from
3401 *	an interprocessor interrupt, and in that case, the sender of
3402 *	the IPI has the pmap lock.
3403 */
3404static void
3405pmap_asn_alloc(pmap_t pmap, long cpu_id)
3406{
3407	struct pmap_asn_info *pma = &pmap->pm_asni[cpu_id];
3408	struct pmap_asn_info *cpma = &pmap_asn_info[cpu_id];
3409
3410#ifdef DEBUG
3411	if (pmapdebug & (PDB_FOLLOW|PDB_ASN))
3412		printf("pmap_asn_alloc(%p)\n", pmap);
3413#endif
3414
3415	/*
3416	 * If the pmap is still using the global kernel_lev1map, there
3417	 * is no need to assign an ASN at this time, because only
3418	 * kernel mappings exist in that map, and all kernel mappings
3419	 * have PG_ASM set.  If the pmap eventually gets its own
3420	 * lev1map, an ASN will be allocated at that time.
3421	 *
3422	 * Only the kernel pmap will reference kernel_lev1map.  Do the
3423	 * same old fixups, but note that we no longer need the pmap
3424	 * to be locked if we're in this mode, since pm_lev1map will
3425	 * never change.
3426	 * #endif
3427	 */
3428	if (pmap->pm_lev1map == kernel_lev1map) {
3429#ifdef DEBUG
3430		if (pmapdebug & PDB_ASN)
3431			printf("pmap_asn_alloc: still references "
3432			    "kernel_lev1map\n");
3433#endif
3434#if defined(MULTIPROCESSOR)
3435		/*
3436		 * In a multiprocessor system, it's possible to
3437		 * get here without having PMAP_ASN_RESERVED in
3438		 * pmap->pm_asni[cpu_id].pma_asn; see pmap_lev1map_destroy().
3439		 *
3440		 * So, what we do here, is simply assign the reserved
3441		 * ASN for kernel_lev1map users and let things
3442		 * continue on.  We do, however, let uniprocessor
3443		 * configurations continue to make its assertion.
3444		 */
3445		pma->pma_asn = PMAP_ASN_RESERVED;
3446#else
3447		KASSERT(pma->pma_asn == PMAP_ASN_RESERVED);
3448#endif /* MULTIPROCESSOR */
3449		return;
3450	}
3451
3452	/*
3453	 * On processors which do not implement ASNs, the swpctx PALcode
3454	 * operation will automatically invalidate the TLB and I-cache,
3455	 * so we don't need to do that here.
3456	 */
3457	if (pmap_max_asn == 0) {
3458		/*
3459		 * Refresh the pmap's generation number, to
3460		 * simplify logic elsewhere.
3461		 */
3462		pma->pma_asngen = cpma->pma_asngen;
3463#ifdef DEBUG
3464		if (pmapdebug & PDB_ASN)
3465			printf("pmap_asn_alloc: no ASNs, using asngen %lu\n",
3466			    pma->pma_asngen);
3467#endif
3468		return;
3469	}
3470
3471	/*
3472	 * Hopefully, we can continue using the one we have...
3473	 */
3474	if (pma->pma_asn != PMAP_ASN_RESERVED &&
3475	    pma->pma_asngen == cpma->pma_asngen) {
3476		/*
3477		 * ASN is still in the current generation; keep on using it.
3478		 */
3479#ifdef DEBUG
3480		if (pmapdebug & PDB_ASN)
3481			printf("pmap_asn_alloc: same generation, keeping %u\n",
3482			    pma->pma_asn);
3483#endif
3484		return;
3485	}
3486
3487	/*
3488	 * Need to assign a new ASN.  Grab the next one, incrementing
3489	 * the generation number if we have to.
3490	 */
3491	if (cpma->pma_asn > pmap_max_asn) {
3492		/*
3493		 * Invalidate all non-PG_ASM TLB entries and the
3494		 * I-cache, and bump the generation number.
3495		 */
3496		ALPHA_TBIAP();
3497		alpha_pal_imb();
3498
3499		cpma->pma_asn = 1;
3500		cpma->pma_asngen++;
3501#ifdef DIAGNOSTIC
3502		if (cpma->pma_asngen == 0) {
3503			/*
3504			 * The generation number has wrapped.  We could
3505			 * handle this scenario by traversing all of
3506			 * the pmaps, and invalidating the generation
3507			 * number on those which are not currently
3508			 * in use by this processor.
3509			 *
3510			 * However... considering that we're using
3511			 * an unsigned 64-bit integer for generation
3512			 * numbers, on non-ASN CPUs, we won't wrap
3513			 * for approx. 585 million years, or 75 billion
3514			 * years on a 128-ASN CPU (assuming 1000 switch
3515			 * operations per second).
3516			 *
3517			 * So, we don't bother.
3518			 */
3519			panic("pmap_asn_alloc: too much uptime");
3520		}
3521#endif
3522#ifdef DEBUG
3523		if (pmapdebug & PDB_ASN)
3524			printf("pmap_asn_alloc: generation bumped to %lu\n",
3525			    cpma->pma_asngen);
3526#endif
3527	}
3528
3529	/*
3530	 * Assign the new ASN and validate the generation number.
3531	 */
3532	pma->pma_asn = cpma->pma_asn++;
3533	pma->pma_asngen = cpma->pma_asngen;
3534
3535#ifdef DEBUG
3536	if (pmapdebug & PDB_ASN)
3537		printf("pmap_asn_alloc: assigning %u to pmap %p\n",
3538		    pma->pma_asn, pmap);
3539#endif
3540
3541	/*
3542	 * Have a new ASN, so there's no need to sync the I-stream
3543	 * on the way back out to userspace.
3544	 */
3545	atomic_and_ulong(&pmap->pm_needisync, ~(1UL << cpu_id));
3546}
3547
3548#if defined(MULTIPROCESSOR)
3549/******************** TLB shootdown code ********************/
3550
3551/*
3552 * pmap_tlb_shootdown:
3553 *
3554 *	Cause the TLB entry for pmap/va to be shot down.
3555 *
3556 *	NOTE: The pmap must be locked here.
3557 */
3558void
3559pmap_tlb_shootdown(pmap_t pmap, vaddr_t va, pt_entry_t pte, u_long *cpumaskp)
3560{
3561	struct pmap_tlb_shootdown_q *pq;
3562	struct pmap_tlb_shootdown_job *pj;
3563	struct cpu_info *ci, *self = curcpu();
3564	u_long cpumask;
3565	CPU_INFO_ITERATOR cii;
3566
3567	KASSERT((pmap == pmap_kernel()) || mutex_owned(&pmap->pm_lock));
3568
3569	cpumask = 0;
3570
3571	for (CPU_INFO_FOREACH(cii, ci)) {
3572		if (ci == self)
3573			continue;
3574
3575		/*
3576		 * The pmap must be locked (unless its the kernel
3577		 * pmap, in which case it is okay for it to be
3578		 * unlocked), which prevents it from  becoming
3579		 * active on any additional processors.  This makes
3580		 * it safe to check for activeness.  If it's not
3581		 * active on the processor in question, then just
3582		 * mark it as needing a new ASN the next time it
3583		 * does, saving the IPI.  We always have to send
3584		 * the IPI for the kernel pmap.
3585		 *
3586		 * Note if it's marked active now, and it becomes
3587		 * inactive by the time the processor receives
3588		 * the IPI, that's okay, because it does the right
3589		 * thing with it later.
3590		 */
3591		if (pmap != pmap_kernel() &&
3592		    PMAP_ISACTIVE(pmap, ci->ci_cpuid) == 0) {
3593			PMAP_INVALIDATE_ASN(pmap, ci->ci_cpuid);
3594			continue;
3595		}
3596
3597		cpumask |= 1UL << ci->ci_cpuid;
3598
3599		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
3600		mutex_spin_enter(&pq->pq_lock);
3601
3602		/*
3603		 * Allocate a job.
3604		 */
3605		if (pq->pq_count < PMAP_TLB_SHOOTDOWN_MAXJOBS) {
3606			pj = pool_cache_get(&pmap_tlb_shootdown_job_cache,
3607			    PR_NOWAIT);
3608		} else {
3609			pj = NULL;
3610		}
3611
3612		/*
3613		 * If a global flush is already pending, we
3614		 * don't really have to do anything else.
3615		 */
3616		pq->pq_pte |= pte;
3617		if (pq->pq_tbia) {
3618			mutex_spin_exit(&pq->pq_lock);
3619			if (pj != NULL) {
3620				pool_cache_put(&pmap_tlb_shootdown_job_cache,
3621				    pj);
3622			}
3623			continue;
3624		}
3625		if (pj == NULL) {
3626			/*
3627			 * Couldn't allocate a job entry.  Just
3628			 * tell the processor to kill everything.
3629			 */
3630			pq->pq_tbia = 1;
3631		} else {
3632			pj->pj_pmap = pmap;
3633			pj->pj_va = va;
3634			pj->pj_pte = pte;
3635			pq->pq_count++;
3636			TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
3637		}
3638		mutex_spin_exit(&pq->pq_lock);
3639	}
3640
3641	*cpumaskp |= cpumask;
3642}
3643
3644/*
3645 * pmap_tlb_shootnow:
3646 *
3647 *	Process the TLB shootdowns that we have been accumulating
3648 *	for the specified processor set.
3649 */
3650void
3651pmap_tlb_shootnow(u_long cpumask)
3652{
3653
3654	alpha_multicast_ipi(cpumask, ALPHA_IPI_SHOOTDOWN);
3655}
3656
3657/*
3658 * pmap_do_tlb_shootdown:
3659 *
3660 *	Process pending TLB shootdown operations for this processor.
3661 */
3662void
3663pmap_do_tlb_shootdown(struct cpu_info *ci, struct trapframe *framep)
3664{
3665	u_long cpu_id = ci->ci_cpuid;
3666	u_long cpu_mask = (1UL << cpu_id);
3667	struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id];
3668	struct pmap_tlb_shootdown_job *pj, *next;
3669	TAILQ_HEAD(, pmap_tlb_shootdown_job) jobs;
3670
3671	TAILQ_INIT(&jobs);
3672
3673	mutex_spin_enter(&pq->pq_lock);
3674	TAILQ_CONCAT(&jobs, &pq->pq_head, pj_list);
3675	if (pq->pq_tbia) {
3676		if (pq->pq_pte & PG_ASM)
3677			ALPHA_TBIA();
3678		else
3679			ALPHA_TBIAP();
3680		pq->pq_tbia = 0;
3681		pq->pq_pte = 0;
3682	} else {
3683		TAILQ_FOREACH(pj, &jobs, pj_list) {
3684			PMAP_INVALIDATE_TLB(pj->pj_pmap, pj->pj_va,
3685			    pj->pj_pte & PG_ASM,
3686			    pj->pj_pmap->pm_cpus & cpu_mask, cpu_id);
3687		}
3688		pq->pq_pte = 0;
3689	}
3690	pq->pq_count = 0;
3691	mutex_spin_exit(&pq->pq_lock);
3692
3693	/* Free jobs back to the cache. */
3694	for (pj = TAILQ_FIRST(&jobs); pj != NULL; pj = next) {
3695		next = TAILQ_NEXT(pj, pj_list);
3696		pool_cache_put(&pmap_tlb_shootdown_job_cache, pj);
3697	}
3698}
3699#endif /* MULTIPROCESSOR */
3700