1/*	$OpenBSD: pmap.c,v 1.184 2024/05/22 05:51:49 jsg Exp $ */
2
3/*
4 * Copyright (c) 2015 Martin Pieuchot
5 * Copyright (c) 2001, 2002, 2007 Dale Rahn.
6 * All rights reserved.
7 *
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * Effort sponsored in part by the Defense Advanced Research Projects
30 * Agency (DARPA) and Air Force Research Laboratory, Air Force
31 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
32 */
33
34/*
35 * powerpc lazy icache management.
36 * The icache does not snoop dcache accesses. The icache also will not load
37 * modified data from the dcache, but the unmodified data in ram.
38 * Before the icache is loaded, the dcache must be synced to ram to prevent
39 * the icache from loading stale data.
40 * pg->pg_flags PG_PMAP_EXE bit is used to track if the dcache is clean
41 * and the icache may have valid data in it.
42 * if the PG_PMAP_EXE bit is set (and the page is not currently RWX)
43 * the icache will only have valid code in it. If the bit is clear
44 * memory may not match the dcache contents or the icache may contain
45 * data from a previous page.
46 *
47 * pmap enter
48 * !E  NONE 	-> R	no action
49 * !E  NONE|R 	-> RW	no action
50 * !E  NONE|R 	-> RX	flush dcache, inval icache (that page only), set E
51 * !E  NONE|R 	-> RWX	flush dcache, inval icache (that page only), set E
52 * !E  NONE|RW 	-> RWX	flush dcache, inval icache (that page only), set E
53 *  E  NONE 	-> R	no action
54 *  E  NONE|R 	-> RW	clear PG_PMAP_EXE bit
55 *  E  NONE|R 	-> RX	no action
56 *  E  NONE|R 	-> RWX	no action
57 *  E  NONE|RW 	-> RWX	-invalid source state
58 *
59 * pamp_protect
60 *  E RW -> R	- invalid source state
61 * !E RW -> R	- no action
62 *  * RX -> R	- no action
63 *  * RWX -> R	- sync dcache, inval icache
64 *  * RWX -> RW	- clear PG_PMAP_EXE
65 *  * RWX -> RX	- sync dcache, inval icache
66 *  * * -> NONE	- no action
67 *
68 * pmap_page_protect (called with arg PROT_NONE if page is to be reused)
69 *  * RW -> R	- as pmap_protect
70 *  * RX -> R	- as pmap_protect
71 *  * RWX -> R	- as pmap_protect
72 *  * RWX -> RW	- as pmap_protect
73 *  * RWX -> RX	- as pmap_protect
74 *  * * -> NONE - clear PG_PMAP_EXE
75 *
76 */
77
78#include <sys/param.h>
79#include <sys/systm.h>
80#include <sys/proc.h>
81#include <sys/queue.h>
82#include <sys/pool.h>
83#include <sys/atomic.h>
84#include <sys/user.h>
85
86#include <uvm/uvm_extern.h>
87
88#include <machine/pcb.h>
89#include <powerpc/powerpc.h>
90#include <powerpc/bat.h>
91#include <machine/pmap.h>
92
93struct bat battable[16];
94
95struct dumpmem dumpmem[VM_PHYSSEG_MAX];
96u_int ndumpmem;
97
98struct pmap kernel_pmap_;
99static struct mem_region *pmap_mem, *pmap_avail;
100struct mem_region pmap_allocated[10];
101int pmap_cnt_avail;
102int pmap_cnt_allocated;
103
104struct pte_64  *pmap_ptable64;
105struct pte_32  *pmap_ptable32;
106int	pmap_ptab_cnt;
107u_int	pmap_ptab_mask;
108
109#define HTABSIZE_32	(pmap_ptab_cnt * 64)
110#define HTABMEMSZ_64	(pmap_ptab_cnt * 8 * sizeof(struct pte_64))
111#define HTABSIZE_64	(ffs(pmap_ptab_cnt) - 12)
112
113static u_int usedsr[NPMAPS / sizeof(u_int) / 8];
114
115struct pte_desc {
116	/* Linked list of phys -> virt entries */
117	LIST_ENTRY(pte_desc) pted_pv_list;
118	union {
119		struct pte_32 pted_pte32;
120		struct pte_64 pted_pte64;
121	} p;
122	pmap_t pted_pmap;
123	vaddr_t pted_va;
124};
125
126void pmap_attr_save(paddr_t pa, u_int32_t bits);
127void pmap_pted_ro(struct pte_desc *, vm_prot_t);
128void pmap_pted_ro64(struct pte_desc *, vm_prot_t);
129void pmap_pted_ro32(struct pte_desc *, vm_prot_t);
130
131/*
132 * Some functions are called in real mode and cannot be profiled.
133 */
134#define __noprof __attribute__((__no_instrument_function__))
135
136/* VP routines */
137int pmap_vp_enter(pmap_t pm, vaddr_t va, struct pte_desc *pted, int flags);
138struct pte_desc *pmap_vp_remove(pmap_t pm, vaddr_t va);
139void pmap_vp_destroy(pmap_t pm);
140struct pte_desc *pmap_vp_lookup(pmap_t pm, vaddr_t va) __noprof;
141
142/* PV routines */
143void pmap_enter_pv(struct pte_desc *pted, struct vm_page *);
144void pmap_remove_pv(struct pte_desc *pted);
145
146
147/* pte hash table routines */
148static inline void *pmap_ptedinhash(struct pte_desc *);
149void pte_insert32(struct pte_desc *) __noprof;
150void pte_insert64(struct pte_desc *) __noprof;
151void pmap_fill_pte64(pmap_t, vaddr_t, paddr_t, struct pte_desc *, vm_prot_t,
152    int) __noprof;
153void pmap_fill_pte32(pmap_t, vaddr_t, paddr_t, struct pte_desc *, vm_prot_t,
154    int) __noprof;
155
156void pmap_syncicache_user_virt(pmap_t pm, vaddr_t va);
157
158void pmap_remove_pted(pmap_t, struct pte_desc *);
159
160/* setup/initialization functions */
161void pmap_avail_setup(void);
162void pmap_avail_fixup(void);
163void pmap_remove_avail(paddr_t base, paddr_t end);
164void *pmap_steal_avail(size_t size, int align);
165
166/* asm interface */
167int pte_spill_r(u_int32_t, u_int32_t, u_int32_t, int) __noprof;
168int pte_spill_v(pmap_t, u_int32_t, u_int32_t, int) __noprof;
169
170u_int32_t pmap_setusr(pmap_t pm, vaddr_t va);
171void pmap_popusr(u_int32_t oldsr);
172
173/* pte invalidation */
174void pte_del(void *, vaddr_t);
175void pte_zap(void *, struct pte_desc *);
176
177/* XXX - panic on pool get failures? */
178struct pool pmap_pmap_pool;
179struct pool pmap_vp_pool;
180struct pool pmap_pted_pool;
181
182int pmap_initialized = 0;
183int physmem;
184int physmaxaddr;
185
186#ifdef MULTIPROCESSOR
187struct __ppc_lock pmap_hash_lock = PPC_LOCK_INITIALIZER;
188
189#define	PMAP_HASH_LOCK(s)						\
190do {									\
191	s = ppc_intr_disable();						\
192	__ppc_lock(&pmap_hash_lock);					\
193} while (0)
194
195#define	PMAP_HASH_UNLOCK(s)						\
196do {									\
197	__ppc_unlock(&pmap_hash_lock);					\
198	ppc_intr_enable(s);						\
199} while (0)
200
201#define	PMAP_VP_LOCK_INIT(pm)		mtx_init(&pm->pm_mtx, IPL_VM)
202
203#define	PMAP_VP_LOCK(pm)						\
204do {									\
205	if (pm != pmap_kernel())					\
206		mtx_enter(&pm->pm_mtx);					\
207} while (0)
208
209#define	PMAP_VP_UNLOCK(pm)						\
210do {									\
211	if (pm != pmap_kernel())					\
212		mtx_leave(&pm->pm_mtx);					\
213} while (0)
214
215#define PMAP_VP_ASSERT_LOCKED(pm)					\
216do {									\
217	if (pm != pmap_kernel())					\
218		MUTEX_ASSERT_LOCKED(&pm->pm_mtx);			\
219} while (0)
220
221#else /* ! MULTIPROCESSOR */
222
223#define	PMAP_HASH_LOCK(s)		(void)s
224#define	PMAP_HASH_UNLOCK(s)		/* nothing */
225
226#define	PMAP_VP_LOCK_INIT(pm)		/* nothing */
227#define	PMAP_VP_LOCK(pm)		/* nothing */
228#define	PMAP_VP_UNLOCK(pm)		/* nothing */
229#define	PMAP_VP_ASSERT_LOCKED(pm)	/* nothing */
230#endif /* MULTIPROCESSOR */
231
232/* virtual to physical helpers */
233static inline int
234VP_SR(vaddr_t va)
235{
236	return (va >>VP_SR_POS) & VP_SR_MASK;
237}
238
239static inline int
240VP_IDX1(vaddr_t va)
241{
242	return (va >> VP_IDX1_POS) & VP_IDX1_MASK;
243}
244
245static inline int
246VP_IDX2(vaddr_t va)
247{
248	return (va >> VP_IDX2_POS) & VP_IDX2_MASK;
249}
250
251#if VP_IDX1_SIZE != VP_IDX2_SIZE
252#error pmap allocation code expects IDX1 and IDX2 size to be same
253#endif
254struct pmapvp {
255	void *vp[VP_IDX1_SIZE];
256};
257
258
259/*
260 * VP routines, virtual to physical translation information.
261 * These data structures are based off of the pmap, per process.
262 */
263
264/*
265 * This is used for pmap_kernel() mappings, they are not to be removed
266 * from the vp table because they were statically initialized at the
267 * initial pmap initialization. This is so that memory allocation
268 * is not necessary in the pmap_kernel() mappings.
269 * Otherwise bad race conditions can appear.
270 */
271struct pte_desc *
272pmap_vp_lookup(pmap_t pm, vaddr_t va)
273{
274	struct pmapvp *vp1;
275	struct pmapvp *vp2;
276	struct pte_desc *pted;
277
278	PMAP_VP_ASSERT_LOCKED(pm);
279
280	vp1 = pm->pm_vp[VP_SR(va)];
281	if (vp1 == NULL) {
282		return NULL;
283	}
284
285	vp2 = vp1->vp[VP_IDX1(va)];
286	if (vp2 == NULL) {
287		return NULL;
288	}
289
290	pted = vp2->vp[VP_IDX2(va)];
291
292	return pted;
293}
294
295/*
296 * Remove, and return, pted at specified address, NULL if not present
297 */
298struct pte_desc *
299pmap_vp_remove(pmap_t pm, vaddr_t va)
300{
301	struct pmapvp *vp1;
302	struct pmapvp *vp2;
303	struct pte_desc *pted;
304
305	PMAP_VP_ASSERT_LOCKED(pm);
306
307	vp1 = pm->pm_vp[VP_SR(va)];
308	if (vp1 == NULL) {
309		return NULL;
310	}
311
312	vp2 = vp1->vp[VP_IDX1(va)];
313	if (vp2 == NULL) {
314		return NULL;
315	}
316
317	pted = vp2->vp[VP_IDX2(va)];
318	vp2->vp[VP_IDX2(va)] = NULL;
319
320	return pted;
321}
322
323/*
324 * Create a V -> P mapping for the given pmap and virtual address
325 * with reference to the pte descriptor that is used to map the page.
326 * This code should track allocations of vp table allocations
327 * so they can be freed efficiently.
328 */
329int
330pmap_vp_enter(pmap_t pm, vaddr_t va, struct pte_desc *pted, int flags)
331{
332	struct pmapvp *vp1;
333	struct pmapvp *vp2;
334
335	PMAP_VP_ASSERT_LOCKED(pm);
336
337	vp1 = pm->pm_vp[VP_SR(va)];
338	if (vp1 == NULL) {
339		vp1 = pool_get(&pmap_vp_pool, PR_NOWAIT | PR_ZERO);
340		if (vp1 == NULL) {
341			if ((flags & PMAP_CANFAIL) == 0)
342				panic("pmap_vp_enter: failed to allocate vp1");
343			return ENOMEM;
344		}
345		pm->pm_vp[VP_SR(va)] = vp1;
346	}
347
348	vp2 = vp1->vp[VP_IDX1(va)];
349	if (vp2 == NULL) {
350		vp2 = pool_get(&pmap_vp_pool, PR_NOWAIT | PR_ZERO);
351		if (vp2 == NULL) {
352			if ((flags & PMAP_CANFAIL) == 0)
353				panic("pmap_vp_enter: failed to allocate vp2");
354			return ENOMEM;
355		}
356		vp1->vp[VP_IDX1(va)] = vp2;
357	}
358
359	vp2->vp[VP_IDX2(va)] = pted;
360
361	return 0;
362}
363
364static inline void
365tlbie(vaddr_t va)
366{
367	asm volatile ("tlbie %0" :: "r"(va & ~PAGE_MASK));
368}
369
370static inline void
371tlbsync(void)
372{
373	asm volatile ("tlbsync");
374}
375static inline void
376eieio(void)
377{
378	asm volatile ("eieio");
379}
380
381static inline void
382sync(void)
383{
384	asm volatile ("sync");
385}
386
387static inline void
388tlbia(void)
389{
390	vaddr_t va;
391
392	sync();
393	for (va = 0; va < 0x00040000; va += 0x00001000)
394		tlbie(va);
395	eieio();
396	tlbsync();
397	sync();
398}
399
400static inline int
401ptesr(sr_t *sr, vaddr_t va)
402{
403	return sr[(u_int)va >> ADDR_SR_SHIFT];
404}
405
406static inline int
407pteidx(sr_t sr, vaddr_t va)
408{
409	int hash;
410	hash = (sr & SR_VSID) ^ (((u_int)va & ADDR_PIDX) >> ADDR_PIDX_SHIFT);
411	return hash & pmap_ptab_mask;
412}
413
414#define PTED_VA_PTEGIDX_M	0x07
415#define PTED_VA_HID_M		0x08
416#define PTED_VA_MANAGED_M	0x10
417#define PTED_VA_WIRED_M		0x20
418#define PTED_VA_EXEC_M		0x40
419
420static inline u_int32_t
421PTED_HID(struct pte_desc *pted)
422{
423	return (pted->pted_va & PTED_VA_HID_M);
424}
425
426static inline u_int32_t
427PTED_PTEGIDX(struct pte_desc *pted)
428{
429	return (pted->pted_va & PTED_VA_PTEGIDX_M);
430}
431
432static inline u_int32_t
433PTED_MANAGED(struct pte_desc *pted)
434{
435	return (pted->pted_va & PTED_VA_MANAGED_M);
436}
437
438static inline u_int32_t
439PTED_VALID(struct pte_desc *pted)
440{
441	if (ppc_proc_is_64b)
442		return (pted->p.pted_pte64.pte_hi & PTE_VALID_64);
443	else
444		return (pted->p.pted_pte32.pte_hi & PTE_VALID_32);
445}
446
447/*
448 * PV entries -
449 * manipulate the physical to virtual translations for the entire system.
450 *
451 * QUESTION: should all mapped memory be stored in PV tables? Or
452 * is it alright to only store "ram" memory. Currently device mappings
453 * are not stored.
454 * It makes sense to pre-allocate mappings for all of "ram" memory, since
455 * it is likely that it will be mapped at some point, but would it also
456 * make sense to use a tree/table like is use for pmap to store device
457 * mappings?
458 * Further notes: It seems that the PV table is only used for pmap_protect
459 * and other paging related operations. Given this, it is not necessary
460 * to store any pmap_kernel() entries in PV tables and does not make
461 * sense to store device mappings in PV either.
462 *
463 * Note: unlike other powerpc pmap designs, the array is only an array
464 * of pointers. Since the same structure is used for holding information
465 * in the VP table, the PV table, and for kernel mappings, the wired entries.
466 * Allocate one data structure to hold all of the info, instead of replicating
467 * it multiple times.
468 *
469 * One issue of making this a single data structure is that two pointers are
470 * wasted for every page which does not map ram (device mappings), this
471 * should be a low percentage of mapped pages in the system, so should not
472 * have too noticeable unnecessary ram consumption.
473 */
474
475void
476pmap_enter_pv(struct pte_desc *pted, struct vm_page *pg)
477{
478	if (__predict_false(!pmap_initialized)) {
479		return;
480	}
481
482	mtx_enter(&pg->mdpage.pv_mtx);
483	LIST_INSERT_HEAD(&(pg->mdpage.pv_list), pted, pted_pv_list);
484	pted->pted_va |= PTED_VA_MANAGED_M;
485	mtx_leave(&pg->mdpage.pv_mtx);
486}
487
488void
489pmap_remove_pv(struct pte_desc *pted)
490{
491	struct vm_page *pg;
492
493	if (ppc_proc_is_64b)
494		pg = PHYS_TO_VM_PAGE(pted->p.pted_pte64.pte_lo & PTE_RPGN_64);
495	else
496		pg = PHYS_TO_VM_PAGE(pted->p.pted_pte32.pte_lo & PTE_RPGN_32);
497
498	mtx_enter(&pg->mdpage.pv_mtx);
499	pted->pted_va &= ~PTED_VA_MANAGED_M;
500	LIST_REMOVE(pted, pted_pv_list);
501	mtx_leave(&pg->mdpage.pv_mtx);
502}
503
504
505/* PTE_CHG_32 == PTE_CHG_64 */
506/* PTE_REF_32 == PTE_REF_64 */
507static __inline u_int
508pmap_pte2flags(u_int32_t pte)
509{
510	return (((pte & PTE_REF_32) ? PG_PMAP_REF : 0) |
511	    ((pte & PTE_CHG_32) ? PG_PMAP_MOD : 0));
512}
513
514static __inline u_int
515pmap_flags2pte(u_int32_t flags)
516{
517	return (((flags & PG_PMAP_REF) ? PTE_REF_32 : 0) |
518	    ((flags & PG_PMAP_MOD) ? PTE_CHG_32 : 0));
519}
520
521void
522pmap_attr_save(paddr_t pa, u_int32_t bits)
523{
524	struct vm_page *pg;
525
526	pg = PHYS_TO_VM_PAGE(pa);
527	if (pg == NULL)
528		return;
529
530	atomic_setbits_int(&pg->pg_flags,  pmap_pte2flags(bits));
531}
532
533int
534pmap_enter(pmap_t pm, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
535{
536	struct pte_desc *pted;
537	struct vm_page *pg;
538	boolean_t nocache = (pa & PMAP_NOCACHE) != 0;
539	boolean_t wt = (pa & PMAP_WT) != 0;
540	int need_sync = 0;
541	int cache, error = 0;
542
543	KASSERT(!(wt && nocache));
544	pa &= PMAP_PA_MASK;
545
546	PMAP_VP_LOCK(pm);
547	pted = pmap_vp_lookup(pm, va);
548	if (pted && PTED_VALID(pted)) {
549		pmap_remove_pted(pm, pted);
550		/* we lost our pted if it was user */
551		if (pm != pmap_kernel())
552			pted = pmap_vp_lookup(pm, va);
553	}
554
555	pm->pm_stats.resident_count++;
556
557	/* Do not have pted for this, get one and put it in VP */
558	if (pted == NULL) {
559		pted = pool_get(&pmap_pted_pool, PR_NOWAIT | PR_ZERO);
560		if (pted == NULL) {
561			if ((flags & PMAP_CANFAIL) == 0) {
562				error = ENOMEM;
563				goto out;
564			}
565			panic("pmap_enter: failed to allocate pted");
566		}
567		error = pmap_vp_enter(pm, va, pted, flags);
568		if (error) {
569			pool_put(&pmap_pted_pool, pted);
570			goto out;
571		}
572	}
573
574	pg = PHYS_TO_VM_PAGE(pa);
575	if (pg != NULL && (pg->pg_flags & PG_PMAP_UC))
576		nocache = TRUE;
577	if (wt)
578		cache = PMAP_CACHE_WT;
579	else if (pg != NULL && !(pg->pg_flags & PG_DEV) && !nocache)
580		cache = PMAP_CACHE_WB;
581	else
582		cache = PMAP_CACHE_CI;
583
584	/* Calculate PTE */
585	if (ppc_proc_is_64b)
586		pmap_fill_pte64(pm, va, pa, pted, prot, cache);
587	else
588		pmap_fill_pte32(pm, va, pa, pted, prot, cache);
589
590	if (pg != NULL) {
591		pmap_enter_pv(pted, pg); /* only managed mem */
592	}
593
594	/*
595	 * Insert into HTAB
596	 * We were told to map the page, probably called from vm_fault,
597	 * so map the page!
598	 */
599	if (ppc_proc_is_64b)
600		pte_insert64(pted);
601	else
602		pte_insert32(pted);
603
604        if (prot & PROT_EXEC) {
605		u_int sn = VP_SR(va);
606
607        	pm->pm_exec[sn]++;
608		if (pm->pm_sr[sn] & SR_NOEXEC)
609			pm->pm_sr[sn] &= ~SR_NOEXEC;
610
611		if (pg != NULL) {
612			need_sync = ((pg->pg_flags & PG_PMAP_EXE) == 0);
613			if (prot & PROT_WRITE)
614				atomic_clearbits_int(&pg->pg_flags,
615				    PG_PMAP_EXE);
616			else
617				atomic_setbits_int(&pg->pg_flags,
618				    PG_PMAP_EXE);
619		} else
620			need_sync = 1;
621	} else {
622		/*
623		 * Should we be paranoid about writeable non-exec
624		 * mappings ? if so, clear the exec tag
625		 */
626		if ((prot & PROT_WRITE) && (pg != NULL))
627			atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
628	}
629
630	/* only instruction sync executable pages */
631	if (need_sync)
632		pmap_syncicache_user_virt(pm, va);
633
634out:
635	PMAP_VP_UNLOCK(pm);
636	return (error);
637}
638
639/*
640 * Remove the given range of mapping entries.
641 */
642void
643pmap_remove(pmap_t pm, vaddr_t sva, vaddr_t eva)
644{
645	struct pte_desc *pted;
646	vaddr_t va;
647
648	PMAP_VP_LOCK(pm);
649	for (va = sva; va < eva; va += PAGE_SIZE) {
650		pted = pmap_vp_lookup(pm, va);
651		if (pted && PTED_VALID(pted))
652			pmap_remove_pted(pm, pted);
653	}
654	PMAP_VP_UNLOCK(pm);
655}
656
657/*
658 * remove a single mapping, notice that this code is O(1)
659 */
660void
661pmap_remove_pted(pmap_t pm, struct pte_desc *pted)
662{
663	void *pte;
664	int s;
665
666	KASSERT(pm == pted->pted_pmap);
667	PMAP_VP_ASSERT_LOCKED(pm);
668
669	pm->pm_stats.resident_count--;
670
671	PMAP_HASH_LOCK(s);
672	if ((pte = pmap_ptedinhash(pted)) != NULL)
673		pte_zap(pte, pted);
674	PMAP_HASH_UNLOCK(s);
675
676	if (pted->pted_va & PTED_VA_EXEC_M) {
677		u_int sn = VP_SR(pted->pted_va);
678
679		pted->pted_va &= ~PTED_VA_EXEC_M;
680		pm->pm_exec[sn]--;
681		if (pm->pm_exec[sn] == 0)
682			pm->pm_sr[sn] |= SR_NOEXEC;
683	}
684
685	if (ppc_proc_is_64b)
686		pted->p.pted_pte64.pte_hi &= ~PTE_VALID_64;
687	else
688		pted->p.pted_pte32.pte_hi &= ~PTE_VALID_32;
689
690	if (PTED_MANAGED(pted))
691		pmap_remove_pv(pted);
692
693	if (pm != pmap_kernel()) {
694		(void)pmap_vp_remove(pm, pted->pted_va);
695		pool_put(&pmap_pted_pool, pted);
696	}
697}
698
699/*
700 * Enter a kernel mapping for the given page.
701 * kernel mappings have a larger set of prerequisites than normal mappings.
702 *
703 * 1. no memory should be allocated to create a kernel mapping.
704 * 2. a vp mapping should already exist, even if invalid. (see 1)
705 * 3. all vp tree mappings should already exist (see 1)
706 *
707 */
708void
709pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
710{
711	struct pte_desc *pted;
712	struct vm_page *pg;
713	boolean_t nocache = (pa & PMAP_NOCACHE) != 0;
714	boolean_t wt = (pa & PMAP_WT) != 0;
715	pmap_t pm;
716	int cache;
717
718	KASSERT(!(wt && nocache));
719	pa &= PMAP_PA_MASK;
720
721	pm = pmap_kernel();
722
723	pted = pmap_vp_lookup(pm, va);
724	if (pted && PTED_VALID(pted))
725		pmap_remove_pted(pm, pted); /* pted is reused */
726
727	pm->pm_stats.resident_count++;
728
729	if (prot & PROT_WRITE) {
730		pg = PHYS_TO_VM_PAGE(pa);
731		if (pg != NULL)
732			atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
733	}
734
735	/* Do not have pted for this, get one and put it in VP */
736	if (pted == NULL) {
737		panic("pted not preallocated in pmap_kernel() va %lx pa %lx",
738		    va, pa);
739	}
740
741	pg = PHYS_TO_VM_PAGE(pa);
742	if (wt)
743		cache = PMAP_CACHE_WT;
744	else if (pg != NULL && !(pg->pg_flags & PG_DEV) && !nocache)
745		cache = PMAP_CACHE_WB;
746	else
747		cache = PMAP_CACHE_CI;
748
749	/* Calculate PTE */
750	if (ppc_proc_is_64b)
751		pmap_fill_pte64(pm, va, pa, pted, prot, cache);
752	else
753		pmap_fill_pte32(pm, va, pa, pted, prot, cache);
754
755	/*
756	 * Insert into HTAB
757	 * We were told to map the page, probably called from vm_fault,
758	 * so map the page!
759	 */
760	if (ppc_proc_is_64b)
761		pte_insert64(pted);
762	else
763		pte_insert32(pted);
764
765	pted->pted_va |= PTED_VA_WIRED_M;
766
767        if (prot & PROT_EXEC) {
768		u_int sn = VP_SR(va);
769
770        	pm->pm_exec[sn]++;
771		if (pm->pm_sr[sn] & SR_NOEXEC)
772			pm->pm_sr[sn] &= ~SR_NOEXEC;
773	}
774}
775
776/*
777 * remove kernel (pmap_kernel()) mappings
778 */
779void
780pmap_kremove(vaddr_t va, vsize_t len)
781{
782	struct pte_desc *pted;
783
784	for (len >>= PAGE_SHIFT; len > 0; len--, va += PAGE_SIZE) {
785		pted = pmap_vp_lookup(pmap_kernel(), va);
786		if (pted && PTED_VALID(pted))
787			pmap_remove_pted(pmap_kernel(), pted);
788	}
789}
790
791static inline void *
792pmap_ptedinhash(struct pte_desc *pted)
793{
794	vaddr_t va = pted->pted_va & ~PAGE_MASK;
795	pmap_t pm = pted->pted_pmap;
796	int sr, idx;
797
798	sr = ptesr(pm->pm_sr, va);
799	idx = pteidx(sr, va);
800
801	if (ppc_proc_is_64b) {
802		struct pte_64 *pte = pmap_ptable64;
803
804		pte += (idx ^ (PTED_HID(pted) ? pmap_ptab_mask : 0)) * 8;
805		pte += PTED_PTEGIDX(pted);
806
807		/*
808		 * We now have the pointer to where it will be, if it is
809		 * currently mapped. If the mapping was thrown away in
810		 * exchange for another page mapping, then this page is
811		 * not currently in the HASH.
812		 */
813		if ((pted->p.pted_pte64.pte_hi |
814		    (PTED_HID(pted) ? PTE_HID_64 : 0)) == pte->pte_hi)
815			return (pte);
816	} else {
817		struct pte_32 *pte = pmap_ptable32;
818
819		pte += (idx ^ (PTED_HID(pted) ? pmap_ptab_mask : 0)) * 8;
820		pte += PTED_PTEGIDX(pted);
821
822		/*
823		 * We now have the pointer to where it will be, if it is
824		 * currently mapped. If the mapping was thrown away in
825		 * exchange for another page mapping, then this page is
826		 * not currently in the HASH.
827		 */
828		if ((pted->p.pted_pte32.pte_hi |
829		    (PTED_HID(pted) ? PTE_HID_32 : 0)) == pte->pte_hi)
830			return (pte);
831	}
832
833	return (NULL);
834}
835
836/*
837 * Delete a Page Table Entry, section 7.6.3.3.
838 *
839 * Note: pte must be locked.
840 */
841void
842pte_del(void *pte, vaddr_t va)
843{
844	if (ppc_proc_is_64b)
845		((struct pte_64 *)pte)->pte_hi &= ~PTE_VALID_64;
846	else
847		((struct pte_32 *)pte)->pte_hi &= ~PTE_VALID_32;
848
849	sync();		/* Ensure update completed. */
850	tlbie(va);	/* Invalidate old translation. */
851	eieio();	/* Order tlbie before tlbsync. */
852	tlbsync();	/* Ensure tlbie completed on all processors. */
853	sync();		/* Ensure tlbsync and update completed. */
854}
855
856void
857pte_zap(void *pte, struct pte_desc *pted)
858{
859	pte_del(pte, pted->pted_va);
860
861	if (!PTED_MANAGED(pted))
862		return;
863
864	if (ppc_proc_is_64b) {
865		pmap_attr_save(pted->p.pted_pte64.pte_lo & PTE_RPGN_64,
866		    ((struct pte_64 *)pte)->pte_lo & (PTE_REF_64|PTE_CHG_64));
867	} else {
868		pmap_attr_save(pted->p.pted_pte32.pte_lo & PTE_RPGN_32,
869		    ((struct pte_32 *)pte)->pte_lo & (PTE_REF_32|PTE_CHG_32));
870	}
871}
872
873/*
874 * What about execution control? Even at only a segment granularity.
875 */
876void
877pmap_fill_pte64(pmap_t pm, vaddr_t va, paddr_t pa, struct pte_desc *pted,
878	vm_prot_t prot, int cache)
879{
880	sr_t sr;
881	struct pte_64 *pte64;
882
883	sr = ptesr(pm->pm_sr, va);
884	pte64 = &pted->p.pted_pte64;
885
886	pte64->pte_hi = (((u_int64_t)sr & SR_VSID) <<
887	   PTE_VSID_SHIFT_64) |
888	    ((va >> ADDR_API_SHIFT_64) & PTE_API_64) | PTE_VALID_64;
889	pte64->pte_lo = (pa & PTE_RPGN_64);
890
891
892	if (cache == PMAP_CACHE_WB)
893		pte64->pte_lo |= PTE_M_64;
894	else if (cache == PMAP_CACHE_WT)
895		pte64->pte_lo |= (PTE_W_64 | PTE_M_64);
896	else
897		pte64->pte_lo |= (PTE_M_64 | PTE_I_64 | PTE_G_64);
898
899	if ((prot & (PROT_READ | PROT_WRITE)) == 0)
900		pte64->pte_lo |= PTE_AC_64;
901
902	if (prot & PROT_WRITE)
903		pte64->pte_lo |= PTE_RW_64;
904	else
905		pte64->pte_lo |= PTE_RO_64;
906
907	pted->pted_va = va & ~PAGE_MASK;
908
909	if (prot & PROT_EXEC)
910		pted->pted_va  |= PTED_VA_EXEC_M;
911	else
912		pte64->pte_lo |= PTE_N_64;
913
914	pted->pted_pmap = pm;
915}
916
917/*
918 * What about execution control? Even at only a segment granularity.
919 */
920void
921pmap_fill_pte32(pmap_t pm, vaddr_t va, paddr_t pa, struct pte_desc *pted,
922	vm_prot_t prot, int cache)
923{
924	sr_t sr;
925	struct pte_32 *pte32;
926
927	sr = ptesr(pm->pm_sr, va);
928	pte32 = &pted->p.pted_pte32;
929
930	pte32->pte_hi = ((sr & SR_VSID) << PTE_VSID_SHIFT_32) |
931	    ((va >> ADDR_API_SHIFT_32) & PTE_API_32) | PTE_VALID_32;
932	pte32->pte_lo = (pa & PTE_RPGN_32);
933
934	if (cache == PMAP_CACHE_WB)
935		pte32->pte_lo |= PTE_M_32;
936	else if (cache == PMAP_CACHE_WT)
937		pte32->pte_lo |= (PTE_W_32 | PTE_M_32);
938	else
939		pte32->pte_lo |= (PTE_M_32 | PTE_I_32 | PTE_G_32);
940
941	if (prot & PROT_WRITE)
942		pte32->pte_lo |= PTE_RW_32;
943	else
944		pte32->pte_lo |= PTE_RO_32;
945
946	pted->pted_va = va & ~PAGE_MASK;
947
948	/* XXX Per-page execution control. */
949	if (prot & PROT_EXEC)
950		pted->pted_va  |= PTED_VA_EXEC_M;
951
952	pted->pted_pmap = pm;
953}
954
955int
956pmap_test_attrs(struct vm_page *pg, u_int flagbit)
957{
958	u_int bits;
959	struct pte_desc *pted;
960	u_int ptebit = pmap_flags2pte(flagbit);
961	int s;
962
963	/* PTE_CHG_32 == PTE_CHG_64 */
964	/* PTE_REF_32 == PTE_REF_64 */
965
966	bits = pg->pg_flags & flagbit;
967	if (bits == flagbit)
968		return bits;
969
970	mtx_enter(&pg->mdpage.pv_mtx);
971	LIST_FOREACH(pted, &(pg->mdpage.pv_list), pted_pv_list) {
972		void *pte;
973
974		PMAP_HASH_LOCK(s);
975		if ((pte = pmap_ptedinhash(pted)) != NULL) {
976			if (ppc_proc_is_64b) {
977				struct pte_64 *ptp64 = pte;
978				bits |=	pmap_pte2flags(ptp64->pte_lo & ptebit);
979			} else {
980				struct pte_32 *ptp32 = pte;
981				bits |=	pmap_pte2flags(ptp32->pte_lo & ptebit);
982			}
983		}
984		PMAP_HASH_UNLOCK(s);
985
986		if (bits == flagbit)
987			break;
988	}
989	mtx_leave(&pg->mdpage.pv_mtx);
990
991	atomic_setbits_int(&pg->pg_flags,  bits);
992
993	return bits;
994}
995
996int
997pmap_clear_attrs(struct vm_page *pg, u_int flagbit)
998{
999	u_int bits;
1000	struct pte_desc *pted;
1001	u_int ptebit = pmap_flags2pte(flagbit);
1002	int s;
1003
1004	/* PTE_CHG_32 == PTE_CHG_64 */
1005	/* PTE_REF_32 == PTE_REF_64 */
1006
1007	bits = pg->pg_flags & flagbit;
1008
1009	mtx_enter(&pg->mdpage.pv_mtx);
1010	LIST_FOREACH(pted, &(pg->mdpage.pv_list), pted_pv_list) {
1011		void *pte;
1012
1013		PMAP_HASH_LOCK(s);
1014		if ((pte = pmap_ptedinhash(pted)) != NULL) {
1015			if (ppc_proc_is_64b) {
1016				struct pte_64 *ptp64 = pte;
1017
1018				bits |=	pmap_pte2flags(ptp64->pte_lo & ptebit);
1019
1020				pte_del(ptp64, pted->pted_va);
1021
1022				ptp64->pte_lo &= ~ptebit;
1023				eieio();
1024				ptp64->pte_hi |= PTE_VALID_64;
1025				sync();
1026			} else {
1027				struct pte_32 *ptp32 = pte;
1028
1029				bits |=	pmap_pte2flags(ptp32->pte_lo & ptebit);
1030
1031				pte_del(ptp32, pted->pted_va);
1032
1033				ptp32->pte_lo &= ~ptebit;
1034				eieio();
1035				ptp32->pte_hi |= PTE_VALID_32;
1036				sync();
1037			}
1038		}
1039		PMAP_HASH_UNLOCK(s);
1040	}
1041	mtx_leave(&pg->mdpage.pv_mtx);
1042
1043	/*
1044	 * this is done a second time, because while walking the list
1045	 * a bit could have been promoted via pmap_attr_save()
1046	 */
1047	bits |= pg->pg_flags & flagbit;
1048	atomic_clearbits_int(&pg->pg_flags,  flagbit);
1049
1050	return bits;
1051}
1052
1053/*
1054 * Fill the given physical page with zeros.
1055 */
1056void
1057pmap_zero_page(struct vm_page *pg)
1058{
1059	vaddr_t va = pmap_map_direct(pg);
1060	int i;
1061
1062	/*
1063	 * Loop over & zero cache lines.  This code assumes that 64-bit
1064	 * CPUs have 128-byte cache lines.  We explicitly use ``dcbzl''
1065	 * here because we do not clear the DCBZ_SIZE bit of the HID5
1066	 * register in order to be compatible with code using ``dcbz''
1067	 * and assuming that cache line size is 32.
1068	 */
1069	if (ppc_proc_is_64b) {
1070		for (i = 0; i < PAGE_SIZE; i += 128)
1071			asm volatile ("dcbzl 0,%0" :: "r"(va + i));
1072		return;
1073	}
1074
1075	for (i = 0; i < PAGE_SIZE; i += CACHELINESIZE)
1076		asm volatile ("dcbz 0,%0" :: "r"(va + i));
1077}
1078
1079/*
1080 * Copy a page.
1081 */
1082void
1083pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1084{
1085	vaddr_t srcva = pmap_map_direct(srcpg);
1086	vaddr_t dstva = pmap_map_direct(dstpg);
1087
1088	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
1089}
1090
1091int pmap_id_avail = 0;
1092
1093pmap_t
1094pmap_create(void)
1095{
1096	u_int bits;
1097	int first, i, k, try, tblidx, tbloff;
1098	int seg;
1099	pmap_t pm;
1100
1101	pm = pool_get(&pmap_pmap_pool, PR_WAITOK|PR_ZERO);
1102
1103	pmap_reference(pm);
1104	PMAP_VP_LOCK_INIT(pm);
1105
1106	/*
1107	 * Allocate segment registers for this pmap.
1108	 * Try not to reuse pmap ids, to spread the hash table usage.
1109	 */
1110	first = pmap_id_avail;
1111again:
1112	for (i = 0; i < NPMAPS; i++) {
1113		try = first + i;
1114		try = try % NPMAPS; /* truncate back into bounds */
1115		tblidx = try / (8 * sizeof usedsr[0]);
1116		tbloff = try % (8 * sizeof usedsr[0]);
1117		bits = usedsr[tblidx];
1118		if ((bits & (1U << tbloff)) == 0) {
1119			if (atomic_cas_uint(&usedsr[tblidx], bits,
1120			    bits | (1U << tbloff)) != bits) {
1121				first = try;
1122				goto again;
1123			}
1124			pmap_id_avail = try + 1;
1125
1126			seg = try << 4;
1127			for (k = 0; k < 16; k++)
1128				pm->pm_sr[k] = (seg + k) | SR_NOEXEC;
1129			return (pm);
1130		}
1131	}
1132	panic("out of pmap slots");
1133}
1134
1135/*
1136 * Add a reference to a given pmap.
1137 */
1138void
1139pmap_reference(pmap_t pm)
1140{
1141	atomic_inc_int(&pm->pm_refs);
1142}
1143
1144/*
1145 * Retire the given pmap from service.
1146 * Should only be called if the map contains no valid mappings.
1147 */
1148void
1149pmap_destroy(pmap_t pm)
1150{
1151	int refs;
1152
1153	refs = atomic_dec_int_nv(&pm->pm_refs);
1154	if (refs == -1)
1155		panic("re-entering pmap_destroy");
1156	if (refs > 0)
1157		return;
1158
1159	/*
1160	 * reference count is zero, free pmap resources and free pmap.
1161	 */
1162	pmap_release(pm);
1163	pool_put(&pmap_pmap_pool, pm);
1164}
1165
1166/*
1167 * Release any resources held by the given physical map.
1168 * Called when a pmap initialized by pmap_pinit is being released.
1169 */
1170void
1171pmap_release(pmap_t pm)
1172{
1173	int i, tblidx, tbloff;
1174
1175	pmap_vp_destroy(pm);
1176	i = (pm->pm_sr[0] & SR_VSID) >> 4;
1177	tblidx = i / (8  * sizeof usedsr[0]);
1178	tbloff = i % (8  * sizeof usedsr[0]);
1179
1180	/* powerpc can do atomic cas, clearbits on same word. */
1181	atomic_clearbits_int(&usedsr[tblidx], 1U << tbloff);
1182}
1183
1184void
1185pmap_vp_destroy(pmap_t pm)
1186{
1187	int i, j;
1188	struct pmapvp *vp1;
1189	struct pmapvp *vp2;
1190
1191	for (i = 0; i < VP_SR_SIZE; i++) {
1192		vp1 = pm->pm_vp[i];
1193		if (vp1 == NULL)
1194			continue;
1195
1196		for (j = 0; j < VP_IDX1_SIZE; j++) {
1197			vp2 = vp1->vp[j];
1198			if (vp2 == NULL)
1199				continue;
1200
1201			pool_put(&pmap_vp_pool, vp2);
1202		}
1203		pm->pm_vp[i] = NULL;
1204		pool_put(&pmap_vp_pool, vp1);
1205	}
1206}
1207
1208void
1209pmap_avail_setup(void)
1210{
1211	struct mem_region *mp;
1212
1213	ppc_mem_regions(&pmap_mem, &pmap_avail);
1214
1215	for (mp = pmap_mem; mp->size !=0; mp++, ndumpmem++) {
1216		physmem += atop(mp->size);
1217		dumpmem[ndumpmem].start = atop(mp->start);
1218		dumpmem[ndumpmem].end = atop(mp->start + mp->size);
1219	}
1220
1221	for (mp = pmap_avail; mp->size !=0 ; mp++) {
1222		if (physmaxaddr <  mp->start + mp->size)
1223			physmaxaddr = mp->start + mp->size;
1224	}
1225
1226	for (mp = pmap_avail; mp->size !=0; mp++)
1227		pmap_cnt_avail += 1;
1228}
1229
1230void
1231pmap_avail_fixup(void)
1232{
1233	struct mem_region *mp;
1234	u_int32_t align;
1235	u_int32_t end;
1236
1237	mp = pmap_avail;
1238	while(mp->size !=0) {
1239		align = round_page(mp->start);
1240		if (mp->start != align) {
1241			pmap_remove_avail(mp->start, align);
1242			mp = pmap_avail;
1243			continue;
1244		}
1245		end = mp->start+mp->size;
1246		align = trunc_page(end);
1247		if (end != align) {
1248			pmap_remove_avail(align, end);
1249			mp = pmap_avail;
1250			continue;
1251		}
1252		mp++;
1253	}
1254}
1255
1256/* remove a given region from avail memory */
1257void
1258pmap_remove_avail(paddr_t base, paddr_t end)
1259{
1260	struct mem_region *mp;
1261	int i;
1262	int mpend;
1263
1264	/* remove given region from available */
1265	for (mp = pmap_avail; mp->size; mp++) {
1266		/*
1267		 * Check if this region holds all of the region
1268		 */
1269		mpend = mp->start + mp->size;
1270		if (base > mpend) {
1271			continue;
1272		}
1273		if (base <= mp->start) {
1274			if (end <= mp->start)
1275				break; /* region not present -??? */
1276
1277			if (end >= mpend) {
1278				/* covers whole region */
1279				/* shorten */
1280				for (i = mp - pmap_avail;
1281				    i < pmap_cnt_avail;
1282				    i++) {
1283					pmap_avail[i] = pmap_avail[i+1];
1284				}
1285				pmap_cnt_avail--;
1286				pmap_avail[pmap_cnt_avail].size = 0;
1287			} else {
1288				mp->start = end;
1289				mp->size = mpend - end;
1290			}
1291		} else {
1292			/* start after the beginning */
1293			if (end >= mpend) {
1294				/* just truncate */
1295				mp->size = base - mp->start;
1296			} else {
1297				/* split */
1298				for (i = pmap_cnt_avail;
1299				    i > (mp - pmap_avail);
1300				    i--) {
1301					pmap_avail[i] = pmap_avail[i - 1];
1302				}
1303				pmap_cnt_avail++;
1304				mp->size = base - mp->start;
1305				mp++;
1306				mp->start = end;
1307				mp->size = mpend - end;
1308			}
1309		}
1310	}
1311	for (mp = pmap_allocated; mp->size != 0; mp++) {
1312		if (base < mp->start) {
1313			if (end == mp->start) {
1314				mp->start = base;
1315				mp->size += end - base;
1316				break;
1317			}
1318			/* lengthen */
1319			for (i = pmap_cnt_allocated; i > (mp - pmap_allocated);
1320			    i--) {
1321				pmap_allocated[i] = pmap_allocated[i - 1];
1322			}
1323			pmap_cnt_allocated++;
1324			mp->start = base;
1325			mp->size = end - base;
1326			return;
1327		}
1328		if (base == (mp->start + mp->size)) {
1329			mp->size += end - base;
1330			return;
1331		}
1332	}
1333	if (mp->size == 0) {
1334		mp->start = base;
1335		mp->size  = end - base;
1336		pmap_cnt_allocated++;
1337	}
1338}
1339
1340void *
1341pmap_steal_avail(size_t size, int align)
1342{
1343	struct mem_region *mp;
1344	int start;
1345	int remsize;
1346
1347	for (mp = pmap_avail; mp->size; mp++) {
1348		if (mp->size > size) {
1349			start = (mp->start + (align -1)) & ~(align -1);
1350			remsize = mp->size - (start - mp->start);
1351			if (remsize >= 0) {
1352				pmap_remove_avail(start, start+size);
1353				return (void *)start;
1354			}
1355		}
1356	}
1357	panic ("unable to allocate region with size %zx align %x",
1358	    size, align);
1359}
1360
1361/*
1362 * Similar to pmap_steal_avail, but operating on vm_physmem since
1363 * uvm_page_physload() has been called.
1364 */
1365vaddr_t
1366pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
1367{
1368	int segno;
1369	u_int npg;
1370	vaddr_t va;
1371	paddr_t pa;
1372	struct vm_physseg *seg;
1373
1374	size = round_page(size);
1375	npg = atop(size);
1376
1377	for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
1378		if (seg->avail_end - seg->avail_start < npg)
1379			continue;
1380		/*
1381		 * We can only steal at an ``unused'' segment boundary,
1382		 * i.e. either at the start or at the end.
1383		 */
1384		if (seg->avail_start == seg->start ||
1385		    seg->avail_end == seg->end)
1386			break;
1387	}
1388	if (segno == vm_nphysseg)
1389		va = 0;
1390	else {
1391		if (seg->avail_start == seg->start) {
1392			pa = ptoa(seg->avail_start);
1393			seg->avail_start += npg;
1394			seg->start += npg;
1395		} else {
1396			pa = ptoa(seg->avail_end) - size;
1397			seg->avail_end -= npg;
1398			seg->end -= npg;
1399		}
1400		/*
1401		 * If all the segment has been consumed now, remove it.
1402		 * Note that the crash dump code still knows about it
1403		 * and will dump it correctly.
1404		 */
1405		if (seg->start == seg->end) {
1406			if (vm_nphysseg-- == 1)
1407				panic("pmap_steal_memory: out of memory");
1408			while (segno < vm_nphysseg) {
1409				seg[0] = seg[1]; /* struct copy */
1410				seg++;
1411				segno++;
1412			}
1413		}
1414
1415		va = (vaddr_t)pa;	/* 1:1 mapping */
1416		bzero((void *)va, size);
1417	}
1418
1419	if (start != NULL)
1420		*start = VM_MIN_KERNEL_ADDRESS;
1421	if (end != NULL)
1422		*end = VM_MAX_KERNEL_ADDRESS;
1423
1424	return (va);
1425}
1426
1427void *msgbuf_addr;
1428
1429/*
1430 * Initialize pmap setup.
1431 * ALL of the code which deals with avail needs rewritten as an actual
1432 * memory allocation.
1433 */
1434void
1435pmap_bootstrap(u_int kernelstart, u_int kernelend)
1436{
1437	struct mem_region *mp;
1438	int i, k;
1439	struct pmapvp *vp1;
1440	struct pmapvp *vp2;
1441	extern vaddr_t ppc_kvm_stolen;
1442
1443	/*
1444	 * set the page size (default value is 4K which is ok)
1445	 */
1446	uvm_setpagesize();
1447
1448	/*
1449	 * Get memory.
1450	 */
1451	pmap_avail_setup();
1452
1453	/*
1454	 * Page align all regions.
1455	 * Non-page memory isn't very interesting to us.
1456	 * Also, sort the entries for ascending addresses.
1457	 */
1458	kernelstart = trunc_page(kernelstart);
1459	kernelend = round_page(kernelend);
1460	pmap_remove_avail(kernelstart, kernelend);
1461
1462	msgbuf_addr = pmap_steal_avail(MSGBUFSIZE,4);
1463
1464#ifdef DEBUG
1465	for (mp = pmap_avail; mp->size; mp++) {
1466		bzero((void *)mp->start, mp->size);
1467	}
1468#endif
1469
1470#define HTABENTS_32 1024
1471#define HTABENTS_64 2048
1472
1473	if (ppc_proc_is_64b) {
1474		pmap_ptab_cnt = HTABENTS_64;
1475		while (pmap_ptab_cnt * 2 < physmem)
1476			pmap_ptab_cnt <<= 1;
1477	} else {
1478		pmap_ptab_cnt = HTABENTS_32;
1479		while (HTABSIZE_32 < (ptoa(physmem) >> 7))
1480			pmap_ptab_cnt <<= 1;
1481	}
1482	/*
1483	 * allocate suitably aligned memory for HTAB
1484	 */
1485	if (ppc_proc_is_64b) {
1486		pmap_ptable64 = pmap_steal_avail(HTABMEMSZ_64, HTABMEMSZ_64);
1487		bzero((void *)pmap_ptable64, HTABMEMSZ_64);
1488		pmap_ptab_mask = pmap_ptab_cnt - 1;
1489	} else {
1490		pmap_ptable32 = pmap_steal_avail(HTABSIZE_32, HTABSIZE_32);
1491		bzero((void *)pmap_ptable32, HTABSIZE_32);
1492		pmap_ptab_mask = pmap_ptab_cnt - 1;
1493	}
1494
1495	/* allocate v->p mappings for pmap_kernel() */
1496	for (i = 0; i < VP_SR_SIZE; i++) {
1497		pmap_kernel()->pm_vp[i] = NULL;
1498	}
1499	vp1 = pmap_steal_avail(sizeof (struct pmapvp), 4);
1500	bzero (vp1, sizeof(struct pmapvp));
1501	pmap_kernel()->pm_vp[PPC_KERNEL_SR] = vp1;
1502	for (i = 0; i < VP_IDX1_SIZE; i++) {
1503		vp2 = vp1->vp[i] = pmap_steal_avail(sizeof (struct pmapvp), 4);
1504		bzero (vp2, sizeof(struct pmapvp));
1505		for (k = 0; k < VP_IDX2_SIZE; k++) {
1506			struct pte_desc *pted;
1507			pted = pmap_steal_avail(sizeof (struct pte_desc), 4);
1508			bzero (pted, sizeof (struct pte_desc));
1509			vp2->vp[k] = pted;
1510		}
1511	}
1512
1513	/*
1514	 * Initialize kernel pmap and hardware.
1515	 */
1516#if NPMAPS >= PPC_KERNEL_SEGMENT / 16
1517	usedsr[PPC_KERNEL_SEGMENT / 16 / (sizeof usedsr[0] * 8)]
1518		|= 1 << ((PPC_KERNEL_SEGMENT / 16) % (sizeof usedsr[0] * 8));
1519#endif
1520	for (i = 0; i < 16; i++)
1521		pmap_kernel()->pm_sr[i] = (PPC_KERNEL_SEG0 + i) | SR_NOEXEC;
1522
1523	if (ppc_nobat) {
1524		vp1 = pmap_steal_avail(sizeof (struct pmapvp), 4);
1525		bzero (vp1, sizeof(struct pmapvp));
1526		pmap_kernel()->pm_vp[0] = vp1;
1527		for (i = 0; i < VP_IDX1_SIZE; i++) {
1528			vp2 = vp1->vp[i] =
1529			    pmap_steal_avail(sizeof (struct pmapvp), 4);
1530			bzero (vp2, sizeof(struct pmapvp));
1531			for (k = 0; k < VP_IDX2_SIZE; k++) {
1532				struct pte_desc *pted;
1533				pted = pmap_steal_avail(sizeof (struct pte_desc), 4);
1534				bzero (pted, sizeof (struct pte_desc));
1535				vp2->vp[k] = pted;
1536			}
1537		}
1538
1539		/* first segment contains executable pages */
1540		pmap_kernel()->pm_exec[0]++;
1541		pmap_kernel()->pm_sr[0] &= ~SR_NOEXEC;
1542	} else {
1543		/*
1544		 * Setup fixed BAT registers.
1545		 *
1546		 * Note that we still run in real mode, and the BAT
1547		 * registers were cleared in cpu_bootstrap().
1548		 */
1549		battable[0].batl = BATL(0x00000000, BAT_M);
1550		if (physmem > atop(0x08000000))
1551			battable[0].batu = BATU(0x00000000, BAT_BL_256M);
1552		else
1553			battable[0].batu = BATU(0x00000000, BAT_BL_128M);
1554
1555		/* Map physical memory with BATs. */
1556		if (physmem > atop(0x10000000)) {
1557			battable[0x1].batl = BATL(0x10000000, BAT_M);
1558			battable[0x1].batu = BATU(0x10000000, BAT_BL_256M);
1559		}
1560		if (physmem > atop(0x20000000)) {
1561			battable[0x2].batl = BATL(0x20000000, BAT_M);
1562			battable[0x2].batu = BATU(0x20000000, BAT_BL_256M);
1563		}
1564		if (physmem > atop(0x30000000)) {
1565			battable[0x3].batl = BATL(0x30000000, BAT_M);
1566			battable[0x3].batu = BATU(0x30000000, BAT_BL_256M);
1567		}
1568		if (physmem > atop(0x40000000)) {
1569			battable[0x4].batl = BATL(0x40000000, BAT_M);
1570			battable[0x4].batu = BATU(0x40000000, BAT_BL_256M);
1571		}
1572		if (physmem > atop(0x50000000)) {
1573			battable[0x5].batl = BATL(0x50000000, BAT_M);
1574			battable[0x5].batu = BATU(0x50000000, BAT_BL_256M);
1575		}
1576		if (physmem > atop(0x60000000)) {
1577			battable[0x6].batl = BATL(0x60000000, BAT_M);
1578			battable[0x6].batu = BATU(0x60000000, BAT_BL_256M);
1579		}
1580		if (physmem > atop(0x70000000)) {
1581			battable[0x7].batl = BATL(0x70000000, BAT_M);
1582			battable[0x7].batu = BATU(0x70000000, BAT_BL_256M);
1583		}
1584	}
1585
1586	ppc_kvm_stolen += reserve_dumppages( (caddr_t)(VM_MIN_KERNEL_ADDRESS +
1587	    ppc_kvm_stolen));
1588
1589	pmap_avail_fixup();
1590	for (mp = pmap_avail; mp->size; mp++) {
1591		if (mp->start > 0x80000000)
1592			continue;
1593		if (mp->start + mp->size > 0x80000000)
1594			mp->size = 0x80000000 - mp->start;
1595		uvm_page_physload(atop(mp->start), atop(mp->start+mp->size),
1596		    atop(mp->start), atop(mp->start+mp->size), 0);
1597	}
1598}
1599
1600void
1601pmap_enable_mmu(void)
1602{
1603	uint32_t scratch, sdr1;
1604	int i;
1605
1606	/*
1607	 * For the PowerPC 970, ACCR = 3 inhibits loads and stores to
1608	 * pages with PTE_AC_64.  This is for execute-only mappings.
1609	 */
1610	if (ppc_proc_is_64b)
1611		asm volatile ("mtspr 29, %0" :: "r" (3));
1612
1613	if (!ppc_nobat) {
1614		extern caddr_t etext;
1615
1616		/* DBAT0 used for initial segment */
1617		ppc_mtdbat0l(battable[0].batl);
1618		ppc_mtdbat0u(battable[0].batu);
1619
1620		/* IBAT0 only covering the kernel .text */
1621		ppc_mtibat0l(battable[0].batl);
1622		if (round_page((vaddr_t)&etext) < 8*1024*1024)
1623			ppc_mtibat0u(BATU(0x00000000, BAT_BL_8M));
1624		else
1625			ppc_mtibat0u(BATU(0x00000000, BAT_BL_16M));
1626	}
1627
1628	for (i = 0; i < 16; i++)
1629		ppc_mtsrin(PPC_KERNEL_SEG0 + i, i << ADDR_SR_SHIFT);
1630
1631	if (ppc_proc_is_64b)
1632		sdr1 = (uint32_t)pmap_ptable64 | HTABSIZE_64;
1633	else
1634		sdr1 = (uint32_t)pmap_ptable32 | (pmap_ptab_mask >> 10);
1635
1636	asm volatile ("sync; mtsdr1 %0; isync" :: "r"(sdr1));
1637	tlbia();
1638
1639	asm volatile ("eieio; mfmsr %0; ori %0,%0,%1; mtmsr %0; sync; isync"
1640	    : "=r"(scratch) : "K"(PSL_IR|PSL_DR|PSL_ME|PSL_RI));
1641}
1642
1643/*
1644 * activate a pmap entry
1645 * All PTE entries exist in the same hash table.
1646 * Segment registers are filled on exit to user mode.
1647 */
1648void
1649pmap_activate(struct proc *p)
1650{
1651	struct pcb *pcb = &p->p_addr->u_pcb;
1652
1653	/* Set the current pmap. */
1654	pcb->pcb_pm = p->p_vmspace->vm_map.pmap;
1655	pmap_extract(pmap_kernel(),
1656	    (vaddr_t)pcb->pcb_pm, (paddr_t *)&pcb->pcb_pmreal);
1657	curcpu()->ci_curpm = pcb->pcb_pmreal;
1658}
1659
1660/*
1661 * deactivate a pmap entry
1662 * NOOP on powerpc
1663 */
1664void
1665pmap_deactivate(struct proc *p)
1666{
1667}
1668
1669/*
1670 * pmap_extract: extract a PA for the given VA
1671 */
1672
1673boolean_t
1674pmap_extract(pmap_t pm, vaddr_t va, paddr_t *pa)
1675{
1676	struct pte_desc *pted;
1677
1678	if (pm == pmap_kernel() && va < physmaxaddr) {
1679		*pa = va;
1680		return TRUE;
1681	}
1682
1683	PMAP_VP_LOCK(pm);
1684	pted = pmap_vp_lookup(pm, va);
1685	if (pted == NULL || !PTED_VALID(pted)) {
1686		PMAP_VP_UNLOCK(pm);
1687		return FALSE;
1688	}
1689
1690	if (ppc_proc_is_64b)
1691		*pa = (pted->p.pted_pte64.pte_lo & PTE_RPGN_64) |
1692		    (va & ~PTE_RPGN_64);
1693	else
1694		*pa = (pted->p.pted_pte32.pte_lo & PTE_RPGN_32) |
1695		    (va & ~PTE_RPGN_32);
1696
1697	PMAP_VP_UNLOCK(pm);
1698	return TRUE;
1699}
1700
1701#ifdef ALTIVEC
1702/*
1703 * Read an instruction from a given virtual memory address.
1704 * Execute-only protection is bypassed.
1705 */
1706int
1707pmap_copyinsn(pmap_t pm, vaddr_t va, uint32_t *insn)
1708{
1709	struct pte_desc *pted;
1710	paddr_t pa;
1711
1712	/* Assume pm != pmap_kernel(). */
1713	if (ppc_proc_is_64b) {
1714		/* inline pmap_extract */
1715		PMAP_VP_LOCK(pm);
1716		pted = pmap_vp_lookup(pm, va);
1717		if (pted == NULL || !PTED_VALID(pted)) {
1718			PMAP_VP_UNLOCK(pm);
1719			return EFAULT;
1720		}
1721		pa = (pted->p.pted_pte64.pte_lo & PTE_RPGN_64) |
1722		    (va & ~PTE_RPGN_64);
1723		PMAP_VP_UNLOCK(pm);
1724
1725		if (pa > physmaxaddr - sizeof(*insn))
1726			return EFAULT;
1727		*insn = *(uint32_t *)pa;
1728		return 0;
1729	} else
1730		return copyin32((void *)va, insn);
1731}
1732#endif
1733
1734u_int32_t
1735pmap_setusr(pmap_t pm, vaddr_t va)
1736{
1737	u_int32_t sr;
1738	u_int32_t oldsr;
1739
1740	sr = ptesr(pm->pm_sr, va);
1741
1742	/* user address range lock?? */
1743	asm volatile ("mfsr %0,%1" : "=r" (oldsr): "n"(PPC_USER_SR));
1744	asm volatile ("isync; mtsr %0,%1; isync" :: "n"(PPC_USER_SR), "r"(sr));
1745	return oldsr;
1746}
1747
1748void
1749pmap_popusr(u_int32_t sr)
1750{
1751	asm volatile ("isync; mtsr %0,%1; isync"
1752	    :: "n"(PPC_USER_SR), "r"(sr));
1753}
1754
1755int
1756_copyin(const void *udaddr, void *kaddr, size_t len)
1757{
1758	void *p;
1759	size_t l;
1760	u_int32_t oldsr;
1761	faultbuf env;
1762	void *oldh = curpcb->pcb_onfault;
1763
1764	while (len > 0) {
1765		p = PPC_USER_ADDR + ((u_int)udaddr & ~PPC_SEGMENT_MASK);
1766		l = (PPC_USER_ADDR + PPC_SEGMENT_LENGTH) - p;
1767		if (l > len)
1768			l = len;
1769		oldsr = pmap_setusr(curpcb->pcb_pm, (vaddr_t)udaddr);
1770		if (setfault(&env)) {
1771			pmap_popusr(oldsr);
1772			curpcb->pcb_onfault = oldh;
1773			return EFAULT;
1774		}
1775		bcopy(p, kaddr, l);
1776		pmap_popusr(oldsr);
1777		udaddr += l;
1778		kaddr += l;
1779		len -= l;
1780	}
1781	curpcb->pcb_onfault = oldh;
1782	return 0;
1783}
1784
1785int
1786copyout(const void *kaddr, void *udaddr, size_t len)
1787{
1788	void *p;
1789	size_t l;
1790	u_int32_t oldsr;
1791	faultbuf env;
1792	void *oldh = curpcb->pcb_onfault;
1793
1794	while (len > 0) {
1795		p = PPC_USER_ADDR + ((u_int)udaddr & ~PPC_SEGMENT_MASK);
1796		l = (PPC_USER_ADDR + PPC_SEGMENT_LENGTH) - p;
1797		if (l > len)
1798			l = len;
1799		oldsr = pmap_setusr(curpcb->pcb_pm, (vaddr_t)udaddr);
1800		if (setfault(&env)) {
1801			pmap_popusr(oldsr);
1802			curpcb->pcb_onfault = oldh;
1803			return EFAULT;
1804		}
1805
1806		bcopy(kaddr, p, l);
1807		pmap_popusr(oldsr);
1808		udaddr += l;
1809		kaddr += l;
1810		len -= l;
1811	}
1812	curpcb->pcb_onfault = oldh;
1813	return 0;
1814}
1815
1816int
1817copyin32(const uint32_t *udaddr, uint32_t *kaddr)
1818{
1819	volatile uint32_t *p;
1820	u_int32_t oldsr;
1821	faultbuf env;
1822	void *oldh = curpcb->pcb_onfault;
1823
1824	if ((u_int)udaddr & 0x3)
1825		return EFAULT;
1826
1827	p = PPC_USER_ADDR + ((u_int)udaddr & ~PPC_SEGMENT_MASK);
1828	oldsr = pmap_setusr(curpcb->pcb_pm, (vaddr_t)udaddr);
1829	if (setfault(&env)) {
1830		pmap_popusr(oldsr);
1831		curpcb->pcb_onfault = oldh;
1832		return EFAULT;
1833	}
1834	*kaddr = *p;
1835	pmap_popusr(oldsr);
1836	curpcb->pcb_onfault = oldh;
1837	return 0;
1838}
1839
1840int
1841_copyinstr(const void *udaddr, void *kaddr, size_t len, size_t *done)
1842{
1843	const u_char *uaddr = udaddr;
1844	u_char *kp    = kaddr;
1845	u_char *up;
1846	u_char c;
1847	void   *p;
1848	size_t	 l;
1849	u_int32_t oldsr;
1850	int cnt = 0;
1851	faultbuf env;
1852	void *oldh = curpcb->pcb_onfault;
1853
1854	while (len > 0) {
1855		p = PPC_USER_ADDR + ((u_int)uaddr & ~PPC_SEGMENT_MASK);
1856		l = (PPC_USER_ADDR + PPC_SEGMENT_LENGTH) - p;
1857		up = p;
1858		if (l > len)
1859			l = len;
1860		len -= l;
1861		oldsr = pmap_setusr(curpcb->pcb_pm, (vaddr_t)uaddr);
1862		if (setfault(&env)) {
1863			if (done != NULL)
1864				*done =  cnt;
1865
1866			curpcb->pcb_onfault = oldh;
1867			pmap_popusr(oldsr);
1868			return EFAULT;
1869		}
1870		while (l > 0) {
1871			c = *up;
1872			*kp = c;
1873			if (c == 0) {
1874				if (done != NULL)
1875					*done = cnt + 1;
1876
1877				curpcb->pcb_onfault = oldh;
1878				pmap_popusr(oldsr);
1879				return 0;
1880			}
1881			up++;
1882			kp++;
1883			l--;
1884			cnt++;
1885			uaddr++;
1886		}
1887		pmap_popusr(oldsr);
1888	}
1889	curpcb->pcb_onfault = oldh;
1890	if (done != NULL)
1891		*done = cnt;
1892
1893	return ENAMETOOLONG;
1894}
1895
1896int
1897copyoutstr(const void *kaddr, void *udaddr, size_t len, size_t *done)
1898{
1899	u_char *uaddr = (void *)udaddr;
1900	const u_char *kp    = kaddr;
1901	u_char *up;
1902	u_char c;
1903	void   *p;
1904	size_t	 l;
1905	u_int32_t oldsr;
1906	int cnt = 0;
1907	faultbuf env;
1908	void *oldh = curpcb->pcb_onfault;
1909
1910	while (len > 0) {
1911		p = PPC_USER_ADDR + ((u_int)uaddr & ~PPC_SEGMENT_MASK);
1912		l = (PPC_USER_ADDR + PPC_SEGMENT_LENGTH) - p;
1913		up = p;
1914		if (l > len)
1915			l = len;
1916		len -= l;
1917		oldsr = pmap_setusr(curpcb->pcb_pm, (vaddr_t)uaddr);
1918		if (setfault(&env)) {
1919			if (done != NULL)
1920				*done =  cnt;
1921
1922			curpcb->pcb_onfault = oldh;
1923			pmap_popusr(oldsr);
1924			return EFAULT;
1925		}
1926		while (l > 0) {
1927			c = *kp;
1928			*up = c;
1929			if (c == 0) {
1930				if (done != NULL)
1931					*done = cnt + 1;
1932
1933				curpcb->pcb_onfault = oldh;
1934				pmap_popusr(oldsr);
1935				return 0;
1936			}
1937			up++;
1938			kp++;
1939			l--;
1940			cnt++;
1941			uaddr++;
1942		}
1943		pmap_popusr(oldsr);
1944	}
1945	curpcb->pcb_onfault = oldh;
1946	if (done != NULL)
1947		*done = cnt;
1948
1949	return ENAMETOOLONG;
1950}
1951
1952/*
1953 * sync instruction cache for user virtual address.
1954 * The address WAS JUST MAPPED, so we have a VALID USERSPACE mapping
1955 */
1956void
1957pmap_syncicache_user_virt(pmap_t pm, vaddr_t va)
1958{
1959	vaddr_t start;
1960	int oldsr;
1961
1962	if (pm != pmap_kernel()) {
1963		start = ((u_int)PPC_USER_ADDR + ((u_int)va &
1964		    ~PPC_SEGMENT_MASK));
1965		/* will only ever be page size, will not cross segments */
1966
1967		/* USER SEGMENT LOCK - MPXXX */
1968		oldsr = pmap_setusr(pm, va);
1969	} else {
1970		start = va; /* flush mapped page */
1971	}
1972
1973	syncicache((void *)start, PAGE_SIZE);
1974
1975	if (pm != pmap_kernel()) {
1976		pmap_popusr(oldsr);
1977		/* USER SEGMENT UNLOCK -MPXXX */
1978	}
1979}
1980
1981void
1982pmap_pted_ro(struct pte_desc *pted, vm_prot_t prot)
1983{
1984	if (ppc_proc_is_64b)
1985		pmap_pted_ro64(pted, prot);
1986	else
1987		pmap_pted_ro32(pted, prot);
1988}
1989
1990void
1991pmap_pted_ro64(struct pte_desc *pted, vm_prot_t prot)
1992{
1993	pmap_t pm = pted->pted_pmap;
1994	vaddr_t va = pted->pted_va & ~PAGE_MASK;
1995	struct vm_page *pg;
1996	void *pte;
1997	int s;
1998
1999	pg = PHYS_TO_VM_PAGE(pted->p.pted_pte64.pte_lo & PTE_RPGN_64);
2000	if (pg->pg_flags & PG_PMAP_EXE) {
2001		if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_WRITE) {
2002			atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
2003		} else {
2004			pmap_syncicache_user_virt(pm, va);
2005		}
2006	}
2007
2008	pted->p.pted_pte64.pte_lo &= ~PTE_PP_64;
2009	pted->p.pted_pte64.pte_lo |= PTE_RO_64;
2010
2011	if ((prot & PROT_EXEC) == 0)
2012		pted->p.pted_pte64.pte_lo |= PTE_N_64;
2013
2014	if ((prot & (PROT_READ | PROT_WRITE)) == 0)
2015		pted->p.pted_pte64.pte_lo |= PTE_AC_64;
2016
2017	PMAP_HASH_LOCK(s);
2018	if ((pte = pmap_ptedinhash(pted)) != NULL) {
2019		struct pte_64 *ptp64 = pte;
2020
2021		pte_del(ptp64, va);
2022
2023		if (PTED_MANAGED(pted)) { /* XXX */
2024			pmap_attr_save(ptp64->pte_lo & PTE_RPGN_64,
2025			    ptp64->pte_lo & (PTE_REF_64|PTE_CHG_64));
2026		}
2027
2028		/* Add a Page Table Entry, section 7.6.3.1. */
2029		ptp64->pte_lo = pted->p.pted_pte64.pte_lo;
2030		eieio();	/* Order 1st PTE update before 2nd. */
2031		ptp64->pte_hi |= PTE_VALID_64;
2032		sync();		/* Ensure updates completed. */
2033	}
2034	PMAP_HASH_UNLOCK(s);
2035}
2036
2037void
2038pmap_pted_ro32(struct pte_desc *pted, vm_prot_t prot)
2039{
2040	pmap_t pm = pted->pted_pmap;
2041	vaddr_t va = pted->pted_va & ~PAGE_MASK;
2042	struct vm_page *pg;
2043	void *pte;
2044	int s;
2045
2046	pg = PHYS_TO_VM_PAGE(pted->p.pted_pte32.pte_lo & PTE_RPGN_32);
2047	if (pg->pg_flags & PG_PMAP_EXE) {
2048		if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_WRITE) {
2049			atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
2050		} else {
2051			pmap_syncicache_user_virt(pm, va);
2052		}
2053	}
2054
2055	pted->p.pted_pte32.pte_lo &= ~PTE_PP_32;
2056	pted->p.pted_pte32.pte_lo |= PTE_RO_32;
2057
2058	PMAP_HASH_LOCK(s);
2059	if ((pte = pmap_ptedinhash(pted)) != NULL) {
2060		struct pte_32 *ptp32 = pte;
2061
2062		pte_del(ptp32, va);
2063
2064		if (PTED_MANAGED(pted)) { /* XXX */
2065			pmap_attr_save(ptp32->pte_lo & PTE_RPGN_32,
2066			    ptp32->pte_lo & (PTE_REF_32|PTE_CHG_32));
2067		}
2068
2069		/* Add a Page Table Entry, section 7.6.3.1. */
2070		ptp32->pte_lo &= ~(PTE_CHG_32|PTE_PP_32);
2071		ptp32->pte_lo |= PTE_RO_32;
2072		eieio();	/* Order 1st PTE update before 2nd. */
2073		ptp32->pte_hi |= PTE_VALID_32;
2074		sync();		/* Ensure updates completed. */
2075	}
2076	PMAP_HASH_UNLOCK(s);
2077}
2078
2079/*
2080 * Lower the protection on the specified physical page.
2081 *
2082 * There are only two cases, either the protection is going to 0,
2083 * or it is going to read-only.
2084 */
2085void
2086pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
2087{
2088	struct pte_desc *pted;
2089	void *pte;
2090	pmap_t pm;
2091	int s;
2092
2093	if (prot == PROT_NONE) {
2094		mtx_enter(&pg->mdpage.pv_mtx);
2095		while ((pted = LIST_FIRST(&(pg->mdpage.pv_list))) != NULL) {
2096			pmap_reference(pted->pted_pmap);
2097			pm = pted->pted_pmap;
2098			mtx_leave(&pg->mdpage.pv_mtx);
2099
2100			PMAP_VP_LOCK(pm);
2101
2102			/*
2103			 * We dropped the pvlist lock before grabbing
2104			 * the pmap lock to avoid lock ordering
2105			 * problems.  This means we have to check the
2106			 * pvlist again since somebody else might have
2107			 * modified it.  All we care about is that the
2108			 * pvlist entry matches the pmap we just
2109			 * locked.  If it doesn't, unlock the pmap and
2110			 * try again.
2111			 */
2112			mtx_enter(&pg->mdpage.pv_mtx);
2113			if ((pted = LIST_FIRST(&(pg->mdpage.pv_list))) == NULL ||
2114			    pted->pted_pmap != pm) {
2115				mtx_leave(&pg->mdpage.pv_mtx);
2116				PMAP_VP_UNLOCK(pm);
2117				pmap_destroy(pm);
2118				mtx_enter(&pg->mdpage.pv_mtx);
2119				continue;
2120			}
2121
2122			PMAP_HASH_LOCK(s);
2123			if ((pte = pmap_ptedinhash(pted)) != NULL)
2124				pte_zap(pte, pted);
2125			PMAP_HASH_UNLOCK(s);
2126
2127			pted->pted_va &= ~PTED_VA_MANAGED_M;
2128			LIST_REMOVE(pted, pted_pv_list);
2129			mtx_leave(&pg->mdpage.pv_mtx);
2130
2131			pmap_remove_pted(pm, pted);
2132
2133			PMAP_VP_UNLOCK(pm);
2134			pmap_destroy(pm);
2135			mtx_enter(&pg->mdpage.pv_mtx);
2136		}
2137		mtx_leave(&pg->mdpage.pv_mtx);
2138		/* page is being reclaimed, sync icache next use */
2139		atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
2140		return;
2141	}
2142
2143	mtx_enter(&pg->mdpage.pv_mtx);
2144	LIST_FOREACH(pted, &(pg->mdpage.pv_list), pted_pv_list)
2145		pmap_pted_ro(pted, prot);
2146	mtx_leave(&pg->mdpage.pv_mtx);
2147}
2148
2149void
2150pmap_protect(pmap_t pm, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2151{
2152	if (prot & (PROT_READ | PROT_EXEC)) {
2153		struct pte_desc *pted;
2154
2155		PMAP_VP_LOCK(pm);
2156		while (sva < eva) {
2157			pted = pmap_vp_lookup(pm, sva);
2158			if (pted && PTED_VALID(pted))
2159				pmap_pted_ro(pted, prot);
2160			sva += PAGE_SIZE;
2161		}
2162		PMAP_VP_UNLOCK(pm);
2163		return;
2164	}
2165	pmap_remove(pm, sva, eva);
2166}
2167
2168/*
2169 * Restrict given range to physical memory
2170 */
2171void
2172pmap_real_memory(paddr_t *start, vsize_t *size)
2173{
2174	struct mem_region *mp;
2175
2176	for (mp = pmap_mem; mp->size; mp++) {
2177		if (((*start + *size) > mp->start)
2178			&& (*start < (mp->start + mp->size)))
2179		{
2180			if (*start < mp->start) {
2181				*size -= mp->start - *start;
2182				*start = mp->start;
2183			}
2184			if ((*start + *size) > (mp->start + mp->size))
2185				*size = mp->start + mp->size - *start;
2186			return;
2187		}
2188	}
2189	*size = 0;
2190}
2191
2192void
2193pmap_init()
2194{
2195	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0,
2196	    "pmap", NULL);
2197	pool_setlowat(&pmap_pmap_pool, 2);
2198	pool_init(&pmap_vp_pool, sizeof(struct pmapvp), 0, IPL_VM, 0,
2199	    "vp", &pool_allocator_single);
2200	pool_setlowat(&pmap_vp_pool, 10);
2201	pool_init(&pmap_pted_pool, sizeof(struct pte_desc), 0, IPL_VM, 0,
2202	    "pted", NULL);
2203	pool_setlowat(&pmap_pted_pool, 20);
2204
2205	pmap_initialized = 1;
2206}
2207
2208void
2209pmap_proc_iflush(struct process *pr, vaddr_t va, vsize_t len)
2210{
2211	paddr_t pa;
2212	vsize_t clen;
2213
2214	while (len > 0) {
2215		/* add one to always round up to the next page */
2216		clen = round_page(va + 1) - va;
2217		if (clen > len)
2218			clen = len;
2219
2220		if (pmap_extract(pr->ps_vmspace->vm_map.pmap, va, &pa)) {
2221			syncicache((void *)pa, clen);
2222		}
2223
2224		len -= clen;
2225		va += clen;
2226	}
2227}
2228
2229/*
2230 * There are two routines, pte_spill_r and pte_spill_v
2231 * the _r version only handles kernel faults which are not user
2232 * accesses. The _v version handles all user faults and kernel copyin/copyout
2233 * "user" accesses.
2234 */
2235int
2236pte_spill_r(u_int32_t va, u_int32_t msr, u_int32_t dsisr, int exec_fault)
2237{
2238	pmap_t pm;
2239	struct pte_desc *pted;
2240	struct pte_desc pted_store;
2241
2242	/* lookup is done physical to prevent faults */
2243
2244	/*
2245	 * This function only handles kernel faults, not supervisor copyins.
2246	 */
2247	if (msr & PSL_PR)
2248		return 0;
2249
2250	/* if copyin, throw to full excption handler */
2251	if (VP_SR(va) == PPC_USER_SR)
2252		return 0;
2253
2254	pm = pmap_kernel();
2255
2256	/* 0 - physmaxaddr mapped 1-1 */
2257	if (va < physmaxaddr) {
2258		u_int32_t aligned_va;
2259		vm_prot_t prot = PROT_READ | PROT_WRITE;
2260		extern caddr_t kernel_text;
2261		extern caddr_t etext;
2262
2263		pted = &pted_store;
2264
2265		if (va >= trunc_page((vaddr_t)&kernel_text) &&
2266		    va < round_page((vaddr_t)&etext)) {
2267			prot |= PROT_EXEC;
2268		}
2269
2270		aligned_va = trunc_page(va);
2271		if (ppc_proc_is_64b) {
2272			pmap_fill_pte64(pm, aligned_va, aligned_va,
2273			    pted, prot, PMAP_CACHE_WB);
2274			pte_insert64(pted);
2275		} else {
2276			pmap_fill_pte32(pm, aligned_va, aligned_va,
2277			    pted, prot, PMAP_CACHE_WB);
2278			pte_insert32(pted);
2279		}
2280		return 1;
2281	}
2282
2283	return pte_spill_v(pm, va, dsisr, exec_fault);
2284}
2285
2286int
2287pte_spill_v(pmap_t pm, u_int32_t va, u_int32_t dsisr, int exec_fault)
2288{
2289	struct pte_desc *pted;
2290	int inserted = 0;
2291
2292	/*
2293	 * DSISR_DABR is set if the PowerPC 970 attempted to read or
2294	 * write an execute-only page.
2295	 */
2296	if (dsisr & DSISR_DABR)
2297		return 0;
2298
2299	/*
2300	 * If the current mapping is RO and the access was a write
2301	 * we return 0
2302	 */
2303	PMAP_VP_LOCK(pm);
2304	pted = pmap_vp_lookup(pm, va);
2305	if (pted == NULL || !PTED_VALID(pted))
2306		goto out;
2307
2308	/* Attempted to write a read-only page. */
2309	if (dsisr & DSISR_STORE) {
2310		if (ppc_proc_is_64b) {
2311			if ((pted->p.pted_pte64.pte_lo & PTE_PP_64) ==
2312			    PTE_RO_64)
2313				goto out;
2314		} else {
2315			if ((pted->p.pted_pte32.pte_lo & PTE_PP_32) ==
2316			    PTE_RO_32)
2317				goto out;
2318		}
2319	}
2320
2321	/* Attempted to execute non-executable page. */
2322	if ((exec_fault != 0) && ((pted->pted_va & PTED_VA_EXEC_M) == 0))
2323		goto out;
2324
2325	inserted = 1;
2326	if (ppc_proc_is_64b)
2327		pte_insert64(pted);
2328	else
2329		pte_insert32(pted);
2330
2331out:
2332	PMAP_VP_UNLOCK(pm);
2333	return (inserted);
2334}
2335
2336
2337/*
2338 * should pte_insert code avoid wired mappings?
2339 * is the stack safe?
2340 * is the pted safe? (physical)
2341 * -ugh
2342 */
2343void
2344pte_insert64(struct pte_desc *pted)
2345{
2346	struct pte_64 *ptp64;
2347	int off, secondary;
2348	int sr, idx, i;
2349	void *pte;
2350	int s;
2351
2352	PMAP_HASH_LOCK(s);
2353	if ((pte = pmap_ptedinhash(pted)) != NULL)
2354		pte_zap(pte, pted);
2355
2356	pted->pted_va &= ~(PTED_VA_HID_M|PTED_VA_PTEGIDX_M);
2357
2358	sr = ptesr(pted->pted_pmap->pm_sr, pted->pted_va);
2359	idx = pteidx(sr, pted->pted_va);
2360
2361	/*
2362	 * instead of starting at the beginning of each pteg,
2363	 * the code should pick a random location with in the primary
2364	 * then search all of the entries, then if not yet found,
2365	 * do the same for the secondary.
2366	 * this would reduce the frontloading of the pteg.
2367	 */
2368
2369	/* first just try fill of primary hash */
2370	ptp64 = pmap_ptable64 + (idx) * 8;
2371	for (i = 0; i < 8; i++) {
2372		if (ptp64[i].pte_hi & PTE_VALID_64)
2373			continue;
2374
2375		pted->pted_va |= i;
2376
2377		/* Add a Page Table Entry, section 7.6.3.1. */
2378		ptp64[i].pte_hi = pted->p.pted_pte64.pte_hi & ~PTE_VALID_64;
2379		ptp64[i].pte_lo = pted->p.pted_pte64.pte_lo;
2380		eieio();	/* Order 1st PTE update before 2nd. */
2381		ptp64[i].pte_hi |= PTE_VALID_64;
2382		sync();		/* Ensure updates completed. */
2383
2384		goto out;
2385	}
2386
2387	/* try fill of secondary hash */
2388	ptp64 = pmap_ptable64 + (idx ^ pmap_ptab_mask) * 8;
2389	for (i = 0; i < 8; i++) {
2390		if (ptp64[i].pte_hi & PTE_VALID_64)
2391			continue;
2392
2393		pted->pted_va |= (i | PTED_VA_HID_M);
2394
2395		/* Add a Page Table Entry, section 7.6.3.1. */
2396		ptp64[i].pte_hi = pted->p.pted_pte64.pte_hi & ~PTE_VALID_64;
2397		ptp64[i].pte_lo = pted->p.pted_pte64.pte_lo;
2398		eieio();	/* Order 1st PTE update before 2nd. */
2399		ptp64[i].pte_hi |= (PTE_HID_64|PTE_VALID_64);
2400		sync();		/* Ensure updates completed. */
2401
2402		goto out;
2403	}
2404
2405	/* need decent replacement algorithm */
2406	off = ppc_mftb();
2407	secondary = off & 8;
2408
2409
2410	pted->pted_va |= off & (PTED_VA_PTEGIDX_M|PTED_VA_HID_M);
2411
2412	idx = (idx ^ (PTED_HID(pted) ? pmap_ptab_mask : 0));
2413
2414	ptp64 = pmap_ptable64 + (idx * 8);
2415	ptp64 += PTED_PTEGIDX(pted); /* increment by index into pteg */
2416
2417	if (ptp64->pte_hi & PTE_VALID_64) {
2418		vaddr_t va;
2419
2420		/* Bits 9-19 */
2421		idx = (idx ^ ((ptp64->pte_hi & PTE_HID_64) ?
2422		    pmap_ptab_mask : 0));
2423		va = (ptp64->pte_hi >> PTE_VSID_SHIFT_64) ^ idx;
2424		va <<= ADDR_PIDX_SHIFT;
2425		/* Bits 4-8 */
2426		va |= (ptp64->pte_hi & PTE_API_64) << ADDR_API_SHIFT_32;
2427		/* Bits 0-3 */
2428		va |= (ptp64->pte_hi >> PTE_VSID_SHIFT_64)
2429		    << ADDR_SR_SHIFT;
2430
2431		pte_del(ptp64, va);
2432
2433		pmap_attr_save(ptp64->pte_lo & PTE_RPGN_64,
2434		    ptp64->pte_lo & (PTE_REF_64|PTE_CHG_64));
2435	}
2436
2437	/* Add a Page Table Entry, section 7.6.3.1. */
2438	ptp64->pte_hi = pted->p.pted_pte64.pte_hi & ~PTE_VALID_64;
2439	if (secondary)
2440		ptp64->pte_hi |= PTE_HID_64;
2441	ptp64->pte_lo = pted->p.pted_pte64.pte_lo;
2442	eieio();	/* Order 1st PTE update before 2nd. */
2443	ptp64->pte_hi |= PTE_VALID_64;
2444	sync();		/* Ensure updates completed. */
2445
2446out:
2447	PMAP_HASH_UNLOCK(s);
2448}
2449
2450void
2451pte_insert32(struct pte_desc *pted)
2452{
2453	struct pte_32 *ptp32;
2454	int off, secondary;
2455	int sr, idx, i;
2456	void *pte;
2457	int s;
2458
2459	PMAP_HASH_LOCK(s);
2460	if ((pte = pmap_ptedinhash(pted)) != NULL)
2461		pte_zap(pte, pted);
2462
2463	pted->pted_va &= ~(PTED_VA_HID_M|PTED_VA_PTEGIDX_M);
2464
2465	sr = ptesr(pted->pted_pmap->pm_sr, pted->pted_va);
2466	idx = pteidx(sr, pted->pted_va);
2467
2468	/*
2469	 * instead of starting at the beginning of each pteg,
2470	 * the code should pick a random location with in the primary
2471	 * then search all of the entries, then if not yet found,
2472	 * do the same for the secondary.
2473	 * this would reduce the frontloading of the pteg.
2474	 */
2475
2476	/* first just try fill of primary hash */
2477	ptp32 = pmap_ptable32 + (idx) * 8;
2478	for (i = 0; i < 8; i++) {
2479		if (ptp32[i].pte_hi & PTE_VALID_32)
2480			continue;
2481
2482		pted->pted_va |= i;
2483
2484		/* Add a Page Table Entry, section 7.6.3.1. */
2485		ptp32[i].pte_hi = pted->p.pted_pte32.pte_hi & ~PTE_VALID_32;
2486		ptp32[i].pte_lo = pted->p.pted_pte32.pte_lo;
2487		eieio();	/* Order 1st PTE update before 2nd. */
2488		ptp32[i].pte_hi |= PTE_VALID_32;
2489		sync();		/* Ensure updates completed. */
2490
2491		goto out;
2492	}
2493
2494	/* try fill of secondary hash */
2495	ptp32 = pmap_ptable32 + (idx ^ pmap_ptab_mask) * 8;
2496	for (i = 0; i < 8; i++) {
2497		if (ptp32[i].pte_hi & PTE_VALID_32)
2498			continue;
2499
2500		pted->pted_va |= (i | PTED_VA_HID_M);
2501
2502		/* Add a Page Table Entry, section 7.6.3.1. */
2503		ptp32[i].pte_hi = pted->p.pted_pte32.pte_hi & ~PTE_VALID_32;
2504		ptp32[i].pte_lo = pted->p.pted_pte32.pte_lo;
2505		eieio();	/* Order 1st PTE update before 2nd. */
2506		ptp32[i].pte_hi |= (PTE_HID_32|PTE_VALID_32);
2507		sync();		/* Ensure updates completed. */
2508
2509		goto out;
2510	}
2511
2512	/* need decent replacement algorithm */
2513	off = ppc_mftb();
2514	secondary = off & 8;
2515
2516	pted->pted_va |= off & (PTED_VA_PTEGIDX_M|PTED_VA_HID_M);
2517
2518	idx = (idx ^ (PTED_HID(pted) ? pmap_ptab_mask : 0));
2519
2520	ptp32 = pmap_ptable32 + (idx * 8);
2521	ptp32 += PTED_PTEGIDX(pted); /* increment by index into pteg */
2522
2523	if (ptp32->pte_hi & PTE_VALID_32) {
2524		vaddr_t va;
2525
2526		va = ((ptp32->pte_hi & PTE_API_32) << ADDR_API_SHIFT_32) |
2527		     ((((ptp32->pte_hi >> PTE_VSID_SHIFT_32) & SR_VSID)
2528			^(idx ^ ((ptp32->pte_hi & PTE_HID_32) ? 0x3ff : 0)))
2529			    & 0x3ff) << PAGE_SHIFT;
2530
2531		pte_del(ptp32, va);
2532
2533		pmap_attr_save(ptp32->pte_lo & PTE_RPGN_32,
2534		    ptp32->pte_lo & (PTE_REF_32|PTE_CHG_32));
2535	}
2536
2537	/* Add a Page Table Entry, section 7.6.3.1. */
2538	ptp32->pte_hi = pted->p.pted_pte32.pte_hi & ~PTE_VALID_32;
2539	if (secondary)
2540		ptp32->pte_hi |= PTE_HID_32;
2541	ptp32->pte_lo = pted->p.pted_pte32.pte_lo;
2542	eieio();	/* Order 1st PTE update before 2nd. */
2543	ptp32->pte_hi |= PTE_VALID_32;
2544	sync();		/* Ensure updates completed. */
2545
2546out:
2547	PMAP_HASH_UNLOCK(s);
2548}
2549