mmu_oea.c revision 183094
1/*-
2 * Copyright (c) 2001 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Matt Thomas <matt@3am-software.com> of Allegro Networks, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *        This product includes software developed by the NetBSD
19 *        Foundation, Inc. and its contributors.
20 * 4. Neither the name of The NetBSD Foundation nor the names of its
21 *    contributors may be used to endorse or promote products derived
22 *    from this software without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36/*-
37 * Copyright (C) 1995, 1996 Wolfgang Solfrank.
38 * Copyright (C) 1995, 1996 TooLs GmbH.
39 * All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. All advertising materials mentioning features or use of this software
50 *    must display the following acknowledgement:
51 *	This product includes software developed by TooLs GmbH.
52 * 4. The name of TooLs GmbH may not be used to endorse or promote products
53 *    derived from this software without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
56 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
59 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
61 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
63 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
64 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $
67 */
68/*-
69 * Copyright (C) 2001 Benno Rice.
70 * All rights reserved.
71 *
72 * Redistribution and use in source and binary forms, with or without
73 * modification, are permitted provided that the following conditions
74 * are met:
75 * 1. Redistributions of source code must retain the above copyright
76 *    notice, this list of conditions and the following disclaimer.
77 * 2. Redistributions in binary form must reproduce the above copyright
78 *    notice, this list of conditions and the following disclaimer in the
79 *    documentation and/or other materials provided with the distribution.
80 *
81 * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
82 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
83 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
84 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
85 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
86 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
87 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
88 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
89 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
90 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
91 */
92
93#include <sys/cdefs.h>
94__FBSDID("$FreeBSD: head/sys/powerpc/aim/mmu_oea.c 183094 2008-09-16 19:16:33Z marcel $");
95
96/*
97 * Manages physical address maps.
98 *
99 * In addition to hardware address maps, this module is called upon to
100 * provide software-use-only maps which may or may not be stored in the
101 * same form as hardware maps.  These pseudo-maps are used to store
102 * intermediate results from copy operations to and from address spaces.
103 *
104 * Since the information managed by this module is also stored by the
105 * logical address mapping module, this module may throw away valid virtual
106 * to physical mappings at almost any time.  However, invalidations of
107 * mappings must be done as requested.
108 *
109 * In order to cope with hardware architectures which make virtual to
110 * physical map invalidates expensive, this module may delay invalidate
111 * reduced protection operations until such time as they are actually
112 * necessary.  This module is given full information as to which processors
113 * are currently using which maps, and to when physical maps must be made
114 * correct.
115 */
116
117#include "opt_kstack_pages.h"
118
119#include <sys/param.h>
120#include <sys/kernel.h>
121#include <sys/ktr.h>
122#include <sys/lock.h>
123#include <sys/msgbuf.h>
124#include <sys/mutex.h>
125#include <sys/proc.h>
126#include <sys/sysctl.h>
127#include <sys/systm.h>
128#include <sys/vmmeter.h>
129
130#include <dev/ofw/openfirm.h>
131
132#include <vm/vm.h>
133#include <vm/vm_param.h>
134#include <vm/vm_kern.h>
135#include <vm/vm_page.h>
136#include <vm/vm_map.h>
137#include <vm/vm_object.h>
138#include <vm/vm_extern.h>
139#include <vm/vm_pageout.h>
140#include <vm/vm_pager.h>
141#include <vm/uma.h>
142
143#include <machine/cpu.h>
144#include <machine/powerpc.h>
145#include <machine/bat.h>
146#include <machine/frame.h>
147#include <machine/md_var.h>
148#include <machine/psl.h>
149#include <machine/pte.h>
150#include <machine/smp.h>
151#include <machine/sr.h>
152#include <machine/mmuvar.h>
153
154#include "mmu_if.h"
155
156#define	MOEA_DEBUG
157
158#define TODO	panic("%s: not implemented", __func__);
159
160#define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
161#define	VSID_TO_SR(vsid)	((vsid) & 0xf)
162#define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
163
164#define	PVO_PTEGIDX_MASK	0x007		/* which PTEG slot */
165#define	PVO_PTEGIDX_VALID	0x008		/* slot is valid */
166#define	PVO_WIRED		0x010		/* PVO entry is wired */
167#define	PVO_MANAGED		0x020		/* PVO entry is managed */
168#define	PVO_EXECUTABLE		0x040		/* PVO entry is executable */
169#define	PVO_BOOTSTRAP		0x080		/* PVO entry allocated during
170						   bootstrap */
171#define PVO_FAKE		0x100		/* fictitious phys page */
172#define	PVO_VADDR(pvo)		((pvo)->pvo_vaddr & ~ADDR_POFF)
173#define	PVO_ISEXECUTABLE(pvo)	((pvo)->pvo_vaddr & PVO_EXECUTABLE)
174#define PVO_ISFAKE(pvo)		((pvo)->pvo_vaddr & PVO_FAKE)
175#define	PVO_PTEGIDX_GET(pvo)	((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK)
176#define	PVO_PTEGIDX_ISSET(pvo)	((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID)
177#define	PVO_PTEGIDX_CLR(pvo)	\
178	((void)((pvo)->pvo_vaddr &= ~(PVO_PTEGIDX_VALID|PVO_PTEGIDX_MASK)))
179#define	PVO_PTEGIDX_SET(pvo, i)	\
180	((void)((pvo)->pvo_vaddr |= (i)|PVO_PTEGIDX_VALID))
181
182#define	MOEA_PVO_CHECK(pvo)
183
184struct ofw_map {
185	vm_offset_t	om_va;
186	vm_size_t	om_len;
187	vm_offset_t	om_pa;
188	u_int		om_mode;
189};
190
191/*
192 * Map of physical memory regions.
193 */
194static struct	mem_region *regions;
195static struct	mem_region *pregions;
196u_int           phys_avail_count;
197int		regions_sz, pregions_sz;
198static struct	ofw_map *translations;
199
200extern struct pmap ofw_pmap;
201
202/*
203 * Lock for the pteg and pvo tables.
204 */
205struct mtx	moea_table_mutex;
206
207/* tlbie instruction synchronization */
208static struct mtx tlbie_mtx;
209
210/*
211 * PTEG data.
212 */
213static struct	pteg *moea_pteg_table;
214u_int		moea_pteg_count;
215u_int		moea_pteg_mask;
216
217/*
218 * PVO data.
219 */
220struct	pvo_head *moea_pvo_table;		/* pvo entries by pteg index */
221struct	pvo_head moea_pvo_kunmanaged =
222    LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged);	/* list of unmanaged pages */
223struct	pvo_head moea_pvo_unmanaged =
224    LIST_HEAD_INITIALIZER(moea_pvo_unmanaged);	/* list of unmanaged pages */
225
226uma_zone_t	moea_upvo_zone;	/* zone for pvo entries for unmanaged pages */
227uma_zone_t	moea_mpvo_zone;	/* zone for pvo entries for managed pages */
228
229#define	BPVO_POOL_SIZE	32768
230static struct	pvo_entry *moea_bpvo_pool;
231static int	moea_bpvo_pool_index = 0;
232
233#define	VSID_NBPW	(sizeof(u_int32_t) * 8)
234static u_int	moea_vsid_bitmap[NPMAPS / VSID_NBPW];
235
236static boolean_t moea_initialized = FALSE;
237
238/*
239 * Statistics.
240 */
241u_int	moea_pte_valid = 0;
242u_int	moea_pte_overflow = 0;
243u_int	moea_pte_replacements = 0;
244u_int	moea_pvo_entries = 0;
245u_int	moea_pvo_enter_calls = 0;
246u_int	moea_pvo_remove_calls = 0;
247u_int	moea_pte_spills = 0;
248SYSCTL_INT(_machdep, OID_AUTO, moea_pte_valid, CTLFLAG_RD, &moea_pte_valid,
249    0, "");
250SYSCTL_INT(_machdep, OID_AUTO, moea_pte_overflow, CTLFLAG_RD,
251    &moea_pte_overflow, 0, "");
252SYSCTL_INT(_machdep, OID_AUTO, moea_pte_replacements, CTLFLAG_RD,
253    &moea_pte_replacements, 0, "");
254SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_entries, CTLFLAG_RD, &moea_pvo_entries,
255    0, "");
256SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_enter_calls, CTLFLAG_RD,
257    &moea_pvo_enter_calls, 0, "");
258SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_remove_calls, CTLFLAG_RD,
259    &moea_pvo_remove_calls, 0, "");
260SYSCTL_INT(_machdep, OID_AUTO, moea_pte_spills, CTLFLAG_RD,
261    &moea_pte_spills, 0, "");
262
263/*
264 * Allocate physical memory for use in moea_bootstrap.
265 */
266static vm_offset_t	moea_bootstrap_alloc(vm_size_t, u_int);
267
268/*
269 * PTE calls.
270 */
271static int		moea_pte_insert(u_int, struct pte *);
272
273/*
274 * PVO calls.
275 */
276static int	moea_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *,
277		    vm_offset_t, vm_offset_t, u_int, int);
278static void	moea_pvo_remove(struct pvo_entry *, int);
279static struct	pvo_entry *moea_pvo_find_va(pmap_t, vm_offset_t, int *);
280static struct	pte *moea_pvo_to_pte(const struct pvo_entry *, int);
281
282/*
283 * Utility routines.
284 */
285static void		moea_enter_locked(pmap_t, vm_offset_t, vm_page_t,
286			    vm_prot_t, boolean_t);
287static void		moea_syncicache(vm_offset_t, vm_size_t);
288static boolean_t	moea_query_bit(vm_page_t, int);
289static u_int		moea_clear_bit(vm_page_t, int, int *);
290static void		moea_kremove(mmu_t, vm_offset_t);
291int		moea_pte_spill(vm_offset_t);
292
293/*
294 * Kernel MMU interface
295 */
296void moea_change_wiring(mmu_t, pmap_t, vm_offset_t, boolean_t);
297void moea_clear_modify(mmu_t, vm_page_t);
298void moea_clear_reference(mmu_t, vm_page_t);
299void moea_copy_page(mmu_t, vm_page_t, vm_page_t);
300void moea_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t);
301void moea_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
302    vm_prot_t);
303void moea_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
304vm_paddr_t moea_extract(mmu_t, pmap_t, vm_offset_t);
305vm_page_t moea_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t);
306void moea_init(mmu_t);
307boolean_t moea_is_modified(mmu_t, vm_page_t);
308boolean_t moea_ts_referenced(mmu_t, vm_page_t);
309vm_offset_t moea_map(mmu_t, vm_offset_t *, vm_offset_t, vm_offset_t, int);
310boolean_t moea_page_exists_quick(mmu_t, pmap_t, vm_page_t);
311int moea_page_wired_mappings(mmu_t, vm_page_t);
312void moea_pinit(mmu_t, pmap_t);
313void moea_pinit0(mmu_t, pmap_t);
314void moea_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
315void moea_qenter(mmu_t, vm_offset_t, vm_page_t *, int);
316void moea_qremove(mmu_t, vm_offset_t, int);
317void moea_release(mmu_t, pmap_t);
318void moea_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
319void moea_remove_all(mmu_t, vm_page_t);
320void moea_remove_write(mmu_t, vm_page_t);
321void moea_zero_page(mmu_t, vm_page_t);
322void moea_zero_page_area(mmu_t, vm_page_t, int, int);
323void moea_zero_page_idle(mmu_t, vm_page_t);
324void moea_activate(mmu_t, struct thread *);
325void moea_deactivate(mmu_t, struct thread *);
326void moea_bootstrap(mmu_t, vm_offset_t, vm_offset_t);
327void *moea_mapdev(mmu_t, vm_offset_t, vm_size_t);
328void moea_unmapdev(mmu_t, vm_offset_t, vm_size_t);
329vm_offset_t moea_kextract(mmu_t, vm_offset_t);
330void moea_kenter(mmu_t, vm_offset_t, vm_offset_t);
331boolean_t moea_dev_direct_mapped(mmu_t, vm_offset_t, vm_size_t);
332boolean_t moea_page_executable(mmu_t, vm_page_t);
333
334static mmu_method_t moea_methods[] = {
335	MMUMETHOD(mmu_change_wiring,	moea_change_wiring),
336	MMUMETHOD(mmu_clear_modify,	moea_clear_modify),
337	MMUMETHOD(mmu_clear_reference,	moea_clear_reference),
338	MMUMETHOD(mmu_copy_page,	moea_copy_page),
339	MMUMETHOD(mmu_enter,		moea_enter),
340	MMUMETHOD(mmu_enter_object,	moea_enter_object),
341	MMUMETHOD(mmu_enter_quick,	moea_enter_quick),
342	MMUMETHOD(mmu_extract,		moea_extract),
343	MMUMETHOD(mmu_extract_and_hold,	moea_extract_and_hold),
344	MMUMETHOD(mmu_init,		moea_init),
345	MMUMETHOD(mmu_is_modified,	moea_is_modified),
346	MMUMETHOD(mmu_ts_referenced,	moea_ts_referenced),
347	MMUMETHOD(mmu_map,     		moea_map),
348	MMUMETHOD(mmu_page_exists_quick,moea_page_exists_quick),
349	MMUMETHOD(mmu_page_wired_mappings,moea_page_wired_mappings),
350	MMUMETHOD(mmu_pinit,		moea_pinit),
351	MMUMETHOD(mmu_pinit0,		moea_pinit0),
352	MMUMETHOD(mmu_protect,		moea_protect),
353	MMUMETHOD(mmu_qenter,		moea_qenter),
354	MMUMETHOD(mmu_qremove,		moea_qremove),
355	MMUMETHOD(mmu_release,		moea_release),
356	MMUMETHOD(mmu_remove,		moea_remove),
357	MMUMETHOD(mmu_remove_all,      	moea_remove_all),
358	MMUMETHOD(mmu_remove_write,	moea_remove_write),
359	MMUMETHOD(mmu_zero_page,       	moea_zero_page),
360	MMUMETHOD(mmu_zero_page_area,	moea_zero_page_area),
361	MMUMETHOD(mmu_zero_page_idle,	moea_zero_page_idle),
362	MMUMETHOD(mmu_activate,		moea_activate),
363	MMUMETHOD(mmu_deactivate,      	moea_deactivate),
364
365	/* Internal interfaces */
366	MMUMETHOD(mmu_bootstrap,       	moea_bootstrap),
367	MMUMETHOD(mmu_mapdev,		moea_mapdev),
368	MMUMETHOD(mmu_unmapdev,		moea_unmapdev),
369	MMUMETHOD(mmu_kextract,		moea_kextract),
370	MMUMETHOD(mmu_kenter,		moea_kenter),
371	MMUMETHOD(mmu_dev_direct_mapped,moea_dev_direct_mapped),
372	MMUMETHOD(mmu_page_executable,	moea_page_executable),
373
374	{ 0, 0 }
375};
376
377static mmu_def_t oea_mmu = {
378	MMU_TYPE_OEA,
379	moea_methods,
380	0
381};
382MMU_DEF(oea_mmu);
383
384static void
385tlbie(vm_offset_t va)
386{
387
388	mtx_lock_spin(&tlbie_mtx);
389	__asm __volatile("tlbie %0" :: "r"(va));
390	__asm __volatile("tlbsync");
391	powerpc_sync();
392	mtx_unlock_spin(&tlbie_mtx);
393}
394
395static void
396tlbia(void)
397{
398	vm_offset_t va;
399
400	for (va = 0; va < 0x00040000; va += 0x00001000) {
401		__asm __volatile("tlbie %0" :: "r"(va));
402		powerpc_sync();
403	}
404	__asm __volatile("tlbsync");
405	powerpc_sync();
406}
407
408static __inline int
409va_to_sr(u_int *sr, vm_offset_t va)
410{
411	return (sr[(uintptr_t)va >> ADDR_SR_SHFT]);
412}
413
414static __inline u_int
415va_to_pteg(u_int sr, vm_offset_t addr)
416{
417	u_int hash;
418
419	hash = (sr & SR_VSID_MASK) ^ (((u_int)addr & ADDR_PIDX) >>
420	    ADDR_PIDX_SHFT);
421	return (hash & moea_pteg_mask);
422}
423
424static __inline struct pvo_head *
425pa_to_pvoh(vm_offset_t pa, vm_page_t *pg_p)
426{
427	struct	vm_page *pg;
428
429	pg = PHYS_TO_VM_PAGE(pa);
430
431	if (pg_p != NULL)
432		*pg_p = pg;
433
434	if (pg == NULL)
435		return (&moea_pvo_unmanaged);
436
437	return (&pg->md.mdpg_pvoh);
438}
439
440static __inline struct pvo_head *
441vm_page_to_pvoh(vm_page_t m)
442{
443
444	return (&m->md.mdpg_pvoh);
445}
446
447static __inline void
448moea_attr_clear(vm_page_t m, int ptebit)
449{
450
451	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
452	m->md.mdpg_attrs &= ~ptebit;
453}
454
455static __inline int
456moea_attr_fetch(vm_page_t m)
457{
458
459	return (m->md.mdpg_attrs);
460}
461
462static __inline void
463moea_attr_save(vm_page_t m, int ptebit)
464{
465
466	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
467	m->md.mdpg_attrs |= ptebit;
468}
469
470static __inline int
471moea_pte_compare(const struct pte *pt, const struct pte *pvo_pt)
472{
473	if (pt->pte_hi == pvo_pt->pte_hi)
474		return (1);
475
476	return (0);
477}
478
479static __inline int
480moea_pte_match(struct pte *pt, u_int sr, vm_offset_t va, int which)
481{
482	return (pt->pte_hi & ~PTE_VALID) ==
483	    (((sr & SR_VSID_MASK) << PTE_VSID_SHFT) |
484	    ((va >> ADDR_API_SHFT) & PTE_API) | which);
485}
486
487static __inline void
488moea_pte_create(struct pte *pt, u_int sr, vm_offset_t va, u_int pte_lo)
489{
490
491	mtx_assert(&moea_table_mutex, MA_OWNED);
492
493	/*
494	 * Construct a PTE.  Default to IMB initially.  Valid bit only gets
495	 * set when the real pte is set in memory.
496	 *
497	 * Note: Don't set the valid bit for correct operation of tlb update.
498	 */
499	pt->pte_hi = ((sr & SR_VSID_MASK) << PTE_VSID_SHFT) |
500	    (((va & ADDR_PIDX) >> ADDR_API_SHFT) & PTE_API);
501	pt->pte_lo = pte_lo;
502}
503
504static __inline void
505moea_pte_synch(struct pte *pt, struct pte *pvo_pt)
506{
507
508	mtx_assert(&moea_table_mutex, MA_OWNED);
509	pvo_pt->pte_lo |= pt->pte_lo & (PTE_REF | PTE_CHG);
510}
511
512static __inline void
513moea_pte_clear(struct pte *pt, vm_offset_t va, int ptebit)
514{
515
516	mtx_assert(&moea_table_mutex, MA_OWNED);
517
518	/*
519	 * As shown in Section 7.6.3.2.3
520	 */
521	pt->pte_lo &= ~ptebit;
522	tlbie(va);
523}
524
525static __inline void
526moea_pte_set(struct pte *pt, struct pte *pvo_pt)
527{
528
529	mtx_assert(&moea_table_mutex, MA_OWNED);
530	pvo_pt->pte_hi |= PTE_VALID;
531
532	/*
533	 * Update the PTE as defined in section 7.6.3.1.
534	 * Note that the REF/CHG bits are from pvo_pt and thus should havce
535	 * been saved so this routine can restore them (if desired).
536	 */
537	pt->pte_lo = pvo_pt->pte_lo;
538	powerpc_sync();
539	pt->pte_hi = pvo_pt->pte_hi;
540	powerpc_sync();
541	moea_pte_valid++;
542}
543
544static __inline void
545moea_pte_unset(struct pte *pt, struct pte *pvo_pt, vm_offset_t va)
546{
547
548	mtx_assert(&moea_table_mutex, MA_OWNED);
549	pvo_pt->pte_hi &= ~PTE_VALID;
550
551	/*
552	 * Force the reg & chg bits back into the PTEs.
553	 */
554	powerpc_sync();
555
556	/*
557	 * Invalidate the pte.
558	 */
559	pt->pte_hi &= ~PTE_VALID;
560
561	tlbie(va);
562
563	/*
564	 * Save the reg & chg bits.
565	 */
566	moea_pte_synch(pt, pvo_pt);
567	moea_pte_valid--;
568}
569
570static __inline void
571moea_pte_change(struct pte *pt, struct pte *pvo_pt, vm_offset_t va)
572{
573
574	/*
575	 * Invalidate the PTE
576	 */
577	moea_pte_unset(pt, pvo_pt, va);
578	moea_pte_set(pt, pvo_pt);
579}
580
581/*
582 * Quick sort callout for comparing memory regions.
583 */
584static int	mr_cmp(const void *a, const void *b);
585static int	om_cmp(const void *a, const void *b);
586
587static int
588mr_cmp(const void *a, const void *b)
589{
590	const struct	mem_region *regiona;
591	const struct	mem_region *regionb;
592
593	regiona = a;
594	regionb = b;
595	if (regiona->mr_start < regionb->mr_start)
596		return (-1);
597	else if (regiona->mr_start > regionb->mr_start)
598		return (1);
599	else
600		return (0);
601}
602
603static int
604om_cmp(const void *a, const void *b)
605{
606	const struct	ofw_map *mapa;
607	const struct	ofw_map *mapb;
608
609	mapa = a;
610	mapb = b;
611	if (mapa->om_pa < mapb->om_pa)
612		return (-1);
613	else if (mapa->om_pa > mapb->om_pa)
614		return (1);
615	else
616		return (0);
617}
618
619void
620pmap_cpu_bootstrap(int ap)
621{
622	u_int sdr;
623	int i;
624
625	if (ap) {
626		powerpc_sync();
627		__asm __volatile("mtdbatu 0,%0" :: "r"(battable[0].batu));
628		__asm __volatile("mtdbatl 0,%0" :: "r"(battable[0].batl));
629		isync();
630		__asm __volatile("mtibatu 0,%0" :: "r"(battable[0].batu));
631		__asm __volatile("mtibatl 0,%0" :: "r"(battable[0].batl));
632		isync();
633	}
634
635	__asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu));
636	__asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl));
637	isync();
638
639	__asm __volatile("mtibatu 1,%0" :: "r"(0));
640	__asm __volatile("mtdbatu 2,%0" :: "r"(0));
641	__asm __volatile("mtibatu 2,%0" :: "r"(0));
642	__asm __volatile("mtdbatu 3,%0" :: "r"(0));
643	__asm __volatile("mtibatu 3,%0" :: "r"(0));
644	isync();
645
646	for (i = 0; i < 16; i++)
647		mtsrin(i << ADDR_SR_SHFT, EMPTY_SEGMENT);
648
649	__asm __volatile("mtsr %0,%1" :: "n"(KERNEL_SR), "r"(KERNEL_SEGMENT));
650	__asm __volatile("mtsr %0,%1" :: "n"(KERNEL2_SR), "r"(KERNEL2_SEGMENT));
651	powerpc_sync();
652
653	sdr = (u_int)moea_pteg_table | (moea_pteg_mask >> 10);
654	__asm __volatile("mtsdr1 %0" :: "r"(sdr));
655	isync();
656
657	tlbia();
658}
659
660void
661moea_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
662{
663	ihandle_t	mmui;
664	phandle_t	chosen, mmu;
665	int		sz;
666	int		i, j;
667	int		ofw_mappings;
668	vm_size_t	size, physsz, hwphyssz;
669	vm_offset_t	pa, va, off;
670
671        /*
672         * Set up BAT0 to map the lowest 256 MB area
673         */
674        battable[0x0].batl = BATL(0x00000000, BAT_M, BAT_PP_RW);
675        battable[0x0].batu = BATU(0x00000000, BAT_BL_256M, BAT_Vs);
676
677        /*
678         * Map PCI memory space.
679         */
680        battable[0x8].batl = BATL(0x80000000, BAT_I|BAT_G, BAT_PP_RW);
681        battable[0x8].batu = BATU(0x80000000, BAT_BL_256M, BAT_Vs);
682
683        battable[0x9].batl = BATL(0x90000000, BAT_I|BAT_G, BAT_PP_RW);
684        battable[0x9].batu = BATU(0x90000000, BAT_BL_256M, BAT_Vs);
685
686        battable[0xa].batl = BATL(0xa0000000, BAT_I|BAT_G, BAT_PP_RW);
687        battable[0xa].batu = BATU(0xa0000000, BAT_BL_256M, BAT_Vs);
688
689        battable[0xb].batl = BATL(0xb0000000, BAT_I|BAT_G, BAT_PP_RW);
690        battable[0xb].batu = BATU(0xb0000000, BAT_BL_256M, BAT_Vs);
691
692        /*
693         * Map obio devices.
694         */
695        battable[0xf].batl = BATL(0xf0000000, BAT_I|BAT_G, BAT_PP_RW);
696        battable[0xf].batu = BATU(0xf0000000, BAT_BL_256M, BAT_Vs);
697
698	/*
699	 * Use an IBAT and a DBAT to map the bottom segment of memory
700	 * where we are.
701	 */
702	__asm (".balign 32; \n"
703	       "mtibatu 0,%0; mtibatl 0,%1; isync; \n"
704	       "mtdbatu 0,%0; mtdbatl 0,%1; isync"
705	    :: "r"(battable[0].batu), "r"(battable[0].batl));
706
707	/* map pci space */
708	__asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu));
709	__asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl));
710	isync();
711
712	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
713	CTR0(KTR_PMAP, "moea_bootstrap: physical memory");
714
715	qsort(pregions, pregions_sz, sizeof(*pregions), mr_cmp);
716	for (i = 0; i < pregions_sz; i++) {
717		vm_offset_t pa;
718		vm_offset_t end;
719
720		CTR3(KTR_PMAP, "physregion: %#x - %#x (%#x)",
721			pregions[i].mr_start,
722			pregions[i].mr_start + pregions[i].mr_size,
723			pregions[i].mr_size);
724		/*
725		 * Install entries into the BAT table to allow all
726		 * of physmem to be convered by on-demand BAT entries.
727		 * The loop will sometimes set the same battable element
728		 * twice, but that's fine since they won't be used for
729		 * a while yet.
730		 */
731		pa = pregions[i].mr_start & 0xf0000000;
732		end = pregions[i].mr_start + pregions[i].mr_size;
733		do {
734                        u_int n = pa >> ADDR_SR_SHFT;
735
736			battable[n].batl = BATL(pa, BAT_M, BAT_PP_RW);
737			battable[n].batu = BATU(pa, BAT_BL_256M, BAT_Vs);
738			pa += SEGMENT_LENGTH;
739		} while (pa < end);
740	}
741
742	if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz)
743		panic("moea_bootstrap: phys_avail too small");
744	qsort(regions, regions_sz, sizeof(*regions), mr_cmp);
745	phys_avail_count = 0;
746	physsz = 0;
747	hwphyssz = 0;
748	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
749	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
750		CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start,
751		    regions[i].mr_start + regions[i].mr_size,
752		    regions[i].mr_size);
753		if (hwphyssz != 0 &&
754		    (physsz + regions[i].mr_size) >= hwphyssz) {
755			if (physsz < hwphyssz) {
756				phys_avail[j] = regions[i].mr_start;
757				phys_avail[j + 1] = regions[i].mr_start +
758				    hwphyssz - physsz;
759				physsz = hwphyssz;
760				phys_avail_count++;
761			}
762			break;
763		}
764		phys_avail[j] = regions[i].mr_start;
765		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
766		phys_avail_count++;
767		physsz += regions[i].mr_size;
768	}
769	physmem = btoc(physsz);
770
771	/*
772	 * Allocate PTEG table.
773	 */
774#ifdef PTEGCOUNT
775	moea_pteg_count = PTEGCOUNT;
776#else
777	moea_pteg_count = 0x1000;
778
779	while (moea_pteg_count < physmem)
780		moea_pteg_count <<= 1;
781
782	moea_pteg_count >>= 1;
783#endif /* PTEGCOUNT */
784
785	size = moea_pteg_count * sizeof(struct pteg);
786	CTR2(KTR_PMAP, "moea_bootstrap: %d PTEGs, %d bytes", moea_pteg_count,
787	    size);
788	moea_pteg_table = (struct pteg *)moea_bootstrap_alloc(size, size);
789	CTR1(KTR_PMAP, "moea_bootstrap: PTEG table at %p", moea_pteg_table);
790	bzero((void *)moea_pteg_table, moea_pteg_count * sizeof(struct pteg));
791	moea_pteg_mask = moea_pteg_count - 1;
792
793	/*
794	 * Allocate pv/overflow lists.
795	 */
796	size = sizeof(struct pvo_head) * moea_pteg_count;
797	moea_pvo_table = (struct pvo_head *)moea_bootstrap_alloc(size,
798	    PAGE_SIZE);
799	CTR1(KTR_PMAP, "moea_bootstrap: PVO table at %p", moea_pvo_table);
800	for (i = 0; i < moea_pteg_count; i++)
801		LIST_INIT(&moea_pvo_table[i]);
802
803	/*
804	 * Initialize the lock that synchronizes access to the pteg and pvo
805	 * tables.
806	 */
807	mtx_init(&moea_table_mutex, "pmap table", NULL, MTX_DEF |
808	    MTX_RECURSE);
809
810	mtx_init(&tlbie_mtx, "tlbie", NULL, MTX_SPIN);
811
812	/*
813	 * Initialise the unmanaged pvo pool.
814	 */
815	moea_bpvo_pool = (struct pvo_entry *)moea_bootstrap_alloc(
816		BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0);
817	moea_bpvo_pool_index = 0;
818
819	/*
820	 * Make sure kernel vsid is allocated as well as VSID 0.
821	 */
822	moea_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW]
823		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
824	moea_vsid_bitmap[0] |= 1;
825
826	/*
827	 * Set up the Open Firmware pmap and add it's mappings.
828	 */
829	moea_pinit(mmup, &ofw_pmap);
830	ofw_pmap.pm_sr[KERNEL_SR] = KERNEL_SEGMENT;
831	ofw_pmap.pm_sr[KERNEL2_SR] = KERNEL2_SEGMENT;
832	if ((chosen = OF_finddevice("/chosen")) == -1)
833		panic("moea_bootstrap: can't find /chosen");
834	OF_getprop(chosen, "mmu", &mmui, 4);
835	if ((mmu = OF_instance_to_package(mmui)) == -1)
836		panic("moea_bootstrap: can't get mmu package");
837	if ((sz = OF_getproplen(mmu, "translations")) == -1)
838		panic("moea_bootstrap: can't get ofw translation count");
839	translations = NULL;
840	for (i = 0; phys_avail[i] != 0; i += 2) {
841		if (phys_avail[i + 1] >= sz) {
842			translations = (struct ofw_map *)phys_avail[i];
843			break;
844		}
845	}
846	if (translations == NULL)
847		panic("moea_bootstrap: no space to copy translations");
848	bzero(translations, sz);
849	if (OF_getprop(mmu, "translations", translations, sz) == -1)
850		panic("moea_bootstrap: can't get ofw translations");
851	CTR0(KTR_PMAP, "moea_bootstrap: translations");
852	sz /= sizeof(*translations);
853	qsort(translations, sz, sizeof (*translations), om_cmp);
854	for (i = 0, ofw_mappings = 0; i < sz; i++) {
855		CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x",
856		    translations[i].om_pa, translations[i].om_va,
857		    translations[i].om_len);
858
859		/*
860		 * If the mapping is 1:1, let the RAM and device on-demand
861		 * BAT tables take care of the translation.
862		 */
863		if (translations[i].om_va == translations[i].om_pa)
864			continue;
865
866		/* Enter the pages */
867		for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
868			struct	vm_page m;
869
870			m.phys_addr = translations[i].om_pa + off;
871			PMAP_LOCK(&ofw_pmap);
872			moea_enter_locked(&ofw_pmap,
873				   translations[i].om_va + off, &m,
874				   VM_PROT_ALL, 1);
875			PMAP_UNLOCK(&ofw_pmap);
876			ofw_mappings++;
877		}
878	}
879
880	/*
881	 * Calculate the last available physical address.
882	 */
883	for (i = 0; phys_avail[i + 2] != 0; i += 2)
884		;
885	Maxmem = powerpc_btop(phys_avail[i + 1]);
886
887	/*
888	 * Initialize the kernel pmap (which is statically allocated).
889	 */
890	PMAP_LOCK_INIT(kernel_pmap);
891	for (i = 0; i < 16; i++) {
892		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT;
893	}
894	kernel_pmap->pm_sr[KERNEL_SR] = KERNEL_SEGMENT;
895	kernel_pmap->pm_sr[KERNEL2_SR] = KERNEL2_SEGMENT;
896	kernel_pmap->pm_active = ~0;
897
898	pmap_cpu_bootstrap(0);
899
900	pmap_bootstrapped++;
901
902	/*
903	 * Set the start and end of kva.
904	 */
905	virtual_avail = VM_MIN_KERNEL_ADDRESS;
906	virtual_end = VM_MAX_KERNEL_ADDRESS;
907
908	/*
909	 * Allocate a kernel stack with a guard page for thread0 and map it
910	 * into the kernel page map.
911	 */
912	pa = moea_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
913	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
914	virtual_avail = va + KSTACK_PAGES * PAGE_SIZE;
915	CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va);
916	thread0.td_kstack = va;
917	thread0.td_kstack_pages = KSTACK_PAGES;
918	for (i = 0; i < KSTACK_PAGES; i++) {
919		moea_kenter(mmup, va, pa);;
920		pa += PAGE_SIZE;
921		va += PAGE_SIZE;
922	}
923
924	/*
925	 * Allocate virtual address space for the message buffer.
926	 */
927	pa = msgbuf_phys = moea_bootstrap_alloc(MSGBUF_SIZE, PAGE_SIZE);
928	msgbufp = (struct msgbuf *)virtual_avail;
929	va = virtual_avail;
930	virtual_avail += round_page(MSGBUF_SIZE);
931	while (va < virtual_avail) {
932		moea_kenter(mmup, va, pa);;
933		pa += PAGE_SIZE;
934		va += PAGE_SIZE;
935	}
936}
937
938/*
939 * Activate a user pmap.  The pmap must be activated before it's address
940 * space can be accessed in any way.
941 */
942void
943moea_activate(mmu_t mmu, struct thread *td)
944{
945	pmap_t	pm, pmr;
946
947	/*
948	 * Load all the data we need up front to encourage the compiler to
949	 * not issue any loads while we have interrupts disabled below.
950	 */
951	pm = &td->td_proc->p_vmspace->vm_pmap;
952
953	if ((pmr = (pmap_t)moea_kextract(mmu, (vm_offset_t)pm)) == NULL)
954		pmr = pm;
955
956	pm->pm_active |= PCPU_GET(cpumask);
957	PCPU_SET(curpmap, pmr);
958}
959
960void
961moea_deactivate(mmu_t mmu, struct thread *td)
962{
963	pmap_t	pm;
964
965	pm = &td->td_proc->p_vmspace->vm_pmap;
966	pm->pm_active &= ~PCPU_GET(cpumask);
967	PCPU_SET(curpmap, NULL);
968}
969
970void
971moea_change_wiring(mmu_t mmu, pmap_t pm, vm_offset_t va, boolean_t wired)
972{
973	struct	pvo_entry *pvo;
974
975	PMAP_LOCK(pm);
976	pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
977
978	if (pvo != NULL) {
979		if (wired) {
980			if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
981				pm->pm_stats.wired_count++;
982			pvo->pvo_vaddr |= PVO_WIRED;
983		} else {
984			if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
985				pm->pm_stats.wired_count--;
986			pvo->pvo_vaddr &= ~PVO_WIRED;
987		}
988	}
989	PMAP_UNLOCK(pm);
990}
991
992void
993moea_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst)
994{
995	vm_offset_t	dst;
996	vm_offset_t	src;
997
998	dst = VM_PAGE_TO_PHYS(mdst);
999	src = VM_PAGE_TO_PHYS(msrc);
1000
1001	kcopy((void *)src, (void *)dst, PAGE_SIZE);
1002}
1003
1004/*
1005 * Zero a page of physical memory by temporarily mapping it into the tlb.
1006 */
1007void
1008moea_zero_page(mmu_t mmu, vm_page_t m)
1009{
1010	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1011	void *va = (void *)pa;
1012
1013	bzero(va, PAGE_SIZE);
1014}
1015
1016void
1017moea_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size)
1018{
1019	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1020	void *va = (void *)(pa + off);
1021
1022	bzero(va, size);
1023}
1024
1025void
1026moea_zero_page_idle(mmu_t mmu, vm_page_t m)
1027{
1028	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1029	void *va = (void *)pa;
1030
1031	bzero(va, PAGE_SIZE);
1032}
1033
1034/*
1035 * Map the given physical page at the specified virtual address in the
1036 * target pmap with the protection requested.  If specified the page
1037 * will be wired down.
1038 */
1039void
1040moea_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1041	   boolean_t wired)
1042{
1043
1044	vm_page_lock_queues();
1045	PMAP_LOCK(pmap);
1046	moea_enter_locked(pmap, va, m, prot, wired);
1047	vm_page_unlock_queues();
1048	PMAP_UNLOCK(pmap);
1049}
1050
1051/*
1052 * Map the given physical page at the specified virtual address in the
1053 * target pmap with the protection requested.  If specified the page
1054 * will be wired down.
1055 *
1056 * The page queues and pmap must be locked.
1057 */
1058static void
1059moea_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1060    boolean_t wired)
1061{
1062	struct		pvo_head *pvo_head;
1063	uma_zone_t	zone;
1064	vm_page_t	pg;
1065	u_int		pte_lo, pvo_flags, was_exec, i;
1066	int		error;
1067
1068	if (!moea_initialized) {
1069		pvo_head = &moea_pvo_kunmanaged;
1070		zone = moea_upvo_zone;
1071		pvo_flags = 0;
1072		pg = NULL;
1073		was_exec = PTE_EXEC;
1074	} else {
1075		pvo_head = vm_page_to_pvoh(m);
1076		pg = m;
1077		zone = moea_mpvo_zone;
1078		pvo_flags = PVO_MANAGED;
1079		was_exec = 0;
1080	}
1081	if (pmap_bootstrapped)
1082		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1083	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1084
1085	/* XXX change the pvo head for fake pages */
1086	if ((m->flags & PG_FICTITIOUS) == PG_FICTITIOUS)
1087		pvo_head = &moea_pvo_kunmanaged;
1088
1089	/*
1090	 * If this is a managed page, and it's the first reference to the page,
1091	 * clear the execness of the page.  Otherwise fetch the execness.
1092	 */
1093	if ((pg != NULL) && ((m->flags & PG_FICTITIOUS) == 0)) {
1094		if (LIST_EMPTY(pvo_head)) {
1095			moea_attr_clear(pg, PTE_EXEC);
1096		} else {
1097			was_exec = moea_attr_fetch(pg) & PTE_EXEC;
1098		}
1099	}
1100
1101	/*
1102	 * Assume the page is cache inhibited and access is guarded unless
1103	 * it's in our available memory array.
1104	 */
1105	pte_lo = PTE_I | PTE_G;
1106	for (i = 0; i < pregions_sz; i++) {
1107		if ((VM_PAGE_TO_PHYS(m) >= pregions[i].mr_start) &&
1108		    (VM_PAGE_TO_PHYS(m) <
1109			(pregions[i].mr_start + pregions[i].mr_size))) {
1110			pte_lo = PTE_M;
1111			break;
1112		}
1113	}
1114
1115	if (prot & VM_PROT_WRITE) {
1116		pte_lo |= PTE_BW;
1117		if (pmap_bootstrapped)
1118			vm_page_flag_set(m, PG_WRITEABLE);
1119	} else
1120		pte_lo |= PTE_BR;
1121
1122	if (prot & VM_PROT_EXECUTE)
1123		pvo_flags |= PVO_EXECUTABLE;
1124
1125	if (wired)
1126		pvo_flags |= PVO_WIRED;
1127
1128	if ((m->flags & PG_FICTITIOUS) != 0)
1129		pvo_flags |= PVO_FAKE;
1130
1131	error = moea_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m),
1132	    pte_lo, pvo_flags);
1133
1134	/*
1135	 * Flush the real page from the instruction cache if this page is
1136	 * mapped executable and cacheable and was not previously mapped (or
1137	 * was not mapped executable).
1138	 */
1139	if (error == 0 && (pvo_flags & PVO_EXECUTABLE) &&
1140	    (pte_lo & PTE_I) == 0 && was_exec == 0) {
1141		/*
1142		 * Flush the real memory from the cache.
1143		 */
1144		moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1145		if (pg != NULL)
1146			moea_attr_save(pg, PTE_EXEC);
1147	}
1148
1149	/* XXX syncicache always until problems are sorted */
1150	moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1151}
1152
1153/*
1154 * Maps a sequence of resident pages belonging to the same object.
1155 * The sequence begins with the given page m_start.  This page is
1156 * mapped at the given virtual address start.  Each subsequent page is
1157 * mapped at a virtual address that is offset from start by the same
1158 * amount as the page is offset from m_start within the object.  The
1159 * last page in the sequence is the page with the largest offset from
1160 * m_start that can be mapped at a virtual address less than the given
1161 * virtual address end.  Not every virtual page between start and end
1162 * is mapped; only those for which a resident page exists with the
1163 * corresponding offset from m_start are mapped.
1164 */
1165void
1166moea_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end,
1167    vm_page_t m_start, vm_prot_t prot)
1168{
1169	vm_page_t m;
1170	vm_pindex_t diff, psize;
1171
1172	psize = atop(end - start);
1173	m = m_start;
1174	PMAP_LOCK(pm);
1175	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1176		moea_enter_locked(pm, start + ptoa(diff), m, prot &
1177		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
1178		m = TAILQ_NEXT(m, listq);
1179	}
1180	PMAP_UNLOCK(pm);
1181}
1182
1183void
1184moea_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m,
1185    vm_prot_t prot)
1186{
1187
1188	PMAP_LOCK(pm);
1189	moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1190	    FALSE);
1191	PMAP_UNLOCK(pm);
1192
1193}
1194
1195vm_paddr_t
1196moea_extract(mmu_t mmu, pmap_t pm, vm_offset_t va)
1197{
1198	struct	pvo_entry *pvo;
1199	vm_paddr_t pa;
1200
1201	PMAP_LOCK(pm);
1202	pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
1203	if (pvo == NULL)
1204		pa = 0;
1205	else
1206		pa = (pvo->pvo_pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF);
1207	PMAP_UNLOCK(pm);
1208	return (pa);
1209}
1210
1211/*
1212 * Atomically extract and hold the physical page with the given
1213 * pmap and virtual address pair if that mapping permits the given
1214 * protection.
1215 */
1216vm_page_t
1217moea_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1218{
1219	struct	pvo_entry *pvo;
1220	vm_page_t m;
1221
1222	m = NULL;
1223	vm_page_lock_queues();
1224	PMAP_LOCK(pmap);
1225	pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL);
1226	if (pvo != NULL && (pvo->pvo_pte.pte_hi & PTE_VALID) &&
1227	    ((pvo->pvo_pte.pte_lo & PTE_PP) == PTE_RW ||
1228	     (prot & VM_PROT_WRITE) == 0)) {
1229		m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte_lo & PTE_RPGN);
1230		vm_page_hold(m);
1231	}
1232	vm_page_unlock_queues();
1233	PMAP_UNLOCK(pmap);
1234	return (m);
1235}
1236
1237void
1238moea_init(mmu_t mmu)
1239{
1240
1241	moea_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1242	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1243	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1244	moea_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry),
1245	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1246	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1247	moea_initialized = TRUE;
1248}
1249
1250boolean_t
1251moea_is_modified(mmu_t mmu, vm_page_t m)
1252{
1253
1254	if ((m->flags & (PG_FICTITIOUS |PG_UNMANAGED)) != 0)
1255		return (FALSE);
1256
1257	return (moea_query_bit(m, PTE_CHG));
1258}
1259
1260void
1261moea_clear_reference(mmu_t mmu, vm_page_t m)
1262{
1263
1264	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1265		return;
1266	moea_clear_bit(m, PTE_REF, NULL);
1267}
1268
1269void
1270moea_clear_modify(mmu_t mmu, vm_page_t m)
1271{
1272
1273	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1274		return;
1275	moea_clear_bit(m, PTE_CHG, NULL);
1276}
1277
1278/*
1279 * Clear the write and modified bits in each of the given page's mappings.
1280 */
1281void
1282moea_remove_write(mmu_t mmu, vm_page_t m)
1283{
1284	struct	pvo_entry *pvo;
1285	struct	pte *pt;
1286	pmap_t	pmap;
1287	u_int	lo;
1288
1289	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1290	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 ||
1291	    (m->flags & PG_WRITEABLE) == 0)
1292		return;
1293	lo = moea_attr_fetch(m);
1294	powerpc_sync();
1295	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1296		pmap = pvo->pvo_pmap;
1297		PMAP_LOCK(pmap);
1298		if ((pvo->pvo_pte.pte_lo & PTE_PP) != PTE_BR) {
1299			pt = moea_pvo_to_pte(pvo, -1);
1300			pvo->pvo_pte.pte_lo &= ~PTE_PP;
1301			pvo->pvo_pte.pte_lo |= PTE_BR;
1302			if (pt != NULL) {
1303				moea_pte_synch(pt, &pvo->pvo_pte);
1304				lo |= pvo->pvo_pte.pte_lo;
1305				pvo->pvo_pte.pte_lo &= ~PTE_CHG;
1306				moea_pte_change(pt, &pvo->pvo_pte,
1307				    pvo->pvo_vaddr);
1308				mtx_unlock(&moea_table_mutex);
1309			}
1310		}
1311		PMAP_UNLOCK(pmap);
1312	}
1313	if ((lo & PTE_CHG) != 0) {
1314		moea_attr_clear(m, PTE_CHG);
1315		vm_page_dirty(m);
1316	}
1317	vm_page_flag_clear(m, PG_WRITEABLE);
1318}
1319
1320/*
1321 *	moea_ts_referenced:
1322 *
1323 *	Return a count of reference bits for a page, clearing those bits.
1324 *	It is not necessary for every reference bit to be cleared, but it
1325 *	is necessary that 0 only be returned when there are truly no
1326 *	reference bits set.
1327 *
1328 *	XXX: The exact number of bits to check and clear is a matter that
1329 *	should be tested and standardized at some point in the future for
1330 *	optimal aging of shared pages.
1331 */
1332boolean_t
1333moea_ts_referenced(mmu_t mmu, vm_page_t m)
1334{
1335	int count;
1336
1337	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
1338		return (0);
1339
1340	count = moea_clear_bit(m, PTE_REF, NULL);
1341
1342	return (count);
1343}
1344
1345/*
1346 * Map a wired page into kernel virtual address space.
1347 */
1348void
1349moea_kenter(mmu_t mmu, vm_offset_t va, vm_offset_t pa)
1350{
1351	u_int		pte_lo;
1352	int		error;
1353	int		i;
1354
1355#if 0
1356	if (va < VM_MIN_KERNEL_ADDRESS)
1357		panic("moea_kenter: attempt to enter non-kernel address %#x",
1358		    va);
1359#endif
1360
1361	pte_lo = PTE_I | PTE_G;
1362	for (i = 0; i < pregions_sz; i++) {
1363		if ((pa >= pregions[i].mr_start) &&
1364		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
1365			pte_lo = PTE_M;
1366			break;
1367		}
1368	}
1369
1370	PMAP_LOCK(kernel_pmap);
1371	error = moea_pvo_enter(kernel_pmap, moea_upvo_zone,
1372	    &moea_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED);
1373
1374	if (error != 0 && error != ENOENT)
1375		panic("moea_kenter: failed to enter va %#x pa %#x: %d", va,
1376		    pa, error);
1377
1378	/*
1379	 * Flush the real memory from the instruction cache.
1380	 */
1381	if ((pte_lo & (PTE_I | PTE_G)) == 0) {
1382		moea_syncicache(pa, PAGE_SIZE);
1383	}
1384	PMAP_UNLOCK(kernel_pmap);
1385}
1386
1387/*
1388 * Extract the physical page address associated with the given kernel virtual
1389 * address.
1390 */
1391vm_offset_t
1392moea_kextract(mmu_t mmu, vm_offset_t va)
1393{
1394	struct		pvo_entry *pvo;
1395	vm_paddr_t pa;
1396
1397#ifdef UMA_MD_SMALL_ALLOC
1398	/*
1399	 * Allow direct mappings
1400	 */
1401	if (va < VM_MIN_KERNEL_ADDRESS) {
1402		return (va);
1403	}
1404#endif
1405
1406	PMAP_LOCK(kernel_pmap);
1407	pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL);
1408	KASSERT(pvo != NULL, ("moea_kextract: no addr found"));
1409	pa = (pvo->pvo_pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF);
1410	PMAP_UNLOCK(kernel_pmap);
1411	return (pa);
1412}
1413
1414/*
1415 * Remove a wired page from kernel virtual address space.
1416 */
1417void
1418moea_kremove(mmu_t mmu, vm_offset_t va)
1419{
1420
1421	moea_remove(mmu, kernel_pmap, va, va + PAGE_SIZE);
1422}
1423
1424/*
1425 * Map a range of physical addresses into kernel virtual address space.
1426 *
1427 * The value passed in *virt is a suggested virtual address for the mapping.
1428 * Architectures which can support a direct-mapped physical to virtual region
1429 * can return the appropriate address within that region, leaving '*virt'
1430 * unchanged.  We cannot and therefore do not; *virt is updated with the
1431 * first usable address after the mapped region.
1432 */
1433vm_offset_t
1434moea_map(mmu_t mmu, vm_offset_t *virt, vm_offset_t pa_start,
1435    vm_offset_t pa_end, int prot)
1436{
1437	vm_offset_t	sva, va;
1438
1439	sva = *virt;
1440	va = sva;
1441	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
1442		moea_kenter(mmu, va, pa_start);
1443	*virt = va;
1444	return (sva);
1445}
1446
1447/*
1448 * Returns true if the pmap's pv is one of the first
1449 * 16 pvs linked to from this page.  This count may
1450 * be changed upwards or downwards in the future; it
1451 * is only necessary that true be returned for a small
1452 * subset of pmaps for proper page aging.
1453 */
1454boolean_t
1455moea_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m)
1456{
1457        int loops;
1458	struct pvo_entry *pvo;
1459
1460        if (!moea_initialized || (m->flags & PG_FICTITIOUS))
1461                return FALSE;
1462
1463	loops = 0;
1464	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1465		if (pvo->pvo_pmap == pmap)
1466			return (TRUE);
1467		if (++loops >= 16)
1468			break;
1469	}
1470
1471	return (FALSE);
1472}
1473
1474/*
1475 * Return the number of managed mappings to the given physical page
1476 * that are wired.
1477 */
1478int
1479moea_page_wired_mappings(mmu_t mmu, vm_page_t m)
1480{
1481	struct pvo_entry *pvo;
1482	int count;
1483
1484	count = 0;
1485	if (!moea_initialized || (m->flags & PG_FICTITIOUS) != 0)
1486		return (count);
1487	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1488	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
1489		if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1490			count++;
1491	return (count);
1492}
1493
1494static u_int	moea_vsidcontext;
1495
1496void
1497moea_pinit(mmu_t mmu, pmap_t pmap)
1498{
1499	int	i, mask;
1500	u_int	entropy;
1501
1502	KASSERT((int)pmap < VM_MIN_KERNEL_ADDRESS, ("moea_pinit: virt pmap"));
1503	PMAP_LOCK_INIT(pmap);
1504
1505	entropy = 0;
1506	__asm __volatile("mftb %0" : "=r"(entropy));
1507
1508	/*
1509	 * Allocate some segment registers for this pmap.
1510	 */
1511	for (i = 0; i < NPMAPS; i += VSID_NBPW) {
1512		u_int	hash, n;
1513
1514		/*
1515		 * Create a new value by mutiplying by a prime and adding in
1516		 * entropy from the timebase register.  This is to make the
1517		 * VSID more random so that the PT hash function collides
1518		 * less often.  (Note that the prime casues gcc to do shifts
1519		 * instead of a multiply.)
1520		 */
1521		moea_vsidcontext = (moea_vsidcontext * 0x1105) + entropy;
1522		hash = moea_vsidcontext & (NPMAPS - 1);
1523		if (hash == 0)		/* 0 is special, avoid it */
1524			continue;
1525		n = hash >> 5;
1526		mask = 1 << (hash & (VSID_NBPW - 1));
1527		hash = (moea_vsidcontext & 0xfffff);
1528		if (moea_vsid_bitmap[n] & mask) {	/* collision? */
1529			/* anything free in this bucket? */
1530			if (moea_vsid_bitmap[n] == 0xffffffff) {
1531				entropy = (moea_vsidcontext >> 20);
1532				continue;
1533			}
1534			i = ffs(~moea_vsid_bitmap[i]) - 1;
1535			mask = 1 << i;
1536			hash &= 0xfffff & ~(VSID_NBPW - 1);
1537			hash |= i;
1538		}
1539		moea_vsid_bitmap[n] |= mask;
1540		for (i = 0; i < 16; i++)
1541			pmap->pm_sr[i] = VSID_MAKE(i, hash);
1542		return;
1543	}
1544
1545	panic("moea_pinit: out of segments");
1546}
1547
1548/*
1549 * Initialize the pmap associated with process 0.
1550 */
1551void
1552moea_pinit0(mmu_t mmu, pmap_t pm)
1553{
1554
1555	moea_pinit(mmu, pm);
1556	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
1557}
1558
1559/*
1560 * Set the physical protection on the specified range of this map as requested.
1561 */
1562void
1563moea_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
1564    vm_prot_t prot)
1565{
1566	struct	pvo_entry *pvo;
1567	struct	pte *pt;
1568	int	pteidx;
1569
1570	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
1571	    ("moea_protect: non current pmap"));
1572
1573	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1574		moea_remove(mmu, pm, sva, eva);
1575		return;
1576	}
1577
1578	vm_page_lock_queues();
1579	PMAP_LOCK(pm);
1580	for (; sva < eva; sva += PAGE_SIZE) {
1581		pvo = moea_pvo_find_va(pm, sva, &pteidx);
1582		if (pvo == NULL)
1583			continue;
1584
1585		if ((prot & VM_PROT_EXECUTE) == 0)
1586			pvo->pvo_vaddr &= ~PVO_EXECUTABLE;
1587
1588		/*
1589		 * Grab the PTE pointer before we diddle with the cached PTE
1590		 * copy.
1591		 */
1592		pt = moea_pvo_to_pte(pvo, pteidx);
1593		/*
1594		 * Change the protection of the page.
1595		 */
1596		pvo->pvo_pte.pte_lo &= ~PTE_PP;
1597		pvo->pvo_pte.pte_lo |= PTE_BR;
1598
1599		/*
1600		 * If the PVO is in the page table, update that pte as well.
1601		 */
1602		if (pt != NULL) {
1603			moea_pte_change(pt, &pvo->pvo_pte, pvo->pvo_vaddr);
1604			mtx_unlock(&moea_table_mutex);
1605		}
1606	}
1607	vm_page_unlock_queues();
1608	PMAP_UNLOCK(pm);
1609}
1610
1611/*
1612 * Map a list of wired pages into kernel virtual address space.  This is
1613 * intended for temporary mappings which do not need page modification or
1614 * references recorded.  Existing mappings in the region are overwritten.
1615 */
1616void
1617moea_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count)
1618{
1619	vm_offset_t va;
1620
1621	va = sva;
1622	while (count-- > 0) {
1623		moea_kenter(mmu, va, VM_PAGE_TO_PHYS(*m));
1624		va += PAGE_SIZE;
1625		m++;
1626	}
1627}
1628
1629/*
1630 * Remove page mappings from kernel virtual address space.  Intended for
1631 * temporary mappings entered by moea_qenter.
1632 */
1633void
1634moea_qremove(mmu_t mmu, vm_offset_t sva, int count)
1635{
1636	vm_offset_t va;
1637
1638	va = sva;
1639	while (count-- > 0) {
1640		moea_kremove(mmu, va);
1641		va += PAGE_SIZE;
1642	}
1643}
1644
1645void
1646moea_release(mmu_t mmu, pmap_t pmap)
1647{
1648        int idx, mask;
1649
1650	/*
1651	 * Free segment register's VSID
1652	 */
1653        if (pmap->pm_sr[0] == 0)
1654                panic("moea_release");
1655
1656        idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1);
1657        mask = 1 << (idx % VSID_NBPW);
1658        idx /= VSID_NBPW;
1659        moea_vsid_bitmap[idx] &= ~mask;
1660	PMAP_LOCK_DESTROY(pmap);
1661}
1662
1663/*
1664 * Remove the given range of addresses from the specified map.
1665 */
1666void
1667moea_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1668{
1669	struct	pvo_entry *pvo;
1670	int	pteidx;
1671
1672	vm_page_lock_queues();
1673	PMAP_LOCK(pm);
1674	for (; sva < eva; sva += PAGE_SIZE) {
1675		pvo = moea_pvo_find_va(pm, sva, &pteidx);
1676		if (pvo != NULL) {
1677			moea_pvo_remove(pvo, pteidx);
1678		}
1679	}
1680	PMAP_UNLOCK(pm);
1681	vm_page_unlock_queues();
1682}
1683
1684/*
1685 * Remove physical page from all pmaps in which it resides. moea_pvo_remove()
1686 * will reflect changes in pte's back to the vm_page.
1687 */
1688void
1689moea_remove_all(mmu_t mmu, vm_page_t m)
1690{
1691	struct  pvo_head *pvo_head;
1692	struct	pvo_entry *pvo, *next_pvo;
1693	pmap_t	pmap;
1694
1695	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1696
1697	pvo_head = vm_page_to_pvoh(m);
1698	for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) {
1699		next_pvo = LIST_NEXT(pvo, pvo_vlink);
1700
1701		MOEA_PVO_CHECK(pvo);	/* sanity check */
1702		pmap = pvo->pvo_pmap;
1703		PMAP_LOCK(pmap);
1704		moea_pvo_remove(pvo, -1);
1705		PMAP_UNLOCK(pmap);
1706	}
1707	vm_page_flag_clear(m, PG_WRITEABLE);
1708}
1709
1710/*
1711 * Allocate a physical page of memory directly from the phys_avail map.
1712 * Can only be called from moea_bootstrap before avail start and end are
1713 * calculated.
1714 */
1715static vm_offset_t
1716moea_bootstrap_alloc(vm_size_t size, u_int align)
1717{
1718	vm_offset_t	s, e;
1719	int		i, j;
1720
1721	size = round_page(size);
1722	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1723		if (align != 0)
1724			s = (phys_avail[i] + align - 1) & ~(align - 1);
1725		else
1726			s = phys_avail[i];
1727		e = s + size;
1728
1729		if (s < phys_avail[i] || e > phys_avail[i + 1])
1730			continue;
1731
1732		if (s == phys_avail[i]) {
1733			phys_avail[i] += size;
1734		} else if (e == phys_avail[i + 1]) {
1735			phys_avail[i + 1] -= size;
1736		} else {
1737			for (j = phys_avail_count * 2; j > i; j -= 2) {
1738				phys_avail[j] = phys_avail[j - 2];
1739				phys_avail[j + 1] = phys_avail[j - 1];
1740			}
1741
1742			phys_avail[i + 3] = phys_avail[i + 1];
1743			phys_avail[i + 1] = s;
1744			phys_avail[i + 2] = e;
1745			phys_avail_count++;
1746		}
1747
1748		return (s);
1749	}
1750	panic("moea_bootstrap_alloc: could not allocate memory");
1751}
1752
1753static void
1754moea_syncicache(vm_offset_t pa, vm_size_t len)
1755{
1756	__syncicache((void *)pa, len);
1757}
1758
1759static int
1760moea_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head,
1761    vm_offset_t va, vm_offset_t pa, u_int pte_lo, int flags)
1762{
1763	struct	pvo_entry *pvo;
1764	u_int	sr;
1765	int	first;
1766	u_int	ptegidx;
1767	int	i;
1768	int     bootstrap;
1769
1770	moea_pvo_enter_calls++;
1771	first = 0;
1772	bootstrap = 0;
1773
1774	/*
1775	 * Compute the PTE Group index.
1776	 */
1777	va &= ~ADDR_POFF;
1778	sr = va_to_sr(pm->pm_sr, va);
1779	ptegidx = va_to_pteg(sr, va);
1780
1781	/*
1782	 * Remove any existing mapping for this page.  Reuse the pvo entry if
1783	 * there is a mapping.
1784	 */
1785	mtx_lock(&moea_table_mutex);
1786	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
1787		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
1788			if ((pvo->pvo_pte.pte_lo & PTE_RPGN) == pa &&
1789			    (pvo->pvo_pte.pte_lo & PTE_PP) ==
1790			    (pte_lo & PTE_PP)) {
1791				mtx_unlock(&moea_table_mutex);
1792				return (0);
1793			}
1794			moea_pvo_remove(pvo, -1);
1795			break;
1796		}
1797	}
1798
1799	/*
1800	 * If we aren't overwriting a mapping, try to allocate.
1801	 */
1802	if (moea_initialized) {
1803		pvo = uma_zalloc(zone, M_NOWAIT);
1804	} else {
1805		if (moea_bpvo_pool_index >= BPVO_POOL_SIZE) {
1806			panic("moea_enter: bpvo pool exhausted, %d, %d, %d",
1807			      moea_bpvo_pool_index, BPVO_POOL_SIZE,
1808			      BPVO_POOL_SIZE * sizeof(struct pvo_entry));
1809		}
1810		pvo = &moea_bpvo_pool[moea_bpvo_pool_index];
1811		moea_bpvo_pool_index++;
1812		bootstrap = 1;
1813	}
1814
1815	if (pvo == NULL) {
1816		mtx_unlock(&moea_table_mutex);
1817		return (ENOMEM);
1818	}
1819
1820	moea_pvo_entries++;
1821	pvo->pvo_vaddr = va;
1822	pvo->pvo_pmap = pm;
1823	LIST_INSERT_HEAD(&moea_pvo_table[ptegidx], pvo, pvo_olink);
1824	pvo->pvo_vaddr &= ~ADDR_POFF;
1825	if (flags & VM_PROT_EXECUTE)
1826		pvo->pvo_vaddr |= PVO_EXECUTABLE;
1827	if (flags & PVO_WIRED)
1828		pvo->pvo_vaddr |= PVO_WIRED;
1829	if (pvo_head != &moea_pvo_kunmanaged)
1830		pvo->pvo_vaddr |= PVO_MANAGED;
1831	if (bootstrap)
1832		pvo->pvo_vaddr |= PVO_BOOTSTRAP;
1833	if (flags & PVO_FAKE)
1834		pvo->pvo_vaddr |= PVO_FAKE;
1835
1836	moea_pte_create(&pvo->pvo_pte, sr, va, pa | pte_lo);
1837
1838	/*
1839	 * Remember if the list was empty and therefore will be the first
1840	 * item.
1841	 */
1842	if (LIST_FIRST(pvo_head) == NULL)
1843		first = 1;
1844	LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
1845
1846	if (pvo->pvo_pte.pte_lo & PVO_WIRED)
1847		pm->pm_stats.wired_count++;
1848	pm->pm_stats.resident_count++;
1849
1850	/*
1851	 * We hope this succeeds but it isn't required.
1852	 */
1853	i = moea_pte_insert(ptegidx, &pvo->pvo_pte);
1854	if (i >= 0) {
1855		PVO_PTEGIDX_SET(pvo, i);
1856	} else {
1857		panic("moea_pvo_enter: overflow");
1858		moea_pte_overflow++;
1859	}
1860	mtx_unlock(&moea_table_mutex);
1861
1862	return (first ? ENOENT : 0);
1863}
1864
1865static void
1866moea_pvo_remove(struct pvo_entry *pvo, int pteidx)
1867{
1868	struct	pte *pt;
1869
1870	/*
1871	 * If there is an active pte entry, we need to deactivate it (and
1872	 * save the ref & cfg bits).
1873	 */
1874	pt = moea_pvo_to_pte(pvo, pteidx);
1875	if (pt != NULL) {
1876		moea_pte_unset(pt, &pvo->pvo_pte, pvo->pvo_vaddr);
1877		mtx_unlock(&moea_table_mutex);
1878		PVO_PTEGIDX_CLR(pvo);
1879	} else {
1880		moea_pte_overflow--;
1881	}
1882
1883	/*
1884	 * Update our statistics.
1885	 */
1886	pvo->pvo_pmap->pm_stats.resident_count--;
1887	if (pvo->pvo_pte.pte_lo & PVO_WIRED)
1888		pvo->pvo_pmap->pm_stats.wired_count--;
1889
1890	/*
1891	 * Save the REF/CHG bits into their cache if the page is managed.
1892	 */
1893	if ((pvo->pvo_vaddr & (PVO_MANAGED|PVO_FAKE)) == PVO_MANAGED) {
1894		struct	vm_page *pg;
1895
1896		pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte_lo & PTE_RPGN);
1897		if (pg != NULL) {
1898			moea_attr_save(pg, pvo->pvo_pte.pte_lo &
1899			    (PTE_REF | PTE_CHG));
1900		}
1901	}
1902
1903	/*
1904	 * Remove this PVO from the PV list.
1905	 */
1906	LIST_REMOVE(pvo, pvo_vlink);
1907
1908	/*
1909	 * Remove this from the overflow list and return it to the pool
1910	 * if we aren't going to reuse it.
1911	 */
1912	LIST_REMOVE(pvo, pvo_olink);
1913	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
1914		uma_zfree(pvo->pvo_vaddr & PVO_MANAGED ? moea_mpvo_zone :
1915		    moea_upvo_zone, pvo);
1916	moea_pvo_entries--;
1917	moea_pvo_remove_calls++;
1918}
1919
1920static __inline int
1921moea_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx)
1922{
1923	int	pteidx;
1924
1925	/*
1926	 * We can find the actual pte entry without searching by grabbing
1927	 * the PTEG index from 3 unused bits in pte_lo[11:9] and by
1928	 * noticing the HID bit.
1929	 */
1930	pteidx = ptegidx * 8 + PVO_PTEGIDX_GET(pvo);
1931	if (pvo->pvo_pte.pte_hi & PTE_HID)
1932		pteidx ^= moea_pteg_mask * 8;
1933
1934	return (pteidx);
1935}
1936
1937static struct pvo_entry *
1938moea_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p)
1939{
1940	struct	pvo_entry *pvo;
1941	int	ptegidx;
1942	u_int	sr;
1943
1944	va &= ~ADDR_POFF;
1945	sr = va_to_sr(pm->pm_sr, va);
1946	ptegidx = va_to_pteg(sr, va);
1947
1948	mtx_lock(&moea_table_mutex);
1949	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
1950		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
1951			if (pteidx_p)
1952				*pteidx_p = moea_pvo_pte_index(pvo, ptegidx);
1953			break;
1954		}
1955	}
1956	mtx_unlock(&moea_table_mutex);
1957
1958	return (pvo);
1959}
1960
1961static struct pte *
1962moea_pvo_to_pte(const struct pvo_entry *pvo, int pteidx)
1963{
1964	struct	pte *pt;
1965
1966	/*
1967	 * If we haven't been supplied the ptegidx, calculate it.
1968	 */
1969	if (pteidx == -1) {
1970		int	ptegidx;
1971		u_int	sr;
1972
1973		sr = va_to_sr(pvo->pvo_pmap->pm_sr, pvo->pvo_vaddr);
1974		ptegidx = va_to_pteg(sr, pvo->pvo_vaddr);
1975		pteidx = moea_pvo_pte_index(pvo, ptegidx);
1976	}
1977
1978	pt = &moea_pteg_table[pteidx >> 3].pt[pteidx & 7];
1979	mtx_lock(&moea_table_mutex);
1980
1981	if ((pvo->pvo_pte.pte_hi & PTE_VALID) && !PVO_PTEGIDX_ISSET(pvo)) {
1982		panic("moea_pvo_to_pte: pvo %p has valid pte in pvo but no "
1983		    "valid pte index", pvo);
1984	}
1985
1986	if ((pvo->pvo_pte.pte_hi & PTE_VALID) == 0 && PVO_PTEGIDX_ISSET(pvo)) {
1987		panic("moea_pvo_to_pte: pvo %p has valid pte index in pvo "
1988		    "pvo but no valid pte", pvo);
1989	}
1990
1991	if ((pt->pte_hi ^ (pvo->pvo_pte.pte_hi & ~PTE_VALID)) == PTE_VALID) {
1992		if ((pvo->pvo_pte.pte_hi & PTE_VALID) == 0) {
1993			panic("moea_pvo_to_pte: pvo %p has valid pte in "
1994			    "moea_pteg_table %p but invalid in pvo", pvo, pt);
1995		}
1996
1997		if (((pt->pte_lo ^ pvo->pvo_pte.pte_lo) & ~(PTE_CHG|PTE_REF))
1998		    != 0) {
1999			panic("moea_pvo_to_pte: pvo %p pte does not match "
2000			    "pte %p in moea_pteg_table", pvo, pt);
2001		}
2002
2003		mtx_assert(&moea_table_mutex, MA_OWNED);
2004		return (pt);
2005	}
2006
2007	if (pvo->pvo_pte.pte_hi & PTE_VALID) {
2008		panic("moea_pvo_to_pte: pvo %p has invalid pte %p in "
2009		    "moea_pteg_table but valid in pvo", pvo, pt);
2010	}
2011
2012	mtx_unlock(&moea_table_mutex);
2013	return (NULL);
2014}
2015
2016/*
2017 * XXX: THIS STUFF SHOULD BE IN pte.c?
2018 */
2019int
2020moea_pte_spill(vm_offset_t addr)
2021{
2022	struct	pvo_entry *source_pvo, *victim_pvo;
2023	struct	pvo_entry *pvo;
2024	int	ptegidx, i, j;
2025	u_int	sr;
2026	struct	pteg *pteg;
2027	struct	pte *pt;
2028
2029	moea_pte_spills++;
2030
2031	sr = mfsrin(addr);
2032	ptegidx = va_to_pteg(sr, addr);
2033
2034	/*
2035	 * Have to substitute some entry.  Use the primary hash for this.
2036	 * Use low bits of timebase as random generator.
2037	 */
2038	pteg = &moea_pteg_table[ptegidx];
2039	mtx_lock(&moea_table_mutex);
2040	__asm __volatile("mftb %0" : "=r"(i));
2041	i &= 7;
2042	pt = &pteg->pt[i];
2043
2044	source_pvo = NULL;
2045	victim_pvo = NULL;
2046	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
2047		/*
2048		 * We need to find a pvo entry for this address.
2049		 */
2050		MOEA_PVO_CHECK(pvo);
2051		if (source_pvo == NULL &&
2052		    moea_pte_match(&pvo->pvo_pte, sr, addr,
2053		    pvo->pvo_pte.pte_hi & PTE_HID)) {
2054			/*
2055			 * Now found an entry to be spilled into the pteg.
2056			 * The PTE is now valid, so we know it's active.
2057			 */
2058			j = moea_pte_insert(ptegidx, &pvo->pvo_pte);
2059
2060			if (j >= 0) {
2061				PVO_PTEGIDX_SET(pvo, j);
2062				moea_pte_overflow--;
2063				MOEA_PVO_CHECK(pvo);
2064				mtx_unlock(&moea_table_mutex);
2065				return (1);
2066			}
2067
2068			source_pvo = pvo;
2069
2070			if (victim_pvo != NULL)
2071				break;
2072		}
2073
2074		/*
2075		 * We also need the pvo entry of the victim we are replacing
2076		 * so save the R & C bits of the PTE.
2077		 */
2078		if ((pt->pte_hi & PTE_HID) == 0 && victim_pvo == NULL &&
2079		    moea_pte_compare(pt, &pvo->pvo_pte)) {
2080			victim_pvo = pvo;
2081			if (source_pvo != NULL)
2082				break;
2083		}
2084	}
2085
2086	if (source_pvo == NULL) {
2087		mtx_unlock(&moea_table_mutex);
2088		return (0);
2089	}
2090
2091	if (victim_pvo == NULL) {
2092		if ((pt->pte_hi & PTE_HID) == 0)
2093			panic("moea_pte_spill: victim p-pte (%p) has no pvo"
2094			    "entry", pt);
2095
2096		/*
2097		 * If this is a secondary PTE, we need to search it's primary
2098		 * pvo bucket for the matching PVO.
2099		 */
2100		LIST_FOREACH(pvo, &moea_pvo_table[ptegidx ^ moea_pteg_mask],
2101		    pvo_olink) {
2102			MOEA_PVO_CHECK(pvo);
2103			/*
2104			 * We also need the pvo entry of the victim we are
2105			 * replacing so save the R & C bits of the PTE.
2106			 */
2107			if (moea_pte_compare(pt, &pvo->pvo_pte)) {
2108				victim_pvo = pvo;
2109				break;
2110			}
2111		}
2112
2113		if (victim_pvo == NULL)
2114			panic("moea_pte_spill: victim s-pte (%p) has no pvo"
2115			    "entry", pt);
2116	}
2117
2118	/*
2119	 * We are invalidating the TLB entry for the EA we are replacing even
2120	 * though it's valid.  If we don't, we lose any ref/chg bit changes
2121	 * contained in the TLB entry.
2122	 */
2123	source_pvo->pvo_pte.pte_hi &= ~PTE_HID;
2124
2125	moea_pte_unset(pt, &victim_pvo->pvo_pte, victim_pvo->pvo_vaddr);
2126	moea_pte_set(pt, &source_pvo->pvo_pte);
2127
2128	PVO_PTEGIDX_CLR(victim_pvo);
2129	PVO_PTEGIDX_SET(source_pvo, i);
2130	moea_pte_replacements++;
2131
2132	MOEA_PVO_CHECK(victim_pvo);
2133	MOEA_PVO_CHECK(source_pvo);
2134
2135	mtx_unlock(&moea_table_mutex);
2136	return (1);
2137}
2138
2139static int
2140moea_pte_insert(u_int ptegidx, struct pte *pvo_pt)
2141{
2142	struct	pte *pt;
2143	int	i;
2144
2145	mtx_assert(&moea_table_mutex, MA_OWNED);
2146
2147	/*
2148	 * First try primary hash.
2149	 */
2150	for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2151		if ((pt->pte_hi & PTE_VALID) == 0) {
2152			pvo_pt->pte_hi &= ~PTE_HID;
2153			moea_pte_set(pt, pvo_pt);
2154			return (i);
2155		}
2156	}
2157
2158	/*
2159	 * Now try secondary hash.
2160	 */
2161	ptegidx ^= moea_pteg_mask;
2162
2163	for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2164		if ((pt->pte_hi & PTE_VALID) == 0) {
2165			pvo_pt->pte_hi |= PTE_HID;
2166			moea_pte_set(pt, pvo_pt);
2167			return (i);
2168		}
2169	}
2170
2171	panic("moea_pte_insert: overflow");
2172	return (-1);
2173}
2174
2175static boolean_t
2176moea_query_bit(vm_page_t m, int ptebit)
2177{
2178	struct	pvo_entry *pvo;
2179	struct	pte *pt;
2180
2181#if 0
2182	if (moea_attr_fetch(m) & ptebit)
2183		return (TRUE);
2184#endif
2185
2186	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2187		MOEA_PVO_CHECK(pvo);	/* sanity check */
2188
2189		/*
2190		 * See if we saved the bit off.  If so, cache it and return
2191		 * success.
2192		 */
2193		if (pvo->pvo_pte.pte_lo & ptebit) {
2194			moea_attr_save(m, ptebit);
2195			MOEA_PVO_CHECK(pvo);	/* sanity check */
2196			return (TRUE);
2197		}
2198	}
2199
2200	/*
2201	 * No luck, now go through the hard part of looking at the PTEs
2202	 * themselves.  Sync so that any pending REF/CHG bits are flushed to
2203	 * the PTEs.
2204	 */
2205	powerpc_sync();
2206	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2207		MOEA_PVO_CHECK(pvo);	/* sanity check */
2208
2209		/*
2210		 * See if this pvo has a valid PTE.  if so, fetch the
2211		 * REF/CHG bits from the valid PTE.  If the appropriate
2212		 * ptebit is set, cache it and return success.
2213		 */
2214		pt = moea_pvo_to_pte(pvo, -1);
2215		if (pt != NULL) {
2216			moea_pte_synch(pt, &pvo->pvo_pte);
2217			mtx_unlock(&moea_table_mutex);
2218			if (pvo->pvo_pte.pte_lo & ptebit) {
2219				moea_attr_save(m, ptebit);
2220				MOEA_PVO_CHECK(pvo);	/* sanity check */
2221				return (TRUE);
2222			}
2223		}
2224	}
2225
2226	return (FALSE);
2227}
2228
2229static u_int
2230moea_clear_bit(vm_page_t m, int ptebit, int *origbit)
2231{
2232	u_int	count;
2233	struct	pvo_entry *pvo;
2234	struct	pte *pt;
2235	int	rv;
2236
2237	/*
2238	 * Clear the cached value.
2239	 */
2240	rv = moea_attr_fetch(m);
2241	moea_attr_clear(m, ptebit);
2242
2243	/*
2244	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2245	 * we can reset the right ones).  note that since the pvo entries and
2246	 * list heads are accessed via BAT0 and are never placed in the page
2247	 * table, we don't have to worry about further accesses setting the
2248	 * REF/CHG bits.
2249	 */
2250	powerpc_sync();
2251
2252	/*
2253	 * For each pvo entry, clear the pvo's ptebit.  If this pvo has a
2254	 * valid pte clear the ptebit from the valid pte.
2255	 */
2256	count = 0;
2257	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2258		MOEA_PVO_CHECK(pvo);	/* sanity check */
2259		pt = moea_pvo_to_pte(pvo, -1);
2260		if (pt != NULL) {
2261			moea_pte_synch(pt, &pvo->pvo_pte);
2262			if (pvo->pvo_pte.pte_lo & ptebit) {
2263				count++;
2264				moea_pte_clear(pt, PVO_VADDR(pvo), ptebit);
2265			}
2266			mtx_unlock(&moea_table_mutex);
2267		}
2268		rv |= pvo->pvo_pte.pte_lo;
2269		pvo->pvo_pte.pte_lo &= ~ptebit;
2270		MOEA_PVO_CHECK(pvo);	/* sanity check */
2271	}
2272
2273	if (origbit != NULL) {
2274		*origbit = rv;
2275	}
2276
2277	return (count);
2278}
2279
2280/*
2281 * Return true if the physical range is encompassed by the battable[idx]
2282 */
2283static int
2284moea_bat_mapped(int idx, vm_offset_t pa, vm_size_t size)
2285{
2286	u_int prot;
2287	u_int32_t start;
2288	u_int32_t end;
2289	u_int32_t bat_ble;
2290
2291	/*
2292	 * Return immediately if not a valid mapping
2293	 */
2294	if (!battable[idx].batu & BAT_Vs)
2295		return (EINVAL);
2296
2297	/*
2298	 * The BAT entry must be cache-inhibited, guarded, and r/w
2299	 * so it can function as an i/o page
2300	 */
2301	prot = battable[idx].batl & (BAT_I|BAT_G|BAT_PP_RW);
2302	if (prot != (BAT_I|BAT_G|BAT_PP_RW))
2303		return (EPERM);
2304
2305	/*
2306	 * The address should be within the BAT range. Assume that the
2307	 * start address in the BAT has the correct alignment (thus
2308	 * not requiring masking)
2309	 */
2310	start = battable[idx].batl & BAT_PBS;
2311	bat_ble = (battable[idx].batu & ~(BAT_EBS)) | 0x03;
2312	end = start | (bat_ble << 15) | 0x7fff;
2313
2314	if ((pa < start) || ((pa + size) > end))
2315		return (ERANGE);
2316
2317	return (0);
2318}
2319
2320boolean_t
2321moea_dev_direct_mapped(mmu_t mmu, vm_offset_t pa, vm_size_t size)
2322{
2323	int i;
2324
2325	/*
2326	 * This currently does not work for entries that
2327	 * overlap 256M BAT segments.
2328	 */
2329
2330	for(i = 0; i < 16; i++)
2331		if (moea_bat_mapped(i, pa, size) == 0)
2332			return (0);
2333
2334	return (EFAULT);
2335}
2336
2337boolean_t
2338moea_page_executable(mmu_t mmu, vm_page_t pg)
2339{
2340	return ((moea_attr_fetch(pg) & PTE_EXEC) == PTE_EXEC);
2341}
2342
2343/*
2344 * Map a set of physical memory pages into the kernel virtual
2345 * address space. Return a pointer to where it is mapped. This
2346 * routine is intended to be used for mapping device memory,
2347 * NOT real memory.
2348 */
2349void *
2350moea_mapdev(mmu_t mmu, vm_offset_t pa, vm_size_t size)
2351{
2352	vm_offset_t va, tmpva, ppa, offset;
2353	int i;
2354
2355	ppa = trunc_page(pa);
2356	offset = pa & PAGE_MASK;
2357	size = roundup(offset + size, PAGE_SIZE);
2358
2359	GIANT_REQUIRED;
2360
2361	/*
2362	 * If the physical address lies within a valid BAT table entry,
2363	 * return the 1:1 mapping. This currently doesn't work
2364	 * for regions that overlap 256M BAT segments.
2365	 */
2366	for (i = 0; i < 16; i++) {
2367		if (moea_bat_mapped(i, pa, size) == 0)
2368			return ((void *) pa);
2369	}
2370
2371	va = kmem_alloc_nofault(kernel_map, size);
2372	if (!va)
2373		panic("moea_mapdev: Couldn't alloc kernel virtual memory");
2374
2375	for (tmpva = va; size > 0;) {
2376		moea_kenter(mmu, tmpva, ppa);
2377		tlbie(tmpva);
2378		size -= PAGE_SIZE;
2379		tmpva += PAGE_SIZE;
2380		ppa += PAGE_SIZE;
2381	}
2382
2383	return ((void *)(va + offset));
2384}
2385
2386void
2387moea_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size)
2388{
2389	vm_offset_t base, offset;
2390
2391	/*
2392	 * If this is outside kernel virtual space, then it's a
2393	 * battable entry and doesn't require unmapping
2394	 */
2395	if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)) {
2396		base = trunc_page(va);
2397		offset = va & PAGE_MASK;
2398		size = roundup(offset + size, PAGE_SIZE);
2399		kmem_free(kernel_map, base, size);
2400	}
2401}
2402