1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski <raj@semihalf.com>
5 * Copyright (C) 2006 Semihalf, Marian Balakowicz <m8@semihalf.com>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
20 * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * Some hw specific parts of this pmap were derived or influenced
29 * by NetBSD's ibm4xx pmap module. More generic code is shared with
30 * a few other pmap modules from the FreeBSD tree.
31 */
32
33 /*
34  * VM layout notes:
35  *
36  * Kernel and user threads run within one common virtual address space
37  * defined by AS=0.
38  *
39  * 32-bit pmap:
40  * Virtual address space layout:
41  * -----------------------------
42  * 0x0000_0000 - 0x7fff_ffff	: user process
43  * 0x8000_0000 - 0xbfff_ffff	: pmap_mapdev()-ed area (PCI/PCIE etc.)
44  * 0xc000_0000 - 0xc0ff_ffff	: kernel reserved
45  *   0xc000_0000 - data_end	: kernel code+data, env, metadata etc.
46  * 0xc100_0000 - 0xffff_ffff	: KVA
47  *   0xc100_0000 - 0xc100_3fff : reserved for page zero/copy
48  *   0xc100_4000 - 0xc200_3fff : reserved for ptbl bufs
49  *   0xc200_4000 - 0xc200_8fff : guard page + kstack0
50  *   0xc200_9000 - 0xfeef_ffff	: actual free KVA space
51  *
52  * 64-bit pmap:
53  * Virtual address space layout:
54  * -----------------------------
55  * 0x0000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff      : user process
56  *   0x0000_0000_0000_0000 - 0x8fff_ffff_ffff_ffff    : text, data, heap, maps, libraries
57  *   0x9000_0000_0000_0000 - 0xafff_ffff_ffff_ffff    : mmio region
58  *   0xb000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff    : stack
59  * 0xc000_0000_0000_0000 - 0xcfff_ffff_ffff_ffff      : kernel reserved
60  *   0xc000_0000_0000_0000 - endkernel-1              : kernel code & data
61  *               endkernel - msgbufp-1                : flat device tree
62  *                 msgbufp - kernel_pdir-1            : message buffer
63  *             kernel_pdir - kernel_pp2d-1            : kernel page directory
64  *             kernel_pp2d - .                        : kernel pointers to page directory
65  *      pmap_zero_copy_min - crashdumpmap-1           : reserved for page zero/copy
66  *            crashdumpmap - ptbl_buf_pool_vabase-1   : reserved for ptbl bufs
67  *    ptbl_buf_pool_vabase - virtual_avail-1          : user page directories and page tables
68  *           virtual_avail - 0xcfff_ffff_ffff_ffff    : actual free KVA space
69  * 0xd000_0000_0000_0000 - 0xdfff_ffff_ffff_ffff      : coprocessor region
70  * 0xe000_0000_0000_0000 - 0xefff_ffff_ffff_ffff      : mmio region
71  * 0xf000_0000_0000_0000 - 0xffff_ffff_ffff_ffff      : direct map
72  *   0xf000_0000_0000_0000 - +Maxmem                  : physmem map
73  *                         - 0xffff_ffff_ffff_ffff    : device direct map
74  */
75
76#include <sys/cdefs.h>
77#include "opt_ddb.h"
78#include "opt_kstack_pages.h"
79
80#include <sys/param.h>
81#include <sys/conf.h>
82#include <sys/malloc.h>
83#include <sys/ktr.h>
84#include <sys/proc.h>
85#include <sys/user.h>
86#include <sys/queue.h>
87#include <sys/systm.h>
88#include <sys/kernel.h>
89#include <sys/kerneldump.h>
90#include <sys/linker.h>
91#include <sys/msgbuf.h>
92#include <sys/lock.h>
93#include <sys/mutex.h>
94#include <sys/rwlock.h>
95#include <sys/sched.h>
96#include <sys/smp.h>
97#include <sys/vmmeter.h>
98
99#include <vm/vm.h>
100#include <vm/vm_param.h>
101#include <vm/vm_page.h>
102#include <vm/vm_kern.h>
103#include <vm/vm_pageout.h>
104#include <vm/vm_extern.h>
105#include <vm/vm_object.h>
106#include <vm/vm_map.h>
107#include <vm/vm_pager.h>
108#include <vm/vm_phys.h>
109#include <vm/vm_pagequeue.h>
110#include <vm/vm_dumpset.h>
111#include <vm/uma.h>
112
113#include <machine/_inttypes.h>
114#include <machine/cpu.h>
115#include <machine/pcb.h>
116#include <machine/platform.h>
117
118#include <machine/tlb.h>
119#include <machine/spr.h>
120#include <machine/md_var.h>
121#include <machine/mmuvar.h>
122#include <machine/pmap.h>
123#include <machine/pte.h>
124
125#include <ddb/ddb.h>
126
127#define	SPARSE_MAPDEV
128
129/* Use power-of-two mappings in mmu_booke_mapdev(), to save entries. */
130#define	POW2_MAPPINGS
131
132#ifdef  DEBUG
133#define debugf(fmt, args...) printf(fmt, ##args)
134#define	__debug_used
135#else
136#define debugf(fmt, args...)
137#define	__debug_used	__unused
138#endif
139
140#ifdef __powerpc64__
141#define	PRI0ptrX	"016lx"
142#else
143#define	PRI0ptrX	"08x"
144#endif
145
146#define TODO			panic("%s: not implemented", __func__);
147
148extern unsigned char _etext[];
149extern unsigned char _end[];
150
151extern uint32_t *bootinfo;
152
153vm_paddr_t kernload;
154vm_offset_t kernstart;
155vm_size_t kernsize;
156
157/* Message buffer and tables. */
158static vm_offset_t data_start;
159static vm_size_t data_end;
160
161/* Phys/avail memory regions. */
162static struct mem_region *availmem_regions;
163static int availmem_regions_sz;
164static struct mem_region *physmem_regions;
165static int physmem_regions_sz;
166
167#ifndef __powerpc64__
168/* Reserved KVA space and mutex for mmu_booke_zero_page. */
169static vm_offset_t zero_page_va;
170static struct mtx zero_page_mutex;
171
172/* Reserved KVA space and mutex for mmu_booke_copy_page. */
173static vm_offset_t copy_page_src_va;
174static vm_offset_t copy_page_dst_va;
175static struct mtx copy_page_mutex;
176#endif
177
178static struct mtx tlbivax_mutex;
179
180/**************************************************************************/
181/* PMAP */
182/**************************************************************************/
183
184static int mmu_booke_enter_locked(pmap_t, vm_offset_t, vm_page_t,
185    vm_prot_t, u_int flags, int8_t psind);
186
187unsigned int kptbl_min;		/* Index of the first kernel ptbl. */
188static uma_zone_t ptbl_root_zone;
189
190/*
191 * If user pmap is processed with mmu_booke_remove and the resident count
192 * drops to 0, there are no more pages to remove, so we need not continue.
193 */
194#define PMAP_REMOVE_DONE(pmap) \
195	((pmap) != kernel_pmap && (pmap)->pm_stats.resident_count == 0)
196
197#if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__)
198extern int elf32_nxstack;
199#endif
200
201/**************************************************************************/
202/* TLB and TID handling */
203/**************************************************************************/
204
205/* Translation ID busy table */
206static volatile pmap_t tidbusy[MAXCPU][TID_MAX + 1];
207
208/*
209 * TLB0 capabilities (entry, way numbers etc.). These can vary between e500
210 * core revisions and should be read from h/w registers during early config.
211 */
212uint32_t tlb0_entries;
213uint32_t tlb0_ways;
214uint32_t tlb0_entries_per_way;
215uint32_t tlb1_entries;
216
217#define TLB0_ENTRIES		(tlb0_entries)
218#define TLB0_WAYS		(tlb0_ways)
219#define TLB0_ENTRIES_PER_WAY	(tlb0_entries_per_way)
220
221#define TLB1_ENTRIES (tlb1_entries)
222
223static tlbtid_t tid_alloc(struct pmap *);
224
225#ifdef DDB
226#ifdef __powerpc64__
227static void tlb_print_entry(int, uint32_t, uint64_t, uint32_t, uint32_t);
228#else
229static void tlb_print_entry(int, uint32_t, uint32_t, uint32_t, uint32_t);
230#endif
231#endif
232
233static void tlb1_read_entry(tlb_entry_t *, unsigned int);
234static void tlb1_write_entry(tlb_entry_t *, unsigned int);
235static int tlb1_iomapped(int, vm_paddr_t, vm_size_t, vm_offset_t *);
236static vm_size_t tlb1_mapin_region(vm_offset_t, vm_paddr_t, vm_size_t, int);
237
238static __inline uint32_t tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma);
239
240static vm_size_t tsize2size(unsigned int);
241static unsigned int size2tsize(vm_size_t);
242
243static void set_mas4_defaults(void);
244
245static inline void tlb0_flush_entry(vm_offset_t);
246static inline unsigned int tlb0_tableidx(vm_offset_t, unsigned int);
247
248/**************************************************************************/
249/* Page table management */
250/**************************************************************************/
251
252static struct rwlock_padalign pvh_global_lock;
253
254/* Data for the pv entry allocation mechanism */
255static uma_zone_t pvzone;
256static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
257
258#define PV_ENTRY_ZONE_MIN	2048	/* min pv entries in uma zone */
259
260#ifndef PMAP_SHPGPERPROC
261#define PMAP_SHPGPERPROC	200
262#endif
263
264static vm_paddr_t pte_vatopa(pmap_t, vm_offset_t);
265static int pte_enter(pmap_t, vm_page_t, vm_offset_t, uint32_t, bool);
266static int pte_remove(pmap_t, vm_offset_t, uint8_t);
267static pte_t *pte_find(pmap_t, vm_offset_t);
268static void kernel_pte_alloc(vm_offset_t, vm_offset_t);
269
270static pv_entry_t pv_alloc(void);
271static void pv_free(pv_entry_t);
272static void pv_insert(pmap_t, vm_offset_t, vm_page_t);
273static void pv_remove(pmap_t, vm_offset_t, vm_page_t);
274
275static void booke_pmap_init_qpages(void);
276
277static inline void tlb_miss_lock(void);
278static inline void tlb_miss_unlock(void);
279
280#ifdef SMP
281extern tlb_entry_t __boot_tlb1[];
282void pmap_bootstrap_ap(volatile uint32_t *);
283#endif
284
285/*
286 * Kernel MMU interface
287 */
288static void		mmu_booke_clear_modify(vm_page_t);
289static void		mmu_booke_copy(pmap_t, pmap_t, vm_offset_t,
290    vm_size_t, vm_offset_t);
291static void		mmu_booke_copy_page(vm_page_t, vm_page_t);
292static void		mmu_booke_copy_pages(vm_page_t *,
293    vm_offset_t, vm_page_t *, vm_offset_t, int);
294static int		mmu_booke_enter(pmap_t, vm_offset_t, vm_page_t,
295    vm_prot_t, u_int flags, int8_t psind);
296static void		mmu_booke_enter_object(pmap_t, vm_offset_t, vm_offset_t,
297    vm_page_t, vm_prot_t);
298static void		mmu_booke_enter_quick(pmap_t, vm_offset_t, vm_page_t,
299    vm_prot_t);
300static vm_paddr_t	mmu_booke_extract(pmap_t, vm_offset_t);
301static vm_page_t	mmu_booke_extract_and_hold(pmap_t, vm_offset_t,
302    vm_prot_t);
303static void		mmu_booke_init(void);
304static bool		mmu_booke_is_modified(vm_page_t);
305static bool		mmu_booke_is_prefaultable(pmap_t, vm_offset_t);
306static bool		mmu_booke_is_referenced(vm_page_t);
307static int		mmu_booke_ts_referenced(vm_page_t);
308static vm_offset_t	mmu_booke_map(vm_offset_t *, vm_paddr_t, vm_paddr_t,
309    int);
310static int		mmu_booke_mincore(pmap_t, vm_offset_t,
311    vm_paddr_t *);
312static void		mmu_booke_object_init_pt(pmap_t, vm_offset_t,
313    vm_object_t, vm_pindex_t, vm_size_t);
314static bool		mmu_booke_page_exists_quick(pmap_t, vm_page_t);
315static void		mmu_booke_page_init(vm_page_t);
316static int		mmu_booke_page_wired_mappings(vm_page_t);
317static int		mmu_booke_pinit(pmap_t);
318static void		mmu_booke_pinit0(pmap_t);
319static void		mmu_booke_protect(pmap_t, vm_offset_t, vm_offset_t,
320    vm_prot_t);
321static void		mmu_booke_qenter(vm_offset_t, vm_page_t *, int);
322static void		mmu_booke_qremove(vm_offset_t, int);
323static void		mmu_booke_release(pmap_t);
324static void		mmu_booke_remove(pmap_t, vm_offset_t, vm_offset_t);
325static void		mmu_booke_remove_all(vm_page_t);
326static void		mmu_booke_remove_write(vm_page_t);
327static void		mmu_booke_unwire(pmap_t, vm_offset_t, vm_offset_t);
328static void		mmu_booke_zero_page(vm_page_t);
329static void		mmu_booke_zero_page_area(vm_page_t, int, int);
330static void		mmu_booke_activate(struct thread *);
331static void		mmu_booke_deactivate(struct thread *);
332static void		mmu_booke_bootstrap(vm_offset_t, vm_offset_t);
333static void		*mmu_booke_mapdev(vm_paddr_t, vm_size_t);
334static void		*mmu_booke_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
335static void		mmu_booke_unmapdev(void *, vm_size_t);
336static vm_paddr_t	mmu_booke_kextract(vm_offset_t);
337static void		mmu_booke_kenter(vm_offset_t, vm_paddr_t);
338static void		mmu_booke_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t);
339static void		mmu_booke_kremove(vm_offset_t);
340static int		mmu_booke_dev_direct_mapped(vm_paddr_t, vm_size_t);
341static void		mmu_booke_sync_icache(pmap_t, vm_offset_t,
342    vm_size_t);
343static void		mmu_booke_dumpsys_map(vm_paddr_t pa, size_t,
344    void **);
345static void		mmu_booke_dumpsys_unmap(vm_paddr_t pa, size_t,
346    void *);
347static void		mmu_booke_scan_init(void);
348static vm_offset_t	mmu_booke_quick_enter_page(vm_page_t m);
349static void		mmu_booke_quick_remove_page(vm_offset_t addr);
350static int		mmu_booke_change_attr(vm_offset_t addr,
351    vm_size_t sz, vm_memattr_t mode);
352static int		mmu_booke_decode_kernel_ptr(vm_offset_t addr,
353    int *is_user, vm_offset_t *decoded_addr);
354static void		mmu_booke_page_array_startup(long);
355static bool mmu_booke_page_is_mapped(vm_page_t m);
356static bool mmu_booke_ps_enabled(pmap_t pmap);
357
358static struct pmap_funcs mmu_booke_methods = {
359	/* pmap dispatcher interface */
360	.clear_modify = mmu_booke_clear_modify,
361	.copy = mmu_booke_copy,
362	.copy_page = mmu_booke_copy_page,
363	.copy_pages = mmu_booke_copy_pages,
364	.enter = mmu_booke_enter,
365	.enter_object = mmu_booke_enter_object,
366	.enter_quick = mmu_booke_enter_quick,
367	.extract = mmu_booke_extract,
368	.extract_and_hold = mmu_booke_extract_and_hold,
369	.init = mmu_booke_init,
370	.is_modified = mmu_booke_is_modified,
371	.is_prefaultable = mmu_booke_is_prefaultable,
372	.is_referenced = mmu_booke_is_referenced,
373	.ts_referenced = mmu_booke_ts_referenced,
374	.map = mmu_booke_map,
375	.mincore = mmu_booke_mincore,
376	.object_init_pt = mmu_booke_object_init_pt,
377	.page_exists_quick = mmu_booke_page_exists_quick,
378	.page_init = mmu_booke_page_init,
379	.page_wired_mappings =  mmu_booke_page_wired_mappings,
380	.pinit = mmu_booke_pinit,
381	.pinit0 = mmu_booke_pinit0,
382	.protect = mmu_booke_protect,
383	.qenter = mmu_booke_qenter,
384	.qremove = mmu_booke_qremove,
385	.release = mmu_booke_release,
386	.remove = mmu_booke_remove,
387	.remove_all = mmu_booke_remove_all,
388	.remove_write = mmu_booke_remove_write,
389	.sync_icache = mmu_booke_sync_icache,
390	.unwire = mmu_booke_unwire,
391	.zero_page = mmu_booke_zero_page,
392	.zero_page_area = mmu_booke_zero_page_area,
393	.activate = mmu_booke_activate,
394	.deactivate = mmu_booke_deactivate,
395	.quick_enter_page =  mmu_booke_quick_enter_page,
396	.quick_remove_page =  mmu_booke_quick_remove_page,
397	.page_array_startup = mmu_booke_page_array_startup,
398	.page_is_mapped = mmu_booke_page_is_mapped,
399	.ps_enabled = mmu_booke_ps_enabled,
400
401	/* Internal interfaces */
402	.bootstrap = mmu_booke_bootstrap,
403	.dev_direct_mapped = mmu_booke_dev_direct_mapped,
404	.mapdev = mmu_booke_mapdev,
405	.mapdev_attr = mmu_booke_mapdev_attr,
406	.kenter = mmu_booke_kenter,
407	.kenter_attr = mmu_booke_kenter_attr,
408	.kextract = mmu_booke_kextract,
409	.kremove = mmu_booke_kremove,
410	.unmapdev = mmu_booke_unmapdev,
411	.change_attr = mmu_booke_change_attr,
412	.decode_kernel_ptr =  mmu_booke_decode_kernel_ptr,
413
414	/* dumpsys() support */
415	.dumpsys_map_chunk = mmu_booke_dumpsys_map,
416	.dumpsys_unmap_chunk = mmu_booke_dumpsys_unmap,
417	.dumpsys_pa_init = mmu_booke_scan_init,
418};
419
420MMU_DEF(booke_mmu, MMU_TYPE_BOOKE, mmu_booke_methods);
421
422#ifdef __powerpc64__
423#include "pmap_64.c"
424#else
425#include "pmap_32.c"
426#endif
427
428static vm_offset_t tlb1_map_base = VM_MAPDEV_BASE;
429
430static __inline uint32_t
431tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
432{
433	uint32_t attrib;
434	int i;
435
436	if (ma != VM_MEMATTR_DEFAULT) {
437		switch (ma) {
438		case VM_MEMATTR_UNCACHEABLE:
439			return (MAS2_I | MAS2_G);
440		case VM_MEMATTR_WRITE_COMBINING:
441		case VM_MEMATTR_WRITE_BACK:
442		case VM_MEMATTR_PREFETCHABLE:
443			return (MAS2_I);
444		case VM_MEMATTR_WRITE_THROUGH:
445			return (MAS2_W | MAS2_M);
446		case VM_MEMATTR_CACHEABLE:
447			return (MAS2_M);
448		}
449	}
450
451	/*
452	 * Assume the page is cache inhibited and access is guarded unless
453	 * it's in our available memory array.
454	 */
455	attrib = _TLB_ENTRY_IO;
456	for (i = 0; i < physmem_regions_sz; i++) {
457		if ((pa >= physmem_regions[i].mr_start) &&
458		    (pa < (physmem_regions[i].mr_start +
459		     physmem_regions[i].mr_size))) {
460			attrib = _TLB_ENTRY_MEM;
461			break;
462		}
463	}
464
465	return (attrib);
466}
467
468static inline void
469tlb_miss_lock(void)
470{
471#ifdef SMP
472	struct pcpu *pc;
473
474	if (!smp_started)
475		return;
476
477	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
478		if (pc != pcpup) {
479			CTR3(KTR_PMAP, "%s: tlb miss LOCK of CPU=%d, "
480			    "tlb_lock=%p", __func__, pc->pc_cpuid, pc->pc_booke.tlb_lock);
481
482			KASSERT((pc->pc_cpuid != PCPU_GET(cpuid)),
483			    ("tlb_miss_lock: tried to lock self"));
484
485			tlb_lock(pc->pc_booke.tlb_lock);
486
487			CTR1(KTR_PMAP, "%s: locked", __func__);
488		}
489	}
490#endif
491}
492
493static inline void
494tlb_miss_unlock(void)
495{
496#ifdef SMP
497	struct pcpu *pc;
498
499	if (!smp_started)
500		return;
501
502	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
503		if (pc != pcpup) {
504			CTR2(KTR_PMAP, "%s: tlb miss UNLOCK of CPU=%d",
505			    __func__, pc->pc_cpuid);
506
507			tlb_unlock(pc->pc_booke.tlb_lock);
508
509			CTR1(KTR_PMAP, "%s: unlocked", __func__);
510		}
511	}
512#endif
513}
514
515/* Return number of entries in TLB0. */
516static __inline void
517tlb0_get_tlbconf(void)
518{
519	uint32_t tlb0_cfg;
520
521	tlb0_cfg = mfspr(SPR_TLB0CFG);
522	tlb0_entries = tlb0_cfg & TLBCFG_NENTRY_MASK;
523	tlb0_ways = (tlb0_cfg & TLBCFG_ASSOC_MASK) >> TLBCFG_ASSOC_SHIFT;
524	tlb0_entries_per_way = tlb0_entries / tlb0_ways;
525}
526
527/* Return number of entries in TLB1. */
528static __inline void
529tlb1_get_tlbconf(void)
530{
531	uint32_t tlb1_cfg;
532
533	tlb1_cfg = mfspr(SPR_TLB1CFG);
534	tlb1_entries = tlb1_cfg & TLBCFG_NENTRY_MASK;
535}
536
537/**************************************************************************/
538/* Page table related */
539/**************************************************************************/
540
541/* Allocate pv_entry structure. */
542pv_entry_t
543pv_alloc(void)
544{
545	pv_entry_t pv;
546
547	pv_entry_count++;
548	if (pv_entry_count > pv_entry_high_water)
549		pagedaemon_wakeup(0); /* XXX powerpc NUMA */
550	pv = uma_zalloc(pvzone, M_NOWAIT);
551
552	return (pv);
553}
554
555/* Free pv_entry structure. */
556static __inline void
557pv_free(pv_entry_t pve)
558{
559
560	pv_entry_count--;
561	uma_zfree(pvzone, pve);
562}
563
564/* Allocate and initialize pv_entry structure. */
565static void
566pv_insert(pmap_t pmap, vm_offset_t va, vm_page_t m)
567{
568	pv_entry_t pve;
569
570	//int su = (pmap == kernel_pmap);
571	//debugf("pv_insert: s (su = %d pmap = 0x%08x va = 0x%08x m = 0x%08x)\n", su,
572	//	(u_int32_t)pmap, va, (u_int32_t)m);
573
574	pve = pv_alloc();
575	if (pve == NULL)
576		panic("pv_insert: no pv entries!");
577
578	pve->pv_pmap = pmap;
579	pve->pv_va = va;
580
581	/* add to pv_list */
582	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
583	rw_assert(&pvh_global_lock, RA_WLOCKED);
584
585	TAILQ_INSERT_TAIL(&m->md.pv_list, pve, pv_link);
586
587	//debugf("pv_insert: e\n");
588}
589
590/* Destroy pv entry. */
591static void
592pv_remove(pmap_t pmap, vm_offset_t va, vm_page_t m)
593{
594	pv_entry_t pve;
595
596	//int su = (pmap == kernel_pmap);
597	//debugf("pv_remove: s (su = %d pmap = 0x%08x va = 0x%08x)\n", su, (u_int32_t)pmap, va);
598
599	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
600	rw_assert(&pvh_global_lock, RA_WLOCKED);
601
602	/* find pv entry */
603	TAILQ_FOREACH(pve, &m->md.pv_list, pv_link) {
604		if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) {
605			/* remove from pv_list */
606			TAILQ_REMOVE(&m->md.pv_list, pve, pv_link);
607			if (TAILQ_EMPTY(&m->md.pv_list))
608				vm_page_aflag_clear(m, PGA_WRITEABLE);
609
610			/* free pv entry struct */
611			pv_free(pve);
612			break;
613		}
614	}
615
616	//debugf("pv_remove: e\n");
617}
618
619/**************************************************************************/
620/* PMAP related */
621/**************************************************************************/
622
623/*
624 * This is called during booke_init, before the system is really initialized.
625 */
626static void
627mmu_booke_bootstrap(vm_offset_t start, vm_offset_t kernelend)
628{
629	vm_paddr_t phys_kernelend;
630	struct mem_region *mp, *mp1;
631	int cnt, i, j;
632	vm_paddr_t s, e, sz;
633	vm_paddr_t physsz, hwphyssz;
634	u_int phys_avail_count __debug_used;
635	vm_size_t kstack0_sz;
636	vm_paddr_t kstack0_phys;
637	vm_offset_t kstack0;
638	void *dpcpu;
639
640	debugf("mmu_booke_bootstrap: entered\n");
641
642	/* Set interesting system properties */
643#ifdef __powerpc64__
644	hw_direct_map = 1;
645#else
646	hw_direct_map = 0;
647#endif
648#if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__)
649	elf32_nxstack = 1;
650#endif
651
652	/* Initialize invalidation mutex */
653	mtx_init(&tlbivax_mutex, "tlbivax", NULL, MTX_SPIN);
654
655	/* Read TLB0 size and associativity. */
656	tlb0_get_tlbconf();
657
658	/*
659	 * Align kernel start and end address (kernel image).
660	 * Note that kernel end does not necessarily relate to kernsize.
661	 * kernsize is the size of the kernel that is actually mapped.
662	 */
663	data_start = round_page(kernelend);
664	data_end = data_start;
665
666	/* Allocate the dynamic per-cpu area. */
667	dpcpu = (void *)data_end;
668	data_end += DPCPU_SIZE;
669
670	/* Allocate space for the message buffer. */
671	msgbufp = (struct msgbuf *)data_end;
672	data_end += msgbufsize;
673	debugf(" msgbufp at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n",
674	    (uintptr_t)msgbufp, data_end);
675
676	data_end = round_page(data_end);
677	data_end = round_page(mmu_booke_alloc_kernel_pgtables(data_end));
678
679	/* Retrieve phys/avail mem regions */
680	mem_regions(&physmem_regions, &physmem_regions_sz,
681	    &availmem_regions, &availmem_regions_sz);
682
683	if (PHYS_AVAIL_ENTRIES < availmem_regions_sz)
684		panic("mmu_booke_bootstrap: phys_avail too small");
685
686	data_end = round_page(data_end);
687	vm_page_array = (vm_page_t)data_end;
688	/*
689	 * Get a rough idea (upper bound) on the size of the page array.  The
690	 * vm_page_array will not handle any more pages than we have in the
691	 * avail_regions array, and most likely much less.
692	 */
693	sz = 0;
694	for (mp = availmem_regions; mp->mr_size; mp++) {
695		sz += mp->mr_size;
696	}
697	sz = (round_page(sz) / (PAGE_SIZE + sizeof(struct vm_page)));
698	data_end += round_page(sz * sizeof(struct vm_page));
699
700	/* Pre-round up to 1MB.  This wastes some space, but saves TLB entries */
701	data_end = roundup2(data_end, 1 << 20);
702
703	debugf(" data_end: 0x%"PRI0ptrX"\n", data_end);
704	debugf(" kernstart: %#zx\n", kernstart);
705	debugf(" kernsize: %#zx\n", kernsize);
706
707	if (data_end - kernstart > kernsize) {
708		kernsize += tlb1_mapin_region(kernstart + kernsize,
709		    kernload + kernsize, (data_end - kernstart) - kernsize,
710		    _TLB_ENTRY_MEM);
711	}
712	data_end = kernstart + kernsize;
713	debugf(" updated data_end: 0x%"PRI0ptrX"\n", data_end);
714
715	/*
716	 * Clear the structures - note we can only do it safely after the
717	 * possible additional TLB1 translations are in place (above) so that
718	 * all range up to the currently calculated 'data_end' is covered.
719	 */
720	bzero((void *)data_start, data_end - data_start);
721	dpcpu_init(dpcpu, 0);
722
723	/*******************************************************/
724	/* Set the start and end of kva. */
725	/*******************************************************/
726	virtual_avail = round_page(data_end);
727	virtual_end = VM_MAX_KERNEL_ADDRESS;
728
729#ifndef __powerpc64__
730	/* Allocate KVA space for page zero/copy operations. */
731	zero_page_va = virtual_avail;
732	virtual_avail += PAGE_SIZE;
733	copy_page_src_va = virtual_avail;
734	virtual_avail += PAGE_SIZE;
735	copy_page_dst_va = virtual_avail;
736	virtual_avail += PAGE_SIZE;
737	debugf("zero_page_va = 0x%"PRI0ptrX"\n", zero_page_va);
738	debugf("copy_page_src_va = 0x%"PRI0ptrX"\n", copy_page_src_va);
739	debugf("copy_page_dst_va = 0x%"PRI0ptrX"\n", copy_page_dst_va);
740
741	/* Initialize page zero/copy mutexes. */
742	mtx_init(&zero_page_mutex, "mmu_booke_zero_page", NULL, MTX_DEF);
743	mtx_init(&copy_page_mutex, "mmu_booke_copy_page", NULL, MTX_DEF);
744
745	/* Allocate KVA space for ptbl bufs. */
746	ptbl_buf_pool_vabase = virtual_avail;
747	virtual_avail += PTBL_BUFS * PTBL_PAGES * PAGE_SIZE;
748	debugf("ptbl_buf_pool_vabase = 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n",
749	    ptbl_buf_pool_vabase, virtual_avail);
750#endif
751#ifdef	__powerpc64__
752	/* Allocate KVA space for crashdumpmap. */
753	crashdumpmap = (caddr_t)virtual_avail;
754	virtual_avail += MAXDUMPPGS * PAGE_SIZE;
755#endif
756
757	/* Calculate corresponding physical addresses for the kernel region. */
758	phys_kernelend = kernload + kernsize;
759	debugf("kernel image and allocated data:\n");
760	debugf(" kernload    = 0x%09jx\n", (uintmax_t)kernload);
761	debugf(" kernstart   = 0x%"PRI0ptrX"\n", kernstart);
762	debugf(" kernsize    = 0x%"PRI0ptrX"\n", kernsize);
763
764	/*
765	 * Remove kernel physical address range from avail regions list. Page
766	 * align all regions.  Non-page aligned memory isn't very interesting
767	 * to us.  Also, sort the entries for ascending addresses.
768	 */
769
770	sz = 0;
771	cnt = availmem_regions_sz;
772	debugf("processing avail regions:\n");
773	for (mp = availmem_regions; mp->mr_size; mp++) {
774		s = mp->mr_start;
775		e = mp->mr_start + mp->mr_size;
776		debugf(" %09jx-%09jx -> ", (uintmax_t)s, (uintmax_t)e);
777		/* Check whether this region holds all of the kernel. */
778		if (s < kernload && e > phys_kernelend) {
779			availmem_regions[cnt].mr_start = phys_kernelend;
780			availmem_regions[cnt++].mr_size = e - phys_kernelend;
781			e = kernload;
782		}
783		/* Look whether this regions starts within the kernel. */
784		if (s >= kernload && s < phys_kernelend) {
785			if (e <= phys_kernelend)
786				goto empty;
787			s = phys_kernelend;
788		}
789		/* Now look whether this region ends within the kernel. */
790		if (e > kernload && e <= phys_kernelend) {
791			if (s >= kernload)
792				goto empty;
793			e = kernload;
794		}
795		/* Now page align the start and size of the region. */
796		s = round_page(s);
797		e = trunc_page(e);
798		if (e < s)
799			e = s;
800		sz = e - s;
801		debugf("%09jx-%09jx = %jx\n",
802		    (uintmax_t)s, (uintmax_t)e, (uintmax_t)sz);
803
804		/* Check whether some memory is left here. */
805		if (sz == 0) {
806		empty:
807			memmove(mp, mp + 1,
808			    (cnt - (mp - availmem_regions)) * sizeof(*mp));
809			cnt--;
810			mp--;
811			continue;
812		}
813
814		/* Do an insertion sort. */
815		for (mp1 = availmem_regions; mp1 < mp; mp1++)
816			if (s < mp1->mr_start)
817				break;
818		if (mp1 < mp) {
819			memmove(mp1 + 1, mp1, (char *)mp - (char *)mp1);
820			mp1->mr_start = s;
821			mp1->mr_size = sz;
822		} else {
823			mp->mr_start = s;
824			mp->mr_size = sz;
825		}
826	}
827	availmem_regions_sz = cnt;
828
829	/*******************************************************/
830	/* Steal physical memory for kernel stack from the end */
831	/* of the first avail region                           */
832	/*******************************************************/
833	kstack0_sz = kstack_pages * PAGE_SIZE;
834	kstack0_phys = availmem_regions[0].mr_start +
835	    availmem_regions[0].mr_size;
836	kstack0_phys -= kstack0_sz;
837	availmem_regions[0].mr_size -= kstack0_sz;
838
839	/*******************************************************/
840	/* Fill in phys_avail table, based on availmem_regions */
841	/*******************************************************/
842	phys_avail_count = 0;
843	physsz = 0;
844	hwphyssz = 0;
845	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
846
847	debugf("fill in phys_avail:\n");
848	for (i = 0, j = 0; i < availmem_regions_sz; i++, j += 2) {
849		debugf(" region: 0x%jx - 0x%jx (0x%jx)\n",
850		    (uintmax_t)availmem_regions[i].mr_start,
851		    (uintmax_t)availmem_regions[i].mr_start +
852		        availmem_regions[i].mr_size,
853		    (uintmax_t)availmem_regions[i].mr_size);
854
855		if (hwphyssz != 0 &&
856		    (physsz + availmem_regions[i].mr_size) >= hwphyssz) {
857			debugf(" hw.physmem adjust\n");
858			if (physsz < hwphyssz) {
859				phys_avail[j] = availmem_regions[i].mr_start;
860				phys_avail[j + 1] =
861				    availmem_regions[i].mr_start +
862				    hwphyssz - physsz;
863				physsz = hwphyssz;
864				phys_avail_count++;
865				dump_avail[j] = phys_avail[j];
866				dump_avail[j + 1] = phys_avail[j + 1];
867			}
868			break;
869		}
870
871		phys_avail[j] = availmem_regions[i].mr_start;
872		phys_avail[j + 1] = availmem_regions[i].mr_start +
873		    availmem_regions[i].mr_size;
874		phys_avail_count++;
875		physsz += availmem_regions[i].mr_size;
876		dump_avail[j] = phys_avail[j];
877		dump_avail[j + 1] = phys_avail[j + 1];
878	}
879	physmem = btoc(physsz);
880
881	/* Calculate the last available physical address. */
882	for (i = 0; phys_avail[i + 2] != 0; i += 2)
883		;
884	Maxmem = powerpc_btop(phys_avail[i + 1]);
885
886	debugf("Maxmem = 0x%08lx\n", Maxmem);
887	debugf("phys_avail_count = %d\n", phys_avail_count);
888	debugf("physsz = 0x%09jx physmem = %jd (0x%09jx)\n",
889	    (uintmax_t)physsz, (uintmax_t)physmem, (uintmax_t)physmem);
890
891#ifdef __powerpc64__
892	/*
893	 * Map the physical memory contiguously in TLB1.
894	 * Round so it fits into a single mapping.
895	 */
896	tlb1_mapin_region(DMAP_BASE_ADDRESS, 0,
897	    phys_avail[i + 1], _TLB_ENTRY_MEM);
898#endif
899
900	/*******************************************************/
901	/* Initialize (statically allocated) kernel pmap. */
902	/*******************************************************/
903	PMAP_LOCK_INIT(kernel_pmap);
904
905	debugf("kernel_pmap = 0x%"PRI0ptrX"\n", (uintptr_t)kernel_pmap);
906	kernel_pte_alloc(virtual_avail, kernstart);
907	for (i = 0; i < MAXCPU; i++) {
908		kernel_pmap->pm_tid[i] = TID_KERNEL;
909
910		/* Initialize each CPU's tidbusy entry 0 with kernel_pmap */
911		tidbusy[i][TID_KERNEL] = kernel_pmap;
912	}
913
914	/* Mark kernel_pmap active on all CPUs */
915	CPU_FILL(&kernel_pmap->pm_active);
916
917 	/*
918	 * Initialize the global pv list lock.
919	 */
920	rw_init(&pvh_global_lock, "pmap pv global");
921
922	/*******************************************************/
923	/* Final setup */
924	/*******************************************************/
925
926	/* Enter kstack0 into kernel map, provide guard page */
927	kstack0 = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
928	thread0.td_kstack = kstack0;
929	thread0.td_kstack_pages = kstack_pages;
930
931	debugf("kstack_sz = 0x%08jx\n", (uintmax_t)kstack0_sz);
932	debugf("kstack0_phys at 0x%09jx - 0x%09jx\n",
933	    (uintmax_t)kstack0_phys, (uintmax_t)kstack0_phys + kstack0_sz);
934	debugf("kstack0 at 0x%"PRI0ptrX" - 0x%"PRI0ptrX"\n",
935	    kstack0, kstack0 + kstack0_sz);
936
937	virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE + kstack0_sz;
938	for (i = 0; i < kstack_pages; i++) {
939		mmu_booke_kenter(kstack0, kstack0_phys);
940		kstack0 += PAGE_SIZE;
941		kstack0_phys += PAGE_SIZE;
942	}
943
944	pmap_bootstrapped = 1;
945
946	debugf("virtual_avail = %"PRI0ptrX"\n", virtual_avail);
947	debugf("virtual_end   = %"PRI0ptrX"\n", virtual_end);
948
949	debugf("mmu_booke_bootstrap: exit\n");
950}
951
952#ifdef SMP
953void
954tlb1_ap_prep(void)
955{
956	tlb_entry_t *e, tmp;
957	unsigned int i;
958
959	/* Prepare TLB1 image for AP processors */
960	e = __boot_tlb1;
961	for (i = 0; i < TLB1_ENTRIES; i++) {
962		tlb1_read_entry(&tmp, i);
963
964		if ((tmp.mas1 & MAS1_VALID) && (tmp.mas2 & _TLB_ENTRY_SHARED))
965			memcpy(e++, &tmp, sizeof(tmp));
966	}
967}
968
969void
970pmap_bootstrap_ap(volatile uint32_t *trcp __unused)
971{
972	int i;
973
974	/*
975	 * Finish TLB1 configuration: the BSP already set up its TLB1 and we
976	 * have the snapshot of its contents in the s/w __boot_tlb1[] table
977	 * created by tlb1_ap_prep(), so use these values directly to
978	 * (re)program AP's TLB1 hardware.
979	 *
980	 * Start at index 1 because index 0 has the kernel map.
981	 */
982	for (i = 1; i < TLB1_ENTRIES; i++) {
983		if (__boot_tlb1[i].mas1 & MAS1_VALID)
984			tlb1_write_entry(&__boot_tlb1[i], i);
985	}
986
987	set_mas4_defaults();
988}
989#endif
990
991static void
992booke_pmap_init_qpages(void)
993{
994	struct pcpu *pc;
995	int i;
996
997	CPU_FOREACH(i) {
998		pc = pcpu_find(i);
999		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
1000		if (pc->pc_qmap_addr == 0)
1001			panic("pmap_init_qpages: unable to allocate KVA");
1002	}
1003}
1004
1005SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, booke_pmap_init_qpages, NULL);
1006
1007/*
1008 * Get the physical page address for the given pmap/virtual address.
1009 */
1010static vm_paddr_t
1011mmu_booke_extract(pmap_t pmap, vm_offset_t va)
1012{
1013	vm_paddr_t pa;
1014
1015	PMAP_LOCK(pmap);
1016	pa = pte_vatopa(pmap, va);
1017	PMAP_UNLOCK(pmap);
1018
1019	return (pa);
1020}
1021
1022/*
1023 * Extract the physical page address associated with the given
1024 * kernel virtual address.
1025 */
1026static vm_paddr_t
1027mmu_booke_kextract(vm_offset_t va)
1028{
1029	tlb_entry_t e;
1030	vm_paddr_t p = 0;
1031	int i;
1032
1033#ifdef __powerpc64__
1034	if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS)
1035		return (DMAP_TO_PHYS(va));
1036#endif
1037
1038	if (va >= VM_MIN_KERNEL_ADDRESS && va <= VM_MAX_KERNEL_ADDRESS)
1039		p = pte_vatopa(kernel_pmap, va);
1040
1041	if (p == 0) {
1042		/* Check TLB1 mappings */
1043		for (i = 0; i < TLB1_ENTRIES; i++) {
1044			tlb1_read_entry(&e, i);
1045			if (!(e.mas1 & MAS1_VALID))
1046				continue;
1047			if (va >= e.virt && va < e.virt + e.size)
1048				return (e.phys + (va - e.virt));
1049		}
1050	}
1051
1052	return (p);
1053}
1054
1055/*
1056 * Initialize the pmap module.
1057 *
1058 * Called by vm_mem_init(), to initialize any structures that the pmap system
1059 * needs to map virtual memory.
1060 */
1061static void
1062mmu_booke_init(void)
1063{
1064	int shpgperproc = PMAP_SHPGPERPROC;
1065
1066	/*
1067	 * Initialize the address space (zone) for the pv entries.  Set a
1068	 * high water mark so that the system can recover from excessive
1069	 * numbers of pv entries.
1070	 */
1071	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
1072	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
1073
1074	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1075	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
1076
1077	TUNABLE_INT_FETCH("vm.pmap.pv_entry_max", &pv_entry_max);
1078	pv_entry_high_water = 9 * (pv_entry_max / 10);
1079
1080	uma_zone_reserve_kva(pvzone, pv_entry_max);
1081
1082	/* Pre-fill pvzone with initial number of pv entries. */
1083	uma_prealloc(pvzone, PV_ENTRY_ZONE_MIN);
1084
1085	/* Create a UMA zone for page table roots. */
1086	ptbl_root_zone = uma_zcreate("pmap root", PMAP_ROOT_SIZE,
1087	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_VM);
1088
1089	/* Initialize ptbl allocation. */
1090	ptbl_init();
1091}
1092
1093/*
1094 * Map a list of wired pages into kernel virtual address space.  This is
1095 * intended for temporary mappings which do not need page modification or
1096 * references recorded.  Existing mappings in the region are overwritten.
1097 */
1098static void
1099mmu_booke_qenter(vm_offset_t sva, vm_page_t *m, int count)
1100{
1101	vm_offset_t va;
1102
1103	va = sva;
1104	while (count-- > 0) {
1105		mmu_booke_kenter(va, VM_PAGE_TO_PHYS(*m));
1106		va += PAGE_SIZE;
1107		m++;
1108	}
1109}
1110
1111/*
1112 * Remove page mappings from kernel virtual address space.  Intended for
1113 * temporary mappings entered by mmu_booke_qenter.
1114 */
1115static void
1116mmu_booke_qremove(vm_offset_t sva, int count)
1117{
1118	vm_offset_t va;
1119
1120	va = sva;
1121	while (count-- > 0) {
1122		mmu_booke_kremove(va);
1123		va += PAGE_SIZE;
1124	}
1125}
1126
1127/*
1128 * Map a wired page into kernel virtual address space.
1129 */
1130static void
1131mmu_booke_kenter(vm_offset_t va, vm_paddr_t pa)
1132{
1133
1134	mmu_booke_kenter_attr(va, pa, VM_MEMATTR_DEFAULT);
1135}
1136
1137static void
1138mmu_booke_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
1139{
1140	uint32_t flags;
1141	pte_t *pte;
1142
1143	KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) &&
1144	    (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kenter: invalid va"));
1145
1146	flags = PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID;
1147	flags |= tlb_calc_wimg(pa, ma) << PTE_MAS2_SHIFT;
1148	flags |= PTE_PS_4KB;
1149
1150	pte = pte_find(kernel_pmap, va);
1151	KASSERT((pte != NULL), ("mmu_booke_kenter: invalid va.  NULL PTE"));
1152
1153	mtx_lock_spin(&tlbivax_mutex);
1154	tlb_miss_lock();
1155
1156	if (PTE_ISVALID(pte)) {
1157		CTR1(KTR_PMAP, "%s: replacing entry!", __func__);
1158
1159		/* Flush entry from TLB0 */
1160		tlb0_flush_entry(va);
1161	}
1162
1163	*pte = PTE_RPN_FROM_PA(pa) | flags;
1164
1165	//debugf("mmu_booke_kenter: pdir_idx = %d ptbl_idx = %d va=0x%08x "
1166	//		"pa=0x%08x rpn=0x%08x flags=0x%08x\n",
1167	//		pdir_idx, ptbl_idx, va, pa, pte->rpn, pte->flags);
1168
1169	/* Flush the real memory from the instruction cache. */
1170	if ((flags & (PTE_I | PTE_G)) == 0)
1171		__syncicache((void *)va, PAGE_SIZE);
1172
1173	tlb_miss_unlock();
1174	mtx_unlock_spin(&tlbivax_mutex);
1175}
1176
1177/*
1178 * Remove a page from kernel page table.
1179 */
1180static void
1181mmu_booke_kremove(vm_offset_t va)
1182{
1183	pte_t *pte;
1184
1185	CTR2(KTR_PMAP,"%s: s (va = 0x%"PRI0ptrX")\n", __func__, va);
1186
1187	KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) &&
1188	    (va <= VM_MAX_KERNEL_ADDRESS)),
1189	    ("mmu_booke_kremove: invalid va"));
1190
1191	pte = pte_find(kernel_pmap, va);
1192
1193	if (!PTE_ISVALID(pte)) {
1194		CTR1(KTR_PMAP, "%s: invalid pte", __func__);
1195
1196		return;
1197	}
1198
1199	mtx_lock_spin(&tlbivax_mutex);
1200	tlb_miss_lock();
1201
1202	/* Invalidate entry in TLB0, update PTE. */
1203	tlb0_flush_entry(va);
1204	*pte = 0;
1205
1206	tlb_miss_unlock();
1207	mtx_unlock_spin(&tlbivax_mutex);
1208}
1209
1210/*
1211 * Figure out where a given kernel pointer (usually in a fault) points
1212 * to from the VM's perspective, potentially remapping into userland's
1213 * address space.
1214 */
1215static int
1216mmu_booke_decode_kernel_ptr(vm_offset_t addr, int *is_user,
1217    vm_offset_t *decoded_addr)
1218{
1219
1220	if (trunc_page(addr) <= VM_MAXUSER_ADDRESS)
1221		*is_user = 1;
1222	else
1223		*is_user = 0;
1224
1225	*decoded_addr = addr;
1226	return (0);
1227}
1228
1229static bool
1230mmu_booke_page_is_mapped(vm_page_t m)
1231{
1232
1233	return (!TAILQ_EMPTY(&(m)->md.pv_list));
1234}
1235
1236static bool
1237mmu_booke_ps_enabled(pmap_t pmap __unused)
1238{
1239	return (false);
1240}
1241
1242/*
1243 * Initialize pmap associated with process 0.
1244 */
1245static void
1246mmu_booke_pinit0(pmap_t pmap)
1247{
1248
1249	PMAP_LOCK_INIT(pmap);
1250	mmu_booke_pinit(pmap);
1251	PCPU_SET(curpmap, pmap);
1252}
1253
1254/*
1255 * Insert the given physical page at the specified virtual address in the
1256 * target physical map with the protection requested. If specified the page
1257 * will be wired down.
1258 */
1259static int
1260mmu_booke_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
1261    vm_prot_t prot, u_int flags, int8_t psind)
1262{
1263	int error;
1264
1265	rw_wlock(&pvh_global_lock);
1266	PMAP_LOCK(pmap);
1267	error = mmu_booke_enter_locked(pmap, va, m, prot, flags, psind);
1268	PMAP_UNLOCK(pmap);
1269	rw_wunlock(&pvh_global_lock);
1270	return (error);
1271}
1272
1273static int
1274mmu_booke_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
1275    vm_prot_t prot, u_int pmap_flags, int8_t psind __unused)
1276{
1277	pte_t *pte;
1278	vm_paddr_t pa;
1279	pte_t flags;
1280	int error, su, sync;
1281
1282	pa = VM_PAGE_TO_PHYS(m);
1283	su = (pmap == kernel_pmap);
1284	sync = 0;
1285
1286	//debugf("mmu_booke_enter_locked: s (pmap=0x%08x su=%d tid=%d m=0x%08x va=0x%08x "
1287	//		"pa=0x%08x prot=0x%08x flags=%#x)\n",
1288	//		(u_int32_t)pmap, su, pmap->pm_tid,
1289	//		(u_int32_t)m, va, pa, prot, flags);
1290
1291	if (su) {
1292		KASSERT(((va >= virtual_avail) &&
1293		    (va <= VM_MAX_KERNEL_ADDRESS)),
1294		    ("mmu_booke_enter_locked: kernel pmap, non kernel va"));
1295	} else {
1296		KASSERT((va <= VM_MAXUSER_ADDRESS),
1297		    ("mmu_booke_enter_locked: user pmap, non user va"));
1298	}
1299	if ((m->oflags & VPO_UNMANAGED) == 0) {
1300		if ((pmap_flags & PMAP_ENTER_QUICK_LOCKED) == 0)
1301			VM_PAGE_OBJECT_BUSY_ASSERT(m);
1302		else
1303			VM_OBJECT_ASSERT_LOCKED(m->object);
1304	}
1305
1306	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1307
1308	/*
1309	 * If there is an existing mapping, and the physical address has not
1310	 * changed, must be protection or wiring change.
1311	 */
1312	if (((pte = pte_find(pmap, va)) != NULL) &&
1313	    (PTE_ISVALID(pte)) && (PTE_PA(pte) == pa)) {
1314
1315		/*
1316		 * Before actually updating pte->flags we calculate and
1317		 * prepare its new value in a helper var.
1318		 */
1319		flags = *pte;
1320		flags &= ~(PTE_UW | PTE_UX | PTE_SW | PTE_SX | PTE_MODIFIED);
1321
1322		/* Wiring change, just update stats. */
1323		if ((pmap_flags & PMAP_ENTER_WIRED) != 0) {
1324			if (!PTE_ISWIRED(pte)) {
1325				flags |= PTE_WIRED;
1326				pmap->pm_stats.wired_count++;
1327			}
1328		} else {
1329			if (PTE_ISWIRED(pte)) {
1330				flags &= ~PTE_WIRED;
1331				pmap->pm_stats.wired_count--;
1332			}
1333		}
1334
1335		if (prot & VM_PROT_WRITE) {
1336			/* Add write permissions. */
1337			flags |= PTE_SW;
1338			if (!su)
1339				flags |= PTE_UW;
1340
1341			if ((flags & PTE_MANAGED) != 0)
1342				vm_page_aflag_set(m, PGA_WRITEABLE);
1343		} else {
1344			/* Handle modified pages, sense modify status. */
1345
1346			/*
1347			 * The PTE_MODIFIED flag could be set by underlying
1348			 * TLB misses since we last read it (above), possibly
1349			 * other CPUs could update it so we check in the PTE
1350			 * directly rather than rely on that saved local flags
1351			 * copy.
1352			 */
1353			if (PTE_ISMODIFIED(pte))
1354				vm_page_dirty(m);
1355		}
1356
1357		if (prot & VM_PROT_EXECUTE) {
1358			flags |= PTE_SX;
1359			if (!su)
1360				flags |= PTE_UX;
1361
1362			/*
1363			 * Check existing flags for execute permissions: if we
1364			 * are turning execute permissions on, icache should
1365			 * be flushed.
1366			 */
1367			if ((*pte & (PTE_UX | PTE_SX)) == 0)
1368				sync++;
1369		}
1370
1371		flags &= ~PTE_REFERENCED;
1372
1373		/*
1374		 * The new flags value is all calculated -- only now actually
1375		 * update the PTE.
1376		 */
1377		mtx_lock_spin(&tlbivax_mutex);
1378		tlb_miss_lock();
1379
1380		tlb0_flush_entry(va);
1381		*pte &= ~PTE_FLAGS_MASK;
1382		*pte |= flags;
1383
1384		tlb_miss_unlock();
1385		mtx_unlock_spin(&tlbivax_mutex);
1386
1387	} else {
1388		/*
1389		 * If there is an existing mapping, but it's for a different
1390		 * physical address, pte_enter() will delete the old mapping.
1391		 */
1392		//if ((pte != NULL) && PTE_ISVALID(pte))
1393		//	debugf("mmu_booke_enter_locked: replace\n");
1394		//else
1395		//	debugf("mmu_booke_enter_locked: new\n");
1396
1397		/* Now set up the flags and install the new mapping. */
1398		flags = (PTE_SR | PTE_VALID);
1399		flags |= PTE_M;
1400
1401		if (!su)
1402			flags |= PTE_UR;
1403
1404		if (prot & VM_PROT_WRITE) {
1405			flags |= PTE_SW;
1406			if (!su)
1407				flags |= PTE_UW;
1408
1409			if ((m->oflags & VPO_UNMANAGED) == 0)
1410				vm_page_aflag_set(m, PGA_WRITEABLE);
1411		}
1412
1413		if (prot & VM_PROT_EXECUTE) {
1414			flags |= PTE_SX;
1415			if (!su)
1416				flags |= PTE_UX;
1417		}
1418
1419		/* If its wired update stats. */
1420		if ((pmap_flags & PMAP_ENTER_WIRED) != 0)
1421			flags |= PTE_WIRED;
1422
1423		error = pte_enter(pmap, m, va, flags,
1424		    (pmap_flags & PMAP_ENTER_NOSLEEP) != 0);
1425		if (error != 0)
1426			return (KERN_RESOURCE_SHORTAGE);
1427
1428		if ((flags & PMAP_ENTER_WIRED) != 0)
1429			pmap->pm_stats.wired_count++;
1430
1431		/* Flush the real memory from the instruction cache. */
1432		if (prot & VM_PROT_EXECUTE)
1433			sync++;
1434	}
1435
1436	if (sync && (su || pmap == PCPU_GET(curpmap))) {
1437		__syncicache((void *)va, PAGE_SIZE);
1438		sync = 0;
1439	}
1440
1441	return (KERN_SUCCESS);
1442}
1443
1444/*
1445 * Maps a sequence of resident pages belonging to the same object.
1446 * The sequence begins with the given page m_start.  This page is
1447 * mapped at the given virtual address start.  Each subsequent page is
1448 * mapped at a virtual address that is offset from start by the same
1449 * amount as the page is offset from m_start within the object.  The
1450 * last page in the sequence is the page with the largest offset from
1451 * m_start that can be mapped at a virtual address less than the given
1452 * virtual address end.  Not every virtual page between start and end
1453 * is mapped; only those for which a resident page exists with the
1454 * corresponding offset from m_start are mapped.
1455 */
1456static void
1457mmu_booke_enter_object(pmap_t pmap, vm_offset_t start,
1458    vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
1459{
1460	vm_page_t m;
1461	vm_pindex_t diff, psize;
1462
1463	VM_OBJECT_ASSERT_LOCKED(m_start->object);
1464
1465	psize = atop(end - start);
1466	m = m_start;
1467	rw_wlock(&pvh_global_lock);
1468	PMAP_LOCK(pmap);
1469	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1470		mmu_booke_enter_locked(pmap, start + ptoa(diff), m,
1471		    prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1472		    PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0);
1473		m = TAILQ_NEXT(m, listq);
1474	}
1475	PMAP_UNLOCK(pmap);
1476	rw_wunlock(&pvh_global_lock);
1477}
1478
1479static void
1480mmu_booke_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
1481    vm_prot_t prot)
1482{
1483
1484	rw_wlock(&pvh_global_lock);
1485	PMAP_LOCK(pmap);
1486	mmu_booke_enter_locked(pmap, va, m,
1487	    prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP |
1488	    PMAP_ENTER_QUICK_LOCKED, 0);
1489	PMAP_UNLOCK(pmap);
1490	rw_wunlock(&pvh_global_lock);
1491}
1492
1493/*
1494 * Remove the given range of addresses from the specified map.
1495 *
1496 * It is assumed that the start and end are properly rounded to the page size.
1497 */
1498static void
1499mmu_booke_remove(pmap_t pmap, vm_offset_t va, vm_offset_t endva)
1500{
1501	pte_t *pte;
1502	uint8_t hold_flag;
1503
1504	int su = (pmap == kernel_pmap);
1505
1506	//debugf("mmu_booke_remove: s (su = %d pmap=0x%08x tid=%d va=0x%08x endva=0x%08x)\n",
1507	//		su, (u_int32_t)pmap, pmap->pm_tid, va, endva);
1508
1509	if (su) {
1510		KASSERT(((va >= virtual_avail) &&
1511		    (va <= VM_MAX_KERNEL_ADDRESS)),
1512		    ("mmu_booke_remove: kernel pmap, non kernel va"));
1513	} else {
1514		KASSERT((va <= VM_MAXUSER_ADDRESS),
1515		    ("mmu_booke_remove: user pmap, non user va"));
1516	}
1517
1518	if (PMAP_REMOVE_DONE(pmap)) {
1519		//debugf("mmu_booke_remove: e (empty)\n");
1520		return;
1521	}
1522
1523	hold_flag = PTBL_HOLD_FLAG(pmap);
1524	//debugf("mmu_booke_remove: hold_flag = %d\n", hold_flag);
1525
1526	rw_wlock(&pvh_global_lock);
1527	PMAP_LOCK(pmap);
1528	for (; va < endva; va += PAGE_SIZE) {
1529		pte = pte_find_next(pmap, &va);
1530		if ((pte == NULL) || !PTE_ISVALID(pte))
1531			break;
1532		if (va >= endva)
1533			break;
1534		pte_remove(pmap, va, hold_flag);
1535	}
1536	PMAP_UNLOCK(pmap);
1537	rw_wunlock(&pvh_global_lock);
1538
1539	//debugf("mmu_booke_remove: e\n");
1540}
1541
1542/*
1543 * Remove physical page from all pmaps in which it resides.
1544 */
1545static void
1546mmu_booke_remove_all(vm_page_t m)
1547{
1548	pv_entry_t pv, pvn;
1549	uint8_t hold_flag;
1550
1551	rw_wlock(&pvh_global_lock);
1552	TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_link, pvn) {
1553		PMAP_LOCK(pv->pv_pmap);
1554		hold_flag = PTBL_HOLD_FLAG(pv->pv_pmap);
1555		pte_remove(pv->pv_pmap, pv->pv_va, hold_flag);
1556		PMAP_UNLOCK(pv->pv_pmap);
1557	}
1558	vm_page_aflag_clear(m, PGA_WRITEABLE);
1559	rw_wunlock(&pvh_global_lock);
1560}
1561
1562/*
1563 * Map a range of physical addresses into kernel virtual address space.
1564 */
1565static vm_offset_t
1566mmu_booke_map(vm_offset_t *virt, vm_paddr_t pa_start,
1567    vm_paddr_t pa_end, int prot)
1568{
1569	vm_offset_t sva = *virt;
1570	vm_offset_t va = sva;
1571
1572#ifdef __powerpc64__
1573	/* XXX: Handle memory not starting at 0x0. */
1574	if (pa_end < ctob(Maxmem))
1575		return (PHYS_TO_DMAP(pa_start));
1576#endif
1577
1578	while (pa_start < pa_end) {
1579		mmu_booke_kenter(va, pa_start);
1580		va += PAGE_SIZE;
1581		pa_start += PAGE_SIZE;
1582	}
1583	*virt = va;
1584
1585	return (sva);
1586}
1587
1588/*
1589 * The pmap must be activated before it's address space can be accessed in any
1590 * way.
1591 */
1592static void
1593mmu_booke_activate(struct thread *td)
1594{
1595	pmap_t pmap;
1596	u_int cpuid;
1597
1598	pmap = &td->td_proc->p_vmspace->vm_pmap;
1599
1600	CTR5(KTR_PMAP, "%s: s (td = %p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX")",
1601	    __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap);
1602
1603	KASSERT((pmap != kernel_pmap), ("mmu_booke_activate: kernel_pmap!"));
1604
1605	sched_pin();
1606
1607	cpuid = PCPU_GET(cpuid);
1608	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
1609	PCPU_SET(curpmap, pmap);
1610
1611	if (pmap->pm_tid[cpuid] == TID_NONE)
1612		tid_alloc(pmap);
1613
1614	/* Load PID0 register with pmap tid value. */
1615	mtspr(SPR_PID0, pmap->pm_tid[cpuid]);
1616	__asm __volatile("isync");
1617
1618	mtspr(SPR_DBCR0, td->td_pcb->pcb_cpu.booke.dbcr0);
1619
1620	sched_unpin();
1621
1622	CTR3(KTR_PMAP, "%s: e (tid = %d for '%s')", __func__,
1623	    pmap->pm_tid[PCPU_GET(cpuid)], td->td_proc->p_comm);
1624}
1625
1626/*
1627 * Deactivate the specified process's address space.
1628 */
1629static void
1630mmu_booke_deactivate(struct thread *td)
1631{
1632	pmap_t pmap;
1633
1634	pmap = &td->td_proc->p_vmspace->vm_pmap;
1635
1636	CTR5(KTR_PMAP, "%s: td=%p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX,
1637	    __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap);
1638
1639	td->td_pcb->pcb_cpu.booke.dbcr0 = mfspr(SPR_DBCR0);
1640
1641	CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmap->pm_active);
1642	PCPU_SET(curpmap, NULL);
1643}
1644
1645/*
1646 * Copy the range specified by src_addr/len
1647 * from the source map to the range dst_addr/len
1648 * in the destination map.
1649 *
1650 * This routine is only advisory and need not do anything.
1651 */
1652static void
1653mmu_booke_copy(pmap_t dst_pmap, pmap_t src_pmap,
1654    vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr)
1655{
1656
1657}
1658
1659/*
1660 * Set the physical protection on the specified range of this map as requested.
1661 */
1662static void
1663mmu_booke_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1664    vm_prot_t prot)
1665{
1666	vm_offset_t va;
1667	vm_page_t m;
1668	pte_t *pte;
1669
1670	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1671		mmu_booke_remove(pmap, sva, eva);
1672		return;
1673	}
1674
1675	if (prot & VM_PROT_WRITE)
1676		return;
1677
1678	PMAP_LOCK(pmap);
1679	for (va = sva; va < eva; va += PAGE_SIZE) {
1680		if ((pte = pte_find(pmap, va)) != NULL) {
1681			if (PTE_ISVALID(pte)) {
1682				m = PHYS_TO_VM_PAGE(PTE_PA(pte));
1683
1684				mtx_lock_spin(&tlbivax_mutex);
1685				tlb_miss_lock();
1686
1687				/* Handle modified pages. */
1688				if (PTE_ISMODIFIED(pte) && PTE_ISMANAGED(pte))
1689					vm_page_dirty(m);
1690
1691				tlb0_flush_entry(va);
1692				*pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED);
1693
1694				tlb_miss_unlock();
1695				mtx_unlock_spin(&tlbivax_mutex);
1696			}
1697		}
1698	}
1699	PMAP_UNLOCK(pmap);
1700}
1701
1702/*
1703 * Clear the write and modified bits in each of the given page's mappings.
1704 */
1705static void
1706mmu_booke_remove_write(vm_page_t m)
1707{
1708	pv_entry_t pv;
1709	pte_t *pte;
1710
1711	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1712	    ("mmu_booke_remove_write: page %p is not managed", m));
1713	vm_page_assert_busied(m);
1714
1715	if (!pmap_page_is_write_mapped(m))
1716	        return;
1717	rw_wlock(&pvh_global_lock);
1718	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
1719		PMAP_LOCK(pv->pv_pmap);
1720		if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL) {
1721			if (PTE_ISVALID(pte)) {
1722				m = PHYS_TO_VM_PAGE(PTE_PA(pte));
1723
1724				mtx_lock_spin(&tlbivax_mutex);
1725				tlb_miss_lock();
1726
1727				/* Handle modified pages. */
1728				if (PTE_ISMODIFIED(pte))
1729					vm_page_dirty(m);
1730
1731				/* Flush mapping from TLB0. */
1732				*pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED);
1733
1734				tlb_miss_unlock();
1735				mtx_unlock_spin(&tlbivax_mutex);
1736			}
1737		}
1738		PMAP_UNLOCK(pv->pv_pmap);
1739	}
1740	vm_page_aflag_clear(m, PGA_WRITEABLE);
1741	rw_wunlock(&pvh_global_lock);
1742}
1743
1744/*
1745 * Atomically extract and hold the physical page with the given
1746 * pmap and virtual address pair if that mapping permits the given
1747 * protection.
1748 */
1749static vm_page_t
1750mmu_booke_extract_and_hold(pmap_t pmap, vm_offset_t va,
1751    vm_prot_t prot)
1752{
1753	pte_t *pte;
1754	vm_page_t m;
1755	uint32_t pte_wbit;
1756
1757	m = NULL;
1758	PMAP_LOCK(pmap);
1759	pte = pte_find(pmap, va);
1760	if ((pte != NULL) && PTE_ISVALID(pte)) {
1761		if (pmap == kernel_pmap)
1762			pte_wbit = PTE_SW;
1763		else
1764			pte_wbit = PTE_UW;
1765
1766		if ((*pte & pte_wbit) != 0 || (prot & VM_PROT_WRITE) == 0) {
1767			m = PHYS_TO_VM_PAGE(PTE_PA(pte));
1768			if (!vm_page_wire_mapped(m))
1769				m = NULL;
1770		}
1771	}
1772	PMAP_UNLOCK(pmap);
1773	return (m);
1774}
1775
1776/*
1777 * Initialize a vm_page's machine-dependent fields.
1778 */
1779static void
1780mmu_booke_page_init(vm_page_t m)
1781{
1782
1783	m->md.pv_tracked = 0;
1784	TAILQ_INIT(&m->md.pv_list);
1785}
1786
1787/*
1788 * Return whether or not the specified physical page was modified
1789 * in any of physical maps.
1790 */
1791static bool
1792mmu_booke_is_modified(vm_page_t m)
1793{
1794	pte_t *pte;
1795	pv_entry_t pv;
1796	bool rv;
1797
1798	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1799	    ("mmu_booke_is_modified: page %p is not managed", m));
1800	rv = false;
1801
1802	/*
1803	 * If the page is not busied then this check is racy.
1804	 */
1805	if (!pmap_page_is_write_mapped(m))
1806		return (false);
1807
1808	rw_wlock(&pvh_global_lock);
1809	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
1810		PMAP_LOCK(pv->pv_pmap);
1811		if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL &&
1812		    PTE_ISVALID(pte)) {
1813			if (PTE_ISMODIFIED(pte))
1814				rv = true;
1815		}
1816		PMAP_UNLOCK(pv->pv_pmap);
1817		if (rv)
1818			break;
1819	}
1820	rw_wunlock(&pvh_global_lock);
1821	return (rv);
1822}
1823
1824/*
1825 * Return whether or not the specified virtual address is eligible
1826 * for prefault.
1827 */
1828static bool
1829mmu_booke_is_prefaultable(pmap_t pmap, vm_offset_t addr)
1830{
1831
1832	return (false);
1833}
1834
1835/*
1836 * Return whether or not the specified physical page was referenced
1837 * in any physical maps.
1838 */
1839static bool
1840mmu_booke_is_referenced(vm_page_t m)
1841{
1842	pte_t *pte;
1843	pv_entry_t pv;
1844	bool rv;
1845
1846	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1847	    ("mmu_booke_is_referenced: page %p is not managed", m));
1848	rv = false;
1849	rw_wlock(&pvh_global_lock);
1850	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
1851		PMAP_LOCK(pv->pv_pmap);
1852		if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL &&
1853		    PTE_ISVALID(pte)) {
1854			if (PTE_ISREFERENCED(pte))
1855				rv = true;
1856		}
1857		PMAP_UNLOCK(pv->pv_pmap);
1858		if (rv)
1859			break;
1860	}
1861	rw_wunlock(&pvh_global_lock);
1862	return (rv);
1863}
1864
1865/*
1866 * Clear the modify bits on the specified physical page.
1867 */
1868static void
1869mmu_booke_clear_modify(vm_page_t m)
1870{
1871	pte_t *pte;
1872	pv_entry_t pv;
1873
1874	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1875	    ("mmu_booke_clear_modify: page %p is not managed", m));
1876	vm_page_assert_busied(m);
1877
1878	if (!pmap_page_is_write_mapped(m))
1879	        return;
1880
1881	rw_wlock(&pvh_global_lock);
1882	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
1883		PMAP_LOCK(pv->pv_pmap);
1884		if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL &&
1885		    PTE_ISVALID(pte)) {
1886			mtx_lock_spin(&tlbivax_mutex);
1887			tlb_miss_lock();
1888
1889			if (*pte & (PTE_SW | PTE_UW | PTE_MODIFIED)) {
1890				tlb0_flush_entry(pv->pv_va);
1891				*pte &= ~(PTE_SW | PTE_UW | PTE_MODIFIED |
1892				    PTE_REFERENCED);
1893			}
1894
1895			tlb_miss_unlock();
1896			mtx_unlock_spin(&tlbivax_mutex);
1897		}
1898		PMAP_UNLOCK(pv->pv_pmap);
1899	}
1900	rw_wunlock(&pvh_global_lock);
1901}
1902
1903/*
1904 * Return a count of reference bits for a page, clearing those bits.
1905 * It is not necessary for every reference bit to be cleared, but it
1906 * is necessary that 0 only be returned when there are truly no
1907 * reference bits set.
1908 *
1909 * As an optimization, update the page's dirty field if a modified bit is
1910 * found while counting reference bits.  This opportunistic update can be
1911 * performed at low cost and can eliminate the need for some future calls
1912 * to pmap_is_modified().  However, since this function stops after
1913 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
1914 * dirty pages.  Those dirty pages will only be detected by a future call
1915 * to pmap_is_modified().
1916 */
1917static int
1918mmu_booke_ts_referenced(vm_page_t m)
1919{
1920	pte_t *pte;
1921	pv_entry_t pv;
1922	int count;
1923
1924	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1925	    ("mmu_booke_ts_referenced: page %p is not managed", m));
1926	count = 0;
1927	rw_wlock(&pvh_global_lock);
1928	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
1929		PMAP_LOCK(pv->pv_pmap);
1930		if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL &&
1931		    PTE_ISVALID(pte)) {
1932			if (PTE_ISMODIFIED(pte))
1933				vm_page_dirty(m);
1934			if (PTE_ISREFERENCED(pte)) {
1935				mtx_lock_spin(&tlbivax_mutex);
1936				tlb_miss_lock();
1937
1938				tlb0_flush_entry(pv->pv_va);
1939				*pte &= ~PTE_REFERENCED;
1940
1941				tlb_miss_unlock();
1942				mtx_unlock_spin(&tlbivax_mutex);
1943
1944				if (++count >= PMAP_TS_REFERENCED_MAX) {
1945					PMAP_UNLOCK(pv->pv_pmap);
1946					break;
1947				}
1948			}
1949		}
1950		PMAP_UNLOCK(pv->pv_pmap);
1951	}
1952	rw_wunlock(&pvh_global_lock);
1953	return (count);
1954}
1955
1956/*
1957 * Clear the wired attribute from the mappings for the specified range of
1958 * addresses in the given pmap.  Every valid mapping within that range must
1959 * have the wired attribute set.  In contrast, invalid mappings cannot have
1960 * the wired attribute set, so they are ignored.
1961 *
1962 * The wired attribute of the page table entry is not a hardware feature, so
1963 * there is no need to invalidate any TLB entries.
1964 */
1965static void
1966mmu_booke_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1967{
1968	vm_offset_t va;
1969	pte_t *pte;
1970
1971	PMAP_LOCK(pmap);
1972	for (va = sva; va < eva; va += PAGE_SIZE) {
1973		if ((pte = pte_find(pmap, va)) != NULL &&
1974		    PTE_ISVALID(pte)) {
1975			if (!PTE_ISWIRED(pte))
1976				panic("mmu_booke_unwire: pte %p isn't wired",
1977				    pte);
1978			*pte &= ~PTE_WIRED;
1979			pmap->pm_stats.wired_count--;
1980		}
1981	}
1982	PMAP_UNLOCK(pmap);
1983
1984}
1985
1986/*
1987 * Return true if the pmap's pv is one of the first 16 pvs linked to from this
1988 * page.  This count may be changed upwards or downwards in the future; it is
1989 * only necessary that true be returned for a small subset of pmaps for proper
1990 * page aging.
1991 */
1992static bool
1993mmu_booke_page_exists_quick(pmap_t pmap, vm_page_t m)
1994{
1995	pv_entry_t pv;
1996	int loops;
1997	bool rv;
1998
1999	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2000	    ("mmu_booke_page_exists_quick: page %p is not managed", m));
2001	loops = 0;
2002	rv = false;
2003	rw_wlock(&pvh_global_lock);
2004	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2005		if (pv->pv_pmap == pmap) {
2006			rv = true;
2007			break;
2008		}
2009		if (++loops >= 16)
2010			break;
2011	}
2012	rw_wunlock(&pvh_global_lock);
2013	return (rv);
2014}
2015
2016/*
2017 * Return the number of managed mappings to the given physical page that are
2018 * wired.
2019 */
2020static int
2021mmu_booke_page_wired_mappings(vm_page_t m)
2022{
2023	pv_entry_t pv;
2024	pte_t *pte;
2025	int count = 0;
2026
2027	if ((m->oflags & VPO_UNMANAGED) != 0)
2028		return (count);
2029	rw_wlock(&pvh_global_lock);
2030	TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2031		PMAP_LOCK(pv->pv_pmap);
2032		if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL)
2033			if (PTE_ISVALID(pte) && PTE_ISWIRED(pte))
2034				count++;
2035		PMAP_UNLOCK(pv->pv_pmap);
2036	}
2037	rw_wunlock(&pvh_global_lock);
2038	return (count);
2039}
2040
2041static int
2042mmu_booke_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
2043{
2044	int i;
2045	vm_offset_t va;
2046
2047	/*
2048	 * This currently does not work for entries that
2049	 * overlap TLB1 entries.
2050	 */
2051	for (i = 0; i < TLB1_ENTRIES; i ++) {
2052		if (tlb1_iomapped(i, pa, size, &va) == 0)
2053			return (0);
2054	}
2055
2056	return (EFAULT);
2057}
2058
2059void
2060mmu_booke_dumpsys_map(vm_paddr_t pa, size_t sz, void **va)
2061{
2062	vm_paddr_t ppa;
2063	vm_offset_t ofs;
2064	vm_size_t gran;
2065
2066	/* Minidumps are based on virtual memory addresses. */
2067	if (do_minidump) {
2068		*va = (void *)(vm_offset_t)pa;
2069		return;
2070	}
2071
2072	/* Raw physical memory dumps don't have a virtual address. */
2073	/* We always map a 256MB page at 256M. */
2074	gran = 256 * 1024 * 1024;
2075	ppa = rounddown2(pa, gran);
2076	ofs = pa - ppa;
2077	*va = (void *)gran;
2078	tlb1_set_entry((vm_offset_t)va, ppa, gran, _TLB_ENTRY_IO);
2079
2080	if (sz > (gran - ofs))
2081		tlb1_set_entry((vm_offset_t)(va + gran), ppa + gran, gran,
2082		    _TLB_ENTRY_IO);
2083}
2084
2085void
2086mmu_booke_dumpsys_unmap(vm_paddr_t pa, size_t sz, void *va)
2087{
2088	vm_paddr_t ppa;
2089	vm_offset_t ofs;
2090	vm_size_t gran;
2091	tlb_entry_t e;
2092	int i;
2093
2094	/* Minidumps are based on virtual memory addresses. */
2095	/* Nothing to do... */
2096	if (do_minidump)
2097		return;
2098
2099	for (i = 0; i < TLB1_ENTRIES; i++) {
2100		tlb1_read_entry(&e, i);
2101		if (!(e.mas1 & MAS1_VALID))
2102			break;
2103	}
2104
2105	/* Raw physical memory dumps don't have a virtual address. */
2106	i--;
2107	e.mas1 = 0;
2108	e.mas2 = 0;
2109	e.mas3 = 0;
2110	tlb1_write_entry(&e, i);
2111
2112	gran = 256 * 1024 * 1024;
2113	ppa = rounddown2(pa, gran);
2114	ofs = pa - ppa;
2115	if (sz > (gran - ofs)) {
2116		i--;
2117		e.mas1 = 0;
2118		e.mas2 = 0;
2119		e.mas3 = 0;
2120		tlb1_write_entry(&e, i);
2121	}
2122}
2123
2124extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
2125
2126void
2127mmu_booke_scan_init(void)
2128{
2129	vm_offset_t va;
2130	pte_t *pte;
2131	int i;
2132
2133	if (!do_minidump) {
2134		/* Initialize phys. segments for dumpsys(). */
2135		memset(&dump_map, 0, sizeof(dump_map));
2136		mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions,
2137		    &availmem_regions_sz);
2138		for (i = 0; i < physmem_regions_sz; i++) {
2139			dump_map[i].pa_start = physmem_regions[i].mr_start;
2140			dump_map[i].pa_size = physmem_regions[i].mr_size;
2141		}
2142		return;
2143	}
2144
2145	/* Virtual segments for minidumps: */
2146	memset(&dump_map, 0, sizeof(dump_map));
2147
2148	/* 1st: kernel .data and .bss. */
2149	dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
2150	dump_map[0].pa_size =
2151	    round_page((uintptr_t)_end) - dump_map[0].pa_start;
2152
2153	/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
2154	dump_map[1].pa_start = data_start;
2155	dump_map[1].pa_size = data_end - data_start;
2156
2157	/* 3rd: kernel VM. */
2158	va = dump_map[1].pa_start + dump_map[1].pa_size;
2159	/* Find start of next chunk (from va). */
2160	while (va < virtual_end) {
2161		/* Don't dump the buffer cache. */
2162		if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
2163			va = kmi.buffer_eva;
2164			continue;
2165		}
2166		pte = pte_find(kernel_pmap, va);
2167		if (pte != NULL && PTE_ISVALID(pte))
2168			break;
2169		va += PAGE_SIZE;
2170	}
2171	if (va < virtual_end) {
2172		dump_map[2].pa_start = va;
2173		va += PAGE_SIZE;
2174		/* Find last page in chunk. */
2175		while (va < virtual_end) {
2176			/* Don't run into the buffer cache. */
2177			if (va == kmi.buffer_sva)
2178				break;
2179			pte = pte_find(kernel_pmap, va);
2180			if (pte == NULL || !PTE_ISVALID(pte))
2181				break;
2182			va += PAGE_SIZE;
2183		}
2184		dump_map[2].pa_size = va - dump_map[2].pa_start;
2185	}
2186}
2187
2188/*
2189 * Map a set of physical memory pages into the kernel virtual address space.
2190 * Return a pointer to where it is mapped. This routine is intended to be used
2191 * for mapping device memory, NOT real memory.
2192 */
2193static void *
2194mmu_booke_mapdev(vm_paddr_t pa, vm_size_t size)
2195{
2196
2197	return (mmu_booke_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
2198}
2199
2200static int
2201tlb1_find_pa(vm_paddr_t pa, tlb_entry_t *e)
2202{
2203	int i;
2204
2205	for (i = 0; i < TLB1_ENTRIES; i++) {
2206		tlb1_read_entry(e, i);
2207		if ((e->mas1 & MAS1_VALID) == 0)
2208			continue;
2209		if (e->phys == pa)
2210			return (i);
2211	}
2212	return (-1);
2213}
2214
2215static void *
2216mmu_booke_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
2217{
2218	tlb_entry_t e;
2219	vm_paddr_t tmppa;
2220#ifndef __powerpc64__
2221	uintptr_t tmpva;
2222#endif
2223	uintptr_t va, retva;
2224	vm_size_t sz;
2225	int i;
2226	int wimge;
2227
2228	/*
2229	 * Check if this is premapped in TLB1.
2230	 */
2231	sz = size;
2232	tmppa = pa;
2233	va = ~0;
2234	wimge = tlb_calc_wimg(pa, ma);
2235	for (i = 0; i < TLB1_ENTRIES; i++) {
2236		tlb1_read_entry(&e, i);
2237		if (!(e.mas1 & MAS1_VALID))
2238			continue;
2239		if (wimge != (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED)))
2240			continue;
2241		if (tmppa >= e.phys && tmppa < e.phys + e.size) {
2242			va = e.virt + (pa - e.phys);
2243			tmppa = e.phys + e.size;
2244			sz -= MIN(sz, e.size - (pa - e.phys));
2245			while (sz > 0 && (i = tlb1_find_pa(tmppa, &e)) != -1) {
2246				if (wimge != (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED)))
2247					break;
2248				sz -= MIN(sz, e.size);
2249				tmppa = e.phys + e.size;
2250			}
2251			if (sz != 0)
2252				break;
2253			return ((void *)va);
2254		}
2255	}
2256
2257	size = roundup(size, PAGE_SIZE);
2258
2259#ifdef __powerpc64__
2260	KASSERT(pa < VM_MAPDEV_PA_MAX,
2261	    ("Unsupported physical address! %lx", pa));
2262	va = VM_MAPDEV_BASE + pa;
2263	retva = va;
2264#ifdef POW2_MAPPINGS
2265	/*
2266	 * Align the mapping to a power of 2 size, taking into account that we
2267	 * may need to increase the size multiple times to satisfy the size and
2268	 * alignment requirements.
2269	 *
2270	 * This works in the general case because it's very rare (near never?)
2271	 * to have different access properties (WIMG) within a single
2272	 * power-of-two region.  If a design does call for that, POW2_MAPPINGS
2273	 * can be undefined, and exact mappings will be used instead.
2274	 */
2275	sz = size;
2276	size = roundup2(size, 1 << ilog2(size));
2277	while (rounddown2(va, size) + size < va + sz)
2278		size <<= 1;
2279	va = rounddown2(va, size);
2280	pa = rounddown2(pa, size);
2281#endif
2282#else
2283	/*
2284	 * The device mapping area is between VM_MAXUSER_ADDRESS and
2285	 * VM_MIN_KERNEL_ADDRESS.  This gives 1GB of device addressing.
2286	 */
2287#ifdef SPARSE_MAPDEV
2288	/*
2289	 * With a sparse mapdev, align to the largest starting region.  This
2290	 * could feasibly be optimized for a 'best-fit' alignment, but that
2291	 * calculation could be very costly.
2292	 * Align to the smaller of:
2293	 * - first set bit in overlap of (pa & size mask)
2294	 * - largest size envelope
2295	 *
2296	 * It's possible the device mapping may start at a PA that's not larger
2297	 * than the size mask, so we need to offset in to maximize the TLB entry
2298	 * range and minimize the number of used TLB entries.
2299	 */
2300	do {
2301	    tmpva = tlb1_map_base;
2302	    sz = ffsl((~((1 << flsl(size-1)) - 1)) & pa);
2303	    sz = sz ? min(roundup(sz + 3, 4), flsl(size) - 1) : flsl(size) - 1;
2304	    va = roundup(tlb1_map_base, 1 << sz) | (((1 << sz) - 1) & pa);
2305	} while (!atomic_cmpset_int(&tlb1_map_base, tmpva, va + size));
2306#endif
2307	va = atomic_fetchadd_int(&tlb1_map_base, size);
2308	retva = va;
2309#endif
2310
2311	if (tlb1_mapin_region(va, pa, size, tlb_calc_wimg(pa, ma)) != size)
2312		return (NULL);
2313
2314	return ((void *)retva);
2315}
2316
2317/*
2318 * 'Unmap' a range mapped by mmu_booke_mapdev().
2319 */
2320static void
2321mmu_booke_unmapdev(void *p, vm_size_t size)
2322{
2323#ifdef SUPPORTS_SHRINKING_TLB1
2324	vm_offset_t base, offset, va;
2325
2326	/*
2327	 * Unmap only if this is inside kernel virtual space.
2328	 */
2329	va = (vm_offset_t)p;
2330	if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)) {
2331		base = trunc_page(va);
2332		offset = va & PAGE_MASK;
2333		size = roundup(offset + size, PAGE_SIZE);
2334		mmu_booke_qremove(base, atop(size));
2335		kva_free(base, size);
2336	}
2337#endif
2338}
2339
2340/*
2341 * mmu_booke_object_init_pt preloads the ptes for a given object into the
2342 * specified pmap. This eliminates the blast of soft faults on process startup
2343 * and immediately after an mmap.
2344 */
2345static void
2346mmu_booke_object_init_pt(pmap_t pmap, vm_offset_t addr,
2347    vm_object_t object, vm_pindex_t pindex, vm_size_t size)
2348{
2349
2350	VM_OBJECT_ASSERT_WLOCKED(object);
2351	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
2352	    ("mmu_booke_object_init_pt: non-device object"));
2353}
2354
2355/*
2356 * Perform the pmap work for mincore.
2357 */
2358static int
2359mmu_booke_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
2360{
2361
2362	/* XXX: this should be implemented at some point */
2363	return (0);
2364}
2365
2366static int
2367mmu_booke_change_attr(vm_offset_t addr, vm_size_t sz, vm_memattr_t mode)
2368{
2369	vm_offset_t va;
2370	pte_t *pte;
2371	int i, j;
2372	tlb_entry_t e;
2373
2374	addr = trunc_page(addr);
2375
2376	/* Only allow changes to mapped kernel addresses.  This includes:
2377	 * - KVA
2378	 * - DMAP (powerpc64)
2379	 * - Device mappings
2380	 */
2381	if (addr <= VM_MAXUSER_ADDRESS ||
2382#ifdef __powerpc64__
2383	    (addr >= tlb1_map_base && addr < DMAP_BASE_ADDRESS) ||
2384	    (addr > DMAP_MAX_ADDRESS && addr < VM_MIN_KERNEL_ADDRESS) ||
2385#else
2386	    (addr >= tlb1_map_base && addr < VM_MIN_KERNEL_ADDRESS) ||
2387#endif
2388	    (addr > VM_MAX_KERNEL_ADDRESS))
2389		return (EINVAL);
2390
2391	/* Check TLB1 mappings */
2392	for (i = 0; i < TLB1_ENTRIES; i++) {
2393		tlb1_read_entry(&e, i);
2394		if (!(e.mas1 & MAS1_VALID))
2395			continue;
2396		if (addr >= e.virt && addr < e.virt + e.size)
2397			break;
2398	}
2399	if (i < TLB1_ENTRIES) {
2400		/* Only allow full mappings to be modified for now. */
2401		/* Validate the range. */
2402		for (j = i, va = addr; va < addr + sz; va += e.size, j++) {
2403			tlb1_read_entry(&e, j);
2404			if (va != e.virt || (sz - (va - addr) < e.size))
2405				return (EINVAL);
2406		}
2407		for (va = addr; va < addr + sz; va += e.size, i++) {
2408			tlb1_read_entry(&e, i);
2409			e.mas2 &= ~MAS2_WIMGE_MASK;
2410			e.mas2 |= tlb_calc_wimg(e.phys, mode);
2411
2412			/*
2413			 * Write it out to the TLB.  Should really re-sync with other
2414			 * cores.
2415			 */
2416			tlb1_write_entry(&e, i);
2417		}
2418		return (0);
2419	}
2420
2421	/* Not in TLB1, try through pmap */
2422	/* First validate the range. */
2423	for (va = addr; va < addr + sz; va += PAGE_SIZE) {
2424		pte = pte_find(kernel_pmap, va);
2425		if (pte == NULL || !PTE_ISVALID(pte))
2426			return (EINVAL);
2427	}
2428
2429	mtx_lock_spin(&tlbivax_mutex);
2430	tlb_miss_lock();
2431	for (va = addr; va < addr + sz; va += PAGE_SIZE) {
2432		pte = pte_find(kernel_pmap, va);
2433		*pte &= ~(PTE_MAS2_MASK << PTE_MAS2_SHIFT);
2434		*pte |= tlb_calc_wimg(PTE_PA(pte), mode) << PTE_MAS2_SHIFT;
2435		tlb0_flush_entry(va);
2436	}
2437	tlb_miss_unlock();
2438	mtx_unlock_spin(&tlbivax_mutex);
2439
2440	return (0);
2441}
2442
2443static void
2444mmu_booke_page_array_startup(long pages)
2445{
2446	vm_page_array_size = pages;
2447}
2448
2449/**************************************************************************/
2450/* TID handling */
2451/**************************************************************************/
2452
2453/*
2454 * Allocate a TID. If necessary, steal one from someone else.
2455 * The new TID is flushed from the TLB before returning.
2456 */
2457static tlbtid_t
2458tid_alloc(pmap_t pmap)
2459{
2460	tlbtid_t tid;
2461	int thiscpu;
2462
2463	KASSERT((pmap != kernel_pmap), ("tid_alloc: kernel pmap"));
2464
2465	CTR2(KTR_PMAP, "%s: s (pmap = %p)", __func__, pmap);
2466
2467	thiscpu = PCPU_GET(cpuid);
2468
2469	tid = PCPU_GET(booke.tid_next);
2470	if (tid > TID_MAX)
2471		tid = TID_MIN;
2472	PCPU_SET(booke.tid_next, tid + 1);
2473
2474	/* If we are stealing TID then clear the relevant pmap's field */
2475	if (tidbusy[thiscpu][tid] != NULL) {
2476		CTR2(KTR_PMAP, "%s: warning: stealing tid %d", __func__, tid);
2477
2478		tidbusy[thiscpu][tid]->pm_tid[thiscpu] = TID_NONE;
2479
2480		/* Flush all entries from TLB0 matching this TID. */
2481		tid_flush(tid);
2482	}
2483
2484	tidbusy[thiscpu][tid] = pmap;
2485	pmap->pm_tid[thiscpu] = tid;
2486	__asm __volatile("msync; isync");
2487
2488	CTR3(KTR_PMAP, "%s: e (%02d next = %02d)", __func__, tid,
2489	    PCPU_GET(booke.tid_next));
2490
2491	return (tid);
2492}
2493
2494/**************************************************************************/
2495/* TLB0 handling */
2496/**************************************************************************/
2497
2498/* Convert TLB0 va and way number to tlb0[] table index. */
2499static inline unsigned int
2500tlb0_tableidx(vm_offset_t va, unsigned int way)
2501{
2502	unsigned int idx;
2503
2504	idx = (way * TLB0_ENTRIES_PER_WAY);
2505	idx += (va & MAS2_TLB0_ENTRY_IDX_MASK) >> MAS2_TLB0_ENTRY_IDX_SHIFT;
2506	return (idx);
2507}
2508
2509/*
2510 * Invalidate TLB0 entry.
2511 */
2512static inline void
2513tlb0_flush_entry(vm_offset_t va)
2514{
2515
2516	CTR2(KTR_PMAP, "%s: s va=0x%08x", __func__, va);
2517
2518	mtx_assert(&tlbivax_mutex, MA_OWNED);
2519
2520	__asm __volatile("tlbivax 0, %0" :: "r"(va & MAS2_EPN_MASK));
2521	__asm __volatile("isync; msync");
2522	__asm __volatile("tlbsync; msync");
2523
2524	CTR1(KTR_PMAP, "%s: e", __func__);
2525}
2526
2527/**************************************************************************/
2528/* TLB1 handling */
2529/**************************************************************************/
2530
2531/*
2532 * TLB1 mapping notes:
2533 *
2534 * TLB1[0]	Kernel text and data.
2535 * TLB1[1-15]	Additional kernel text and data mappings (if required), PCI
2536 *		windows, other devices mappings.
2537 */
2538
2539 /*
2540 * Read an entry from given TLB1 slot.
2541 */
2542void
2543tlb1_read_entry(tlb_entry_t *entry, unsigned int slot)
2544{
2545	register_t msr;
2546	uint32_t mas0;
2547
2548	KASSERT((entry != NULL), ("%s(): Entry is NULL!", __func__));
2549
2550	msr = mfmsr();
2551	__asm __volatile("wrteei 0");
2552
2553	mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(slot);
2554	mtspr(SPR_MAS0, mas0);
2555	__asm __volatile("isync; tlbre");
2556
2557	entry->mas1 = mfspr(SPR_MAS1);
2558	entry->mas2 = mfspr(SPR_MAS2);
2559	entry->mas3 = mfspr(SPR_MAS3);
2560
2561	switch ((mfpvr() >> 16) & 0xFFFF) {
2562	case FSL_E500v2:
2563	case FSL_E500mc:
2564	case FSL_E5500:
2565	case FSL_E6500:
2566		entry->mas7 = mfspr(SPR_MAS7);
2567		break;
2568	default:
2569		entry->mas7 = 0;
2570		break;
2571	}
2572	__asm __volatile("wrtee %0" :: "r"(msr));
2573
2574	entry->virt = entry->mas2 & MAS2_EPN_MASK;
2575	entry->phys = ((vm_paddr_t)(entry->mas7 & MAS7_RPN) << 32) |
2576	    (entry->mas3 & MAS3_RPN);
2577	entry->size =
2578	    tsize2size((entry->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT);
2579}
2580
2581struct tlbwrite_args {
2582	tlb_entry_t *e;
2583	unsigned int idx;
2584};
2585
2586static uint32_t
2587tlb1_find_free(void)
2588{
2589	tlb_entry_t e;
2590	int i;
2591
2592	for (i = 0; i < TLB1_ENTRIES; i++) {
2593		tlb1_read_entry(&e, i);
2594		if ((e.mas1 & MAS1_VALID) == 0)
2595			return (i);
2596	}
2597	return (-1);
2598}
2599
2600static void
2601tlb1_purge_va_range(vm_offset_t va, vm_size_t size)
2602{
2603	tlb_entry_t e;
2604	int i;
2605
2606	for (i = 0; i < TLB1_ENTRIES; i++) {
2607		tlb1_read_entry(&e, i);
2608		if ((e.mas1 & MAS1_VALID) == 0)
2609			continue;
2610		if ((e.mas2 & MAS2_EPN_MASK) >= va &&
2611		    (e.mas2 & MAS2_EPN_MASK) < va + size) {
2612			mtspr(SPR_MAS1, e.mas1 & ~MAS1_VALID);
2613			__asm __volatile("isync; tlbwe; isync; msync");
2614		}
2615	}
2616}
2617
2618static void
2619tlb1_write_entry_int(void *arg)
2620{
2621	struct tlbwrite_args *args = arg;
2622	uint32_t idx, mas0;
2623
2624	idx = args->idx;
2625	if (idx == -1) {
2626		tlb1_purge_va_range(args->e->virt, args->e->size);
2627		idx = tlb1_find_free();
2628		if (idx == -1)
2629			panic("No free TLB1 entries!\n");
2630	}
2631	/* Select entry */
2632	mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(idx);
2633
2634	mtspr(SPR_MAS0, mas0);
2635	mtspr(SPR_MAS1, args->e->mas1);
2636	mtspr(SPR_MAS2, args->e->mas2);
2637	mtspr(SPR_MAS3, args->e->mas3);
2638	switch ((mfpvr() >> 16) & 0xFFFF) {
2639	case FSL_E500mc:
2640	case FSL_E5500:
2641	case FSL_E6500:
2642		mtspr(SPR_MAS8, 0);
2643		/* FALLTHROUGH */
2644	case FSL_E500v2:
2645		mtspr(SPR_MAS7, args->e->mas7);
2646		break;
2647	default:
2648		break;
2649	}
2650
2651	__asm __volatile("isync; tlbwe; isync; msync");
2652
2653}
2654
2655static void
2656tlb1_write_entry_sync(void *arg)
2657{
2658	/* Empty synchronization point for smp_rendezvous(). */
2659}
2660
2661/*
2662 * Write given entry to TLB1 hardware.
2663 */
2664static void
2665tlb1_write_entry(tlb_entry_t *e, unsigned int idx)
2666{
2667	struct tlbwrite_args args;
2668
2669	args.e = e;
2670	args.idx = idx;
2671
2672#ifdef SMP
2673	if ((e->mas2 & _TLB_ENTRY_SHARED) && smp_started) {
2674		mb();
2675		smp_rendezvous(tlb1_write_entry_sync,
2676		    tlb1_write_entry_int,
2677		    tlb1_write_entry_sync, &args);
2678	} else
2679#endif
2680	{
2681		register_t msr;
2682
2683		msr = mfmsr();
2684		__asm __volatile("wrteei 0");
2685		tlb1_write_entry_int(&args);
2686		__asm __volatile("wrtee %0" :: "r"(msr));
2687	}
2688}
2689
2690/*
2691 * Convert TLB TSIZE value to mapped region size.
2692 */
2693static vm_size_t
2694tsize2size(unsigned int tsize)
2695{
2696
2697	/*
2698	 * size = 4^tsize KB
2699	 * size = 4^tsize * 2^10 = 2^(2 * tsize - 10)
2700	 */
2701
2702	return ((1 << (2 * tsize)) * 1024);
2703}
2704
2705/*
2706 * Convert region size (must be power of 4) to TLB TSIZE value.
2707 */
2708static unsigned int
2709size2tsize(vm_size_t size)
2710{
2711
2712	return (ilog2(size) / 2 - 5);
2713}
2714
2715/*
2716 * Register permanent kernel mapping in TLB1.
2717 *
2718 * Entries are created starting from index 0 (current free entry is
2719 * kept in tlb1_idx) and are not supposed to be invalidated.
2720 */
2721int
2722tlb1_set_entry(vm_offset_t va, vm_paddr_t pa, vm_size_t size,
2723    uint32_t flags)
2724{
2725	tlb_entry_t e;
2726	uint32_t ts, tid;
2727	int tsize, index;
2728
2729	/* First try to update an existing entry. */
2730	for (index = 0; index < TLB1_ENTRIES; index++) {
2731		tlb1_read_entry(&e, index);
2732		/* Check if we're just updating the flags, and update them. */
2733		if (e.phys == pa && e.virt == va && e.size == size) {
2734			e.mas2 = (va & MAS2_EPN_MASK) | flags;
2735			tlb1_write_entry(&e, index);
2736			return (0);
2737		}
2738	}
2739
2740	/* Convert size to TSIZE */
2741	tsize = size2tsize(size);
2742
2743	tid = (TID_KERNEL << MAS1_TID_SHIFT) & MAS1_TID_MASK;
2744	/* XXX TS is hard coded to 0 for now as we only use single address space */
2745	ts = (0 << MAS1_TS_SHIFT) & MAS1_TS_MASK;
2746
2747	e.phys = pa;
2748	e.virt = va;
2749	e.size = size;
2750	e.mas1 = MAS1_VALID | MAS1_IPROT | ts | tid;
2751	e.mas1 |= ((tsize << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK);
2752	e.mas2 = (va & MAS2_EPN_MASK) | flags;
2753
2754	/* Set supervisor RWX permission bits */
2755	e.mas3 = (pa & MAS3_RPN) | MAS3_SR | MAS3_SW | MAS3_SX;
2756	e.mas7 = (pa >> 32) & MAS7_RPN;
2757
2758	tlb1_write_entry(&e, -1);
2759
2760	return (0);
2761}
2762
2763/*
2764 * Map in contiguous RAM region into the TLB1.
2765 */
2766static vm_size_t
2767tlb1_mapin_region(vm_offset_t va, vm_paddr_t pa, vm_size_t size, int wimge)
2768{
2769	vm_offset_t base;
2770	vm_size_t mapped, sz, ssize;
2771
2772	mapped = 0;
2773	base = va;
2774	ssize = size;
2775
2776	while (size > 0) {
2777		sz = 1UL << (ilog2(size) & ~1);
2778		/* Align size to PA */
2779		if (pa % sz != 0) {
2780			do {
2781				sz >>= 2;
2782			} while (pa % sz != 0);
2783		}
2784		/* Now align from there to VA */
2785		if (va % sz != 0) {
2786			do {
2787				sz >>= 2;
2788			} while (va % sz != 0);
2789		}
2790#ifdef __powerpc64__
2791		/*
2792		 * Clamp TLB1 entries to 4G.
2793		 *
2794		 * While the e6500 supports up to 1TB mappings, the e5500
2795		 * only supports up to 4G mappings. (0b1011)
2796		 *
2797		 * If any e6500 machines capable of supporting a very
2798		 * large amount of memory appear in the future, we can
2799		 * revisit this.
2800		 *
2801		 * For now, though, since we have plenty of space in TLB1,
2802		 * always avoid creating entries larger than 4GB.
2803		 */
2804		sz = MIN(sz, 1UL << 32);
2805#endif
2806		if (bootverbose)
2807			printf("Wiring VA=%p to PA=%jx (size=%lx)\n",
2808			    (void *)va, (uintmax_t)pa, (long)sz);
2809		if (tlb1_set_entry(va, pa, sz,
2810		    _TLB_ENTRY_SHARED | wimge) < 0)
2811			return (mapped);
2812		size -= sz;
2813		pa += sz;
2814		va += sz;
2815	}
2816
2817	mapped = (va - base);
2818	if (bootverbose)
2819		printf("mapped size 0x%"PRIxPTR" (wasted space 0x%"PRIxPTR")\n",
2820		    mapped, mapped - ssize);
2821
2822	return (mapped);
2823}
2824
2825/*
2826 * TLB1 initialization routine, to be called after the very first
2827 * assembler level setup done in locore.S.
2828 */
2829void
2830tlb1_init(void)
2831{
2832	vm_offset_t mas2;
2833	uint32_t mas0, mas1, mas3, mas7;
2834	uint32_t tsz;
2835
2836	tlb1_get_tlbconf();
2837
2838	mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(0);
2839	mtspr(SPR_MAS0, mas0);
2840	__asm __volatile("isync; tlbre");
2841
2842	mas1 = mfspr(SPR_MAS1);
2843	mas2 = mfspr(SPR_MAS2);
2844	mas3 = mfspr(SPR_MAS3);
2845	mas7 = mfspr(SPR_MAS7);
2846
2847	kernload =  ((vm_paddr_t)(mas7 & MAS7_RPN) << 32) |
2848	    (mas3 & MAS3_RPN);
2849
2850	tsz = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
2851	kernsize += (tsz > 0) ? tsize2size(tsz) : 0;
2852	kernstart = trunc_page(mas2);
2853
2854	/* Setup TLB miss defaults */
2855	set_mas4_defaults();
2856}
2857
2858/*
2859 * pmap_early_io_unmap() should be used in short conjunction with
2860 * pmap_early_io_map(), as in the following snippet:
2861 *
2862 * x = pmap_early_io_map(...);
2863 * <do something with x>
2864 * pmap_early_io_unmap(x, size);
2865 *
2866 * And avoiding more allocations between.
2867 */
2868void
2869pmap_early_io_unmap(vm_offset_t va, vm_size_t size)
2870{
2871	int i;
2872	tlb_entry_t e;
2873	vm_size_t isize;
2874
2875	size = roundup(size, PAGE_SIZE);
2876	isize = size;
2877	for (i = 0; i < TLB1_ENTRIES && size > 0; i++) {
2878		tlb1_read_entry(&e, i);
2879		if (!(e.mas1 & MAS1_VALID))
2880			continue;
2881		if (va <= e.virt && (va + isize) >= (e.virt + e.size)) {
2882			size -= e.size;
2883			e.mas1 &= ~MAS1_VALID;
2884			tlb1_write_entry(&e, i);
2885		}
2886	}
2887	if (tlb1_map_base == va + isize)
2888		tlb1_map_base -= isize;
2889}
2890
2891vm_offset_t
2892pmap_early_io_map(vm_paddr_t pa, vm_size_t size)
2893{
2894	vm_paddr_t pa_base;
2895	vm_offset_t va, sz;
2896	int i;
2897	tlb_entry_t e;
2898
2899	KASSERT(!pmap_bootstrapped, ("Do not use after PMAP is up!"));
2900
2901	for (i = 0; i < TLB1_ENTRIES; i++) {
2902		tlb1_read_entry(&e, i);
2903		if (!(e.mas1 & MAS1_VALID))
2904			continue;
2905		if (pa >= e.phys && (pa + size) <=
2906		    (e.phys + e.size))
2907			return (e.virt + (pa - e.phys));
2908	}
2909
2910	pa_base = rounddown(pa, PAGE_SIZE);
2911	size = roundup(size + (pa - pa_base), PAGE_SIZE);
2912	tlb1_map_base = roundup2(tlb1_map_base, 1 << (ilog2(size) & ~1));
2913	va = tlb1_map_base + (pa - pa_base);
2914
2915	do {
2916		sz = 1 << (ilog2(size) & ~1);
2917		tlb1_set_entry(tlb1_map_base, pa_base, sz,
2918		    _TLB_ENTRY_SHARED | _TLB_ENTRY_IO);
2919		size -= sz;
2920		pa_base += sz;
2921		tlb1_map_base += sz;
2922	} while (size > 0);
2923
2924	return (va);
2925}
2926
2927void
2928pmap_track_page(pmap_t pmap, vm_offset_t va)
2929{
2930	vm_paddr_t pa;
2931	vm_page_t page;
2932	struct pv_entry *pve;
2933
2934	va = trunc_page(va);
2935	pa = pmap_kextract(va);
2936	page = PHYS_TO_VM_PAGE(pa);
2937
2938	rw_wlock(&pvh_global_lock);
2939	PMAP_LOCK(pmap);
2940
2941	TAILQ_FOREACH(pve, &page->md.pv_list, pv_link) {
2942		if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) {
2943			goto out;
2944		}
2945	}
2946	page->md.pv_tracked = true;
2947	pv_insert(pmap, va, page);
2948out:
2949	PMAP_UNLOCK(pmap);
2950	rw_wunlock(&pvh_global_lock);
2951}
2952
2953/*
2954 * Setup MAS4 defaults.
2955 * These values are loaded to MAS0-2 on a TLB miss.
2956 */
2957static void
2958set_mas4_defaults(void)
2959{
2960	uint32_t mas4;
2961
2962	/* Defaults: TLB0, PID0, TSIZED=4K */
2963	mas4 = MAS4_TLBSELD0;
2964	mas4 |= (TLB_SIZE_4K << MAS4_TSIZED_SHIFT) & MAS4_TSIZED_MASK;
2965#ifdef SMP
2966	mas4 |= MAS4_MD;
2967#endif
2968	mtspr(SPR_MAS4, mas4);
2969	__asm __volatile("isync");
2970}
2971
2972/*
2973 * Return 0 if the physical IO range is encompassed by one of the
2974 * the TLB1 entries, otherwise return related error code.
2975 */
2976static int
2977tlb1_iomapped(int i, vm_paddr_t pa, vm_size_t size, vm_offset_t *va)
2978{
2979	uint32_t prot;
2980	vm_paddr_t pa_start;
2981	vm_paddr_t pa_end;
2982	unsigned int entry_tsize;
2983	vm_size_t entry_size;
2984	tlb_entry_t e;
2985
2986	*va = (vm_offset_t)NULL;
2987
2988	tlb1_read_entry(&e, i);
2989	/* Skip invalid entries */
2990	if (!(e.mas1 & MAS1_VALID))
2991		return (EINVAL);
2992
2993	/*
2994	 * The entry must be cache-inhibited, guarded, and r/w
2995	 * so it can function as an i/o page
2996	 */
2997	prot = e.mas2 & (MAS2_I | MAS2_G);
2998	if (prot != (MAS2_I | MAS2_G))
2999		return (EPERM);
3000
3001	prot = e.mas3 & (MAS3_SR | MAS3_SW);
3002	if (prot != (MAS3_SR | MAS3_SW))
3003		return (EPERM);
3004
3005	/* The address should be within the entry range. */
3006	entry_tsize = (e.mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
3007	KASSERT((entry_tsize), ("tlb1_iomapped: invalid entry tsize"));
3008
3009	entry_size = tsize2size(entry_tsize);
3010	pa_start = (((vm_paddr_t)e.mas7 & MAS7_RPN) << 32) |
3011	    (e.mas3 & MAS3_RPN);
3012	pa_end = pa_start + entry_size;
3013
3014	if ((pa < pa_start) || ((pa + size) > pa_end))
3015		return (ERANGE);
3016
3017	/* Return virtual address of this mapping. */
3018	*va = (e.mas2 & MAS2_EPN_MASK) + (pa - pa_start);
3019	return (0);
3020}
3021
3022#ifdef DDB
3023/* Print out contents of the MAS registers for each TLB0 entry */
3024static void
3025#ifdef __powerpc64__
3026tlb_print_entry(int i, uint32_t mas1, uint64_t mas2, uint32_t mas3,
3027#else
3028tlb_print_entry(int i, uint32_t mas1, uint32_t mas2, uint32_t mas3,
3029#endif
3030    uint32_t mas7)
3031{
3032	int as;
3033	char desc[3];
3034	tlbtid_t tid;
3035	vm_size_t size;
3036	unsigned int tsize;
3037
3038	desc[2] = '\0';
3039	if (mas1 & MAS1_VALID)
3040		desc[0] = 'V';
3041	else
3042		desc[0] = ' ';
3043
3044	if (mas1 & MAS1_IPROT)
3045		desc[1] = 'P';
3046	else
3047		desc[1] = ' ';
3048
3049	as = (mas1 & MAS1_TS_MASK) ? 1 : 0;
3050	tid = MAS1_GETTID(mas1);
3051
3052	tsize = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
3053	size = 0;
3054	if (tsize)
3055		size = tsize2size(tsize);
3056
3057	printf("%3d: (%s) [AS=%d] "
3058	    "sz = 0x%jx tsz = %d tid = %d mas1 = 0x%08x "
3059	    "mas2(va) = 0x%"PRI0ptrX" mas3(pa) = 0x%08x mas7 = 0x%08x\n",
3060	    i, desc, as, (uintmax_t)size, tsize, tid, mas1, mas2, mas3, mas7);
3061}
3062
3063DB_SHOW_COMMAND(tlb0, tlb0_print_tlbentries)
3064{
3065	uint32_t mas0, mas1, mas3, mas7;
3066#ifdef __powerpc64__
3067	uint64_t mas2;
3068#else
3069	uint32_t mas2;
3070#endif
3071	int entryidx, way, idx;
3072
3073	printf("TLB0 entries:\n");
3074	for (way = 0; way < TLB0_WAYS; way ++)
3075		for (entryidx = 0; entryidx < TLB0_ENTRIES_PER_WAY; entryidx++) {
3076			mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way);
3077			mtspr(SPR_MAS0, mas0);
3078
3079			mas2 = entryidx << MAS2_TLB0_ENTRY_IDX_SHIFT;
3080			mtspr(SPR_MAS2, mas2);
3081
3082			__asm __volatile("isync; tlbre");
3083
3084			mas1 = mfspr(SPR_MAS1);
3085			mas2 = mfspr(SPR_MAS2);
3086			mas3 = mfspr(SPR_MAS3);
3087			mas7 = mfspr(SPR_MAS7);
3088
3089			idx = tlb0_tableidx(mas2, way);
3090			tlb_print_entry(idx, mas1, mas2, mas3, mas7);
3091		}
3092}
3093
3094/*
3095 * Print out contents of the MAS registers for each TLB1 entry
3096 */
3097DB_SHOW_COMMAND(tlb1, tlb1_print_tlbentries)
3098{
3099	uint32_t mas0, mas1, mas3, mas7;
3100#ifdef __powerpc64__
3101	uint64_t mas2;
3102#else
3103	uint32_t mas2;
3104#endif
3105	int i;
3106
3107	printf("TLB1 entries:\n");
3108	for (i = 0; i < TLB1_ENTRIES; i++) {
3109		mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(i);
3110		mtspr(SPR_MAS0, mas0);
3111
3112		__asm __volatile("isync; tlbre");
3113
3114		mas1 = mfspr(SPR_MAS1);
3115		mas2 = mfspr(SPR_MAS2);
3116		mas3 = mfspr(SPR_MAS3);
3117		mas7 = mfspr(SPR_MAS7);
3118
3119		tlb_print_entry(i, mas1, mas2, mas3, mas7);
3120	}
3121}
3122#endif
3123