x86_xpmap.c revision 1.20
1/*	$NetBSD: x86_xpmap.c,v 1.20 2010/07/15 23:20:34 jym Exp $	*/
2
3/*
4 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19/*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
32 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
34 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
40 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 *
42 */
43
44/*
45 *
46 * Copyright (c) 2004 Christian Limpach.
47 * All rights reserved.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 *    notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 *    notice, this list of conditions and the following disclaimer in the
56 *    documentation and/or other materials provided with the distribution.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
59 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
60 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
61 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
63 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
64 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
65 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
66 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
67 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
68 */
69
70
71#include <sys/cdefs.h>
72__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.20 2010/07/15 23:20:34 jym Exp $");
73
74#include "opt_xen.h"
75#include "opt_ddb.h"
76#include "ksyms.h"
77
78#include <sys/param.h>
79#include <sys/systm.h>
80
81#include <uvm/uvm.h>
82
83#include <machine/pmap.h>
84#include <machine/gdt.h>
85#include <xen/xenfunc.h>
86
87#include <dev/isa/isareg.h>
88#include <machine/isa_machdep.h>
89
90#undef	XENDEBUG
91/* #define XENDEBUG_SYNC */
92/* #define	XENDEBUG_LOW */
93
94#ifdef XENDEBUG
95#define	XENPRINTF(x) printf x
96#define	XENPRINTK(x) printk x
97#define	XENPRINTK2(x) /* printk x */
98
99static char XBUF[256];
100#else
101#define	XENPRINTF(x)
102#define	XENPRINTK(x)
103#define	XENPRINTK2(x)
104#endif
105#define	PRINTF(x) printf x
106#define	PRINTK(x) printk x
107
108/* on x86_64 kernel runs in ring 3 */
109#ifdef __x86_64__
110#define PG_k PG_u
111#else
112#define PG_k 0
113#endif
114
115volatile shared_info_t *HYPERVISOR_shared_info;
116/* Xen requires the start_info struct to be page aligned */
117union start_info_union start_info_union __aligned(PAGE_SIZE);
118unsigned long *xpmap_phys_to_machine_mapping;
119
120void xen_failsafe_handler(void);
121
122#define HYPERVISOR_mmu_update_self(req, count, success_count) \
123	HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
124
125void
126xen_failsafe_handler(void)
127{
128
129	panic("xen_failsafe_handler called!\n");
130}
131
132
133void
134xen_set_ldt(vaddr_t base, uint32_t entries)
135{
136	vaddr_t va;
137	vaddr_t end;
138	pt_entry_t *ptp;
139	int s;
140
141#ifdef __x86_64__
142	end = base + (entries << 3);
143#else
144	end = base + entries * sizeof(union descriptor);
145#endif
146
147	for (va = base; va < end; va += PAGE_SIZE) {
148		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
149		ptp = kvtopte(va);
150		XENPRINTF(("xen_set_ldt %#" PRIxVADDR " %d %p\n",
151		    base, entries, ptp));
152		pmap_pte_clearbits(ptp, PG_RW);
153	}
154	s = splvm();
155	xpq_queue_set_ldt(base, entries);
156	splx(s);
157}
158
159#ifdef XENDEBUG
160void xpq_debug_dump(void);
161#endif
162
163#define XPQUEUE_SIZE 2048
164static mmu_update_t xpq_queue[XPQUEUE_SIZE];
165static int xpq_idx = 0;
166
167void
168xpq_flush_queue(void)
169{
170	int i, ok;
171
172	XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
173	for (i = 0; i < xpq_idx; i++)
174		XENPRINTK2(("%d: 0x%08" PRIx64 " 0x%08" PRIx64 "\n", i,
175		    xpq_queue[i].ptr, xpq_queue[i].val));
176	if (xpq_idx != 0 &&
177	    HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok) < 0) {
178		printf("xpq_flush_queue: %d entries \n", xpq_idx);
179		for (i = 0; i < xpq_idx; i++)
180			printf("0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
181			   xpq_queue[i].ptr, xpq_queue[i].val);
182		panic("HYPERVISOR_mmu_update failed\n");
183	}
184	xpq_idx = 0;
185}
186
187static inline void
188xpq_increment_idx(void)
189{
190
191	xpq_idx++;
192	if (__predict_false(xpq_idx == XPQUEUE_SIZE))
193		xpq_flush_queue();
194}
195
196void
197xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
198{
199	XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
200	    "\n", (int64_t)ma, (int64_t)pa));
201	xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
202	xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
203	xpq_increment_idx();
204#ifdef XENDEBUG_SYNC
205	xpq_flush_queue();
206#endif
207}
208
209void
210xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
211{
212
213	KASSERT((ptr & 3) == 0);
214	xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
215	xpq_queue[xpq_idx].val = val;
216	xpq_increment_idx();
217#ifdef XENDEBUG_SYNC
218	xpq_flush_queue();
219#endif
220}
221
222void
223xpq_queue_pt_switch(paddr_t pa)
224{
225	struct mmuext_op op;
226	xpq_flush_queue();
227
228	XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
229	    (int64_t)pa, (int64_t)pa));
230	op.cmd = MMUEXT_NEW_BASEPTR;
231	op.arg1.mfn = pa >> PAGE_SHIFT;
232	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
233		panic("xpq_queue_pt_switch");
234}
235
236void
237xpq_queue_pin_table(paddr_t pa)
238{
239	struct mmuext_op op;
240	xpq_flush_queue();
241
242	XENPRINTK2(("xpq_queue_pin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
243	    (int64_t)pa, (int64_t)pa));
244	op.arg1.mfn = pa >> PAGE_SHIFT;
245
246#if defined(__x86_64__)
247	op.cmd = MMUEXT_PIN_L4_TABLE;
248#else
249	op.cmd = MMUEXT_PIN_L2_TABLE;
250#endif
251	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
252		panic("xpq_queue_pin_table");
253}
254
255#ifdef PAE
256static void
257xpq_queue_pin_l3_table(paddr_t pa)
258{
259	struct mmuext_op op;
260	xpq_flush_queue();
261
262	XENPRINTK2(("xpq_queue_pin_l2_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
263	    (int64_t)pa, (int64_t)pa));
264	op.arg1.mfn = pa >> PAGE_SHIFT;
265
266	op.cmd = MMUEXT_PIN_L3_TABLE;
267	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
268		panic("xpq_queue_pin_table");
269}
270#endif
271
272void
273xpq_queue_unpin_table(paddr_t pa)
274{
275	struct mmuext_op op;
276	xpq_flush_queue();
277
278	XENPRINTK2(("xpq_queue_unpin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
279	    (int64_t)pa, (int64_t)pa));
280	op.arg1.mfn = pa >> PAGE_SHIFT;
281	op.cmd = MMUEXT_UNPIN_TABLE;
282	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
283		panic("xpq_queue_unpin_table");
284}
285
286void
287xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
288{
289	struct mmuext_op op;
290	xpq_flush_queue();
291
292	XENPRINTK2(("xpq_queue_set_ldt\n"));
293	KASSERT(va == (va & ~PAGE_MASK));
294	op.cmd = MMUEXT_SET_LDT;
295	op.arg1.linear_addr = va;
296	op.arg2.nr_ents = entries;
297	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
298		panic("xpq_queue_set_ldt");
299}
300
301void
302xpq_queue_tlb_flush(void)
303{
304	struct mmuext_op op;
305	xpq_flush_queue();
306
307	XENPRINTK2(("xpq_queue_tlb_flush\n"));
308	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
309	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
310		panic("xpq_queue_tlb_flush");
311}
312
313void
314xpq_flush_cache(void)
315{
316	struct mmuext_op op;
317	int s = splvm();
318	xpq_flush_queue();
319
320	XENPRINTK2(("xpq_queue_flush_cache\n"));
321	op.cmd = MMUEXT_FLUSH_CACHE;
322	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
323		panic("xpq_flush_cache");
324	splx(s);
325}
326
327void
328xpq_queue_invlpg(vaddr_t va)
329{
330	struct mmuext_op op;
331	xpq_flush_queue();
332
333	XENPRINTK2(("xpq_queue_invlpg %#" PRIxVADDR "\n", va));
334	op.cmd = MMUEXT_INVLPG_LOCAL;
335	op.arg1.linear_addr = (va & ~PAGE_MASK);
336	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
337		panic("xpq_queue_invlpg");
338}
339
340int
341xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
342{
343	mmu_update_t op;
344	int ok;
345	xpq_flush_queue();
346
347	op.ptr = ptr;
348	op.val = val;
349	if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
350		return EFAULT;
351	return (0);
352}
353
354#ifdef XENDEBUG
355void
356xpq_debug_dump(void)
357{
358	int i;
359
360	XENPRINTK2(("idx: %d\n", xpq_idx));
361	for (i = 0; i < xpq_idx; i++) {
362		snprintf(XBUF, sizeof(XBUF), "%" PRIx64 " %08" PRIx64,
363		    xpq_queue[i].ptr, xpq_queue[i].val);
364		if (++i < xpq_idx)
365			snprintf(XBUF + strlen(XBUF),
366			    sizeof(XBUF) - strlen(XBUF),
367			    "%" PRIx64 " %08" PRIx64,
368			    xpq_queue[i].ptr, xpq_queue[i].val);
369		if (++i < xpq_idx)
370			snprintf(XBUF + strlen(XBUF),
371			    sizeof(XBUF) - strlen(XBUF),
372			    "%" PRIx64 " %08" PRIx64,
373			    xpq_queue[i].ptr, xpq_queue[i].val);
374		if (++i < xpq_idx)
375			snprintf(XBUF + strlen(XBUF),
376			    sizeof(XBUF) - strlen(XBUF),
377			    "%" PRIx64 " %08" PRIx64,
378			    xpq_queue[i].ptr, xpq_queue[i].val);
379		XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
380	}
381}
382#endif
383
384
385extern volatile struct xencons_interface *xencons_interface; /* XXX */
386extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
387
388static void xen_bt_set_readonly (vaddr_t);
389static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int);
390
391/* How many PDEs ? */
392#if L2_SLOT_KERNBASE > 0
393#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
394#else
395#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
396#endif
397
398/*
399 * Construct and switch to new pagetables
400 * first_avail is the first vaddr we can use after
401 * we get rid of Xen pagetables
402 */
403
404vaddr_t xen_pmap_bootstrap (void);
405
406/*
407 * Function to get rid of Xen bootstrap tables
408 */
409
410/* How many PDP do we need: */
411#ifdef PAE
412/*
413 * For PAE, we consider a single contigous L2 "superpage" of 4 pages,
414 * all of them mapped by the L3 page. We also need a shadow page
415 * for L3[3].
416 */
417static const int l2_4_count = 6;
418#else
419static const int l2_4_count = PTP_LEVELS - 1;
420#endif
421
422vaddr_t
423xen_pmap_bootstrap(void)
424{
425	int count, oldcount;
426	long mapsize;
427	vaddr_t bootstrap_tables, init_tables;
428
429	xpmap_phys_to_machine_mapping =
430	    (unsigned long *)xen_start_info.mfn_list;
431	init_tables = xen_start_info.pt_base;
432	__PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables));
433
434	/* Space after Xen boostrap tables should be free */
435	bootstrap_tables = xen_start_info.pt_base +
436		(xen_start_info.nr_pt_frames * PAGE_SIZE);
437
438	/*
439	 * Calculate how many space we need
440	 * first everything mapped before the Xen bootstrap tables
441	 */
442	mapsize = init_tables - KERNTEXTOFF;
443	/* after the tables we'll have:
444	 *  - UAREA
445	 *  - dummy user PGD (x86_64)
446	 *  - HYPERVISOR_shared_info
447	 *  - ISA I/O mem (if needed)
448	 */
449	mapsize += UPAGES * NBPG;
450#ifdef __x86_64__
451	mapsize += NBPG;
452#endif
453	mapsize += NBPG;
454
455#ifdef DOM0OPS
456	if (xendomain_is_dom0()) {
457		/* space for ISA I/O mem */
458		mapsize += IOM_SIZE;
459	}
460#endif
461	/* at this point mapsize doens't include the table size */
462
463#ifdef __x86_64__
464	count = TABLE_L2_ENTRIES;
465#else
466	count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT;
467#endif /* __x86_64__ */
468
469	/* now compute how many L2 pages we need exactly */
470	XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count));
471	while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF >
472	    ((long)count << L2_SHIFT) + KERNBASE) {
473		count++;
474	}
475#ifndef __x86_64__
476	/*
477	 * one more L2 page: we'll alocate several pages after kva_start
478	 * in pmap_bootstrap() before pmap_growkernel(), which have not been
479	 * counted here. It's not a big issue to allocate one more L2 as
480	 * pmap_growkernel() will be called anyway.
481	 */
482	count++;
483	nkptp[1] = count;
484#endif
485
486	/*
487	 * install bootstrap pages. We may need more L2 pages than will
488	 * have the final table here, as it's installed after the final table
489	 */
490	oldcount = count;
491
492bootstrap_again:
493	XENPRINTK(("bootstrap_again oldcount %d\n", oldcount));
494	/*
495	 * Xen space we'll reclaim may not be enough for our new page tables,
496	 * move bootstrap tables if necessary
497	 */
498	if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
499		bootstrap_tables = init_tables +
500					((count + l2_4_count) * PAGE_SIZE);
501	/* make sure we have enough to map the bootstrap_tables */
502	if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
503	    ((long)oldcount << L2_SHIFT) + KERNBASE) {
504		oldcount++;
505		goto bootstrap_again;
506	}
507
508	/* Create temporary tables */
509	xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables,
510		xen_start_info.nr_pt_frames, oldcount, 0);
511
512	/* Create final tables */
513	xen_bootstrap_tables(bootstrap_tables, init_tables,
514	    oldcount + l2_4_count, count, 1);
515
516	/* zero out free space after tables */
517	memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
518	    (UPAGES + 1) * NBPG);
519	return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
520}
521
522
523/*
524 * Build a new table and switch to it
525 * old_count is # of old tables (including PGD, PDTPE and PDE)
526 * new_count is # of new tables (PTE only)
527 * we assume areas don't overlap
528 */
529
530
531static void
532xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd,
533	int old_count, int new_count, int final)
534{
535	pd_entry_t *pdtpe, *pde, *pte;
536	pd_entry_t *cur_pgd, *bt_pgd;
537	paddr_t addr;
538	vaddr_t page, avail, text_end, map_end;
539	int i;
540	extern char __data_start;
541
542	__PRINTK(("xen_bootstrap_tables(%#" PRIxVADDR ", %#" PRIxVADDR ","
543	    " %d, %d)\n",
544	    old_pgd, new_pgd, old_count, new_count));
545	text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK;
546	/*
547	 * size of R/W area after kernel text:
548	 *  xencons_interface (if present)
549	 *  xenstore_interface (if present)
550	 *  table pages (new_count + l2_4_count entries)
551	 * extra mappings (only when final is true):
552	 *  UAREA
553	 *  dummy user PGD (x86_64 only)/gdt page (i386 only)
554	 *  HYPERVISOR_shared_info
555	 *  ISA I/O mem (if needed)
556	 */
557	map_end = new_pgd + ((new_count + l2_4_count) * NBPG);
558	if (final) {
559		map_end += (UPAGES + 1) * NBPG;
560		HYPERVISOR_shared_info = (shared_info_t *)map_end;
561		map_end += NBPG;
562	}
563	/*
564	 * we always set atdevbase, as it's used by init386 to find the first
565	 * available VA. map_end is updated only if we are dom0, so
566	 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
567	 * this case.
568	 */
569	if (final)
570		atdevbase = map_end;
571#ifdef DOM0OPS
572	if (final && xendomain_is_dom0()) {
573		/* ISA I/O mem */
574		map_end += IOM_SIZE;
575	}
576#endif /* DOM0OPS */
577
578	__PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n",
579	    text_end, map_end));
580	__PRINTK(("console %#lx ", xen_start_info.console_mfn));
581	__PRINTK(("xenstore %#" PRIx32 "\n", xen_start_info.store_mfn));
582
583	/*
584	 * Create bootstrap page tables
585	 * What we need:
586	 * - a PGD (level 4)
587	 * - a PDTPE (level 3)
588	 * - a PDE (level2)
589	 * - some PTEs (level 1)
590	 */
591
592	cur_pgd = (pd_entry_t *) old_pgd;
593	bt_pgd = (pd_entry_t *) new_pgd;
594	memset (bt_pgd, 0, PAGE_SIZE);
595	avail = new_pgd + PAGE_SIZE;
596#if PTP_LEVELS > 3
597	/* Install level 3 */
598	pdtpe = (pd_entry_t *) avail;
599	memset (pdtpe, 0, PAGE_SIZE);
600	avail += PAGE_SIZE;
601
602	addr = ((u_long) pdtpe) - KERNBASE;
603	bt_pgd[pl4_pi(KERNTEXTOFF)] =
604	    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
605
606	__PRINTK(("L3 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
607	    " -> L4[%#x]\n",
608	    pdtpe, addr, bt_pgd[pl4_pi(KERNTEXTOFF)], pl4_pi(KERNTEXTOFF)));
609#else
610	pdtpe = bt_pgd;
611#endif /* PTP_LEVELS > 3 */
612
613#if PTP_LEVELS > 2
614	/* Level 2 */
615	pde = (pd_entry_t *) avail;
616	memset(pde, 0, PAGE_SIZE);
617	avail += PAGE_SIZE;
618
619	addr = ((u_long) pde) - KERNBASE;
620	pdtpe[pl3_pi(KERNTEXTOFF)] =
621	    xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
622	__PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
623	    " -> L3[%#x]\n",
624	    pde, addr, pdtpe[pl3_pi(KERNTEXTOFF)], pl3_pi(KERNTEXTOFF)));
625#elif defined(PAE)
626	/* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */
627	pde = (pd_entry_t *) avail;
628	memset(pde, 0, PAGE_SIZE * 5);
629	avail += PAGE_SIZE * 5;
630	addr = ((u_long) pde) - KERNBASE;
631	/*
632	 * enter L2 pages in the L3.
633	 * The real L2 kernel PD will be the last one (so that
634	 * pde[L2_SLOT_KERN] always point to the shadow).
635	 */
636	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
637		/*
638		 * Xen doens't want R/W mappings in L3 entries, it'll add it
639		 * itself.
640		 */
641		pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
642		__PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
643		    " -> L3[%#x]\n",
644		    (vaddr_t)pde + PAGE_SIZE * i, addr, pdtpe[i], i));
645	}
646	addr += PAGE_SIZE;
647	pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
648	__PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
649	    " -> L3[%#x]\n",
650	    (vaddr_t)pde + PAGE_SIZE * 4, addr, pdtpe[3], 3));
651
652#else /* PAE */
653	pde = bt_pgd;
654#endif /* PTP_LEVELS > 2 */
655
656	/* Level 1 */
657	page = KERNTEXTOFF;
658	for (i = 0; i < new_count; i ++) {
659		vaddr_t cur_page = page;
660
661		pte = (pd_entry_t *) avail;
662		avail += PAGE_SIZE;
663
664		memset(pte, 0, PAGE_SIZE);
665		while (pl2_pi(page) == pl2_pi (cur_page)) {
666			if (page >= map_end) {
667				/* not mapped at all */
668				pte[pl1_pi(page)] = 0;
669				page += PAGE_SIZE;
670				continue;
671			}
672			pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
673			if (page == (vaddr_t)HYPERVISOR_shared_info) {
674				pte[pl1_pi(page)] = xen_start_info.shared_info;
675				__PRINTK(("HYPERVISOR_shared_info "
676				    "va %#lx pte %#" PRIxPADDR "\n",
677				    HYPERVISOR_shared_info, pte[pl1_pi(page)]));
678			}
679			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
680			    == xen_start_info.console.domU.mfn) {
681				xencons_interface = (void *)page;
682				pte[pl1_pi(page)] = xen_start_info.console_mfn;
683				pte[pl1_pi(page)] <<= PAGE_SHIFT;
684				__PRINTK(("xencons_interface "
685				    "va %#lx pte %#" PRIxPADDR "\n",
686				    xencons_interface, pte[pl1_pi(page)]));
687			}
688			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
689			    == xen_start_info.store_mfn) {
690				xenstore_interface = (void *)page;
691				pte[pl1_pi(page)] = xen_start_info.store_mfn;
692				pte[pl1_pi(page)] <<= PAGE_SHIFT;
693				__PRINTK(("xenstore_interface "
694				    "va %#lx pte %#" PRIxPADDR "\n",
695				    xenstore_interface, pte[pl1_pi(page)]));
696			}
697#ifdef DOM0OPS
698			if (page >= (vaddr_t)atdevbase &&
699			    page < (vaddr_t)atdevbase + IOM_SIZE) {
700				pte[pl1_pi(page)] =
701				    IOM_BEGIN + (page - (vaddr_t)atdevbase);
702			}
703#endif
704			pte[pl1_pi(page)] |= PG_k | PG_V;
705			if (page < text_end) {
706				/* map kernel text RO */
707				pte[pl1_pi(page)] |= 0;
708			} else if (page >= old_pgd
709			    && page < old_pgd + (old_count * PAGE_SIZE)) {
710				/* map old page tables RO */
711				pte[pl1_pi(page)] |= 0;
712			} else if (page >= new_pgd &&
713			    page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
714				/* map new page tables RO */
715				pte[pl1_pi(page)] |= 0;
716			} else {
717				/* map page RW */
718				pte[pl1_pi(page)] |= PG_RW;
719			}
720
721			if ((page  >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE))
722			    || page >= new_pgd) {
723				__PRINTK(("va %#lx pa %#lx "
724				    "entry 0x%" PRIxPADDR " -> L1[%#x]\n",
725				    page, page - KERNBASE,
726				    pte[pl1_pi(page)], pl1_pi(page)));
727			}
728			page += PAGE_SIZE;
729		}
730
731		addr = ((u_long) pte) - KERNBASE;
732		pde[pl2_pi(cur_page)] =
733		    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
734		__PRINTK(("L1 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
735		    " -> L2[%#x]\n",
736		    pte, addr, pde[pl2_pi(cur_page)], pl2_pi(cur_page)));
737		/* Mark readonly */
738		xen_bt_set_readonly((vaddr_t) pte);
739	}
740
741	/* Install recursive page tables mapping */
742#ifdef PAE
743	/*
744	 * we need a shadow page for the kernel's L2 page
745	 * The real L2 kernel PD will be the last one (so that
746	 * pde[L2_SLOT_KERN] always point to the shadow.
747	 */
748	memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
749	pmap_kl2pd = &pde[L2_SLOT_KERN + NPDPG];
750	pmap_kl2paddr = (u_long)pmap_kl2pd - KERNBASE;
751
752	/*
753	 * We don't enter a recursive entry from the L3 PD. Instead,
754	 * we enter the first 4 L2 pages, which includes the kernel's L2
755	 * shadow. But we have to entrer the shadow after switching
756	 * %cr3, or Xen will refcount some PTE with the wrong type.
757	 */
758	addr = (u_long)pde - KERNBASE;
759	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
760		pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
761		__PRINTK(("pde[%d] va %#" PRIxVADDR " pa %#" PRIxPADDR
762		    " entry %#" PRIxPADDR "\n",
763		    (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i,
764		    addr, pde[PDIR_SLOT_PTE + i]));
765	}
766#if 0
767	addr += PAGE_SIZE; /* point to shadow L2 */
768	pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
769	__PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
770	    (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr,
771	    (int64_t)pde[PDIR_SLOT_PTE + 3]));
772#endif
773	/* Mark tables RO, and pin the kernel's shadow as L2 */
774	addr = (u_long)pde - KERNBASE;
775	for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
776		xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
777		if (i == 2 || i == 3)
778			continue;
779#if 0
780		__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr));
781		xpq_queue_pin_table(xpmap_ptom_masked(addr));
782#endif
783	}
784	if (final) {
785		addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
786		__PRINTK(("pin L2 %d addr %#" PRIxPADDR "\n", 2, addr));
787		xpq_queue_pin_table(xpmap_ptom_masked(addr));
788	}
789#if 0
790	addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE;
791	__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
792	xpq_queue_pin_table(xpmap_ptom_masked(addr));
793#endif
794#else /* PAE */
795	/* recursive entry in higher-level PD */
796	bt_pgd[PDIR_SLOT_PTE] =
797	    xpmap_ptom_masked(new_pgd - KERNBASE) | PG_k | PG_V;
798	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] va %#" PRIxVADDR " pa %#" PRIxPADDR
799	    " entry %#" PRIxPADDR "\n", new_pgd, (paddr_t)new_pgd - KERNBASE,
800	    bt_pgd[PDIR_SLOT_PTE]));
801	/* Mark tables RO */
802	xen_bt_set_readonly((vaddr_t) pde);
803#endif
804#if PTP_LEVELS > 2 || defined(PAE)
805	xen_bt_set_readonly((vaddr_t) pdtpe);
806#endif
807#if PTP_LEVELS > 3
808	xen_bt_set_readonly(new_pgd);
809#endif
810	/* Pin the PGD */
811	__PRINTK(("pin PGD\n"));
812#ifdef PAE
813	xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
814#else
815	xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE));
816#endif
817#ifdef __i386__
818	/* Save phys. addr of PDP, for libkvm. */
819	PDPpaddr = (long)pde - KERNBASE;
820#ifdef PAE
821	/* also save the address of the L3 page */
822	pmap_l3pd = pdtpe;
823	pmap_l3paddr = (new_pgd - KERNBASE);
824#endif /* PAE */
825#endif /* i386 */
826	/* Switch to new tables */
827	__PRINTK(("switch to PGD\n"));
828	xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
829	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry %#" PRIxPADDR "\n",
830	    bt_pgd[PDIR_SLOT_PTE]));
831#ifdef PAE
832	if (final) {
833		/* now enter kernel's PTE mappings */
834		addr =  (u_long)pde - KERNBASE + PAGE_SIZE * 3;
835		xpq_queue_pte_update(
836		    xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
837		    xpmap_ptom_masked(addr) | PG_k | PG_V);
838		xpq_flush_queue();
839	}
840#endif
841
842
843
844	/* Now we can safely reclaim space taken by old tables */
845
846	__PRINTK(("unpin old PGD\n"));
847	/* Unpin old PGD */
848	xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
849	/* Mark old tables RW */
850	page = old_pgd;
851	addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME;
852	addr = xpmap_mtop(addr);
853	pte = (pd_entry_t *) ((u_long)addr + KERNBASE);
854	pte += pl1_pi(page);
855	__PRINTK(("*pde %#" PRIxPADDR " addr %#" PRIxPADDR " pte %#lx\n",
856	    pde[pl2_pi(page)], addr, (long)pte));
857	while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
858		addr = xpmap_ptom(((u_long) pte) - KERNBASE);
859		XENPRINTK(("addr %#" PRIxPADDR " pte %#lx "
860		   "*pte %#" PRIxPADDR "\n",
861		   addr, (long)pte, *pte));
862		xpq_queue_pte_update(addr, *pte | PG_RW);
863		page += PAGE_SIZE;
864		/*
865		 * Our ptes are contiguous
866		 * so it's safe to just "++" here
867		 */
868		pte++;
869	}
870	xpq_flush_queue();
871}
872
873
874/*
875 * Bootstrap helper functions
876 */
877
878/*
879 * Mark a page readonly
880 * XXX: assuming vaddr = paddr + KERNBASE
881 */
882
883static void
884xen_bt_set_readonly (vaddr_t page)
885{
886	pt_entry_t entry;
887
888	entry = xpmap_ptom_masked(page - KERNBASE);
889	entry |= PG_k | PG_V;
890
891	HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG);
892}
893
894#ifdef __x86_64__
895void
896xen_set_user_pgd(paddr_t page)
897{
898	struct mmuext_op op;
899	int s = splvm();
900
901	xpq_flush_queue();
902	op.cmd = MMUEXT_NEW_USER_BASEPTR;
903	op.arg1.mfn = xpmap_phys_to_machine_mapping[page >> PAGE_SHIFT];
904        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
905		panic("xen_set_user_pgd: failed to install new user page"
906			" directory %#" PRIxPADDR, page);
907	splx(s);
908}
909#endif /* __x86_64__ */
910