x86_xpmap.c revision 1.8
1/*	$NetBSD: x86_xpmap.c,v 1.8 2008/04/14 13:38:03 cegger Exp $	*/
2
3/*
4 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19/*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 *    must display the following acknowledgement:
32 *	This product includes software developed by Manuel Bouyer.
33 * 4. The name of the author may not be used to endorse or promote products
34 *    derived from this software without specific prior written permission.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *
47 */
48
49/*
50 *
51 * Copyright (c) 2004 Christian Limpach.
52 * All rights reserved.
53 *
54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions
56 * are met:
57 * 1. Redistributions of source code must retain the above copyright
58 *    notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright
60 *    notice, this list of conditions and the following disclaimer in the
61 *    documentation and/or other materials provided with the distribution.
62 * 3. All advertising materials mentioning features or use of this software
63 *    must display the following acknowledgement:
64 *      This product includes software developed by Christian Limpach.
65 * 4. The name of the author may not be used to endorse or promote products
66 *    derived from this software without specific prior written permission.
67 *
68 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
69 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
70 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
71 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
72 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
73 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
74 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
75 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
76 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
77 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
78 */
79
80
81#include <sys/cdefs.h>
82__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.8 2008/04/14 13:38:03 cegger Exp $");
83
84#include "opt_xen.h"
85#include "opt_ddb.h"
86#include "ksyms.h"
87
88#include <sys/param.h>
89#include <sys/systm.h>
90
91#include <uvm/uvm.h>
92
93#include <machine/pmap.h>
94#include <machine/gdt.h>
95#include <xen/xenfunc.h>
96
97#include <dev/isa/isareg.h>
98#include <machine/isa_machdep.h>
99
100#undef	XENDEBUG
101/* #define XENDEBUG_SYNC */
102/* #define	XENDEBUG_LOW */
103
104#ifdef XENDEBUG
105#define	XENPRINTF(x) printf x
106#define	XENPRINTK(x) printk x
107#define	XENPRINTK2(x) /* printk x */
108
109static char XBUF[256];
110#else
111#define	XENPRINTF(x)
112#define	XENPRINTK(x)
113#define	XENPRINTK2(x)
114#endif
115#define	PRINTF(x) printf x
116#define	PRINTK(x) printk x
117
118/* on x86_64 kernel runs in ring 3 */
119#ifdef __x86_64__
120#define PG_k PG_u
121#else
122#define PG_k 0
123#endif
124
125volatile shared_info_t *HYPERVISOR_shared_info;
126union start_info_union start_info_union;
127unsigned long *xpmap_phys_to_machine_mapping;
128
129void xen_failsafe_handler(void);
130
131#ifdef XEN3
132#define HYPERVISOR_mmu_update_self(req, count, success_count) \
133	HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
134#else
135#define HYPERVISOR_mmu_update_self(req, count, success_count) \
136	HYPERVISOR_mmu_update((req), (count), (success_count))
137#endif
138
139void
140xen_failsafe_handler(void)
141{
142
143	panic("xen_failsafe_handler called!\n");
144}
145
146
147void
148xen_set_ldt(vaddr_t base, uint32_t entries)
149{
150	vaddr_t va;
151	vaddr_t end;
152	pt_entry_t *ptp;
153	int s;
154
155#ifdef __x86_64__
156	end = base + (entries << 3);
157#else
158	end = base + entries * sizeof(union descriptor);
159#endif
160
161	for (va = base; va < end; va += PAGE_SIZE) {
162		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
163		ptp = kvtopte(va);
164		XENPRINTF(("xen_set_ldt %p %d %p\n", (void *)base,
165			      entries, ptp));
166		pmap_pte_clearbits(ptp, PG_RW);
167	}
168	s = splvm();
169	xpq_queue_set_ldt(base, entries);
170	xpq_flush_queue();
171	splx(s);
172}
173
174#ifdef XENDEBUG
175void xpq_debug_dump(void);
176#endif
177
178#define XPQUEUE_SIZE 2048
179static mmu_update_t xpq_queue[XPQUEUE_SIZE];
180static int xpq_idx = 0;
181
182void
183xpq_flush_queue(void)
184{
185	int i, ok;
186
187	XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
188	for (i = 0; i < xpq_idx; i++)
189		XENPRINTK2(("%d: %p %08" PRIx64 "\n", i,
190		    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val));
191	if (xpq_idx != 0 &&
192	    HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok) < 0) {
193		printf("xpq_flush_queue: %d entries \n", xpq_idx);
194		for (i = 0; i < xpq_idx; i++)
195			printf("0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
196			   (uint64_t)xpq_queue[i].ptr,
197			   (uint64_t)xpq_queue[i].val);
198		panic("HYPERVISOR_mmu_update failed\n");
199	}
200	xpq_idx = 0;
201}
202
203static inline void
204xpq_increment_idx(void)
205{
206
207	xpq_idx++;
208	if (__predict_false(xpq_idx == XPQUEUE_SIZE))
209		xpq_flush_queue();
210}
211
212void
213xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
214{
215	XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
216	    "\n", (int64_t)ma, (int64_t)pa));
217	xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
218	xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
219	xpq_increment_idx();
220#ifdef XENDEBUG_SYNC
221	xpq_flush_queue();
222#endif
223}
224
225void
226xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
227{
228
229	KASSERT((ptr & 3) == 0);
230	xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
231	xpq_queue[xpq_idx].val = val;
232	xpq_increment_idx();
233#ifdef XENDEBUG_SYNC
234	xpq_flush_queue();
235#endif
236}
237
238#ifdef XEN3
239void
240xpq_queue_pt_switch(paddr_t pa)
241{
242	struct mmuext_op op;
243	xpq_flush_queue();
244
245	XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
246	    (int64_t)pa, (int64_t)pa));
247	op.cmd = MMUEXT_NEW_BASEPTR;
248	op.arg1.mfn = pa >> PAGE_SHIFT;
249	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
250		panic("xpq_queue_pt_switch");
251}
252
253void
254xpq_queue_pin_table(paddr_t pa)
255{
256	struct mmuext_op op;
257	xpq_flush_queue();
258
259	XENPRINTK2(("xpq_queue_pin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
260	    (int64_t)pa, (int64_t)pa));
261	op.arg1.mfn = pa >> PAGE_SHIFT;
262
263#if defined(__x86_64__)
264	op.cmd = MMUEXT_PIN_L4_TABLE;
265#else
266	op.cmd = MMUEXT_PIN_L2_TABLE;
267#endif
268	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
269		panic("xpq_queue_pin_table");
270}
271
272#ifdef PAE
273static void
274xpq_queue_pin_l3_table(paddr_t pa)
275{
276	struct mmuext_op op;
277	xpq_flush_queue();
278
279	XENPRINTK2(("xpq_queue_pin_l2_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
280	    (int64_t)pa, (int64_t)pa));
281	op.arg1.mfn = pa >> PAGE_SHIFT;
282
283	op.cmd = MMUEXT_PIN_L3_TABLE;
284	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
285		panic("xpq_queue_pin_table");
286}
287#endif
288
289void
290xpq_queue_unpin_table(paddr_t pa)
291{
292	struct mmuext_op op;
293	xpq_flush_queue();
294
295	XENPRINTK2(("xpq_queue_unpin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
296	    (int64_t)pa, (int64_t)pa));
297	op.arg1.mfn = pa >> PAGE_SHIFT;
298	op.cmd = MMUEXT_UNPIN_TABLE;
299	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
300		panic("xpq_queue_unpin_table");
301}
302
303void
304xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
305{
306	struct mmuext_op op;
307	xpq_flush_queue();
308
309	XENPRINTK2(("xpq_queue_set_ldt\n"));
310	KASSERT(va == (va & ~PAGE_MASK));
311	op.cmd = MMUEXT_SET_LDT;
312	op.arg1.linear_addr = va;
313	op.arg2.nr_ents = entries;
314	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
315		panic("xpq_queue_set_ldt");
316}
317
318void
319xpq_queue_tlb_flush(void)
320{
321	struct mmuext_op op;
322	xpq_flush_queue();
323
324	XENPRINTK2(("xpq_queue_tlb_flush\n"));
325	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
326	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
327		panic("xpq_queue_tlb_flush");
328}
329
330void
331xpq_flush_cache(void)
332{
333	struct mmuext_op op;
334	int s = splvm();
335	xpq_flush_queue();
336
337	XENPRINTK2(("xpq_queue_flush_cache\n"));
338	op.cmd = MMUEXT_FLUSH_CACHE;
339	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
340		panic("xpq_flush_cache");
341	splx(s);
342}
343
344void
345xpq_queue_invlpg(vaddr_t va)
346{
347	struct mmuext_op op;
348	xpq_flush_queue();
349
350	XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
351	op.cmd = MMUEXT_INVLPG_LOCAL;
352	op.arg1.linear_addr = (va & ~PAGE_MASK);
353	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
354		panic("xpq_queue_invlpg");
355}
356
357int
358xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
359{
360	mmu_update_t op;
361	int ok;
362	xpq_flush_queue();
363
364	op.ptr = ptr;
365	op.val = val;
366	if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
367		return EFAULT;
368	return (0);
369}
370#else /* XEN3 */
371void
372xpq_queue_pt_switch(paddr_t pa)
373{
374
375	XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa));
376	xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND;
377	xpq_queue[xpq_idx].val = MMUEXT_NEW_BASEPTR;
378	xpq_increment_idx();
379}
380
381void
382xpq_queue_pin_table(paddr_t pa)
383{
384
385	XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa));
386	xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND;
387	xpq_queue[xpq_idx].val = MMUEXT_PIN_L2_TABLE;
388	xpq_increment_idx();
389}
390
391void
392xpq_queue_unpin_table(paddr_t pa)
393{
394
395	XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa));
396	xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND;
397	xpq_queue[xpq_idx].val = MMUEXT_UNPIN_TABLE;
398	xpq_increment_idx();
399}
400
401void
402xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
403{
404
405	XENPRINTK2(("xpq_queue_set_ldt\n"));
406	KASSERT(va == (va & ~PAGE_MASK));
407	xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND | va;
408	xpq_queue[xpq_idx].val = MMUEXT_SET_LDT | (entries << MMUEXT_CMD_SHIFT);
409	xpq_increment_idx();
410}
411
412void
413xpq_queue_tlb_flush(void)
414{
415
416	XENPRINTK2(("xpq_queue_tlb_flush\n"));
417	xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND;
418	xpq_queue[xpq_idx].val = MMUEXT_TLB_FLUSH;
419	xpq_increment_idx();
420}
421
422void
423xpq_flush_cache(void)
424{
425	int s = splvm();
426
427	XENPRINTK2(("xpq_queue_flush_cache\n"));
428	xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND;
429	xpq_queue[xpq_idx].val = MMUEXT_FLUSH_CACHE;
430	xpq_increment_idx();
431	xpq_flush_queue();
432	splx(s);
433}
434
435void
436xpq_queue_invlpg(vaddr_t va)
437{
438
439	XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
440	xpq_queue[xpq_idx].ptr = (va & ~PAGE_MASK) | MMU_EXTENDED_COMMAND;
441	xpq_queue[xpq_idx].val = MMUEXT_INVLPG;
442	xpq_increment_idx();
443}
444
445int
446xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
447{
448	mmu_update_t xpq_up[3];
449
450	xpq_up[0].ptr = MMU_EXTENDED_COMMAND;
451	xpq_up[0].val = MMUEXT_SET_FOREIGNDOM | (dom << 16);
452	xpq_up[1].ptr = ptr;
453	xpq_up[1].val = val;
454	if (HYPERVISOR_mmu_update_self(xpq_up, 2, NULL) < 0)
455		return EFAULT;
456	return (0);
457}
458#endif /* XEN3 */
459
460#ifdef XENDEBUG
461void
462xpq_debug_dump(void)
463{
464	int i;
465
466	XENPRINTK2(("idx: %d\n", xpq_idx));
467	for (i = 0; i < xpq_idx; i++) {
468		sprintf(XBUF, "%" PRIx64 " %08" PRIx64,
469		    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
470		if (++i < xpq_idx)
471			sprintf(XBUF + strlen(XBUF), "%" PRIx64 " %08" PRIx64,
472			    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
473		if (++i < xpq_idx)
474			sprintf(XBUF + strlen(XBUF), "%" PRIx64 " %08" PRIx64,
475			    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
476		if (++i < xpq_idx)
477			sprintf(XBUF + strlen(XBUF), "%" PRIx64 " %08" PRIx64,
478			    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
479		XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
480	}
481}
482#endif
483
484
485extern volatile struct xencons_interface *xencons_interface; /* XXX */
486extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
487
488static void xen_bt_set_readonly (vaddr_t);
489static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int);
490
491/* How many PDEs ? */
492#if L2_SLOT_KERNBASE > 0
493#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
494#else
495#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
496#endif
497
498/*
499 * Construct and switch to new pagetables
500 * first_avail is the first vaddr we can use after
501 * we get rid of Xen pagetables
502 */
503
504vaddr_t xen_pmap_bootstrap (void);
505
506/*
507 * Function to get rid of Xen bootstrap tables
508 */
509
510/* How many PDP do we need: */
511#ifdef PAE
512/*
513 * For PAE, we consider a single contigous L2 "superpage" of 4 pages,
514 * all of them mapped by the L3 page. We also need a shadow page
515 * for L3[3].
516 */
517static const int l2_4_count = 6;
518#else
519static const int l2_4_count = PTP_LEVELS - 1;
520#endif
521
522vaddr_t
523xen_pmap_bootstrap(void)
524{
525	int count, oldcount;
526	long mapsize;
527	vaddr_t bootstrap_tables, init_tables;
528
529	xpmap_phys_to_machine_mapping =
530	    (unsigned long *)xen_start_info.mfn_list;
531	init_tables = xen_start_info.pt_base;
532	__PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables));
533
534	/* Space after Xen boostrap tables should be free */
535	bootstrap_tables = xen_start_info.pt_base +
536		(xen_start_info.nr_pt_frames * PAGE_SIZE);
537
538	/*
539	 * Calculate how many space we need
540	 * first everything mapped before the Xen bootstrap tables
541	 */
542	mapsize = init_tables - KERNTEXTOFF;
543	/* after the tables we'll have:
544	 *  - UAREA
545	 *  - dummy user PGD (x86_64)
546	 *  - HYPERVISOR_shared_info
547	 *  - ISA I/O mem (if needed)
548	 */
549	mapsize += UPAGES * NBPG;
550#ifdef __x86_64__
551	mapsize += NBPG;
552#endif
553	mapsize += NBPG;
554
555#ifdef DOM0OPS
556	if (xen_start_info.flags & SIF_INITDOMAIN) {
557		/* space for ISA I/O mem */
558		mapsize += IOM_SIZE;
559	}
560#endif
561	/* at this point mapsize doens't include the table size */
562
563#ifdef __x86_64__
564	count = TABLE_L2_ENTRIES;
565#else
566	count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT;
567#endif /* __x86_64__ */
568
569	/* now compute how many L2 pages we need exactly */
570	XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count));
571	while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF >
572	    ((long)count << L2_SHIFT) + KERNBASE) {
573		count++;
574	}
575#ifndef __x86_64__
576	/*
577	 * one more L2 page: we'll alocate several pages after kva_start
578	 * in pmap_bootstrap() before pmap_growkernel(), which have not been
579	 * counted here. It's not a big issue to allocate one more L2 as
580	 * pmap_growkernel() will be called anyway.
581	 */
582	count++;
583	nkptp[1] = count;
584#endif
585
586	/*
587	 * install bootstrap pages. We may need more L2 pages than will
588	 * have the final table here, as it's installed after the final table
589	 */
590	oldcount = count;
591
592bootstrap_again:
593	XENPRINTK(("bootstrap_again oldcount %d\n", oldcount));
594	/*
595	 * Xen space we'll reclaim may not be enough for our new page tables,
596	 * move bootstrap tables if necessary
597	 */
598	if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
599		bootstrap_tables = init_tables +
600					((count + l2_4_count) * PAGE_SIZE);
601	/* make sure we have enough to map the bootstrap_tables */
602	if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
603	    ((long)oldcount << L2_SHIFT) + KERNBASE) {
604		oldcount++;
605		goto bootstrap_again;
606	}
607
608	/* Create temporary tables */
609	xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables,
610		xen_start_info.nr_pt_frames, oldcount, 0);
611
612	/* Create final tables */
613	xen_bootstrap_tables(bootstrap_tables, init_tables,
614	    oldcount + l2_4_count, count, 1);
615
616	/* zero out free space after tables */
617	memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
618	    (UPAGES + 1) * NBPG);
619	return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
620}
621
622
623/*
624 * Build a new table and switch to it
625 * old_count is # of old tables (including PGD, PDTPE and PDE)
626 * new_count is # of new tables (PTE only)
627 * we assume areas don't overlap
628 */
629
630
631static void
632xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd,
633	int old_count, int new_count, int final)
634{
635	pd_entry_t *pdtpe, *pde, *pte;
636	pd_entry_t *cur_pgd, *bt_pgd;
637	paddr_t addr;
638	vaddr_t page, avail, text_end, map_end;
639	int i;
640	extern char __data_start;
641
642	__PRINTK(("xen_bootstrap_tables(0x%lx, 0x%lx, %d, %d)\n",
643	    old_pgd, new_pgd, old_count, new_count));
644	text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK;
645	/*
646	 * size of R/W area after kernel text:
647	 *  xencons_interface (if present)
648	 *  xenstore_interface (if present)
649	 *  table pages (new_count + l2_4_count entries)
650	 * extra mappings (only when final is true):
651	 *  UAREA
652	 *  dummy user PGD (x86_64 only)/gdt page (i386 only)
653	 *  HYPERVISOR_shared_info
654	 *  ISA I/O mem (if needed)
655	 */
656	map_end = new_pgd + ((new_count + l2_4_count) * NBPG);
657	if (final) {
658		map_end += (UPAGES + 1) * NBPG;
659		HYPERVISOR_shared_info = (shared_info_t *)map_end;
660		map_end += NBPG;
661	}
662	/*
663	 * we always set atdevbase, as it's used by init386 to find the first
664	 * available VA. map_end is updated only if we are dom0, so
665	 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
666	 * this case.
667	 */
668	if (final)
669		atdevbase = map_end;
670#ifdef DOM0OPS
671	if (final && (xen_start_info.flags & SIF_INITDOMAIN)) {
672		/* ISA I/O mem */
673		map_end += IOM_SIZE;
674	}
675#endif /* DOM0OPS */
676
677	__PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n",
678	    text_end, map_end));
679	__PRINTK(("console 0x%lx ", xen_start_info.console_mfn));
680	__PRINTK(("xenstore 0x%lx\n", xen_start_info.store_mfn));
681
682	/*
683	 * Create bootstrap page tables
684	 * What we need:
685	 * - a PGD (level 4)
686	 * - a PDTPE (level 3)
687	 * - a PDE (level2)
688	 * - some PTEs (level 1)
689	 */
690
691	cur_pgd = (pd_entry_t *) old_pgd;
692	bt_pgd = (pd_entry_t *) new_pgd;
693	memset (bt_pgd, 0, PAGE_SIZE);
694	avail = new_pgd + PAGE_SIZE;
695#if PTP_LEVELS > 3
696	/* Install level 3 */
697	pdtpe = (pd_entry_t *) avail;
698	memset (pdtpe, 0, PAGE_SIZE);
699	avail += PAGE_SIZE;
700
701	addr = ((u_long) pdtpe) - KERNBASE;
702	bt_pgd[pl4_pi(KERNTEXTOFF)] =
703	    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
704
705	__PRINTK(("L3 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L4[0x%x]\n",
706	    pdtpe, (uint64_t)addr, (uint64_t)bt_pgd[pl4_pi(KERNTEXTOFF)],
707	    pl4_pi(KERNTEXTOFF)));
708#else
709	pdtpe = bt_pgd;
710#endif /* PTP_LEVELS > 3 */
711
712#if PTP_LEVELS > 2
713	/* Level 2 */
714	pde = (pd_entry_t *) avail;
715	memset(pde, 0, PAGE_SIZE);
716	avail += PAGE_SIZE;
717
718	addr = ((u_long) pde) - KERNBASE;
719	pdtpe[pl3_pi(KERNTEXTOFF)] =
720	    xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
721	__PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L3[0x%x]\n",
722	    pde, (int64_t)addr, (int64_t)pdtpe[pl3_pi(KERNTEXTOFF)],
723	    pl3_pi(KERNTEXTOFF)));
724#elif defined(PAE)
725	/* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */
726	pde = (pd_entry_t *) avail;
727	memset(pde, 0, PAGE_SIZE * 5);
728	avail += PAGE_SIZE * 5;
729	addr = ((u_long) pde) - KERNBASE;
730	/*
731	 * enter L2 pages in the L3.
732	 * The real L2 kernel PD will be the last one (so that
733	 * pde[L2_SLOT_KERN] always point to the shadow).
734	 */
735	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
736		/*
737		 * Xen doens't want R/W mappings in L3 entries, it'll add it
738		 * itself.
739		 */
740		pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
741		__PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
742		    " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * i,
743		    (int64_t)addr, (int64_t)pdtpe[i], i));
744	}
745	addr += PAGE_SIZE;
746	pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
747	__PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
748	    " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * 4,
749	    (int64_t)addr, (int64_t)pdtpe[3], 3));
750
751#else /* PAE */
752	pde = bt_pgd;
753#endif /* PTP_LEVELS > 2 */
754
755	/* Level 1 */
756	page = KERNTEXTOFF;
757	for (i = 0; i < new_count; i ++) {
758		vaddr_t cur_page = page;
759
760		pte = (pd_entry_t *) avail;
761		avail += PAGE_SIZE;
762
763		memset(pte, 0, PAGE_SIZE);
764		while (pl2_pi(page) == pl2_pi (cur_page)) {
765			if (page >= map_end) {
766				/* not mapped at all */
767				pte[pl1_pi(page)] = 0;
768				page += PAGE_SIZE;
769				continue;
770			}
771			pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
772			if (page == (vaddr_t)HYPERVISOR_shared_info) {
773				pte[pl1_pi(page)] = xen_start_info.shared_info;
774				__PRINTK(("HYPERVISOR_shared_info "
775				    "va 0x%lx pte 0x%" PRIx64 "\n",
776				    HYPERVISOR_shared_info, (int64_t)pte[pl1_pi(page)]));
777			}
778#ifdef XEN3
779			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
780			    == xen_start_info.console_mfn) {
781				xencons_interface = (void *)page;
782				pte[pl1_pi(page)] = xen_start_info.console_mfn;
783				pte[pl1_pi(page)] <<= PAGE_SHIFT;
784				__PRINTK(("xencons_interface "
785				    "va 0x%lx pte 0x%" PRIx64 "\n",
786				    xencons_interface, (int64_t)pte[pl1_pi(page)]));
787			}
788			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
789			    == xen_start_info.store_mfn) {
790				xenstore_interface = (void *)page;
791				pte[pl1_pi(page)] = xen_start_info.store_mfn;
792				pte[pl1_pi(page)] <<= PAGE_SHIFT;
793				__PRINTK(("xenstore_interface "
794				    "va 0x%lx pte 0x%" PRIx64 "\n",
795				    xenstore_interface, (int64_t)pte[pl1_pi(page)]));
796			}
797#endif /* XEN3 */
798#ifdef DOM0OPS
799			if (page >= (vaddr_t)atdevbase &&
800			    page < (vaddr_t)atdevbase + IOM_SIZE) {
801				pte[pl1_pi(page)] =
802				    IOM_BEGIN + (page - (vaddr_t)atdevbase);
803			}
804#endif
805			pte[pl1_pi(page)] |= PG_k | PG_V;
806			if (page < text_end) {
807				/* map kernel text RO */
808				pte[pl1_pi(page)] |= 0;
809			} else if (page >= old_pgd
810			    && page < old_pgd + (old_count * PAGE_SIZE)) {
811				/* map old page tables RO */
812				pte[pl1_pi(page)] |= 0;
813			} else if (page >= new_pgd &&
814			    page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
815				/* map new page tables RO */
816				pte[pl1_pi(page)] |= 0;
817			} else {
818				/* map page RW */
819				pte[pl1_pi(page)] |= PG_RW;
820			}
821
822			if ((page  >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE)) || page >= new_pgd)
823				__PRINTK(("va 0x%lx pa 0x%lx "
824				    "entry 0x%" PRIx64 " -> L1[0x%x]\n",
825				    page, page - KERNBASE,
826				    (int64_t)pte[pl1_pi(page)], pl1_pi(page)));
827			page += PAGE_SIZE;
828		}
829
830		addr = ((u_long) pte) - KERNBASE;
831		pde[pl2_pi(cur_page)] =
832		    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
833		__PRINTK(("L1 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
834		    " -> L2[0x%x]\n", pte, (int64_t)addr,
835		    (int64_t)pde[pl2_pi(cur_page)], pl2_pi(cur_page)));
836		/* Mark readonly */
837		xen_bt_set_readonly((vaddr_t) pte);
838	}
839
840	/* Install recursive page tables mapping */
841#ifdef PAE
842	/*
843	 * we need a shadow page for the kernel's L2 page
844	 * The real L2 kernel PD will be the last one (so that
845	 * pde[L2_SLOT_KERN] always point to the shadow.
846	 */
847	memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
848	pmap_kl2pd = &pde[L2_SLOT_KERN + NPDPG];
849	pmap_kl2paddr = (u_long)pmap_kl2pd - KERNBASE;
850
851	/*
852	 * We don't enter a recursive entry from the L3 PD. Instead,
853	 * we enter the first 4 L2 pages, which includes the kernel's L2
854	 * shadow. But we have to entrer the shadow after switching
855	 * %cr3, or Xen will refcount some PTE with the wrong type.
856	 */
857	addr = (u_long)pde - KERNBASE;
858	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
859		pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
860		__PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
861		    (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i, (long)addr,
862		    (int64_t)pde[PDIR_SLOT_PTE + i]));
863	}
864#if 0
865	addr += PAGE_SIZE; /* point to shadow L2 */
866	pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
867	__PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
868	    (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr,
869	    (int64_t)pde[PDIR_SLOT_PTE + 3]));
870#endif
871	/* Mark tables RO, and pin the kenrel's shadow as L2 */
872	addr = (u_long)pde - KERNBASE;
873	for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
874		xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
875		if (i == 2 || i == 3)
876			continue;
877#if 0
878		__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr));
879		xpq_queue_pin_table(xpmap_ptom_masked(addr));
880#endif
881	}
882	if (final) {
883		addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
884		__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
885		xpq_queue_pin_table(xpmap_ptom_masked(addr));
886	}
887#if 0
888	addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE;
889	__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
890	xpq_queue_pin_table(xpmap_ptom_masked(addr));
891#endif
892#else /* PAE */
893	/* recursive entry in higher-level PD */
894	bt_pgd[PDIR_SLOT_PTE] =
895	    xpmap_ptom_masked(new_pgd - KERNBASE) | PG_k | PG_V;
896	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] va 0x%lx pa 0x%" PRIx64
897	    " entry 0x%" PRIx64 "\n", new_pgd, (int64_t)new_pgd - KERNBASE,
898	    (int64_t)bt_pgd[PDIR_SLOT_PTE]));
899	/* Mark tables RO */
900	xen_bt_set_readonly((vaddr_t) pde);
901#endif
902#if PTP_LEVELS > 2 || defined(PAE)
903	xen_bt_set_readonly((vaddr_t) pdtpe);
904#endif
905#if PTP_LEVELS > 3
906	xen_bt_set_readonly(new_pgd);
907#endif
908	/* Pin the PGD */
909	__PRINTK(("pin PDG\n"));
910#ifdef PAE
911	xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
912#else
913	xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE));
914#endif
915#ifdef __i386__
916	/* Save phys. addr of PDP, for libkvm. */
917	PDPpaddr = (long)pde;
918#ifdef PAE
919	/* also save the address of the L3 page */
920	pmap_l3pd = pdtpe;
921	pmap_l3paddr = (new_pgd - KERNBASE);
922#endif /* PAE */
923#endif /* i386 */
924	/* Switch to new tables */
925	__PRINTK(("switch to PDG\n"));
926	xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
927	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry 0x%" PRIx64 "\n",
928	    (int64_t)bt_pgd[PDIR_SLOT_PTE]));
929#ifdef PAE
930	if (final) {
931		/* now enter kernel's PTE mappings */
932		addr =  (u_long)pde - KERNBASE + PAGE_SIZE * 3;
933		xpq_queue_pte_update(
934		    xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
935		    xpmap_ptom_masked(addr) | PG_k | PG_V);
936		xpq_flush_queue();
937	}
938#endif
939
940
941
942	/* Now we can safely reclaim space taken by old tables */
943
944	__PRINTK(("unpin old PDG\n"));
945	/* Unpin old PGD */
946	xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
947	/* Mark old tables RW */
948	page = old_pgd;
949	addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME;
950	addr = xpmap_mtop(addr);
951	pte = (pd_entry_t *) ((u_long)addr + KERNBASE);
952	pte += pl1_pi(page);
953	__PRINTK(("*pde 0x%" PRIx64 " addr 0x%" PRIx64 " pte 0x%lx\n",
954	    (int64_t)pde[pl2_pi(page)], (int64_t)addr, (long)pte));
955	while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
956		addr = xpmap_ptom(((u_long) pte) - KERNBASE);
957		XENPRINTK(("addr 0x%" PRIx64 " pte 0x%lx *pte 0x%" PRIx64 "\n",
958		   (int64_t)addr, (long)pte, (int64_t)*pte));
959		xpq_queue_pte_update(addr, *pte | PG_RW);
960		page += PAGE_SIZE;
961		/*
962		 * Our ptes are contiguous
963		 * so it's safe to just "++" here
964		 */
965		pte++;
966	}
967	xpq_flush_queue();
968}
969
970
971/*
972 * Bootstrap helper functions
973 */
974
975/*
976 * Mark a page readonly
977 * XXX: assuming vaddr = paddr + KERNBASE
978 */
979
980static void
981xen_bt_set_readonly (vaddr_t page)
982{
983	pt_entry_t entry;
984
985	entry = xpmap_ptom_masked(page - KERNBASE);
986	entry |= PG_k | PG_V;
987
988	HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG);
989}
990
991#ifdef __x86_64__
992void
993xen_set_user_pgd(paddr_t page)
994{
995	struct mmuext_op op;
996	int s = splvm();
997
998	xpq_flush_queue();
999	op.cmd = MMUEXT_NEW_USER_BASEPTR;
1000	op.arg1.mfn = xpmap_phys_to_machine_mapping[page >> PAGE_SHIFT];
1001        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
1002		panic("xen_set_user_pgd: failed to install new user page"
1003			" directory %lx", page);
1004	splx(s);
1005}
1006#endif /* __x86_64__ */
1007