x86_xpmap.c revision 1.15
1/*	$NetBSD: x86_xpmap.c,v 1.15 2009/07/29 12:02:08 cegger Exp $	*/
2
3/*
4 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19/*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 *    must display the following acknowledgement:
32 *	This product includes software developed by Manuel Bouyer.
33 * 4. The name of the author may not be used to endorse or promote products
34 *    derived from this software without specific prior written permission.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *
47 */
48
49/*
50 *
51 * Copyright (c) 2004 Christian Limpach.
52 * All rights reserved.
53 *
54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions
56 * are met:
57 * 1. Redistributions of source code must retain the above copyright
58 *    notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright
60 *    notice, this list of conditions and the following disclaimer in the
61 *    documentation and/or other materials provided with the distribution.
62 * 3. All advertising materials mentioning features or use of this software
63 *    must display the following acknowledgement:
64 *      This product includes software developed by Christian Limpach.
65 * 4. The name of the author may not be used to endorse or promote products
66 *    derived from this software without specific prior written permission.
67 *
68 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
69 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
70 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
71 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
72 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
73 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
74 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
75 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
76 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
77 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
78 */
79
80
81#include <sys/cdefs.h>
82__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.15 2009/07/29 12:02:08 cegger Exp $");
83
84#include "opt_xen.h"
85#include "opt_ddb.h"
86#include "ksyms.h"
87
88#include <sys/param.h>
89#include <sys/systm.h>
90
91#include <uvm/uvm.h>
92
93#include <machine/pmap.h>
94#include <machine/gdt.h>
95#include <xen/xenfunc.h>
96
97#include <dev/isa/isareg.h>
98#include <machine/isa_machdep.h>
99
100#undef	XENDEBUG
101/* #define XENDEBUG_SYNC */
102/* #define	XENDEBUG_LOW */
103
104#ifdef XENDEBUG
105#define	XENPRINTF(x) printf x
106#define	XENPRINTK(x) printk x
107#define	XENPRINTK2(x) /* printk x */
108
109static char XBUF[256];
110#else
111#define	XENPRINTF(x)
112#define	XENPRINTK(x)
113#define	XENPRINTK2(x)
114#endif
115#define	PRINTF(x) printf x
116#define	PRINTK(x) printk x
117
118/* on x86_64 kernel runs in ring 3 */
119#ifdef __x86_64__
120#define PG_k PG_u
121#else
122#define PG_k 0
123#endif
124
125volatile shared_info_t *HYPERVISOR_shared_info;
126/* Xen requires the start_info struct to be page aligned */
127union start_info_union start_info_union __aligned(PAGE_SIZE);
128unsigned long *xpmap_phys_to_machine_mapping;
129
130void xen_failsafe_handler(void);
131
132#define HYPERVISOR_mmu_update_self(req, count, success_count) \
133	HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
134
135void
136xen_failsafe_handler(void)
137{
138
139	panic("xen_failsafe_handler called!\n");
140}
141
142
143void
144xen_set_ldt(vaddr_t base, uint32_t entries)
145{
146	vaddr_t va;
147	vaddr_t end;
148	pt_entry_t *ptp;
149	int s;
150
151#ifdef __x86_64__
152	end = base + (entries << 3);
153#else
154	end = base + entries * sizeof(union descriptor);
155#endif
156
157	for (va = base; va < end; va += PAGE_SIZE) {
158		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
159		ptp = kvtopte(va);
160		XENPRINTF(("xen_set_ldt %p %d %p\n", (void *)base,
161			      entries, ptp));
162		pmap_pte_clearbits(ptp, PG_RW);
163	}
164	s = splvm();
165	xpq_queue_set_ldt(base, entries);
166	xpq_flush_queue();
167	splx(s);
168}
169
170#ifdef XENDEBUG
171void xpq_debug_dump(void);
172#endif
173
174#define XPQUEUE_SIZE 2048
175static mmu_update_t xpq_queue[XPQUEUE_SIZE];
176static int xpq_idx = 0;
177
178void
179xpq_flush_queue(void)
180{
181	int i, ok;
182
183	XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
184	for (i = 0; i < xpq_idx; i++)
185		XENPRINTK2(("%d: %p %08" PRIx64 "\n", i,
186		    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val));
187	if (xpq_idx != 0 &&
188	    HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok) < 0) {
189		printf("xpq_flush_queue: %d entries \n", xpq_idx);
190		for (i = 0; i < xpq_idx; i++)
191			printf("0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
192			   (uint64_t)xpq_queue[i].ptr,
193			   (uint64_t)xpq_queue[i].val);
194		panic("HYPERVISOR_mmu_update failed\n");
195	}
196	xpq_idx = 0;
197}
198
199static inline void
200xpq_increment_idx(void)
201{
202
203	xpq_idx++;
204	if (__predict_false(xpq_idx == XPQUEUE_SIZE))
205		xpq_flush_queue();
206}
207
208void
209xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
210{
211	XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
212	    "\n", (int64_t)ma, (int64_t)pa));
213	xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
214	xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
215	xpq_increment_idx();
216#ifdef XENDEBUG_SYNC
217	xpq_flush_queue();
218#endif
219}
220
221void
222xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
223{
224
225	KASSERT((ptr & 3) == 0);
226	xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
227	xpq_queue[xpq_idx].val = val;
228	xpq_increment_idx();
229#ifdef XENDEBUG_SYNC
230	xpq_flush_queue();
231#endif
232}
233
234void
235xpq_queue_pt_switch(paddr_t pa)
236{
237	struct mmuext_op op;
238	xpq_flush_queue();
239
240	XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
241	    (int64_t)pa, (int64_t)pa));
242	op.cmd = MMUEXT_NEW_BASEPTR;
243	op.arg1.mfn = pa >> PAGE_SHIFT;
244	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
245		panic("xpq_queue_pt_switch");
246}
247
248void
249xpq_queue_pin_table(paddr_t pa)
250{
251	struct mmuext_op op;
252	xpq_flush_queue();
253
254	XENPRINTK2(("xpq_queue_pin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
255	    (int64_t)pa, (int64_t)pa));
256	op.arg1.mfn = pa >> PAGE_SHIFT;
257
258#if defined(__x86_64__)
259	op.cmd = MMUEXT_PIN_L4_TABLE;
260#else
261	op.cmd = MMUEXT_PIN_L2_TABLE;
262#endif
263	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
264		panic("xpq_queue_pin_table");
265}
266
267#ifdef PAE
268static void
269xpq_queue_pin_l3_table(paddr_t pa)
270{
271	struct mmuext_op op;
272	xpq_flush_queue();
273
274	XENPRINTK2(("xpq_queue_pin_l2_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
275	    (int64_t)pa, (int64_t)pa));
276	op.arg1.mfn = pa >> PAGE_SHIFT;
277
278	op.cmd = MMUEXT_PIN_L3_TABLE;
279	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
280		panic("xpq_queue_pin_table");
281}
282#endif
283
284void
285xpq_queue_unpin_table(paddr_t pa)
286{
287	struct mmuext_op op;
288	xpq_flush_queue();
289
290	XENPRINTK2(("xpq_queue_unpin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
291	    (int64_t)pa, (int64_t)pa));
292	op.arg1.mfn = pa >> PAGE_SHIFT;
293	op.cmd = MMUEXT_UNPIN_TABLE;
294	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
295		panic("xpq_queue_unpin_table");
296}
297
298void
299xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
300{
301	struct mmuext_op op;
302	xpq_flush_queue();
303
304	XENPRINTK2(("xpq_queue_set_ldt\n"));
305	KASSERT(va == (va & ~PAGE_MASK));
306	op.cmd = MMUEXT_SET_LDT;
307	op.arg1.linear_addr = va;
308	op.arg2.nr_ents = entries;
309	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
310		panic("xpq_queue_set_ldt");
311}
312
313void
314xpq_queue_tlb_flush(void)
315{
316	struct mmuext_op op;
317	xpq_flush_queue();
318
319	XENPRINTK2(("xpq_queue_tlb_flush\n"));
320	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
321	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
322		panic("xpq_queue_tlb_flush");
323}
324
325void
326xpq_flush_cache(void)
327{
328	struct mmuext_op op;
329	int s = splvm();
330	xpq_flush_queue();
331
332	XENPRINTK2(("xpq_queue_flush_cache\n"));
333	op.cmd = MMUEXT_FLUSH_CACHE;
334	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
335		panic("xpq_flush_cache");
336	splx(s);
337}
338
339void
340xpq_queue_invlpg(vaddr_t va)
341{
342	struct mmuext_op op;
343	xpq_flush_queue();
344
345	XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
346	op.cmd = MMUEXT_INVLPG_LOCAL;
347	op.arg1.linear_addr = (va & ~PAGE_MASK);
348	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
349		panic("xpq_queue_invlpg");
350}
351
352int
353xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
354{
355	mmu_update_t op;
356	int ok;
357	xpq_flush_queue();
358
359	op.ptr = ptr;
360	op.val = val;
361	if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
362		return EFAULT;
363	return (0);
364}
365
366#ifdef XENDEBUG
367void
368xpq_debug_dump(void)
369{
370	int i;
371
372	XENPRINTK2(("idx: %d\n", xpq_idx));
373	for (i = 0; i < xpq_idx; i++) {
374		snprintf(XBUF, sizeof(XBUF), "%" PRIx64 " %08" PRIx64,
375		    (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
376		if (++i < xpq_idx)
377			snprintf(XBUF + strlen(XBUF),
378			    sizeof(XBUF) - strlen(XBUF),
379			    "%" PRIx64 " %08" PRIx64,
380			    (uint64_t)xpq_queue[i].ptr,
381			    (uint64_t)xpq_queue[i].val);
382		if (++i < xpq_idx)
383			snprintf(XBUF + strlen(XBUF),
384			    sizeof(XBUF) - strlen(XBUF),
385			    "%" PRIx64 " %08" PRIx64,
386			    (uint64_t)xpq_queue[i].ptr,
387			    (uint64_t)xpq_queue[i].val);
388		if (++i < xpq_idx)
389			snprintf(XBUF + strlen(XBUF),
390			    sizeof(XBUF) - strlen(XBUF),
391			    "%" PRIx64 " %08" PRIx64,
392			    (uint64_t)xpq_queue[i].ptr,
393			    (uint64_t)xpq_queue[i].val);
394		XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
395	}
396}
397#endif
398
399
400extern volatile struct xencons_interface *xencons_interface; /* XXX */
401extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
402
403static void xen_bt_set_readonly (vaddr_t);
404static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int);
405
406/* How many PDEs ? */
407#if L2_SLOT_KERNBASE > 0
408#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
409#else
410#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
411#endif
412
413/*
414 * Construct and switch to new pagetables
415 * first_avail is the first vaddr we can use after
416 * we get rid of Xen pagetables
417 */
418
419vaddr_t xen_pmap_bootstrap (void);
420
421/*
422 * Function to get rid of Xen bootstrap tables
423 */
424
425/* How many PDP do we need: */
426#ifdef PAE
427/*
428 * For PAE, we consider a single contigous L2 "superpage" of 4 pages,
429 * all of them mapped by the L3 page. We also need a shadow page
430 * for L3[3].
431 */
432static const int l2_4_count = 6;
433#else
434static const int l2_4_count = PTP_LEVELS - 1;
435#endif
436
437vaddr_t
438xen_pmap_bootstrap(void)
439{
440	int count, oldcount;
441	long mapsize;
442	vaddr_t bootstrap_tables, init_tables;
443
444	xpmap_phys_to_machine_mapping =
445	    (unsigned long *)xen_start_info.mfn_list;
446	init_tables = xen_start_info.pt_base;
447	__PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables));
448
449	/* Space after Xen boostrap tables should be free */
450	bootstrap_tables = xen_start_info.pt_base +
451		(xen_start_info.nr_pt_frames * PAGE_SIZE);
452
453	/*
454	 * Calculate how many space we need
455	 * first everything mapped before the Xen bootstrap tables
456	 */
457	mapsize = init_tables - KERNTEXTOFF;
458	/* after the tables we'll have:
459	 *  - UAREA
460	 *  - dummy user PGD (x86_64)
461	 *  - HYPERVISOR_shared_info
462	 *  - ISA I/O mem (if needed)
463	 */
464	mapsize += UPAGES * NBPG;
465#ifdef __x86_64__
466	mapsize += NBPG;
467#endif
468	mapsize += NBPG;
469
470#ifdef DOM0OPS
471	if (xendomain_is_dom0()) {
472		/* space for ISA I/O mem */
473		mapsize += IOM_SIZE;
474	}
475#endif
476	/* at this point mapsize doens't include the table size */
477
478#ifdef __x86_64__
479	count = TABLE_L2_ENTRIES;
480#else
481	count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT;
482#endif /* __x86_64__ */
483
484	/* now compute how many L2 pages we need exactly */
485	XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count));
486	while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF >
487	    ((long)count << L2_SHIFT) + KERNBASE) {
488		count++;
489	}
490#ifndef __x86_64__
491	/*
492	 * one more L2 page: we'll alocate several pages after kva_start
493	 * in pmap_bootstrap() before pmap_growkernel(), which have not been
494	 * counted here. It's not a big issue to allocate one more L2 as
495	 * pmap_growkernel() will be called anyway.
496	 */
497	count++;
498	nkptp[1] = count;
499#endif
500
501	/*
502	 * install bootstrap pages. We may need more L2 pages than will
503	 * have the final table here, as it's installed after the final table
504	 */
505	oldcount = count;
506
507bootstrap_again:
508	XENPRINTK(("bootstrap_again oldcount %d\n", oldcount));
509	/*
510	 * Xen space we'll reclaim may not be enough for our new page tables,
511	 * move bootstrap tables if necessary
512	 */
513	if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
514		bootstrap_tables = init_tables +
515					((count + l2_4_count) * PAGE_SIZE);
516	/* make sure we have enough to map the bootstrap_tables */
517	if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
518	    ((long)oldcount << L2_SHIFT) + KERNBASE) {
519		oldcount++;
520		goto bootstrap_again;
521	}
522
523	/* Create temporary tables */
524	xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables,
525		xen_start_info.nr_pt_frames, oldcount, 0);
526
527	/* Create final tables */
528	xen_bootstrap_tables(bootstrap_tables, init_tables,
529	    oldcount + l2_4_count, count, 1);
530
531	/* zero out free space after tables */
532	memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
533	    (UPAGES + 1) * NBPG);
534	return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
535}
536
537
538/*
539 * Build a new table and switch to it
540 * old_count is # of old tables (including PGD, PDTPE and PDE)
541 * new_count is # of new tables (PTE only)
542 * we assume areas don't overlap
543 */
544
545
546static void
547xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd,
548	int old_count, int new_count, int final)
549{
550	pd_entry_t *pdtpe, *pde, *pte;
551	pd_entry_t *cur_pgd, *bt_pgd;
552	paddr_t addr;
553	vaddr_t page, avail, text_end, map_end;
554	int i;
555	extern char __data_start;
556
557	__PRINTK(("xen_bootstrap_tables(0x%lx, 0x%lx, %d, %d)\n",
558	    old_pgd, new_pgd, old_count, new_count));
559	text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK;
560	/*
561	 * size of R/W area after kernel text:
562	 *  xencons_interface (if present)
563	 *  xenstore_interface (if present)
564	 *  table pages (new_count + l2_4_count entries)
565	 * extra mappings (only when final is true):
566	 *  UAREA
567	 *  dummy user PGD (x86_64 only)/gdt page (i386 only)
568	 *  HYPERVISOR_shared_info
569	 *  ISA I/O mem (if needed)
570	 */
571	map_end = new_pgd + ((new_count + l2_4_count) * NBPG);
572	if (final) {
573		map_end += (UPAGES + 1) * NBPG;
574		HYPERVISOR_shared_info = (shared_info_t *)map_end;
575		map_end += NBPG;
576	}
577	/*
578	 * we always set atdevbase, as it's used by init386 to find the first
579	 * available VA. map_end is updated only if we are dom0, so
580	 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
581	 * this case.
582	 */
583	if (final)
584		atdevbase = map_end;
585#ifdef DOM0OPS
586	if (final && xendomain_is_dom0()) {
587		/* ISA I/O mem */
588		map_end += IOM_SIZE;
589	}
590#endif /* DOM0OPS */
591
592	__PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n",
593	    text_end, map_end));
594	__PRINTK(("console 0x%lx ", xen_start_info.console.domU.mfn));
595	__PRINTK(("xenstore 0x%lx\n", xen_start_info.store_mfn));
596
597	/*
598	 * Create bootstrap page tables
599	 * What we need:
600	 * - a PGD (level 4)
601	 * - a PDTPE (level 3)
602	 * - a PDE (level2)
603	 * - some PTEs (level 1)
604	 */
605
606	cur_pgd = (pd_entry_t *) old_pgd;
607	bt_pgd = (pd_entry_t *) new_pgd;
608	memset (bt_pgd, 0, PAGE_SIZE);
609	avail = new_pgd + PAGE_SIZE;
610#if PTP_LEVELS > 3
611	/* Install level 3 */
612	pdtpe = (pd_entry_t *) avail;
613	memset (pdtpe, 0, PAGE_SIZE);
614	avail += PAGE_SIZE;
615
616	addr = ((u_long) pdtpe) - KERNBASE;
617	bt_pgd[pl4_pi(KERNTEXTOFF)] =
618	    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
619
620	__PRINTK(("L3 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L4[0x%x]\n",
621	    pdtpe, (uint64_t)addr, (uint64_t)bt_pgd[pl4_pi(KERNTEXTOFF)],
622	    pl4_pi(KERNTEXTOFF)));
623#else
624	pdtpe = bt_pgd;
625#endif /* PTP_LEVELS > 3 */
626
627#if PTP_LEVELS > 2
628	/* Level 2 */
629	pde = (pd_entry_t *) avail;
630	memset(pde, 0, PAGE_SIZE);
631	avail += PAGE_SIZE;
632
633	addr = ((u_long) pde) - KERNBASE;
634	pdtpe[pl3_pi(KERNTEXTOFF)] =
635	    xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
636	__PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L3[0x%x]\n",
637	    pde, (int64_t)addr, (int64_t)pdtpe[pl3_pi(KERNTEXTOFF)],
638	    pl3_pi(KERNTEXTOFF)));
639#elif defined(PAE)
640	/* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */
641	pde = (pd_entry_t *) avail;
642	memset(pde, 0, PAGE_SIZE * 5);
643	avail += PAGE_SIZE * 5;
644	addr = ((u_long) pde) - KERNBASE;
645	/*
646	 * enter L2 pages in the L3.
647	 * The real L2 kernel PD will be the last one (so that
648	 * pde[L2_SLOT_KERN] always point to the shadow).
649	 */
650	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
651		/*
652		 * Xen doens't want R/W mappings in L3 entries, it'll add it
653		 * itself.
654		 */
655		pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
656		__PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
657		    " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * i,
658		    (int64_t)addr, (int64_t)pdtpe[i], i));
659	}
660	addr += PAGE_SIZE;
661	pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
662	__PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
663	    " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * 4,
664	    (int64_t)addr, (int64_t)pdtpe[3], 3));
665
666#else /* PAE */
667	pde = bt_pgd;
668#endif /* PTP_LEVELS > 2 */
669
670	/* Level 1 */
671	page = KERNTEXTOFF;
672	for (i = 0; i < new_count; i ++) {
673		vaddr_t cur_page = page;
674
675		pte = (pd_entry_t *) avail;
676		avail += PAGE_SIZE;
677
678		memset(pte, 0, PAGE_SIZE);
679		while (pl2_pi(page) == pl2_pi (cur_page)) {
680			if (page >= map_end) {
681				/* not mapped at all */
682				pte[pl1_pi(page)] = 0;
683				page += PAGE_SIZE;
684				continue;
685			}
686			pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
687			if (page == (vaddr_t)HYPERVISOR_shared_info) {
688				pte[pl1_pi(page)] = xen_start_info.shared_info;
689				__PRINTK(("HYPERVISOR_shared_info "
690				    "va 0x%lx pte 0x%" PRIx64 "\n",
691				    HYPERVISOR_shared_info, (int64_t)pte[pl1_pi(page)]));
692			}
693			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
694			    == xen_start_info.console.domU.mfn) {
695				xencons_interface = (void *)page;
696				pte[pl1_pi(page)] = xen_start_info.console.domU.mfn;
697				pte[pl1_pi(page)] <<= PAGE_SHIFT;
698				__PRINTK(("xencons_interface "
699				    "va 0x%lx pte 0x%" PRIx64 "\n",
700				    xencons_interface, (int64_t)pte[pl1_pi(page)]));
701			}
702			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
703			    == xen_start_info.store_mfn) {
704				xenstore_interface = (void *)page;
705				pte[pl1_pi(page)] = xen_start_info.store_mfn;
706				pte[pl1_pi(page)] <<= PAGE_SHIFT;
707				__PRINTK(("xenstore_interface "
708				    "va 0x%lx pte 0x%" PRIx64 "\n",
709				    xenstore_interface, (int64_t)pte[pl1_pi(page)]));
710			}
711#ifdef DOM0OPS
712			if (page >= (vaddr_t)atdevbase &&
713			    page < (vaddr_t)atdevbase + IOM_SIZE) {
714				pte[pl1_pi(page)] =
715				    IOM_BEGIN + (page - (vaddr_t)atdevbase);
716			}
717#endif
718			pte[pl1_pi(page)] |= PG_k | PG_V;
719			if (page < text_end) {
720				/* map kernel text RO */
721				pte[pl1_pi(page)] |= 0;
722			} else if (page >= old_pgd
723			    && page < old_pgd + (old_count * PAGE_SIZE)) {
724				/* map old page tables RO */
725				pte[pl1_pi(page)] |= 0;
726			} else if (page >= new_pgd &&
727			    page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
728				/* map new page tables RO */
729				pte[pl1_pi(page)] |= 0;
730			} else {
731				/* map page RW */
732				pte[pl1_pi(page)] |= PG_RW;
733			}
734
735			if ((page  >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE))
736			    || page >= new_pgd) {
737				__PRINTK(("va 0x%lx pa 0x%lx "
738				    "entry 0x%" PRIx64 " -> L1[0x%x]\n",
739				    page, page - KERNBASE,
740				    (int64_t)pte[pl1_pi(page)], pl1_pi(page)));
741			}
742			page += PAGE_SIZE;
743		}
744
745		addr = ((u_long) pte) - KERNBASE;
746		pde[pl2_pi(cur_page)] =
747		    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
748		__PRINTK(("L1 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
749		    " -> L2[0x%x]\n", pte, (int64_t)addr,
750		    (int64_t)pde[pl2_pi(cur_page)], pl2_pi(cur_page)));
751		/* Mark readonly */
752		xen_bt_set_readonly((vaddr_t) pte);
753	}
754
755	/* Install recursive page tables mapping */
756#ifdef PAE
757	/*
758	 * we need a shadow page for the kernel's L2 page
759	 * The real L2 kernel PD will be the last one (so that
760	 * pde[L2_SLOT_KERN] always point to the shadow.
761	 */
762	memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
763	pmap_kl2pd = &pde[L2_SLOT_KERN + NPDPG];
764	pmap_kl2paddr = (u_long)pmap_kl2pd - KERNBASE;
765
766	/*
767	 * We don't enter a recursive entry from the L3 PD. Instead,
768	 * we enter the first 4 L2 pages, which includes the kernel's L2
769	 * shadow. But we have to entrer the shadow after switching
770	 * %cr3, or Xen will refcount some PTE with the wrong type.
771	 */
772	addr = (u_long)pde - KERNBASE;
773	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
774		pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
775		__PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
776		    (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i, (long)addr,
777		    (int64_t)pde[PDIR_SLOT_PTE + i]));
778	}
779#if 0
780	addr += PAGE_SIZE; /* point to shadow L2 */
781	pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
782	__PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
783	    (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr,
784	    (int64_t)pde[PDIR_SLOT_PTE + 3]));
785#endif
786	/* Mark tables RO, and pin the kernel's shadow as L2 */
787	addr = (u_long)pde - KERNBASE;
788	for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
789		xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
790		if (i == 2 || i == 3)
791			continue;
792#if 0
793		__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr));
794		xpq_queue_pin_table(xpmap_ptom_masked(addr));
795#endif
796	}
797	if (final) {
798		addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
799		__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
800		xpq_queue_pin_table(xpmap_ptom_masked(addr));
801	}
802#if 0
803	addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE;
804	__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
805	xpq_queue_pin_table(xpmap_ptom_masked(addr));
806#endif
807#else /* PAE */
808	/* recursive entry in higher-level PD */
809	bt_pgd[PDIR_SLOT_PTE] =
810	    xpmap_ptom_masked(new_pgd - KERNBASE) | PG_k | PG_V;
811	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] va 0x%lx pa 0x%" PRIx64
812	    " entry 0x%" PRIx64 "\n", new_pgd, (int64_t)new_pgd - KERNBASE,
813	    (int64_t)bt_pgd[PDIR_SLOT_PTE]));
814	/* Mark tables RO */
815	xen_bt_set_readonly((vaddr_t) pde);
816#endif
817#if PTP_LEVELS > 2 || defined(PAE)
818	xen_bt_set_readonly((vaddr_t) pdtpe);
819#endif
820#if PTP_LEVELS > 3
821	xen_bt_set_readonly(new_pgd);
822#endif
823	/* Pin the PGD */
824	__PRINTK(("pin PGD\n"));
825#ifdef PAE
826	xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
827#else
828	xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE));
829#endif
830#ifdef __i386__
831	/* Save phys. addr of PDP, for libkvm. */
832	PDPpaddr = (long)pde;
833#ifdef PAE
834	/* also save the address of the L3 page */
835	pmap_l3pd = pdtpe;
836	pmap_l3paddr = (new_pgd - KERNBASE);
837#endif /* PAE */
838#endif /* i386 */
839	/* Switch to new tables */
840	__PRINTK(("switch to PGD\n"));
841	xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
842	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry 0x%" PRIx64 "\n",
843	    (int64_t)bt_pgd[PDIR_SLOT_PTE]));
844#ifdef PAE
845	if (final) {
846		/* now enter kernel's PTE mappings */
847		addr =  (u_long)pde - KERNBASE + PAGE_SIZE * 3;
848		xpq_queue_pte_update(
849		    xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
850		    xpmap_ptom_masked(addr) | PG_k | PG_V);
851		xpq_flush_queue();
852	}
853#endif
854
855
856
857	/* Now we can safely reclaim space taken by old tables */
858
859	__PRINTK(("unpin old PGD\n"));
860	/* Unpin old PGD */
861	xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
862	/* Mark old tables RW */
863	page = old_pgd;
864	addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME;
865	addr = xpmap_mtop(addr);
866	pte = (pd_entry_t *) ((u_long)addr + KERNBASE);
867	pte += pl1_pi(page);
868	__PRINTK(("*pde 0x%" PRIx64 " addr 0x%" PRIx64 " pte 0x%lx\n",
869	    (int64_t)pde[pl2_pi(page)], (int64_t)addr, (long)pte));
870	while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
871		addr = xpmap_ptom(((u_long) pte) - KERNBASE);
872		XENPRINTK(("addr 0x%" PRIx64 " pte 0x%lx *pte 0x%" PRIx64 "\n",
873		   (int64_t)addr, (long)pte, (int64_t)*pte));
874		xpq_queue_pte_update(addr, *pte | PG_RW);
875		page += PAGE_SIZE;
876		/*
877		 * Our ptes are contiguous
878		 * so it's safe to just "++" here
879		 */
880		pte++;
881	}
882	xpq_flush_queue();
883}
884
885
886/*
887 * Bootstrap helper functions
888 */
889
890/*
891 * Mark a page readonly
892 * XXX: assuming vaddr = paddr + KERNBASE
893 */
894
895static void
896xen_bt_set_readonly (vaddr_t page)
897{
898	pt_entry_t entry;
899
900	entry = xpmap_ptom_masked(page - KERNBASE);
901	entry |= PG_k | PG_V;
902
903	HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG);
904}
905
906#ifdef __x86_64__
907void
908xen_set_user_pgd(paddr_t page)
909{
910	struct mmuext_op op;
911	int s = splvm();
912
913	xpq_flush_queue();
914	op.cmd = MMUEXT_NEW_USER_BASEPTR;
915	op.arg1.mfn = xpmap_phys_to_machine_mapping[page >> PAGE_SHIFT];
916        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
917		panic("xen_set_user_pgd: failed to install new user page"
918			" directory %lx", page);
919	splx(s);
920}
921#endif /* __x86_64__ */
922