x86_xpmap.c revision 1.37
1/*	$NetBSD: x86_xpmap.c,v 1.37 2012/01/09 13:04:13 cherry Exp $	*/
2
3/*
4 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19/*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
32 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
34 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
40 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 *
42 */
43
44/*
45 *
46 * Copyright (c) 2004 Christian Limpach.
47 * All rights reserved.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 *    notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 *    notice, this list of conditions and the following disclaimer in the
56 *    documentation and/or other materials provided with the distribution.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
59 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
60 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
61 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
63 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
64 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
65 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
66 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
67 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
68 */
69
70
71#include <sys/cdefs.h>
72__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.37 2012/01/09 13:04:13 cherry Exp $");
73
74#include "opt_xen.h"
75#include "opt_ddb.h"
76#include "ksyms.h"
77
78#include <sys/param.h>
79#include <sys/systm.h>
80#include <sys/simplelock.h>
81
82#include <uvm/uvm.h>
83
84#include <machine/pmap.h>
85#include <machine/gdt.h>
86#include <xen/xenfunc.h>
87
88#include <dev/isa/isareg.h>
89#include <machine/isa_machdep.h>
90
91#undef	XENDEBUG
92/* #define XENDEBUG_SYNC */
93/* #define	XENDEBUG_LOW */
94
95#ifdef XENDEBUG
96#define	XENPRINTF(x) printf x
97#define	XENPRINTK(x) printk x
98#define	XENPRINTK2(x) /* printk x */
99
100static char XBUF[256];
101#else
102#define	XENPRINTF(x)
103#define	XENPRINTK(x)
104#define	XENPRINTK2(x)
105#endif
106#define	PRINTF(x) printf x
107#define	PRINTK(x) printk x
108
109/* on x86_64 kernel runs in ring 3 */
110#ifdef __x86_64__
111#define PG_k PG_u
112#else
113#define PG_k 0
114#endif
115
116volatile shared_info_t *HYPERVISOR_shared_info;
117/* Xen requires the start_info struct to be page aligned */
118union start_info_union start_info_union __aligned(PAGE_SIZE);
119unsigned long *xpmap_phys_to_machine_mapping;
120kmutex_t pte_lock;
121
122void xen_failsafe_handler(void);
123
124#define HYPERVISOR_mmu_update_self(req, count, success_count) \
125	HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
126
127void
128xen_failsafe_handler(void)
129{
130
131	panic("xen_failsafe_handler called!\n");
132}
133
134
135void
136xen_set_ldt(vaddr_t base, uint32_t entries)
137{
138	vaddr_t va;
139	vaddr_t end;
140	pt_entry_t *ptp;
141	int s;
142
143#ifdef __x86_64__
144	end = base + (entries << 3);
145#else
146	end = base + entries * sizeof(union descriptor);
147#endif
148
149	for (va = base; va < end; va += PAGE_SIZE) {
150		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
151		ptp = kvtopte(va);
152		XENPRINTF(("xen_set_ldt %#" PRIxVADDR " %d %p\n",
153		    base, entries, ptp));
154		pmap_pte_clearbits(ptp, PG_RW);
155	}
156	s = splvm();
157	xpq_queue_set_ldt(base, entries);
158	splx(s);
159}
160
161#ifdef XENDEBUG
162void xpq_debug_dump(void);
163#endif
164
165#define XPQUEUE_SIZE 2048
166static mmu_update_t xpq_queue_array[MAXCPUS][XPQUEUE_SIZE];
167static int xpq_idx_array[MAXCPUS];
168
169extern struct cpu_info * (*xpq_cpu)(void);
170
171void
172xpq_flush_queue(void)
173{
174	int i, ok = 0, ret;
175
176	mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
177	int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
178
179	XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
180	for (i = 0; i < xpq_idx; i++)
181		XENPRINTK2(("%d: 0x%08" PRIx64 " 0x%08" PRIx64 "\n", i,
182		    xpq_queue[i].ptr, xpq_queue[i].val));
183
184retry:
185	ret = HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok);
186
187	if (xpq_idx != 0 && ret < 0) {
188		printf("xpq_flush_queue: %d entries (%d successful)\n",
189		    xpq_idx, ok);
190
191		if (ok != 0) {
192			xpq_queue += ok;
193			xpq_idx -= ok;
194			ok = 0;
195			goto retry;
196		}
197
198		for (i = 0; i < xpq_idx; i++)
199			printf("0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
200			   xpq_queue[i].ptr, xpq_queue[i].val);
201		panic("HYPERVISOR_mmu_update failed, ret: %d\n", ret);
202	}
203	xpq_idx_array[xpq_cpu()->ci_cpuid] = 0;
204}
205
206static inline void
207xpq_increment_idx(void)
208{
209
210	if (__predict_false(++xpq_idx_array[xpq_cpu()->ci_cpuid] == XPQUEUE_SIZE))
211		xpq_flush_queue();
212}
213
214void
215xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
216{
217
218	mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
219	int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
220
221	XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
222	    "\n", (int64_t)ma, (int64_t)pa));
223
224	xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
225	xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
226	xpq_increment_idx();
227#ifdef XENDEBUG_SYNC
228	xpq_flush_queue();
229#endif
230}
231
232void
233xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
234{
235
236	mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
237	int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
238
239	KASSERT((ptr & 3) == 0);
240	xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
241	xpq_queue[xpq_idx].val = val;
242	xpq_increment_idx();
243#ifdef XENDEBUG_SYNC
244	xpq_flush_queue();
245#endif
246}
247
248void
249xpq_queue_pt_switch(paddr_t pa)
250{
251	struct mmuext_op op;
252	xpq_flush_queue();
253
254	XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
255	    (int64_t)pa, (int64_t)pa));
256	op.cmd = MMUEXT_NEW_BASEPTR;
257	op.arg1.mfn = pa >> PAGE_SHIFT;
258	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
259		panic("xpq_queue_pt_switch");
260}
261
262void
263xpq_queue_pin_table(paddr_t pa, int lvl)
264{
265	struct mmuext_op op;
266
267	xpq_flush_queue();
268
269	XENPRINTK2(("xpq_queue_pin_l%d_table: %#" PRIxPADDR "\n",
270	    lvl + 1, pa));
271
272	op.arg1.mfn = pa >> PAGE_SHIFT;
273	op.cmd = lvl;
274
275	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
276		panic("xpq_queue_pin_table");
277}
278
279void
280xpq_queue_unpin_table(paddr_t pa)
281{
282	struct mmuext_op op;
283
284	xpq_flush_queue();
285
286	XENPRINTK2(("xpq_queue_unpin_table: %#" PRIxPADDR "\n", pa));
287	op.arg1.mfn = pa >> PAGE_SHIFT;
288	op.cmd = MMUEXT_UNPIN_TABLE;
289	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
290		panic("xpq_queue_unpin_table");
291}
292
293void
294xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
295{
296	struct mmuext_op op;
297
298	xpq_flush_queue();
299
300	XENPRINTK2(("xpq_queue_set_ldt\n"));
301	KASSERT(va == (va & ~PAGE_MASK));
302	op.cmd = MMUEXT_SET_LDT;
303	op.arg1.linear_addr = va;
304	op.arg2.nr_ents = entries;
305	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
306		panic("xpq_queue_set_ldt");
307}
308
309void
310xpq_queue_tlb_flush(void)
311{
312	struct mmuext_op op;
313
314	xpq_flush_queue();
315
316	XENPRINTK2(("xpq_queue_tlb_flush\n"));
317	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
318	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
319		panic("xpq_queue_tlb_flush");
320}
321
322void
323xpq_flush_cache(void)
324{
325	struct mmuext_op op;
326	int s = splvm(), err;
327
328	xpq_flush_queue();
329
330	XENPRINTK2(("xpq_queue_flush_cache\n"));
331	op.cmd = MMUEXT_FLUSH_CACHE;
332	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) < 0) {
333		panic("xpq_flush_cache, err %d", err);
334	}
335	splx(s); /* XXX: removeme */
336}
337
338void
339xpq_queue_invlpg(vaddr_t va)
340{
341	struct mmuext_op op;
342	xpq_flush_queue();
343
344	XENPRINTK2(("xpq_queue_invlpg %#" PRIxVADDR "\n", va));
345	op.cmd = MMUEXT_INVLPG_LOCAL;
346	op.arg1.linear_addr = (va & ~PAGE_MASK);
347	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
348		panic("xpq_queue_invlpg");
349}
350
351void
352xen_mcast_invlpg(vaddr_t va, uint32_t cpumask)
353{
354	mmuext_op_t op;
355
356	/* Flush pending page updates */
357	xpq_flush_queue();
358
359	op.cmd = MMUEXT_INVLPG_MULTI;
360	op.arg1.linear_addr = va;
361	op.arg2.vcpumask = &cpumask;
362
363	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
364		panic("xpq_queue_invlpg_all");
365	}
366
367	return;
368}
369
370void
371xen_bcast_invlpg(vaddr_t va)
372{
373	mmuext_op_t op;
374
375	/* Flush pending page updates */
376	xpq_flush_queue();
377
378	op.cmd = MMUEXT_INVLPG_ALL;
379	op.arg1.linear_addr = va;
380
381	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
382		panic("xpq_queue_invlpg_all");
383	}
384
385	return;
386}
387
388/* This is a synchronous call. */
389void
390xen_mcast_tlbflush(uint32_t cpumask)
391{
392	mmuext_op_t op;
393
394	/* Flush pending page updates */
395	xpq_flush_queue();
396
397	op.cmd = MMUEXT_TLB_FLUSH_MULTI;
398	op.arg2.vcpumask = &cpumask;
399
400	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
401		panic("xpq_queue_invlpg_all");
402	}
403
404	return;
405}
406
407/* This is a synchronous call. */
408void
409xen_bcast_tlbflush(void)
410{
411	mmuext_op_t op;
412
413	/* Flush pending page updates */
414	xpq_flush_queue();
415
416	op.cmd = MMUEXT_TLB_FLUSH_ALL;
417
418	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
419		panic("xpq_queue_invlpg_all");
420	}
421
422	return;
423}
424
425/* This is a synchronous call. */
426void
427xen_vcpu_mcast_invlpg(vaddr_t sva, vaddr_t eva, uint32_t cpumask)
428{
429	KASSERT(eva > sva);
430
431	/* Flush pending page updates */
432	xpq_flush_queue();
433
434	/* Align to nearest page boundary */
435	sva &= ~PAGE_MASK;
436	eva &= ~PAGE_MASK;
437
438	for ( ; sva <= eva; sva += PAGE_SIZE) {
439		xen_mcast_invlpg(sva, cpumask);
440	}
441
442	return;
443}
444
445/* This is a synchronous call. */
446void
447xen_vcpu_bcast_invlpg(vaddr_t sva, vaddr_t eva)
448{
449	KASSERT(eva > sva);
450
451	/* Flush pending page updates */
452	xpq_flush_queue();
453
454	/* Align to nearest page boundary */
455	sva &= ~PAGE_MASK;
456	eva &= ~PAGE_MASK;
457
458	for ( ; sva <= eva; sva += PAGE_SIZE) {
459		xen_bcast_invlpg(sva);
460	}
461
462	return;
463}
464
465int
466xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
467{
468	mmu_update_t op;
469	int ok;
470
471	xpq_flush_queue();
472
473	op.ptr = ptr;
474	op.val = val;
475	if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
476		return EFAULT;
477	return (0);
478}
479
480#ifdef XENDEBUG
481void
482xpq_debug_dump(void)
483{
484	int i;
485
486	mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
487	int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
488
489	XENPRINTK2(("idx: %d\n", xpq_idx));
490	for (i = 0; i < xpq_idx; i++) {
491		snprintf(XBUF, sizeof(XBUF), "%" PRIx64 " %08" PRIx64,
492		    xpq_queue[i].ptr, xpq_queue[i].val);
493		if (++i < xpq_idx)
494			snprintf(XBUF + strlen(XBUF),
495			    sizeof(XBUF) - strlen(XBUF),
496			    "%" PRIx64 " %08" PRIx64,
497			    xpq_queue[i].ptr, xpq_queue[i].val);
498		if (++i < xpq_idx)
499			snprintf(XBUF + strlen(XBUF),
500			    sizeof(XBUF) - strlen(XBUF),
501			    "%" PRIx64 " %08" PRIx64,
502			    xpq_queue[i].ptr, xpq_queue[i].val);
503		if (++i < xpq_idx)
504			snprintf(XBUF + strlen(XBUF),
505			    sizeof(XBUF) - strlen(XBUF),
506			    "%" PRIx64 " %08" PRIx64,
507			    xpq_queue[i].ptr, xpq_queue[i].val);
508		XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
509	}
510}
511#endif
512
513
514extern volatile struct xencons_interface *xencons_interface; /* XXX */
515extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
516
517static void xen_bt_set_readonly (vaddr_t);
518static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int);
519
520/* How many PDEs ? */
521#if L2_SLOT_KERNBASE > 0
522#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
523#else
524#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
525#endif
526
527/*
528 * Construct and switch to new pagetables
529 * first_avail is the first vaddr we can use after
530 * we get rid of Xen pagetables
531 */
532
533vaddr_t xen_pmap_bootstrap (void);
534
535/*
536 * Function to get rid of Xen bootstrap tables
537 */
538
539/* How many PDP do we need: */
540#ifdef PAE
541/*
542 * For PAE, we consider a single contigous L2 "superpage" of 4 pages,
543 * all of them mapped by the L3 page. We also need a shadow page
544 * for L3[3].
545 */
546static const int l2_4_count = 6;
547#elif defined(__x86_64__)
548static const int l2_4_count = PTP_LEVELS;
549#else
550static const int l2_4_count = PTP_LEVELS - 1;
551#endif
552
553vaddr_t
554xen_pmap_bootstrap(void)
555{
556	int count, oldcount;
557	long mapsize;
558	vaddr_t bootstrap_tables, init_tables;
559
560	memset(xpq_idx_array, 0, sizeof xpq_idx_array);
561
562	xpmap_phys_to_machine_mapping =
563	    (unsigned long *)xen_start_info.mfn_list;
564	init_tables = xen_start_info.pt_base;
565	__PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables));
566
567	/* Space after Xen boostrap tables should be free */
568	bootstrap_tables = xen_start_info.pt_base +
569		(xen_start_info.nr_pt_frames * PAGE_SIZE);
570
571	/*
572	 * Calculate how many space we need
573	 * first everything mapped before the Xen bootstrap tables
574	 */
575	mapsize = init_tables - KERNTEXTOFF;
576	/* after the tables we'll have:
577	 *  - UAREA
578	 *  - dummy user PGD (x86_64)
579	 *  - HYPERVISOR_shared_info
580	 *  - ISA I/O mem (if needed)
581	 */
582	mapsize += UPAGES * NBPG;
583#ifdef __x86_64__
584	mapsize += NBPG;
585#endif
586	mapsize += NBPG;
587
588#ifdef DOM0OPS
589	if (xendomain_is_dom0()) {
590		/* space for ISA I/O mem */
591		mapsize += IOM_SIZE;
592	}
593#endif
594	/* at this point mapsize doens't include the table size */
595
596#ifdef __x86_64__
597	count = TABLE_L2_ENTRIES;
598#else
599	count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT;
600#endif /* __x86_64__ */
601
602	/* now compute how many L2 pages we need exactly */
603	XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count));
604	while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF >
605	    ((long)count << L2_SHIFT) + KERNBASE) {
606		count++;
607	}
608#ifndef __x86_64__
609	/*
610	 * one more L2 page: we'll alocate several pages after kva_start
611	 * in pmap_bootstrap() before pmap_growkernel(), which have not been
612	 * counted here. It's not a big issue to allocate one more L2 as
613	 * pmap_growkernel() will be called anyway.
614	 */
615	count++;
616	nkptp[1] = count;
617#endif
618
619	/*
620	 * install bootstrap pages. We may need more L2 pages than will
621	 * have the final table here, as it's installed after the final table
622	 */
623	oldcount = count;
624
625bootstrap_again:
626	XENPRINTK(("bootstrap_again oldcount %d\n", oldcount));
627	/*
628	 * Xen space we'll reclaim may not be enough for our new page tables,
629	 * move bootstrap tables if necessary
630	 */
631	if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
632		bootstrap_tables = init_tables +
633					((count + l2_4_count) * PAGE_SIZE);
634	/* make sure we have enough to map the bootstrap_tables */
635	if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
636	    ((long)oldcount << L2_SHIFT) + KERNBASE) {
637		oldcount++;
638		goto bootstrap_again;
639	}
640
641	/* Create temporary tables */
642	xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables,
643		xen_start_info.nr_pt_frames, oldcount, 0);
644
645	/* Create final tables */
646	xen_bootstrap_tables(bootstrap_tables, init_tables,
647	    oldcount + l2_4_count, count, 1);
648
649	/* zero out free space after tables */
650	memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
651	    (UPAGES + 1) * NBPG);
652
653	/* Finally, flush TLB. */
654	xpq_queue_tlb_flush();
655
656	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
657
658	return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
659}
660
661/*
662 * Build a new table and switch to it
663 * old_count is # of old tables (including PGD, PDTPE and PDE)
664 * new_count is # of new tables (PTE only)
665 * we assume areas don't overlap
666 */
667static void
668xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd,
669	int old_count, int new_count, int final)
670{
671	pd_entry_t *pdtpe, *pde, *pte;
672	pd_entry_t *cur_pgd, *bt_pgd;
673	paddr_t addr;
674	vaddr_t page, avail, text_end, map_end;
675	int i;
676	extern char __data_start;
677
678	__PRINTK(("xen_bootstrap_tables(%#" PRIxVADDR ", %#" PRIxVADDR ","
679	    " %d, %d)\n",
680	    old_pgd, new_pgd, old_count, new_count));
681	text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK;
682	/*
683	 * size of R/W area after kernel text:
684	 *  xencons_interface (if present)
685	 *  xenstore_interface (if present)
686	 *  table pages (new_count + l2_4_count entries)
687	 * extra mappings (only when final is true):
688	 *  UAREA
689	 *  dummy user PGD (x86_64 only)/gdt page (i386 only)
690	 *  HYPERVISOR_shared_info
691	 *  ISA I/O mem (if needed)
692	 */
693	map_end = new_pgd + ((new_count + l2_4_count) * NBPG);
694	if (final) {
695		map_end += (UPAGES + 1) * NBPG;
696		HYPERVISOR_shared_info = (shared_info_t *)map_end;
697		map_end += NBPG;
698	}
699	/*
700	 * we always set atdevbase, as it's used by init386 to find the first
701	 * available VA. map_end is updated only if we are dom0, so
702	 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
703	 * this case.
704	 */
705	if (final)
706		atdevbase = map_end;
707#ifdef DOM0OPS
708	if (final && xendomain_is_dom0()) {
709		/* ISA I/O mem */
710		map_end += IOM_SIZE;
711	}
712#endif /* DOM0OPS */
713
714	__PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n",
715	    text_end, map_end));
716	__PRINTK(("console %#lx ", xen_start_info.console_mfn));
717	__PRINTK(("xenstore %#" PRIx32 "\n", xen_start_info.store_mfn));
718
719	/*
720	 * Create bootstrap page tables
721	 * What we need:
722	 * - a PGD (level 4)
723	 * - a PDTPE (level 3)
724	 * - a PDE (level2)
725	 * - some PTEs (level 1)
726	 */
727
728	cur_pgd = (pd_entry_t *) old_pgd;
729	bt_pgd = (pd_entry_t *) new_pgd;
730	memset (bt_pgd, 0, PAGE_SIZE);
731	avail = new_pgd + PAGE_SIZE;
732#if PTP_LEVELS > 3
733	/* per-cpu L4 PD */
734	pd_entry_t *bt_cpu_pgd = bt_pgd;
735	/* pmap_kernel() "shadow" L4 PD */
736	bt_pgd = (pd_entry_t *) avail;
737	memset(bt_pgd, 0, PAGE_SIZE);
738	avail += PAGE_SIZE;
739
740	/* Install level 3 */
741	pdtpe = (pd_entry_t *) avail;
742	memset (pdtpe, 0, PAGE_SIZE);
743	avail += PAGE_SIZE;
744
745	addr = ((u_long) pdtpe) - KERNBASE;
746	bt_pgd[pl4_pi(KERNTEXTOFF)] = bt_cpu_pgd[pl4_pi(KERNTEXTOFF)] =
747	    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
748
749	__PRINTK(("L3 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
750	    " -> L4[%#x]\n",
751	    pdtpe, addr, bt_pgd[pl4_pi(KERNTEXTOFF)], pl4_pi(KERNTEXTOFF)));
752#else
753	pdtpe = bt_pgd;
754#endif /* PTP_LEVELS > 3 */
755
756#if PTP_LEVELS > 2
757	/* Level 2 */
758	pde = (pd_entry_t *) avail;
759	memset(pde, 0, PAGE_SIZE);
760	avail += PAGE_SIZE;
761
762	addr = ((u_long) pde) - KERNBASE;
763	pdtpe[pl3_pi(KERNTEXTOFF)] =
764	    xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
765	__PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
766	    " -> L3[%#x]\n",
767	    pde, addr, pdtpe[pl3_pi(KERNTEXTOFF)], pl3_pi(KERNTEXTOFF)));
768#elif defined(PAE)
769	/* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */
770	pde = (pd_entry_t *) avail;
771	memset(pde, 0, PAGE_SIZE * 5);
772	avail += PAGE_SIZE * 5;
773	addr = ((u_long) pde) - KERNBASE;
774	/*
775	 * enter L2 pages in the L3.
776	 * The real L2 kernel PD will be the last one (so that
777	 * pde[L2_SLOT_KERN] always point to the shadow).
778	 */
779	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
780		/*
781		 * Xen doesn't want R/W mappings in L3 entries, it'll add it
782		 * itself.
783		 */
784		pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
785		__PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
786		    " -> L3[%#x]\n",
787		    (vaddr_t)pde + PAGE_SIZE * i, addr, pdtpe[i], i));
788	}
789	addr += PAGE_SIZE;
790	pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
791	__PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
792	    " -> L3[%#x]\n",
793	    (vaddr_t)pde + PAGE_SIZE * 4, addr, pdtpe[3], 3));
794
795#else /* PAE */
796	pde = bt_pgd;
797#endif /* PTP_LEVELS > 2 */
798
799	/* Level 1 */
800	page = KERNTEXTOFF;
801	for (i = 0; i < new_count; i ++) {
802		vaddr_t cur_page = page;
803
804		pte = (pd_entry_t *) avail;
805		avail += PAGE_SIZE;
806
807		memset(pte, 0, PAGE_SIZE);
808		while (pl2_pi(page) == pl2_pi (cur_page)) {
809			if (page >= map_end) {
810				/* not mapped at all */
811				pte[pl1_pi(page)] = 0;
812				page += PAGE_SIZE;
813				continue;
814			}
815			pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
816			if (page == (vaddr_t)HYPERVISOR_shared_info) {
817				pte[pl1_pi(page)] = xen_start_info.shared_info;
818				__PRINTK(("HYPERVISOR_shared_info "
819				    "va %#lx pte %#" PRIxPADDR "\n",
820				    HYPERVISOR_shared_info, pte[pl1_pi(page)]));
821			}
822			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
823			    == xen_start_info.console.domU.mfn) {
824				xencons_interface = (void *)page;
825				pte[pl1_pi(page)] = xen_start_info.console_mfn;
826				pte[pl1_pi(page)] <<= PAGE_SHIFT;
827				__PRINTK(("xencons_interface "
828				    "va %#lx pte %#" PRIxPADDR "\n",
829				    xencons_interface, pte[pl1_pi(page)]));
830			}
831			if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
832			    == xen_start_info.store_mfn) {
833				xenstore_interface = (void *)page;
834				pte[pl1_pi(page)] = xen_start_info.store_mfn;
835				pte[pl1_pi(page)] <<= PAGE_SHIFT;
836				__PRINTK(("xenstore_interface "
837				    "va %#lx pte %#" PRIxPADDR "\n",
838				    xenstore_interface, pte[pl1_pi(page)]));
839			}
840#ifdef DOM0OPS
841			if (page >= (vaddr_t)atdevbase &&
842			    page < (vaddr_t)atdevbase + IOM_SIZE) {
843				pte[pl1_pi(page)] =
844				    IOM_BEGIN + (page - (vaddr_t)atdevbase);
845			}
846#endif
847			pte[pl1_pi(page)] |= PG_k | PG_V;
848			if (page < text_end) {
849				/* map kernel text RO */
850				pte[pl1_pi(page)] |= 0;
851			} else if (page >= old_pgd
852			    && page < old_pgd + (old_count * PAGE_SIZE)) {
853				/* map old page tables RO */
854				pte[pl1_pi(page)] |= 0;
855			} else if (page >= new_pgd &&
856			    page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
857				/* map new page tables RO */
858				pte[pl1_pi(page)] |= 0;
859			} else {
860				/* map page RW */
861				pte[pl1_pi(page)] |= PG_RW;
862			}
863
864			if ((page  >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE))
865			    || page >= new_pgd) {
866				__PRINTK(("va %#lx pa %#lx "
867				    "entry 0x%" PRIxPADDR " -> L1[%#x]\n",
868				    page, page - KERNBASE,
869				    pte[pl1_pi(page)], pl1_pi(page)));
870			}
871			page += PAGE_SIZE;
872		}
873
874		addr = ((u_long) pte) - KERNBASE;
875		pde[pl2_pi(cur_page)] =
876		    xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
877		__PRINTK(("L1 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
878		    " -> L2[%#x]\n",
879		    pte, addr, pde[pl2_pi(cur_page)], pl2_pi(cur_page)));
880		/* Mark readonly */
881		xen_bt_set_readonly((vaddr_t) pte);
882	}
883
884	/* Install recursive page tables mapping */
885#ifdef PAE
886	/*
887	 * we need a shadow page for the kernel's L2 page
888	 * The real L2 kernel PD will be the last one (so that
889	 * pde[L2_SLOT_KERN] always point to the shadow.
890	 */
891	memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
892	cpu_info_primary.ci_kpm_pdir = &pde[L2_SLOT_KERN + NPDPG];
893	cpu_info_primary.ci_kpm_pdirpa =
894	    (vaddr_t) cpu_info_primary.ci_kpm_pdir - KERNBASE;
895
896	/*
897	 * We don't enter a recursive entry from the L3 PD. Instead,
898	 * we enter the first 4 L2 pages, which includes the kernel's L2
899	 * shadow. But we have to entrer the shadow after switching
900	 * %cr3, or Xen will refcount some PTE with the wrong type.
901	 */
902	addr = (u_long)pde - KERNBASE;
903	for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
904		pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
905		__PRINTK(("pde[%d] va %#" PRIxVADDR " pa %#" PRIxPADDR
906		    " entry %#" PRIxPADDR "\n",
907		    (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i,
908		    addr, pde[PDIR_SLOT_PTE + i]));
909	}
910#if 0
911	addr += PAGE_SIZE; /* point to shadow L2 */
912	pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
913	__PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
914	    (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr,
915	    (int64_t)pde[PDIR_SLOT_PTE + 3]));
916#endif
917	/* Mark tables RO, and pin the kernel's shadow as L2 */
918	addr = (u_long)pde - KERNBASE;
919	for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
920		xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
921		if (i == 2 || i == 3)
922			continue;
923#if 0
924		__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr));
925		xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
926#endif
927	}
928	if (final) {
929		addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
930		__PRINTK(("pin L2 %d addr %#" PRIxPADDR "\n", 2, addr));
931		xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
932	}
933#if 0
934	addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE;
935	__PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
936	xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
937#endif
938#else /* PAE */
939	/* recursive entry in higher-level per-cpu PD and pmap_kernel() */
940	bt_pgd[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)bt_pgd - KERNBASE) | PG_k | PG_V;
941#ifdef __x86_64__
942	   bt_cpu_pgd[PDIR_SLOT_PTE] =
943		   xpmap_ptom_masked((paddr_t)bt_cpu_pgd - KERNBASE) | PG_k | PG_V;
944#endif /* __x86_64__ */
945	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] va %#" PRIxVADDR " pa %#" PRIxPADDR
946	    " entry %#" PRIxPADDR "\n", new_pgd, (paddr_t)new_pgd - KERNBASE,
947	    bt_pgd[PDIR_SLOT_PTE]));
948	/* Mark tables RO */
949	xen_bt_set_readonly((vaddr_t) pde);
950#endif
951#if PTP_LEVELS > 2 || defined(PAE)
952	xen_bt_set_readonly((vaddr_t) pdtpe);
953#endif
954#if PTP_LEVELS > 3
955	xen_bt_set_readonly(new_pgd);
956#endif
957	/* Pin the PGD */
958	__PRINTK(("pin PGD: %"PRIxVADDR"\n", new_pgd - KERNBASE));
959#ifdef __x86_64__
960	xpq_queue_pin_l4_table(xpmap_ptom_masked(new_pgd - KERNBASE));
961#elif PAE
962	xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
963#else
964	xpq_queue_pin_l2_table(xpmap_ptom_masked(new_pgd - KERNBASE));
965#endif
966
967	/* Save phys. addr of PDP, for libkvm. */
968#ifdef PAE
969	PDPpaddr = (u_long)pde - KERNBASE; /* PDP is the L2 with PAE */
970#else
971	PDPpaddr = (u_long)bt_pgd - KERNBASE;
972#endif
973
974	/* Switch to new tables */
975	__PRINTK(("switch to PGD\n"));
976	xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
977	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry %#" PRIxPADDR "\n",
978	    bt_pgd[PDIR_SLOT_PTE]));
979
980#ifdef PAE
981	if (final) {
982		/* save the address of the L3 page */
983		cpu_info_primary.ci_pae_l3_pdir = pdtpe;
984		cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE);
985
986		/* now enter kernel's PTE mappings */
987		addr =  (u_long)pde - KERNBASE + PAGE_SIZE * 3;
988		xpq_queue_pte_update(
989		    xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
990		    xpmap_ptom_masked(addr) | PG_k | PG_V);
991		xpq_flush_queue();
992	}
993#elif defined(__x86_64__)
994	if (final) {
995		/* save the address of the real per-cpu L4 pgd page */
996		cpu_info_primary.ci_kpm_pdir = bt_cpu_pgd;
997		cpu_info_primary.ci_kpm_pdirpa = ((paddr_t) bt_cpu_pgd - KERNBASE);
998	}
999#endif
1000
1001	/* Now we can safely reclaim space taken by old tables */
1002
1003	__PRINTK(("unpin old PGD\n"));
1004	/* Unpin old PGD */
1005	xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
1006	/* Mark old tables RW */
1007	page = old_pgd;
1008	addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME;
1009	addr = xpmap_mtop(addr);
1010	pte = (pd_entry_t *) ((u_long)addr + KERNBASE);
1011	pte += pl1_pi(page);
1012	__PRINTK(("*pde %#" PRIxPADDR " addr %#" PRIxPADDR " pte %#lx\n",
1013	    pde[pl2_pi(page)], addr, (long)pte));
1014	while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
1015		addr = xpmap_ptom(((u_long) pte) - KERNBASE);
1016		XENPRINTK(("addr %#" PRIxPADDR " pte %#lx "
1017		   "*pte %#" PRIxPADDR "\n",
1018		   addr, (long)pte, *pte));
1019		xpq_queue_pte_update(addr, *pte | PG_RW);
1020		page += PAGE_SIZE;
1021		/*
1022		 * Our ptes are contiguous
1023		 * so it's safe to just "++" here
1024		 */
1025		pte++;
1026	}
1027	xpq_flush_queue();
1028}
1029
1030
1031/*
1032 * Bootstrap helper functions
1033 */
1034
1035/*
1036 * Mark a page readonly
1037 * XXX: assuming vaddr = paddr + KERNBASE
1038 */
1039
1040static void
1041xen_bt_set_readonly (vaddr_t page)
1042{
1043	pt_entry_t entry;
1044
1045	entry = xpmap_ptom_masked(page - KERNBASE);
1046	entry |= PG_k | PG_V;
1047
1048	HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG);
1049}
1050
1051#ifdef __x86_64__
1052void
1053xen_set_user_pgd(paddr_t page)
1054{
1055	struct mmuext_op op;
1056	int s = splvm();
1057
1058	xpq_flush_queue();
1059	op.cmd = MMUEXT_NEW_USER_BASEPTR;
1060	op.arg1.mfn = pfn_to_mfn(page >> PAGE_SHIFT);
1061        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
1062		panic("xen_set_user_pgd: failed to install new user page"
1063			" directory %#" PRIxPADDR, page);
1064	splx(s);
1065}
1066#endif /* __x86_64__ */
1067