x86_xpmap.c revision 1.13
1/* $NetBSD: x86_xpmap.c,v 1.13 2009/06/20 10:24:28 cegger Exp $ */ 2 3/* 4 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19/* 20 * Copyright (c) 2006, 2007 Manuel Bouyer. 21 * 22 * Redistribution and use in source and binary forms, with or without 23 * modification, are permitted provided that the following conditions 24 * are met: 25 * 1. Redistributions of source code must retain the above copyright 26 * notice, this list of conditions and the following disclaimer. 27 * 2. Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in the 29 * documentation and/or other materials provided with the distribution. 30 * 3. All advertising materials mentioning features or use of this software 31 * must display the following acknowledgement: 32 * This product includes software developed by Manuel Bouyer. 33 * 4. The name of the author may not be used to endorse or promote products 34 * derived from this software without specific prior written permission. 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 37 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 39 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 40 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 45 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46 * 47 */ 48 49/* 50 * 51 * Copyright (c) 2004 Christian Limpach. 52 * All rights reserved. 53 * 54 * Redistribution and use in source and binary forms, with or without 55 * modification, are permitted provided that the following conditions 56 * are met: 57 * 1. Redistributions of source code must retain the above copyright 58 * notice, this list of conditions and the following disclaimer. 59 * 2. Redistributions in binary form must reproduce the above copyright 60 * notice, this list of conditions and the following disclaimer in the 61 * documentation and/or other materials provided with the distribution. 62 * 3. All advertising materials mentioning features or use of this software 63 * must display the following acknowledgement: 64 * This product includes software developed by Christian Limpach. 65 * 4. The name of the author may not be used to endorse or promote products 66 * derived from this software without specific prior written permission. 67 * 68 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 69 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 70 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 71 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 72 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 73 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 74 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 75 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 76 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 77 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 78 */ 79 80 81#include <sys/cdefs.h> 82__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.13 2009/06/20 10:24:28 cegger Exp $"); 83 84#include "opt_xen.h" 85#include "opt_ddb.h" 86#include "ksyms.h" 87 88#include <sys/param.h> 89#include <sys/systm.h> 90 91#include <uvm/uvm.h> 92 93#include <machine/pmap.h> 94#include <machine/gdt.h> 95#include <xen/xenfunc.h> 96 97#include <dev/isa/isareg.h> 98#include <machine/isa_machdep.h> 99 100#undef XENDEBUG 101/* #define XENDEBUG_SYNC */ 102/* #define XENDEBUG_LOW */ 103 104#ifdef XENDEBUG 105#define XENPRINTF(x) printf x 106#define XENPRINTK(x) printk x 107#define XENPRINTK2(x) /* printk x */ 108 109static char XBUF[256]; 110#else 111#define XENPRINTF(x) 112#define XENPRINTK(x) 113#define XENPRINTK2(x) 114#endif 115#define PRINTF(x) printf x 116#define PRINTK(x) printk x 117 118/* on x86_64 kernel runs in ring 3 */ 119#ifdef __x86_64__ 120#define PG_k PG_u 121#else 122#define PG_k 0 123#endif 124 125volatile shared_info_t *HYPERVISOR_shared_info; 126/* Xen requires the start_info struct to be page aligned */ 127union start_info_union start_info_union __aligned(PAGE_SIZE); 128unsigned long *xpmap_phys_to_machine_mapping; 129 130void xen_failsafe_handler(void); 131 132#ifdef XEN3 133#define HYPERVISOR_mmu_update_self(req, count, success_count) \ 134 HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF) 135#else 136#define HYPERVISOR_mmu_update_self(req, count, success_count) \ 137 HYPERVISOR_mmu_update((req), (count), (success_count)) 138#endif 139 140void 141xen_failsafe_handler(void) 142{ 143 144 panic("xen_failsafe_handler called!\n"); 145} 146 147 148void 149xen_set_ldt(vaddr_t base, uint32_t entries) 150{ 151 vaddr_t va; 152 vaddr_t end; 153 pt_entry_t *ptp; 154 int s; 155 156#ifdef __x86_64__ 157 end = base + (entries << 3); 158#else 159 end = base + entries * sizeof(union descriptor); 160#endif 161 162 for (va = base; va < end; va += PAGE_SIZE) { 163 KASSERT(va >= VM_MIN_KERNEL_ADDRESS); 164 ptp = kvtopte(va); 165 XENPRINTF(("xen_set_ldt %p %d %p\n", (void *)base, 166 entries, ptp)); 167 pmap_pte_clearbits(ptp, PG_RW); 168 } 169 s = splvm(); 170 xpq_queue_set_ldt(base, entries); 171 xpq_flush_queue(); 172 splx(s); 173} 174 175#ifdef XENDEBUG 176void xpq_debug_dump(void); 177#endif 178 179#define XPQUEUE_SIZE 2048 180static mmu_update_t xpq_queue[XPQUEUE_SIZE]; 181static int xpq_idx = 0; 182 183void 184xpq_flush_queue(void) 185{ 186 int i, ok; 187 188 XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx)); 189 for (i = 0; i < xpq_idx; i++) 190 XENPRINTK2(("%d: %p %08" PRIx64 "\n", i, 191 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val)); 192 if (xpq_idx != 0 && 193 HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok) < 0) { 194 printf("xpq_flush_queue: %d entries \n", xpq_idx); 195 for (i = 0; i < xpq_idx; i++) 196 printf("0x%016" PRIx64 ": 0x%016" PRIx64 "\n", 197 (uint64_t)xpq_queue[i].ptr, 198 (uint64_t)xpq_queue[i].val); 199 panic("HYPERVISOR_mmu_update failed\n"); 200 } 201 xpq_idx = 0; 202} 203 204static inline void 205xpq_increment_idx(void) 206{ 207 208 xpq_idx++; 209 if (__predict_false(xpq_idx == XPQUEUE_SIZE)) 210 xpq_flush_queue(); 211} 212 213void 214xpq_queue_machphys_update(paddr_t ma, paddr_t pa) 215{ 216 XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64 217 "\n", (int64_t)ma, (int64_t)pa)); 218 xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE; 219 xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT; 220 xpq_increment_idx(); 221#ifdef XENDEBUG_SYNC 222 xpq_flush_queue(); 223#endif 224} 225 226void 227xpq_queue_pte_update(paddr_t ptr, pt_entry_t val) 228{ 229 230 KASSERT((ptr & 3) == 0); 231 xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE; 232 xpq_queue[xpq_idx].val = val; 233 xpq_increment_idx(); 234#ifdef XENDEBUG_SYNC 235 xpq_flush_queue(); 236#endif 237} 238 239#ifdef XEN3 240void 241xpq_queue_pt_switch(paddr_t pa) 242{ 243 struct mmuext_op op; 244 xpq_flush_queue(); 245 246 XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n", 247 (int64_t)pa, (int64_t)pa)); 248 op.cmd = MMUEXT_NEW_BASEPTR; 249 op.arg1.mfn = pa >> PAGE_SHIFT; 250 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 251 panic("xpq_queue_pt_switch"); 252} 253 254void 255xpq_queue_pin_table(paddr_t pa) 256{ 257 struct mmuext_op op; 258 xpq_flush_queue(); 259 260 XENPRINTK2(("xpq_queue_pin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n", 261 (int64_t)pa, (int64_t)pa)); 262 op.arg1.mfn = pa >> PAGE_SHIFT; 263 264#if defined(__x86_64__) 265 op.cmd = MMUEXT_PIN_L4_TABLE; 266#else 267 op.cmd = MMUEXT_PIN_L2_TABLE; 268#endif 269 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 270 panic("xpq_queue_pin_table"); 271} 272 273#ifdef PAE 274static void 275xpq_queue_pin_l3_table(paddr_t pa) 276{ 277 struct mmuext_op op; 278 xpq_flush_queue(); 279 280 XENPRINTK2(("xpq_queue_pin_l2_table: 0x%" PRIx64 " 0x%" PRIx64 "\n", 281 (int64_t)pa, (int64_t)pa)); 282 op.arg1.mfn = pa >> PAGE_SHIFT; 283 284 op.cmd = MMUEXT_PIN_L3_TABLE; 285 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 286 panic("xpq_queue_pin_table"); 287} 288#endif 289 290void 291xpq_queue_unpin_table(paddr_t pa) 292{ 293 struct mmuext_op op; 294 xpq_flush_queue(); 295 296 XENPRINTK2(("xpq_queue_unpin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n", 297 (int64_t)pa, (int64_t)pa)); 298 op.arg1.mfn = pa >> PAGE_SHIFT; 299 op.cmd = MMUEXT_UNPIN_TABLE; 300 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 301 panic("xpq_queue_unpin_table"); 302} 303 304void 305xpq_queue_set_ldt(vaddr_t va, uint32_t entries) 306{ 307 struct mmuext_op op; 308 xpq_flush_queue(); 309 310 XENPRINTK2(("xpq_queue_set_ldt\n")); 311 KASSERT(va == (va & ~PAGE_MASK)); 312 op.cmd = MMUEXT_SET_LDT; 313 op.arg1.linear_addr = va; 314 op.arg2.nr_ents = entries; 315 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 316 panic("xpq_queue_set_ldt"); 317} 318 319void 320xpq_queue_tlb_flush(void) 321{ 322 struct mmuext_op op; 323 xpq_flush_queue(); 324 325 XENPRINTK2(("xpq_queue_tlb_flush\n")); 326 op.cmd = MMUEXT_TLB_FLUSH_LOCAL; 327 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 328 panic("xpq_queue_tlb_flush"); 329} 330 331void 332xpq_flush_cache(void) 333{ 334 struct mmuext_op op; 335 int s = splvm(); 336 xpq_flush_queue(); 337 338 XENPRINTK2(("xpq_queue_flush_cache\n")); 339 op.cmd = MMUEXT_FLUSH_CACHE; 340 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 341 panic("xpq_flush_cache"); 342 splx(s); 343} 344 345void 346xpq_queue_invlpg(vaddr_t va) 347{ 348 struct mmuext_op op; 349 xpq_flush_queue(); 350 351 XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va)); 352 op.cmd = MMUEXT_INVLPG_LOCAL; 353 op.arg1.linear_addr = (va & ~PAGE_MASK); 354 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 355 panic("xpq_queue_invlpg"); 356} 357 358int 359xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom) 360{ 361 mmu_update_t op; 362 int ok; 363 xpq_flush_queue(); 364 365 op.ptr = ptr; 366 op.val = val; 367 if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0) 368 return EFAULT; 369 return (0); 370} 371#else /* XEN3 */ 372void 373xpq_queue_pt_switch(paddr_t pa) 374{ 375 376 XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa)); 377 xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND; 378 xpq_queue[xpq_idx].val = MMUEXT_NEW_BASEPTR; 379 xpq_increment_idx(); 380} 381 382void 383xpq_queue_pin_table(paddr_t pa) 384{ 385 386 XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa)); 387 xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND; 388 xpq_queue[xpq_idx].val = MMUEXT_PIN_L2_TABLE; 389 xpq_increment_idx(); 390} 391 392void 393xpq_queue_unpin_table(paddr_t pa) 394{ 395 396 XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa)); 397 xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND; 398 xpq_queue[xpq_idx].val = MMUEXT_UNPIN_TABLE; 399 xpq_increment_idx(); 400} 401 402void 403xpq_queue_set_ldt(vaddr_t va, uint32_t entries) 404{ 405 406 XENPRINTK2(("xpq_queue_set_ldt\n")); 407 KASSERT(va == (va & ~PAGE_MASK)); 408 xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND | va; 409 xpq_queue[xpq_idx].val = MMUEXT_SET_LDT | (entries << MMUEXT_CMD_SHIFT); 410 xpq_increment_idx(); 411} 412 413void 414xpq_queue_tlb_flush(void) 415{ 416 417 XENPRINTK2(("xpq_queue_tlb_flush\n")); 418 xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND; 419 xpq_queue[xpq_idx].val = MMUEXT_TLB_FLUSH; 420 xpq_increment_idx(); 421} 422 423void 424xpq_flush_cache(void) 425{ 426 int s = splvm(); 427 428 XENPRINTK2(("xpq_queue_flush_cache\n")); 429 xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND; 430 xpq_queue[xpq_idx].val = MMUEXT_FLUSH_CACHE; 431 xpq_increment_idx(); 432 xpq_flush_queue(); 433 splx(s); 434} 435 436void 437xpq_queue_invlpg(vaddr_t va) 438{ 439 440 XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va)); 441 xpq_queue[xpq_idx].ptr = (va & ~PAGE_MASK) | MMU_EXTENDED_COMMAND; 442 xpq_queue[xpq_idx].val = MMUEXT_INVLPG; 443 xpq_increment_idx(); 444} 445 446int 447xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom) 448{ 449 mmu_update_t xpq_up[3]; 450 451 xpq_up[0].ptr = MMU_EXTENDED_COMMAND; 452 xpq_up[0].val = MMUEXT_SET_FOREIGNDOM | (dom << 16); 453 xpq_up[1].ptr = ptr; 454 xpq_up[1].val = val; 455 if (HYPERVISOR_mmu_update_self(xpq_up, 2, NULL) < 0) 456 return EFAULT; 457 return (0); 458} 459#endif /* XEN3 */ 460 461#ifdef XENDEBUG 462void 463xpq_debug_dump(void) 464{ 465 int i; 466 467 XENPRINTK2(("idx: %d\n", xpq_idx)); 468 for (i = 0; i < xpq_idx; i++) { 469 snprintf(XBUF, sizeof(XBUF), "%" PRIx64 " %08" PRIx64, 470 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val); 471 if (++i < xpq_idx) 472 snprintf(XBUF + strlen(XBUF), 473 sizeof(XBUF) - strlen(XBUF), 474 "%" PRIx64 " %08" PRIx64, 475 (uint64_t)xpq_queue[i].ptr, 476 (uint64_t)xpq_queue[i].val); 477 if (++i < xpq_idx) 478 snprintf(XBUF + strlen(XBUF), 479 sizeof(XBUF) - strlen(XBUF), 480 "%" PRIx64 " %08" PRIx64, 481 (uint64_t)xpq_queue[i].ptr, 482 (uint64_t)xpq_queue[i].val); 483 if (++i < xpq_idx) 484 snprintf(XBUF + strlen(XBUF), 485 sizeof(XBUF) - strlen(XBUF), 486 "%" PRIx64 " %08" PRIx64, 487 (uint64_t)xpq_queue[i].ptr, 488 (uint64_t)xpq_queue[i].val); 489 XENPRINTK2(("%d: %s\n", xpq_idx, XBUF)); 490 } 491} 492#endif 493 494 495extern volatile struct xencons_interface *xencons_interface; /* XXX */ 496extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 497 498static void xen_bt_set_readonly (vaddr_t); 499static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int); 500 501/* How many PDEs ? */ 502#if L2_SLOT_KERNBASE > 0 503#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1)) 504#else 505#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1) 506#endif 507 508/* 509 * Construct and switch to new pagetables 510 * first_avail is the first vaddr we can use after 511 * we get rid of Xen pagetables 512 */ 513 514vaddr_t xen_pmap_bootstrap (void); 515 516/* 517 * Function to get rid of Xen bootstrap tables 518 */ 519 520/* How many PDP do we need: */ 521#ifdef PAE 522/* 523 * For PAE, we consider a single contigous L2 "superpage" of 4 pages, 524 * all of them mapped by the L3 page. We also need a shadow page 525 * for L3[3]. 526 */ 527static const int l2_4_count = 6; 528#else 529static const int l2_4_count = PTP_LEVELS - 1; 530#endif 531 532vaddr_t 533xen_pmap_bootstrap(void) 534{ 535 int count, oldcount; 536 long mapsize; 537 vaddr_t bootstrap_tables, init_tables; 538 539 xpmap_phys_to_machine_mapping = 540 (unsigned long *)xen_start_info.mfn_list; 541 init_tables = xen_start_info.pt_base; 542 __PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables)); 543 544 /* Space after Xen boostrap tables should be free */ 545 bootstrap_tables = xen_start_info.pt_base + 546 (xen_start_info.nr_pt_frames * PAGE_SIZE); 547 548 /* 549 * Calculate how many space we need 550 * first everything mapped before the Xen bootstrap tables 551 */ 552 mapsize = init_tables - KERNTEXTOFF; 553 /* after the tables we'll have: 554 * - UAREA 555 * - dummy user PGD (x86_64) 556 * - HYPERVISOR_shared_info 557 * - ISA I/O mem (if needed) 558 */ 559 mapsize += UPAGES * NBPG; 560#ifdef __x86_64__ 561 mapsize += NBPG; 562#endif 563 mapsize += NBPG; 564 565#ifdef DOM0OPS 566 if (xendomain_is_dom0()) { 567 /* space for ISA I/O mem */ 568 mapsize += IOM_SIZE; 569 } 570#endif 571 /* at this point mapsize doens't include the table size */ 572 573#ifdef __x86_64__ 574 count = TABLE_L2_ENTRIES; 575#else 576 count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT; 577#endif /* __x86_64__ */ 578 579 /* now compute how many L2 pages we need exactly */ 580 XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count)); 581 while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF > 582 ((long)count << L2_SHIFT) + KERNBASE) { 583 count++; 584 } 585#ifndef __x86_64__ 586 /* 587 * one more L2 page: we'll alocate several pages after kva_start 588 * in pmap_bootstrap() before pmap_growkernel(), which have not been 589 * counted here. It's not a big issue to allocate one more L2 as 590 * pmap_growkernel() will be called anyway. 591 */ 592 count++; 593 nkptp[1] = count; 594#endif 595 596 /* 597 * install bootstrap pages. We may need more L2 pages than will 598 * have the final table here, as it's installed after the final table 599 */ 600 oldcount = count; 601 602bootstrap_again: 603 XENPRINTK(("bootstrap_again oldcount %d\n", oldcount)); 604 /* 605 * Xen space we'll reclaim may not be enough for our new page tables, 606 * move bootstrap tables if necessary 607 */ 608 if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE)) 609 bootstrap_tables = init_tables + 610 ((count + l2_4_count) * PAGE_SIZE); 611 /* make sure we have enough to map the bootstrap_tables */ 612 if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) > 613 ((long)oldcount << L2_SHIFT) + KERNBASE) { 614 oldcount++; 615 goto bootstrap_again; 616 } 617 618 /* Create temporary tables */ 619 xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables, 620 xen_start_info.nr_pt_frames, oldcount, 0); 621 622 /* Create final tables */ 623 xen_bootstrap_tables(bootstrap_tables, init_tables, 624 oldcount + l2_4_count, count, 1); 625 626 /* zero out free space after tables */ 627 memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0, 628 (UPAGES + 1) * NBPG); 629 return (init_tables + ((count + l2_4_count) * PAGE_SIZE)); 630} 631 632 633/* 634 * Build a new table and switch to it 635 * old_count is # of old tables (including PGD, PDTPE and PDE) 636 * new_count is # of new tables (PTE only) 637 * we assume areas don't overlap 638 */ 639 640 641static void 642xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd, 643 int old_count, int new_count, int final) 644{ 645 pd_entry_t *pdtpe, *pde, *pte; 646 pd_entry_t *cur_pgd, *bt_pgd; 647 paddr_t addr; 648 vaddr_t page, avail, text_end, map_end; 649 int i; 650 extern char __data_start; 651 652 __PRINTK(("xen_bootstrap_tables(0x%lx, 0x%lx, %d, %d)\n", 653 old_pgd, new_pgd, old_count, new_count)); 654 text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK; 655 /* 656 * size of R/W area after kernel text: 657 * xencons_interface (if present) 658 * xenstore_interface (if present) 659 * table pages (new_count + l2_4_count entries) 660 * extra mappings (only when final is true): 661 * UAREA 662 * dummy user PGD (x86_64 only)/gdt page (i386 only) 663 * HYPERVISOR_shared_info 664 * ISA I/O mem (if needed) 665 */ 666 map_end = new_pgd + ((new_count + l2_4_count) * NBPG); 667 if (final) { 668 map_end += (UPAGES + 1) * NBPG; 669 HYPERVISOR_shared_info = (shared_info_t *)map_end; 670 map_end += NBPG; 671 } 672 /* 673 * we always set atdevbase, as it's used by init386 to find the first 674 * available VA. map_end is updated only if we are dom0, so 675 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in 676 * this case. 677 */ 678 if (final) 679 atdevbase = map_end; 680#ifdef DOM0OPS 681 if (final && xendomain_is_dom0()) { 682 /* ISA I/O mem */ 683 map_end += IOM_SIZE; 684 } 685#endif /* DOM0OPS */ 686 687 __PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n", 688 text_end, map_end)); 689 __PRINTK(("console 0x%lx ", xen_start_info.console.domU.mfn)); 690 __PRINTK(("xenstore 0x%lx\n", xen_start_info.store_mfn)); 691 692 /* 693 * Create bootstrap page tables 694 * What we need: 695 * - a PGD (level 4) 696 * - a PDTPE (level 3) 697 * - a PDE (level2) 698 * - some PTEs (level 1) 699 */ 700 701 cur_pgd = (pd_entry_t *) old_pgd; 702 bt_pgd = (pd_entry_t *) new_pgd; 703 memset (bt_pgd, 0, PAGE_SIZE); 704 avail = new_pgd + PAGE_SIZE; 705#if PTP_LEVELS > 3 706 /* Install level 3 */ 707 pdtpe = (pd_entry_t *) avail; 708 memset (pdtpe, 0, PAGE_SIZE); 709 avail += PAGE_SIZE; 710 711 addr = ((u_long) pdtpe) - KERNBASE; 712 bt_pgd[pl4_pi(KERNTEXTOFF)] = 713 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V; 714 715 __PRINTK(("L3 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L4[0x%x]\n", 716 pdtpe, (uint64_t)addr, (uint64_t)bt_pgd[pl4_pi(KERNTEXTOFF)], 717 pl4_pi(KERNTEXTOFF))); 718#else 719 pdtpe = bt_pgd; 720#endif /* PTP_LEVELS > 3 */ 721 722#if PTP_LEVELS > 2 723 /* Level 2 */ 724 pde = (pd_entry_t *) avail; 725 memset(pde, 0, PAGE_SIZE); 726 avail += PAGE_SIZE; 727 728 addr = ((u_long) pde) - KERNBASE; 729 pdtpe[pl3_pi(KERNTEXTOFF)] = 730 xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW; 731 __PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L3[0x%x]\n", 732 pde, (int64_t)addr, (int64_t)pdtpe[pl3_pi(KERNTEXTOFF)], 733 pl3_pi(KERNTEXTOFF))); 734#elif defined(PAE) 735 /* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */ 736 pde = (pd_entry_t *) avail; 737 memset(pde, 0, PAGE_SIZE * 5); 738 avail += PAGE_SIZE * 5; 739 addr = ((u_long) pde) - KERNBASE; 740 /* 741 * enter L2 pages in the L3. 742 * The real L2 kernel PD will be the last one (so that 743 * pde[L2_SLOT_KERN] always point to the shadow). 744 */ 745 for (i = 0; i < 3; i++, addr += PAGE_SIZE) { 746 /* 747 * Xen doens't want R/W mappings in L3 entries, it'll add it 748 * itself. 749 */ 750 pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V; 751 __PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 752 " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * i, 753 (int64_t)addr, (int64_t)pdtpe[i], i)); 754 } 755 addr += PAGE_SIZE; 756 pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V; 757 __PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 758 " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * 4, 759 (int64_t)addr, (int64_t)pdtpe[3], 3)); 760 761#else /* PAE */ 762 pde = bt_pgd; 763#endif /* PTP_LEVELS > 2 */ 764 765 /* Level 1 */ 766 page = KERNTEXTOFF; 767 for (i = 0; i < new_count; i ++) { 768 vaddr_t cur_page = page; 769 770 pte = (pd_entry_t *) avail; 771 avail += PAGE_SIZE; 772 773 memset(pte, 0, PAGE_SIZE); 774 while (pl2_pi(page) == pl2_pi (cur_page)) { 775 if (page >= map_end) { 776 /* not mapped at all */ 777 pte[pl1_pi(page)] = 0; 778 page += PAGE_SIZE; 779 continue; 780 } 781 pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE); 782 if (page == (vaddr_t)HYPERVISOR_shared_info) { 783 pte[pl1_pi(page)] = xen_start_info.shared_info; 784 __PRINTK(("HYPERVISOR_shared_info " 785 "va 0x%lx pte 0x%" PRIx64 "\n", 786 HYPERVISOR_shared_info, (int64_t)pte[pl1_pi(page)])); 787 } 788#ifdef XEN3 789 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT) 790 == xen_start_info.console.domU.mfn) { 791 xencons_interface = (void *)page; 792 pte[pl1_pi(page)] = xen_start_info.console.domU.mfn; 793 pte[pl1_pi(page)] <<= PAGE_SHIFT; 794 __PRINTK(("xencons_interface " 795 "va 0x%lx pte 0x%" PRIx64 "\n", 796 xencons_interface, (int64_t)pte[pl1_pi(page)])); 797 } 798 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT) 799 == xen_start_info.store_mfn) { 800 xenstore_interface = (void *)page; 801 pte[pl1_pi(page)] = xen_start_info.store_mfn; 802 pte[pl1_pi(page)] <<= PAGE_SHIFT; 803 __PRINTK(("xenstore_interface " 804 "va 0x%lx pte 0x%" PRIx64 "\n", 805 xenstore_interface, (int64_t)pte[pl1_pi(page)])); 806 } 807#endif /* XEN3 */ 808#ifdef DOM0OPS 809 if (page >= (vaddr_t)atdevbase && 810 page < (vaddr_t)atdevbase + IOM_SIZE) { 811 pte[pl1_pi(page)] = 812 IOM_BEGIN + (page - (vaddr_t)atdevbase); 813 } 814#endif 815 pte[pl1_pi(page)] |= PG_k | PG_V; 816 if (page < text_end) { 817 /* map kernel text RO */ 818 pte[pl1_pi(page)] |= 0; 819 } else if (page >= old_pgd 820 && page < old_pgd + (old_count * PAGE_SIZE)) { 821 /* map old page tables RO */ 822 pte[pl1_pi(page)] |= 0; 823 } else if (page >= new_pgd && 824 page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) { 825 /* map new page tables RO */ 826 pte[pl1_pi(page)] |= 0; 827 } else { 828 /* map page RW */ 829 pte[pl1_pi(page)] |= PG_RW; 830 } 831 832 if ((page >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE)) 833 || page >= new_pgd) { 834 __PRINTK(("va 0x%lx pa 0x%lx " 835 "entry 0x%" PRIx64 " -> L1[0x%x]\n", 836 page, page - KERNBASE, 837 (int64_t)pte[pl1_pi(page)], pl1_pi(page))); 838 } 839 page += PAGE_SIZE; 840 } 841 842 addr = ((u_long) pte) - KERNBASE; 843 pde[pl2_pi(cur_page)] = 844 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V; 845 __PRINTK(("L1 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 846 " -> L2[0x%x]\n", pte, (int64_t)addr, 847 (int64_t)pde[pl2_pi(cur_page)], pl2_pi(cur_page))); 848 /* Mark readonly */ 849 xen_bt_set_readonly((vaddr_t) pte); 850 } 851 852 /* Install recursive page tables mapping */ 853#ifdef PAE 854 /* 855 * we need a shadow page for the kernel's L2 page 856 * The real L2 kernel PD will be the last one (so that 857 * pde[L2_SLOT_KERN] always point to the shadow. 858 */ 859 memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE); 860 pmap_kl2pd = &pde[L2_SLOT_KERN + NPDPG]; 861 pmap_kl2paddr = (u_long)pmap_kl2pd - KERNBASE; 862 863 /* 864 * We don't enter a recursive entry from the L3 PD. Instead, 865 * we enter the first 4 L2 pages, which includes the kernel's L2 866 * shadow. But we have to entrer the shadow after switching 867 * %cr3, or Xen will refcount some PTE with the wrong type. 868 */ 869 addr = (u_long)pde - KERNBASE; 870 for (i = 0; i < 3; i++, addr += PAGE_SIZE) { 871 pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V; 872 __PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n", 873 (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i, (long)addr, 874 (int64_t)pde[PDIR_SLOT_PTE + i])); 875 } 876#if 0 877 addr += PAGE_SIZE; /* point to shadow L2 */ 878 pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V; 879 __PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n", 880 (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr, 881 (int64_t)pde[PDIR_SLOT_PTE + 3])); 882#endif 883 /* Mark tables RO, and pin the kenrel's shadow as L2 */ 884 addr = (u_long)pde - KERNBASE; 885 for (i = 0; i < 5; i++, addr += PAGE_SIZE) { 886 xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i); 887 if (i == 2 || i == 3) 888 continue; 889#if 0 890 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr)); 891 xpq_queue_pin_table(xpmap_ptom_masked(addr)); 892#endif 893 } 894 if (final) { 895 addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE; 896 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr)); 897 xpq_queue_pin_table(xpmap_ptom_masked(addr)); 898 } 899#if 0 900 addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE; 901 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr)); 902 xpq_queue_pin_table(xpmap_ptom_masked(addr)); 903#endif 904#else /* PAE */ 905 /* recursive entry in higher-level PD */ 906 bt_pgd[PDIR_SLOT_PTE] = 907 xpmap_ptom_masked(new_pgd - KERNBASE) | PG_k | PG_V; 908 __PRINTK(("bt_pgd[PDIR_SLOT_PTE] va 0x%lx pa 0x%" PRIx64 909 " entry 0x%" PRIx64 "\n", new_pgd, (int64_t)new_pgd - KERNBASE, 910 (int64_t)bt_pgd[PDIR_SLOT_PTE])); 911 /* Mark tables RO */ 912 xen_bt_set_readonly((vaddr_t) pde); 913#endif 914#if PTP_LEVELS > 2 || defined(PAE) 915 xen_bt_set_readonly((vaddr_t) pdtpe); 916#endif 917#if PTP_LEVELS > 3 918 xen_bt_set_readonly(new_pgd); 919#endif 920 /* Pin the PGD */ 921 __PRINTK(("pin PDG\n")); 922#ifdef PAE 923 xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE)); 924#else 925 xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE)); 926#endif 927#ifdef __i386__ 928 /* Save phys. addr of PDP, for libkvm. */ 929 PDPpaddr = (long)pde; 930#ifdef PAE 931 /* also save the address of the L3 page */ 932 pmap_l3pd = pdtpe; 933 pmap_l3paddr = (new_pgd - KERNBASE); 934#endif /* PAE */ 935#endif /* i386 */ 936 /* Switch to new tables */ 937 __PRINTK(("switch to PDG\n")); 938 xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE)); 939 __PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry 0x%" PRIx64 "\n", 940 (int64_t)bt_pgd[PDIR_SLOT_PTE])); 941#ifdef PAE 942 if (final) { 943 /* now enter kernel's PTE mappings */ 944 addr = (u_long)pde - KERNBASE + PAGE_SIZE * 3; 945 xpq_queue_pte_update( 946 xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE), 947 xpmap_ptom_masked(addr) | PG_k | PG_V); 948 xpq_flush_queue(); 949 } 950#endif 951 952 953 954 /* Now we can safely reclaim space taken by old tables */ 955 956 __PRINTK(("unpin old PDG\n")); 957 /* Unpin old PGD */ 958 xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE)); 959 /* Mark old tables RW */ 960 page = old_pgd; 961 addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME; 962 addr = xpmap_mtop(addr); 963 pte = (pd_entry_t *) ((u_long)addr + KERNBASE); 964 pte += pl1_pi(page); 965 __PRINTK(("*pde 0x%" PRIx64 " addr 0x%" PRIx64 " pte 0x%lx\n", 966 (int64_t)pde[pl2_pi(page)], (int64_t)addr, (long)pte)); 967 while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) { 968 addr = xpmap_ptom(((u_long) pte) - KERNBASE); 969 XENPRINTK(("addr 0x%" PRIx64 " pte 0x%lx *pte 0x%" PRIx64 "\n", 970 (int64_t)addr, (long)pte, (int64_t)*pte)); 971 xpq_queue_pte_update(addr, *pte | PG_RW); 972 page += PAGE_SIZE; 973 /* 974 * Our ptes are contiguous 975 * so it's safe to just "++" here 976 */ 977 pte++; 978 } 979 xpq_flush_queue(); 980} 981 982 983/* 984 * Bootstrap helper functions 985 */ 986 987/* 988 * Mark a page readonly 989 * XXX: assuming vaddr = paddr + KERNBASE 990 */ 991 992static void 993xen_bt_set_readonly (vaddr_t page) 994{ 995 pt_entry_t entry; 996 997 entry = xpmap_ptom_masked(page - KERNBASE); 998 entry |= PG_k | PG_V; 999 1000 HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG); 1001} 1002 1003#ifdef __x86_64__ 1004void 1005xen_set_user_pgd(paddr_t page) 1006{ 1007 struct mmuext_op op; 1008 int s = splvm(); 1009 1010 xpq_flush_queue(); 1011 op.cmd = MMUEXT_NEW_USER_BASEPTR; 1012 op.arg1.mfn = xpmap_phys_to_machine_mapping[page >> PAGE_SHIFT]; 1013 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 1014 panic("xen_set_user_pgd: failed to install new user page" 1015 " directory %lx", page); 1016 splx(s); 1017} 1018#endif /* __x86_64__ */ 1019