x86_xpmap.c revision 1.87
1/* $NetBSD: x86_xpmap.c,v 1.87 2020/05/06 17:28:26 bouyer Exp $ */ 2 3/* 4 * Copyright (c) 2017 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 34 * 35 * Permission to use, copy, modify, and distribute this software for any 36 * purpose with or without fee is hereby granted, provided that the above 37 * copyright notice and this permission notice appear in all copies. 38 * 39 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 40 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 41 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 42 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 43 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 44 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 45 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 46 */ 47 48/* 49 * Copyright (c) 2006, 2007 Manuel Bouyer. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 61 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 62 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 63 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 64 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 66 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 67 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 68 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 69 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 70 */ 71 72/* 73 * Copyright (c) 2004 Christian Limpach. 74 * All rights reserved. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 86 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 87 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 88 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 89 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 90 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 91 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 92 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 94 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97#include <sys/cdefs.h> 98__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.87 2020/05/06 17:28:26 bouyer Exp $"); 99 100#include "opt_xen.h" 101#include "opt_ddb.h" 102#include "ksyms.h" 103 104#include <sys/param.h> 105#include <sys/systm.h> 106#include <sys/mutex.h> 107#include <sys/cpu.h> 108#include <sys/kernel.h> 109 110#include <uvm/uvm.h> 111 112#include <x86/pmap.h> 113#include <machine/gdt.h> 114#include <xen/xenfunc.h> 115 116#include <dev/isa/isareg.h> 117#include <machine/isa_machdep.h> 118 119#ifdef XENDEBUG 120#define __PRINTK(x) printk x 121#else 122#define __PRINTK(x) 123#endif 124 125/* Xen requires the start_info struct to be page aligned */ 126union start_info_union start_info_union __aligned(PAGE_SIZE); 127 128volatile shared_info_t *HYPERVISOR_shared_info __read_mostly; 129unsigned long *xpmap_phys_to_machine_mapping __read_mostly; 130kmutex_t pte_lock __cacheline_aligned; 131vaddr_t xen_dummy_page; 132pt_entry_t xpmap_pg_nx __read_mostly; 133 134#define XPQUEUE_SIZE 2048 135static mmu_update_t xpq_queue_array[MAXCPUS][XPQUEUE_SIZE]; 136 137void xen_failsafe_handler(void); 138 139extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 140 141static void xen_bt_set_readonly(vaddr_t); 142static void xen_bootstrap_tables(vaddr_t, vaddr_t, size_t, size_t, bool); 143 144vaddr_t xen_locore(void); 145 146/* 147 * kcpuset internally uses an array of uint32_t while xen uses an array of 148 * u_long. As we're little-endian we can cast one to the other. 149 */ 150typedef union { 151#ifdef _LP64 152 uint32_t xcpum_km[2]; 153#else 154 uint32_t xcpum_km[1]; 155#endif 156 u_long xcpum_xm; 157} xcpumask_t; 158 159void 160xen_failsafe_handler(void) 161{ 162 163 panic("xen_failsafe_handler called!\n"); 164} 165 166void 167xen_set_ldt(vaddr_t base, uint32_t entries) 168{ 169 vaddr_t va; 170 vaddr_t end; 171 pt_entry_t *ptp; 172 int s; 173 174#ifdef __x86_64__ 175 end = base + (entries << 3); 176#else 177 end = base + entries * sizeof(union descriptor); 178#endif 179 180 for (va = base; va < end; va += PAGE_SIZE) { 181 KASSERT(va >= VM_MIN_KERNEL_ADDRESS); 182 ptp = kvtopte(va); 183 pmap_pte_clearbits(ptp, PTE_W); 184 } 185 s = splvm(); /* XXXSMP */ 186 xpq_queue_set_ldt(base, entries); 187 splx(s); 188} 189 190void 191xpq_flush_queue(void) 192{ 193 mmu_update_t *xpq_queue; 194 int done = 0, ret; 195 size_t xpq_idx; 196 197 KASSERT(curcpu()->ci_ilevel >= IPL_VM || cold); 198 199 xpq_idx = curcpu()->ci_xpq_idx; 200 xpq_queue = xpq_queue_array[curcpu()->ci_cpuid]; 201 202retry: 203 ret = HYPERVISOR_mmu_update(xpq_queue, xpq_idx, &done, DOMID_SELF); 204 205 if (ret < 0 && xpq_idx != 0) { 206 printf("xpq_flush_queue: %zu entries (%d successful) on " 207 "cpu%d (%ld)\n", 208 xpq_idx, done, curcpu()->ci_index, curcpu()->ci_cpuid); 209 210 if (done != 0) { 211 xpq_queue += done; 212 xpq_idx -= done; 213 done = 0; 214 goto retry; 215 } 216 217 panic("HYPERVISOR_mmu_update failed, ret: %d\n", ret); 218 } 219 curcpu()->ci_xpq_idx = 0; 220} 221 222static inline void 223xpq_increment_idx(void) 224{ 225 KASSERT(curcpu()->ci_ilevel >= IPL_VM || cold); 226 if (__predict_false(++curcpu()->ci_xpq_idx == XPQUEUE_SIZE)) 227 xpq_flush_queue(); 228} 229 230void 231xpq_queue_machphys_update(paddr_t ma, paddr_t pa) 232{ 233 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid]; 234 size_t xpq_idx = curcpu()->ci_xpq_idx; 235 236 xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE; 237 xpq_queue[xpq_idx].val = pa >> PAGE_SHIFT; 238 xpq_increment_idx(); 239} 240 241void 242xpq_queue_pte_update(paddr_t ptr, pt_entry_t val) 243{ 244 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid]; 245 size_t xpq_idx = curcpu()->ci_xpq_idx; 246 247 xpq_queue[xpq_idx].ptr = ptr | MMU_NORMAL_PT_UPDATE; 248 xpq_queue[xpq_idx].val = val; 249 xpq_increment_idx(); 250} 251 252void 253xpq_queue_pt_switch(paddr_t pa) 254{ 255 struct mmuext_op op; 256 257 xpq_flush_queue(); 258 259 op.cmd = MMUEXT_NEW_BASEPTR; 260 op.arg1.mfn = pa >> PAGE_SHIFT; 261 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 262 panic(__func__); 263} 264 265void 266xpq_queue_pin_table(paddr_t pa, int lvl) 267{ 268 struct mmuext_op op; 269 270 xpq_flush_queue(); 271 272 op.cmd = lvl; 273 op.arg1.mfn = pa >> PAGE_SHIFT; 274 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 275 panic(__func__); 276} 277 278void 279xpq_queue_unpin_table(paddr_t pa) 280{ 281 struct mmuext_op op; 282 283 xpq_flush_queue(); 284 285 op.cmd = MMUEXT_UNPIN_TABLE; 286 op.arg1.mfn = pa >> PAGE_SHIFT; 287 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 288 panic(__func__); 289} 290 291void 292xpq_queue_set_ldt(vaddr_t va, uint32_t entries) 293{ 294 struct mmuext_op op; 295 296 xpq_flush_queue(); 297 298 KASSERT(va == (va & ~PAGE_MASK)); 299 op.cmd = MMUEXT_SET_LDT; 300 op.arg1.linear_addr = va; 301 op.arg2.nr_ents = entries; 302 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 303 panic(__func__); 304} 305 306void 307xpq_queue_tlb_flush(void) 308{ 309 struct mmuext_op op; 310 311 xpq_flush_queue(); 312 313 op.cmd = MMUEXT_TLB_FLUSH_LOCAL; 314 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 315 panic(__func__); 316} 317 318void 319xpq_flush_cache(void) 320{ 321 int s = splvm(); 322 323 xpq_flush_queue(); 324 325 asm("wbinvd":::"memory"); 326 splx(s); 327} 328 329void 330xpq_queue_invlpg(vaddr_t va) 331{ 332 struct mmuext_op op; 333 334 xpq_flush_queue(); 335 336 op.cmd = MMUEXT_INVLPG_LOCAL; 337 op.arg1.linear_addr = (va & ~PAGE_MASK); 338 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 339 panic(__func__); 340} 341 342void 343xen_mcast_invlpg(vaddr_t va, kcpuset_t *kc) 344{ 345 xcpumask_t xcpumask; 346 mmuext_op_t op; 347 348 kcpuset_export_u32(kc, &xcpumask.xcpum_km[0], sizeof(xcpumask)); 349 350 xpq_flush_queue(); 351 352 op.cmd = MMUEXT_INVLPG_MULTI; 353 op.arg1.linear_addr = va; 354 set_xen_guest_handle(op.arg2.vcpumask, &xcpumask.xcpum_xm); 355 356 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 357 panic(__func__); 358} 359 360void 361xen_bcast_invlpg(vaddr_t va) 362{ 363 mmuext_op_t op; 364 365 xpq_flush_queue(); 366 367 op.cmd = MMUEXT_INVLPG_ALL; 368 op.arg1.linear_addr = va; 369 370 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 371 panic(__func__); 372} 373 374/* This is a synchronous call. */ 375void 376xen_mcast_tlbflush(kcpuset_t *kc) 377{ 378 xcpumask_t xcpumask; 379 mmuext_op_t op; 380 381 kcpuset_export_u32(kc, &xcpumask.xcpum_km[0], sizeof(xcpumask)); 382 383 xpq_flush_queue(); 384 385 op.cmd = MMUEXT_TLB_FLUSH_MULTI; 386 set_xen_guest_handle(op.arg2.vcpumask, &xcpumask.xcpum_xm); 387 388 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 389 panic(__func__); 390} 391 392/* This is a synchronous call. */ 393void 394xen_bcast_tlbflush(void) 395{ 396 mmuext_op_t op; 397 398 xpq_flush_queue(); 399 400 op.cmd = MMUEXT_TLB_FLUSH_ALL; 401 402 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 403 panic(__func__); 404} 405 406void 407xen_copy_page(paddr_t srcpa, paddr_t dstpa) 408{ 409 mmuext_op_t op; 410 411 op.cmd = MMUEXT_COPY_PAGE; 412 op.arg1.mfn = xpmap_ptom(dstpa) >> PAGE_SHIFT; 413 op.arg2.src_mfn = xpmap_ptom(srcpa) >> PAGE_SHIFT; 414 415 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 416 panic(__func__); 417} 418 419void 420xen_pagezero(paddr_t pa) 421{ 422 mmuext_op_t op; 423 424 op.cmd = MMUEXT_CLEAR_PAGE; 425 op.arg1.mfn = xpmap_ptom(pa) >> PAGE_SHIFT; 426 427 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 428 panic(__func__); 429} 430 431int 432xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom) 433{ 434 mmu_update_t op; 435 int ok; 436 437 xpq_flush_queue(); 438 439 op.ptr = ptr; 440 op.val = val; 441 if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0) 442 return EFAULT; 443 return 0; 444} 445 446#if L2_SLOT_KERNBASE > 0 447#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1)) 448#else 449#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1) 450#endif 451 452#ifdef __x86_64__ 453#define PDIRSZ PTP_LEVELS 454#else 455/* 456 * For PAE, we need an L3 page, a single contiguous L2 "superpage" of 4 pages 457 * (all of them mapped by the L3 page), and a shadow page for L3[3]. 458 */ 459#define PDIRSZ (1 + 4 + 1) 460#endif 461 462/* 463 * Xen locore: get rid of the Xen bootstrap tables. Build and switch to new page 464 * tables. 465 * 466 * Virtual address space of the kernel when leaving this function: 467 * +--------------+------------------+-------------+------------+--------------- 468 * | KERNEL IMAGE | BOOTSTRAP TABLES | PROC0 UAREA | DUMMY PAGE | HYPER. SHARED 469 * +--------------+------------------+-------------+------------+--------------- 470 * 471 * ------+-----------------+-------------+ 472 * INFO | EARLY ZERO PAGE | ISA I/O MEM | 473 * ------+-----------------+-------------+ 474 * 475 * DUMMY PAGE is either a PDG for amd64 or a GDT for i386. 476 * 477 * (HYPER. SHARED INFO + EARLY ZERO PAGE + ISA I/O MEM) have no physical 478 * addresses preallocated. 479 */ 480vaddr_t 481xen_locore(void) 482{ 483 size_t nL2, oldcount, mapsize; 484 vaddr_t our_tables, xen_tables; 485 u_int descs[4]; 486 487 xen_init_features(); 488 489 xpmap_phys_to_machine_mapping = 490 (unsigned long *)xen_start_info.mfn_list; 491 492 /* Set the NX/XD bit, if available. descs[3] = %edx. */ 493 x86_cpuid(0x80000001, descs); 494 xpmap_pg_nx = (descs[3] & CPUID_NOX) ? PTE_NX : 0; 495 496 /* Space after Xen boostrap tables should be free */ 497 xen_tables = xen_start_info.pt_base; 498 our_tables = xen_tables + (xen_start_info.nr_pt_frames * PAGE_SIZE); 499 500 /* 501 * Calculate how much space we need. First, everything mapped before 502 * the Xen bootstrap tables. 503 */ 504 mapsize = xen_tables - KERNTEXTOFF; 505 506 /* After the tables we'll have: 507 * - UAREA 508 * - dummy user PGD (x86_64) 509 * - HYPERVISOR_shared_info 510 * - early_zerop 511 * - ISA I/O mem (if needed) 512 */ 513 mapsize += UPAGES * PAGE_SIZE; 514#ifdef __x86_64__ 515 mapsize += PAGE_SIZE; 516#endif 517 mapsize += PAGE_SIZE; 518 mapsize += PAGE_SIZE; 519#ifdef DOM0OPS 520 if (xendomain_is_dom0()) { 521 mapsize += IOM_SIZE; 522 } 523#endif 524 525 /* 526 * At this point, mapsize doesn't include the table size. 527 */ 528#ifdef __x86_64__ 529 nL2 = TABLE_L2_ENTRIES; 530#else 531 nL2 = (mapsize + (NBPD_L2 - 1)) >> L2_SHIFT; 532#endif 533 534 /* 535 * Now compute how many L2 pages we need exactly. This is useful only 536 * on i386, since the initial count for amd64 is already enough. 537 */ 538 while (KERNTEXTOFF + mapsize + (nL2 + PDIRSZ) * PAGE_SIZE > 539 KERNBASE + (nL2 << L2_SHIFT)) { 540 nL2++; 541 } 542 543#ifdef i386 544 /* 545 * One more L2 page: we'll allocate several pages after kva_start 546 * in pmap_bootstrap() before pmap_growkernel(), which have not been 547 * counted here. It's not a big issue to allocate one more L2 as 548 * pmap_growkernel() will be called anyway. 549 */ 550 nL2++; 551 nkptp[1] = nL2; 552#endif 553 554 /* 555 * Install bootstrap pages. We may need more L2 pages than will 556 * have the final table here, as it's installed after the final table. 557 */ 558 oldcount = nL2; 559 560bootstrap_again: 561 562 /* 563 * Xen space we'll reclaim may not be enough for our new page tables, 564 * move bootstrap tables if necessary. 565 */ 566 if (our_tables < xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE)) 567 our_tables = xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE); 568 569 /* 570 * Make sure the number of L2 pages we have is enough to map everything 571 * from KERNBASE to the bootstrap tables themselves. 572 */ 573 if (our_tables + ((oldcount + PDIRSZ) * PAGE_SIZE) > 574 KERNBASE + (oldcount << L2_SHIFT)) { 575 oldcount++; 576 goto bootstrap_again; 577 } 578 579 /* Create temporary tables */ 580 xen_bootstrap_tables(xen_tables, our_tables, 581 xen_start_info.nr_pt_frames, oldcount, false); 582 583 /* Create final tables */ 584 xen_bootstrap_tables(our_tables, xen_tables, 585 oldcount + PDIRSZ, nL2, true); 586 587 /* Zero out PROC0 UAREA and DUMMY PAGE. */ 588 memset((void *)(xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE)), 0, 589 (UPAGES + 1) * PAGE_SIZE); 590 591 /* Finally, flush TLB. */ 592 xpq_queue_tlb_flush(); 593 594 return (xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE)); 595} 596 597/* 598 * Build a new table and switch to it. 599 * old_count is # of old tables (including L4, L3 and L2). 600 * new_count is # of new tables (PTE only). 601 * We assume the areas don't overlap. 602 */ 603static void 604xen_bootstrap_tables(vaddr_t old_pgd, vaddr_t new_pgd, size_t old_count, 605 size_t new_count, bool final) 606{ 607 pd_entry_t *L4cpu, *L4, *L3, *L2, *pte; 608 paddr_t addr; 609 vaddr_t page, avail, map_end; 610 int i; 611 extern char __rodata_start; 612 extern char __data_start; 613 extern char __kernel_end; 614 extern char *early_zerop; /* from pmap.c */ 615#ifdef i386 616 extern union descriptor tmpgdt[]; 617#endif 618 619 /* 620 * Layout of RW area after the kernel image: 621 * xencons_interface (if present) 622 * xenstore_interface (if present) 623 * table pages (new_count + PDIRSZ entries) 624 * Extra mappings (only when final is true): 625 * UAREA 626 * dummy user PGD (x86_64 only) / GDT page (i386 only) 627 * HYPERVISOR_shared_info 628 * early_zerop 629 * ISA I/O mem (if needed) 630 */ 631 map_end = new_pgd + ((new_count + PDIRSZ) * PAGE_SIZE); 632 if (final) { 633 map_end += UPAGES * PAGE_SIZE; 634 xen_dummy_page = (vaddr_t)map_end; 635 map_end += PAGE_SIZE; 636 HYPERVISOR_shared_info = (shared_info_t *)map_end; 637 map_end += PAGE_SIZE; 638 early_zerop = (char *)map_end; 639 map_end += PAGE_SIZE; 640 } 641 642 /* 643 * We always set atdevbase, as it's used by init386 to find the first 644 * available VA. map_end is updated only if we are dom0, so 645 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in 646 * this case. 647 */ 648 if (final) { 649 atdevbase = map_end; 650#ifdef DOM0OPS 651 if (xendomain_is_dom0()) { 652 /* ISA I/O mem */ 653 map_end += IOM_SIZE; 654 } 655#endif 656 } 657 658 __PRINTK(("xen_bootstrap_tables map_end 0x%lx\n", map_end)); 659 __PRINTK(("console %#lx ", xen_start_info.console_mfn)); 660 __PRINTK(("xenstore %#" PRIx32 "\n", xen_start_info.store_mfn)); 661 662 avail = new_pgd; 663 664 /* 665 * Create our page tables. 666 */ 667 668#ifdef __x86_64__ 669 /* per-cpu L4 */ 670 L4cpu = (pd_entry_t *)avail; 671 memset(L4cpu, 0, PAGE_SIZE); 672 avail += PAGE_SIZE; 673 674 /* pmap_kernel L4 */ 675 L4 = (pd_entry_t *)avail; 676 memset(L4, 0, PAGE_SIZE); 677 avail += PAGE_SIZE; 678 679 /* L3 */ 680 L3 = (pd_entry_t *)avail; 681 memset(L3, 0, PAGE_SIZE); 682 avail += PAGE_SIZE; 683 684 /* link L4->L3 */ 685 addr = ((u_long)L3) - KERNBASE; 686 L4cpu[pl4_pi(KERNTEXTOFF)] = xpmap_ptom_masked(addr) | PTE_P | PTE_W; 687 L4[pl4_pi(KERNTEXTOFF)] = xpmap_ptom_masked(addr) | PTE_P | PTE_W; 688 689 /* L2 */ 690 L2 = (pd_entry_t *)avail; 691 memset(L2, 0, PAGE_SIZE); 692 avail += PAGE_SIZE; 693 694 /* link L3->L2 */ 695 addr = ((u_long)L2) - KERNBASE; 696 L3[pl3_pi(KERNTEXTOFF)] = xpmap_ptom_masked(addr) | PTE_P | PTE_W; 697#else 698 /* no L4 on i386PAE */ 699 __USE(L4cpu); 700 __USE(L4); 701 702 /* L3 */ 703 L3 = (pd_entry_t *)avail; 704 memset(L3, 0, PAGE_SIZE); 705 avail += PAGE_SIZE; 706 707 /* 708 * Our PAE-style level 2, 5 contiguous pages (4 L2 + 1 shadow). 709 * +-----------------+----------------+---------+ 710 * Physical layout: | 3 * USERLAND L2 | L2 KERN SHADOW | L2 KERN | 711 * +-----------------+----------------+---------+ 712 * However, we enter L3[3] into L2 KERN, and not L2 KERN SHADOW. 713 * This way, L2[L2_SLOT_KERN] always points to the shadow. 714 */ 715 L2 = (pd_entry_t *)avail; 716 memset(L2, 0, PAGE_SIZE * 5); 717 avail += PAGE_SIZE * 5; 718 719 /* 720 * Link L2 pages in L3, with a special case for L2 KERN. Xen doesn't 721 * want RW permissions in L3 entries, it'll add them itself. 722 */ 723 addr = ((u_long)L2) - KERNBASE; 724 for (i = 0; i < 3; i++, addr += PAGE_SIZE) { 725 L3[i] = xpmap_ptom_masked(addr) | PTE_P; 726 } 727 addr += PAGE_SIZE; 728 L3[3] = xpmap_ptom_masked(addr) | PTE_P; 729#endif 730 731 /* Level 1 */ 732 page = KERNTEXTOFF; 733 for (i = 0; i < new_count; i ++) { 734 vaddr_t cur_page = page; 735 736 pte = (pd_entry_t *)avail; 737 memset(pte, 0, PAGE_SIZE); 738 avail += PAGE_SIZE; 739 740 while (pl2_pi(page) == pl2_pi(cur_page)) { 741 if (page >= map_end) { 742 /* not mapped at all */ 743 pte[pl1_pi(page)] = 0; 744 page += PAGE_SIZE; 745 continue; 746 } 747 pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE); 748 if (page == (vaddr_t)HYPERVISOR_shared_info) { 749 pte[pl1_pi(page)] = xen_start_info.shared_info; 750 } 751 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT) 752 == xen_start_info.console.domU.mfn) { 753 xencons_interface = (void *)page; 754 pte[pl1_pi(page)] = xen_start_info.console_mfn; 755 pte[pl1_pi(page)] <<= PAGE_SHIFT; 756 } 757 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT) 758 == xen_start_info.store_mfn) { 759 xenstore_interface = (void *)page; 760 pte[pl1_pi(page)] = xen_start_info.store_mfn; 761 pte[pl1_pi(page)] <<= PAGE_SHIFT; 762 } 763#ifdef DOM0OPS 764 if (page >= (vaddr_t)atdevbase && 765 page < (vaddr_t)atdevbase + IOM_SIZE) { 766 pte[pl1_pi(page)] = 767 IOM_BEGIN + (page - (vaddr_t)atdevbase); 768 pte[pl1_pi(page)] |= xpmap_pg_nx; 769 } 770#endif 771 772 pte[pl1_pi(page)] |= PTE_P; 773 if (page < (vaddr_t)&__rodata_start) { 774 /* Map the kernel text RX. Nothing to do. */ 775 } else if (page >= (vaddr_t)&__rodata_start && 776 page < (vaddr_t)&__data_start) { 777 /* Map the kernel rodata R. */ 778 pte[pl1_pi(page)] |= xpmap_pg_nx; 779 } else if (page >= old_pgd && 780 page < old_pgd + (old_count * PAGE_SIZE)) { 781 /* Map the old page tables R. */ 782 pte[pl1_pi(page)] |= xpmap_pg_nx; 783 } else if (page >= new_pgd && 784 page < new_pgd + ((new_count + PDIRSZ) * PAGE_SIZE)) { 785 /* Map the new page tables R. */ 786 pte[pl1_pi(page)] |= xpmap_pg_nx; 787#ifdef i386 788 } else if (page == (vaddr_t)tmpgdt) { 789 /* 790 * Map bootstrap gdt R/O. Later, we will re-add 791 * this page to uvm after making it writable. 792 */ 793 pte[pl1_pi(page)] = 0; 794 page += PAGE_SIZE; 795 continue; 796#endif 797 } else if (page >= (vaddr_t)&__data_start && 798 page < (vaddr_t)&__kernel_end) { 799 /* Map the kernel data+bss RW. */ 800 pte[pl1_pi(page)] |= PTE_W | xpmap_pg_nx; 801 } else { 802 /* Map the page RW. */ 803 pte[pl1_pi(page)] |= PTE_W | xpmap_pg_nx; 804 } 805 806 page += PAGE_SIZE; 807 } 808 809 addr = ((u_long)pte) - KERNBASE; 810 L2[pl2_pi(cur_page)] = xpmap_ptom_masked(addr) | PTE_W | PTE_P; 811 812 /* Mark readonly */ 813 xen_bt_set_readonly((vaddr_t)pte); 814 } 815 816 /* Install recursive page tables mapping */ 817#ifdef __x86_64__ 818 /* Recursive entry in pmap_kernel(). */ 819 L4[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)L4 - KERNBASE) 820 | PTE_P | xpmap_pg_nx; 821 /* Recursive entry in higher-level per-cpu PD. */ 822 L4cpu[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)L4cpu - KERNBASE) 823 | PTE_P | xpmap_pg_nx; 824 825 /* Mark tables RO */ 826 xen_bt_set_readonly((vaddr_t)L2); 827#else 828 /* Copy L2 KERN into L2 KERN SHADOW, and reference the latter in cpu0. */ 829 memcpy(&L2[L2_SLOT_KERN + NPDPG], &L2[L2_SLOT_KERN], PAGE_SIZE); 830 cpu_info_primary.ci_kpm_pdir = &L2[L2_SLOT_KERN + NPDPG]; 831 cpu_info_primary.ci_kpm_pdirpa = 832 (vaddr_t)cpu_info_primary.ci_kpm_pdir - KERNBASE; 833 834 /* 835 * We don't enter a recursive entry from the L3 PD. Instead, we enter 836 * the first 4 L2 pages, which includes the kernel's L2 shadow. But we 837 * have to enter the shadow after switching %cr3, or Xen will refcount 838 * some PTEs with the wrong type. 839 */ 840 addr = (u_long)L2 - KERNBASE; 841 for (i = 0; i < 3; i++, addr += PAGE_SIZE) { 842 L2[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PTE_P | 843 xpmap_pg_nx; 844 } 845 846 /* Mark tables RO, and pin L2 KERN SHADOW. */ 847 addr = (u_long)L2 - KERNBASE; 848 for (i = 0; i < 5; i++, addr += PAGE_SIZE) { 849 xen_bt_set_readonly(((vaddr_t)L2) + PAGE_SIZE * i); 850 } 851 if (final) { 852 addr = (u_long)L2 - KERNBASE + 3 * PAGE_SIZE; 853 xpq_queue_pin_l2_table(xpmap_ptom_masked(addr)); 854 } 855#endif 856 857 xen_bt_set_readonly((vaddr_t)L3); 858#ifdef __x86_64__ 859 xen_bt_set_readonly((vaddr_t)L4cpu); 860#endif 861 862 /* Pin the PGD */ 863#ifdef __x86_64__ 864 xpq_queue_pin_l4_table(xpmap_ptom_masked(new_pgd - KERNBASE)); 865#else 866 xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE)); 867#endif 868 869 /* Save phys. addr of PDP, for libkvm. */ 870#ifdef __x86_64__ 871 PDPpaddr = (u_long)L4 - KERNBASE; 872#else 873 PDPpaddr = (u_long)L2 - KERNBASE; /* PDP is the L2 with PAE */ 874#endif 875 876 /* Switch to new tables */ 877 xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE)); 878 879 if (final) { 880#ifdef __x86_64__ 881 /* Save the address of the real per-cpu L4 page. */ 882 cpu_info_primary.ci_kpm_pdir = L4cpu; 883 cpu_info_primary.ci_kpm_pdirpa = ((paddr_t)L4cpu - KERNBASE); 884#else 885 /* Save the address of the L3 page */ 886 cpu_info_primary.ci_pae_l3_pdir = L3; 887 cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE); 888 889 /* Now enter the kernel's PTE mappings */ 890 addr = (u_long)L2 - KERNBASE + PAGE_SIZE * 3; 891 xpq_queue_pte_update( 892 xpmap_ptom(((vaddr_t)&L2[PDIR_SLOT_PTE + 3]) - KERNBASE), 893 xpmap_ptom_masked(addr) | PTE_P); 894 xpq_flush_queue(); 895#endif 896 } 897 898 /* 899 * Now we can safely reclaim the space taken by the old tables. 900 */ 901 902 /* Unpin old PGD */ 903 xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE)); 904 905 /* Mark old tables RW */ 906 page = old_pgd; 907 addr = xpmap_mtop((paddr_t)L2[pl2_pi(page)] & PTE_4KFRAME); 908 pte = (pd_entry_t *)((u_long)addr + KERNBASE); 909 pte += pl1_pi(page); 910 while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) { 911 addr = xpmap_ptom(((u_long)pte) - KERNBASE); 912 xpq_queue_pte_update(addr, *pte | PTE_W); 913 page += PAGE_SIZE; 914 /* 915 * Our PTEs are contiguous so it's safe to just "++" here. 916 */ 917 pte++; 918 } 919 xpq_flush_queue(); 920} 921 922/* 923 * Mark a page read-only, assuming vaddr = paddr + KERNBASE. 924 */ 925static void 926xen_bt_set_readonly(vaddr_t page) 927{ 928 pt_entry_t entry; 929 930 entry = xpmap_ptom_masked(page - KERNBASE); 931 entry |= PTE_P | xpmap_pg_nx; 932 933 HYPERVISOR_update_va_mapping(page, entry, UVMF_INVLPG); 934} 935 936#ifdef __x86_64__ 937void 938xen_set_user_pgd(paddr_t page) 939{ 940 struct mmuext_op op; 941 int s = splvm(); /* XXXSMP */ 942 943 xpq_flush_queue(); 944 op.cmd = MMUEXT_NEW_USER_BASEPTR; 945 op.arg1.mfn = xpmap_ptom_masked(page) >> PAGE_SHIFT; 946 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 947 panic("xen_set_user_pgd: failed to install new user page" 948 " directory %#" PRIxPADDR, page); 949 splx(s); 950} 951#endif /* __x86_64__ */ 952