1/* 2 * Copyright 2014, General Dynamics C4 Systems 3 * 4 * This software may be distributed and modified according to the terms of 5 * the GNU General Public License version 2. Note that NO WARRANTY is provided. 6 * See "LICENSE_GPLv2.txt" for details. 7 * 8 * @TAG(GD_GPL) 9 */ 10 11#include <config.h> 12#include <kernel/boot.h> 13#include <machine/io.h> 14#include <model/statedata.h> 15#include <object/interrupt.h> 16#include <arch/object/interrupt.h> 17#include <arch/machine.h> 18#include <arch/kernel/apic.h> 19#include <arch/kernel/boot.h> 20#include <arch/kernel/boot_sys.h> 21#include <arch/kernel/vspace.h> 22#include <machine/fpu.h> 23#include <arch/machine/timer.h> 24#include <arch/object/ioport.h> 25#include <linker.h> 26#include <util.h> 27 28#include <plat/machine/intel-vtd.h> 29 30/* functions exactly corresponding to abstract specification */ 31 32BOOT_CODE static void 33init_irqs(cap_t root_cnode_cap) 34{ 35 irq_t i; 36 37 for (i = 0; i <= maxIRQ; i++) { 38 if (i == irq_timer) { 39 setIRQState(IRQTimer, i); 40#ifdef ENABLE_SMP_SUPPORT 41 } else if (i == irq_remote_call_ipi || i == irq_reschedule_ipi) { 42 setIRQState(IRQIPI, i); 43#endif /* ENABLE_SMP_SUPPORT */ 44#ifdef CONFIG_IOMMU 45 } else if (i == irq_iommu) { 46 setIRQState(IRQReserved, i); 47#endif 48 } else if (i == 2 && config_set(CONFIG_IRQ_PIC)) { 49 /* cascaded legacy PIC */ 50 setIRQState(IRQReserved, i); 51 } else if (i >= irq_isa_min && i <= irq_isa_max) { 52 if (config_set(CONFIG_IRQ_PIC)) { 53 setIRQState(IRQInactive, i); 54 } else { 55 setIRQState(IRQReserved, i); 56 } 57 } else if (i >= irq_user_min && i <= irq_user_max) { 58 if (config_set(CONFIG_IRQ_IOAPIC)) { 59 setIRQState(IRQInactive, i); 60 } else { 61 setIRQState(IRQReserved, i); 62 } 63 } else { 64 setIRQState(IRQReserved, i); 65 } 66 } 67 Arch_irqStateInit(); 68 /* provide the IRQ control cap */ 69 write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIRQControl), cap_irq_control_cap_new()); 70} 71 72/* The maximum number of reserved regions we have is 1 for each physical memory region (+ MAX_NUM_FREEMEM_REG) 73 * plus 1 for each kernel device. For kernel devices we have the ioapics (+ CONFIG_MAX_NUM_IOAPIC), 74 * iommus (+ MAX_NUM_DRHU), apic (+ 1) and the reserved MSI region (+ 1) */ 75#define NUM_RESERVED_REGIONS (MAX_NUM_FREEMEM_REG + CONFIG_MAX_NUM_IOAPIC + MAX_NUM_DRHU + 2) 76typedef struct allocated_p_region { 77 p_region_t regs[NUM_RESERVED_REGIONS]; 78 word_t cur_pos; 79} allocated_p_region_t; 80 81BOOT_BSS static allocated_p_region_t allocated_p_regions; 82 83BOOT_CODE static void 84merge_regions(void) 85{ 86 unsigned int i, j; 87 /* Walk through all the regions and see if any can get merged */ 88 for (i = 1; i < allocated_p_regions.cur_pos;) { 89 if (allocated_p_regions.regs[i - 1].end == allocated_p_regions.regs[i].start) { 90 /* move this down */ 91 allocated_p_regions.regs[i - 1].end = allocated_p_regions.regs[i].end; 92 /* fill the rest down */ 93 for (j = i; j < allocated_p_regions.cur_pos - 1; j++) { 94 allocated_p_regions.regs[j] = allocated_p_regions.regs[j + 1]; 95 } 96 allocated_p_regions.cur_pos--; 97 /* don't increment 'i' since we want to recheck that the 98 * region we just moved to this slot doesn't also need merging */ 99 } else { 100 i++; 101 } 102 } 103} 104 105static UNUSED BOOT_CODE bool_t p_region_overlaps(p_region_t reg) 106{ 107 unsigned int i; 108 for (i = 0; i < allocated_p_regions.cur_pos; i++) { 109 if (allocated_p_regions.regs[i].start < reg.end && 110 allocated_p_regions.regs[i].end > reg.start) { 111 return true; 112 } 113 } 114 return false; 115} 116 117BOOT_CODE bool_t 118add_allocated_p_region(p_region_t reg) 119{ 120 unsigned int i, j; 121 122 assert(reg.start <= reg.end); 123 assert(!p_region_overlaps(reg)); 124 125 /* Walk the existing regions and see if we can merge with an existing 126 * region, or insert in order */ 127 for (i = 0; i < allocated_p_regions.cur_pos; i++) { 128 /* see if we can merge before or after this region */ 129 if (allocated_p_regions.regs[i].end == reg.start) { 130 allocated_p_regions.regs[i].end = reg.end; 131 merge_regions(); 132 return true; 133 } 134 if (allocated_p_regions.regs[i].start == reg.end) { 135 allocated_p_regions.regs[i].start = reg.start; 136 merge_regions(); 137 return true; 138 } 139 /* see if this new one should be inserted before */ 140 if (reg.end < allocated_p_regions.regs[i].start) { 141 /* ensure there's space to bump the regions up */ 142 if (allocated_p_regions.cur_pos + 1 == NUM_RESERVED_REGIONS) { 143 printf("Ran out of reserved physical regions\n"); 144 return false; 145 } 146 /* Copy the regions up to make a gap */ 147 for (j = allocated_p_regions.cur_pos; j != i; j--) { 148 allocated_p_regions.regs[j] = allocated_p_regions.regs[j - 1]; 149 } 150 /* Put this region in the gap */ 151 allocated_p_regions.regs[i] = reg; 152 allocated_p_regions.cur_pos++; 153 return true; 154 } 155 } 156 157 /* nothing else matched, put this one at the end */ 158 if (i + 1 == NUM_RESERVED_REGIONS) { 159 printf("Ran out of reserved physical regions\n"); 160 return false; 161 } 162 allocated_p_regions.regs[i] = reg; 163 allocated_p_regions.cur_pos = i + 1; 164 return true; 165} 166 167BOOT_CODE void 168init_allocated_p_regions() 169{ 170 allocated_p_regions.cur_pos = 0; 171} 172 173BOOT_CODE static bool_t 174create_untypeds( 175 cap_t root_cnode_cap, 176 region_t boot_mem_reuse_reg) 177{ 178 seL4_SlotPos slot_pos_before; 179 seL4_SlotPos slot_pos_after; 180 word_t i; 181 182 paddr_t start = 0; 183 184 slot_pos_before = ndks_boot.slot_pos_cur; 185 create_kernel_untypeds(root_cnode_cap, boot_mem_reuse_reg, slot_pos_before); 186 187 for (i = 0; i < allocated_p_regions.cur_pos; i++) { 188 if (start != allocated_p_regions.regs[i].start) { 189 if (!create_untypeds_for_region(root_cnode_cap, true, 190 paddr_to_pptr_reg((p_region_t) { 191 start, allocated_p_regions.regs[i].start 192 }), 193 slot_pos_before)) { 194 return false; 195 } 196 } 197 start = allocated_p_regions.regs[i].end; 198 } 199 200 if (start != PADDR_USER_DEVICE_TOP) { 201 if (!create_untypeds_for_region(root_cnode_cap, true, 202 paddr_to_pptr_reg((p_region_t) { 203 start, PADDR_USER_DEVICE_TOP 204 }), 205 slot_pos_before)) { 206 return false; 207 } 208 } 209 210 slot_pos_after = ndks_boot.slot_pos_cur; 211 ndks_boot.bi_frame->untyped = (seL4_SlotRegion) { 212 slot_pos_before, slot_pos_after 213 }; 214 return true; 215} 216 217BOOT_CODE static void 218init_freemem(p_region_t ui_p_reg, mem_p_regs_t mem_p_regs) 219{ 220 word_t i; 221 /* we are guaranteed that we started loading the user image after the kernel 222 * so we only include addresses above ui_info.p_reg.end */ 223 pptr_t floor = ui_p_reg.end; 224 for (i = 0; i < MAX_NUM_FREEMEM_REG; i++) { 225 ndks_boot.freemem[i] = REG_EMPTY; 226 } 227 for (i = 0; i < mem_p_regs.count; i++) { 228 pptr_t start = mem_p_regs.list[i].start; 229 pptr_t end = mem_p_regs.list[i].end; 230 if (start < floor) { 231 start = floor; 232 } 233 if (end < floor) { 234 end = floor; 235 } 236 insert_region(paddr_to_pptr_reg((p_region_t) { 237 start, end 238 })); 239 } 240} 241 242/* This function initialises a node's kernel state. It does NOT initialise the CPU. */ 243 244BOOT_CODE bool_t 245init_sys_state( 246 cpu_id_t cpu_id, 247 mem_p_regs_t mem_p_regs, 248 ui_info_t ui_info, 249 p_region_t boot_mem_reuse_p_reg, 250 /* parameters below not modeled in abstract specification */ 251 uint32_t num_drhu, 252 paddr_t* drhu_list, 253 acpi_rmrr_list_t *rmrr_list, 254 acpi_rsdp_t *acpi_rsdp, 255 seL4_X86_BootInfo_VBE *vbe, 256 seL4_X86_BootInfo_mmap_t *mb_mmap, 257 seL4_X86_BootInfo_fb_t *fb_info 258) 259{ 260 cap_t root_cnode_cap; 261 vptr_t extra_bi_frame_vptr; 262 vptr_t bi_frame_vptr; 263 vptr_t ipcbuf_vptr; 264 cap_t it_vspace_cap; 265 cap_t it_ap_cap; 266 cap_t ipcbuf_cap; 267 pptr_t bi_frame_pptr; 268 word_t extra_bi_size = sizeof(seL4_BootInfoHeader); 269 region_t extra_bi_region; 270 pptr_t extra_bi_offset = 0; 271 uint32_t tsc_freq; 272 create_frames_of_region_ret_t create_frames_ret; 273 create_frames_of_region_ret_t extra_bi_ret; 274 275 /* convert from physical addresses to kernel pptrs */ 276 region_t ui_reg = paddr_to_pptr_reg(ui_info.p_reg); 277 region_t boot_mem_reuse_reg = paddr_to_pptr_reg(boot_mem_reuse_p_reg); 278 279 /* convert from physical addresses to userland vptrs */ 280 v_region_t ui_v_reg; 281 v_region_t it_v_reg; 282 ui_v_reg.start = ui_info.p_reg.start - ui_info.pv_offset; 283 ui_v_reg.end = ui_info.p_reg.end - ui_info.pv_offset; 284 285 ipcbuf_vptr = ui_v_reg.end; 286 bi_frame_vptr = ipcbuf_vptr + BIT(PAGE_BITS); 287 extra_bi_frame_vptr = bi_frame_vptr + BIT(PAGE_BITS); 288 289 if (vbe->vbeMode != -1) { 290 extra_bi_size += sizeof(seL4_X86_BootInfo_VBE); 291 } 292 if (acpi_rsdp) { 293 extra_bi_size += sizeof(seL4_BootInfoHeader) + sizeof(*acpi_rsdp); 294 } 295 if (fb_info && fb_info->addr) { 296 extra_bi_size += sizeof(seL4_BootInfoHeader) + sizeof(*fb_info); 297 } 298 299 word_t mb_mmap_size = sizeof(seL4_X86_BootInfo_mmap_t); 300 extra_bi_size += mb_mmap_size; 301 302 // room for tsc frequency 303 extra_bi_size += sizeof(seL4_BootInfoHeader) + 4; 304 305 /* The region of the initial thread is the user image + ipcbuf and boot info */ 306 it_v_reg.start = ui_v_reg.start; 307 it_v_reg.end = ROUND_UP(extra_bi_frame_vptr + extra_bi_size, PAGE_BITS); 308 309 init_freemem(ui_info.p_reg, mem_p_regs); 310 311 /* create the root cnode */ 312 root_cnode_cap = create_root_cnode(); 313 314 /* create the IO port cap */ 315 write_slot( 316 SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIOPortControl), 317 cap_io_port_control_cap_new() 318 ); 319 320 /* create the cap for managing thread domains */ 321 create_domain_cap(root_cnode_cap); 322 323 /* create the IRQ CNode */ 324 if (!create_irq_cnode()) { 325 return false; 326 } 327 328 /* initialise the IRQ states and provide the IRQ control cap */ 329 init_irqs(root_cnode_cap); 330 331 tsc_freq = tsc_init(); 332 333 /* create the bootinfo frame */ 334 bi_frame_pptr = allocate_bi_frame(0, ksNumCPUs, ipcbuf_vptr); 335 if (!bi_frame_pptr) { 336 return false; 337 } 338 339 extra_bi_region = allocate_extra_bi_region(extra_bi_size); 340 if (extra_bi_region.start == 0) { 341 return false; 342 } 343 344 /* populate vbe info block */ 345 if (vbe->vbeMode != -1) { 346 vbe->header.id = SEL4_BOOTINFO_HEADER_X86_VBE; 347 vbe->header.len = sizeof(seL4_X86_BootInfo_VBE); 348 memcpy((void*)(extra_bi_region.start + extra_bi_offset), vbe, sizeof(seL4_X86_BootInfo_VBE)); 349 extra_bi_offset += sizeof(seL4_X86_BootInfo_VBE); 350 } 351 352 /* populate acpi rsdp block */ 353 if (acpi_rsdp) { 354 seL4_BootInfoHeader header; 355 header.id = SEL4_BOOTINFO_HEADER_X86_ACPI_RSDP; 356 header.len = sizeof(header) + sizeof(*acpi_rsdp); 357 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header; 358 extra_bi_offset += sizeof(header); 359 memcpy((void*)(extra_bi_region.start + extra_bi_offset), acpi_rsdp, sizeof(*acpi_rsdp)); 360 extra_bi_offset += sizeof(*acpi_rsdp); 361 } 362 363 /* populate framebuffer information block */ 364 if (fb_info && fb_info->addr) { 365 seL4_BootInfoHeader header; 366 header.id = SEL4_BOOTINFO_HEADER_X86_FRAMEBUFFER; 367 header.len = sizeof(header) + sizeof(*fb_info); 368 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header; 369 extra_bi_offset += sizeof(header); 370 memcpy((void*)(extra_bi_region.start + extra_bi_offset), fb_info, sizeof(*fb_info)); 371 extra_bi_offset += sizeof(*fb_info); 372 } 373 374 /* populate multiboot mmap block */ 375 mb_mmap->header.id = SEL4_BOOTINFO_HEADER_X86_MBMMAP; 376 mb_mmap->header.len = mb_mmap_size; 377 memcpy((void*)(extra_bi_region.start + extra_bi_offset), mb_mmap, mb_mmap_size); 378 extra_bi_offset += mb_mmap_size; 379 380 /* populate tsc frequency block */ 381 { 382 seL4_BootInfoHeader header; 383 header.id = SEL4_BOOTINFO_HEADER_X86_TSC_FREQ; 384 header.len = sizeof(header) + 4; 385 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header; 386 extra_bi_offset += sizeof(header); 387 *(uint32_t*)(extra_bi_region.start + extra_bi_offset) = tsc_freq; 388 extra_bi_offset += 4; 389 } 390 391 /* provde a chunk for any leftover padding in the extended boot info */ 392 seL4_BootInfoHeader padding_header; 393 padding_header.id = SEL4_BOOTINFO_HEADER_PADDING; 394 padding_header.len = (extra_bi_region.end - extra_bi_region.start) - extra_bi_offset; 395 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = padding_header; 396 397 /* Construct an initial address space with enough virtual addresses 398 * to cover the user image + ipc buffer and bootinfo frames */ 399 it_vspace_cap = create_it_address_space(root_cnode_cap, it_v_reg); 400 if (cap_get_capType(it_vspace_cap) == cap_null_cap) { 401 return false; 402 } 403 404 /* Create and map bootinfo frame cap */ 405 create_bi_frame_cap( 406 root_cnode_cap, 407 it_vspace_cap, 408 bi_frame_pptr, 409 bi_frame_vptr 410 ); 411 412 /* create and map extra bootinfo region */ 413 extra_bi_ret = 414 create_frames_of_region( 415 root_cnode_cap, 416 it_vspace_cap, 417 extra_bi_region, 418 true, 419 pptr_to_paddr((void*)(extra_bi_region.start - extra_bi_frame_vptr)) 420 ); 421 if (!extra_bi_ret.success) { 422 return false; 423 } 424 ndks_boot.bi_frame->extraBIPages = extra_bi_ret.region; 425 426 /* create the initial thread's IPC buffer */ 427 ipcbuf_cap = create_ipcbuf_frame(root_cnode_cap, it_vspace_cap, ipcbuf_vptr); 428 if (cap_get_capType(ipcbuf_cap) == cap_null_cap) { 429 return false; 430 } 431 432 /* create all userland image frames */ 433 create_frames_ret = 434 create_frames_of_region( 435 root_cnode_cap, 436 it_vspace_cap, 437 ui_reg, 438 true, 439 ui_info.pv_offset 440 ); 441 if (!create_frames_ret.success) { 442 return false; 443 } 444 ndks_boot.bi_frame->userImageFrames = create_frames_ret.region; 445 446 /* create the initial thread's ASID pool */ 447 it_ap_cap = create_it_asid_pool(root_cnode_cap); 448 if (cap_get_capType(it_ap_cap) == cap_null_cap) { 449 return false; 450 } 451 write_it_asid_pool(it_ap_cap, it_vspace_cap); 452 453 /* create the idle thread */ 454 if (!create_idle_thread()) { 455 return false; 456 } 457 458 /* create the initial thread */ 459 tcb_t *initial = create_initial_thread(root_cnode_cap, 460 it_vspace_cap, 461 ui_info.v_entry, 462 bi_frame_vptr, 463 ipcbuf_vptr, 464 ipcbuf_cap); 465 if (initial == NULL) { 466 return false; 467 } 468 init_core_state(initial); 469 470#ifdef CONFIG_IOMMU 471 /* initialise VTD-related data structures and the IOMMUs */ 472 if (!vtd_init(cpu_id, num_drhu, rmrr_list)) { 473 return false; 474 } 475 476 /* write number of IOMMU PT levels into bootinfo */ 477 ndks_boot.bi_frame->numIOPTLevels = x86KSnumIOPTLevels; 478 479 /* write IOSpace master cap */ 480 write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIOSpace), master_iospace_cap()); 481#else 482 ndks_boot.bi_frame->numIOPTLevels = -1; 483#endif 484 485 /* create all of the untypeds. Both devices and kernel window memory */ 486 if (!create_untypeds(root_cnode_cap, boot_mem_reuse_reg)) { 487 return false; 488 } 489 /* WARNING: alloc_region() must not be called anymore after here! */ 490 491 /* finalise the bootinfo frame */ 492 bi_finalise(); 493 494 return true; 495} 496 497/* This function initialises the CPU. It does NOT initialise any kernel state. */ 498 499BOOT_CODE bool_t 500init_cpu( 501 bool_t mask_legacy_irqs 502) 503{ 504 /* initialise virtual-memory-related data structures */ 505 if (!init_vm_state()) { 506 return false; 507 } 508 509 /* initialise CPU's descriptor table registers (GDTR, IDTR, LDTR, TR) */ 510 init_dtrs(); 511 512 if (config_set(CONFIG_SYSENTER)) { 513 /* initialise MSRs (needs an initialised TSS) */ 514 init_sysenter_msrs(); 515 } else if (config_set(CONFIG_SYSCALL)) { 516 init_syscall_msrs(); 517 } else { 518 return false; 519 } 520 521 /* setup additional PAT MSR */ 522 if (!init_pat_msr()) { 523 return false; 524 } 525 526 /* enable the Write Protect bit in cr0. This prevents the kernel from writing to 527 * read only memory, which we shouldn't do under correct execution */ 528 write_cr0(read_cr0() | CR0_WRITE_PROTECT); 529 530 /* check for SMAP and SMEP and enable */ 531 cpuid_007h_ebx_t ebx_007; 532 ebx_007.words[0] = x86_cpuid_ebx(0x7, 0); 533 if (cpuid_007h_ebx_get_smap(ebx_007)) { 534 /* if we have user stack trace enabled or dangerous code injection then we cannot 535 * enable this as SMAP will make them fault. */ 536 if (!config_set(CONFIG_PRINTING) && !config_set(CONFIG_DANGEROUS_CODE_INJECTION)) { 537 write_cr4(read_cr4() | CR4_SMAP); 538 } 539 } 540 if (cpuid_007h_ebx_get_smep(ebx_007)) { 541 /* similar to smap we cannot enable smep if using dangerous code injenction. it 542 * does not affect stack trace printing though */ 543 if (!config_set(CONFIG_DANGEROUS_CODE_INJECTION)) { 544 write_cr4(read_cr4() | CR4_SMEP); 545 } 546 } 547 548 if (!init_ibrs()) { 549 return false; 550 } 551 552#ifdef CONFIG_HARDWARE_DEBUG_API 553 /* Initialize hardware breakpoints */ 554 Arch_initHardwareBreakpoints(); 555#endif 556 557 /* initialise floating-point unit */ 558 if (!Arch_initFpu()) { 559 return false; 560 } 561 562 /* initialise local APIC */ 563 if (!apic_init(mask_legacy_irqs)) { 564 return false; 565 } 566 567#ifdef CONFIG_DEBUG_DISABLE_PREFETCHERS 568 if (!disablePrefetchers()) { 569 return false; 570 } 571#endif 572 573 if (config_set(CONFIG_EXPORT_PMC_USER)) { 574 enablePMCUser(); 575 } 576 577#ifdef CONFIG_VTX 578 /* initialise Intel VT-x extensions */ 579 if (!vtx_init()) { 580 return false; 581 } 582#endif 583 584 return true; 585} 586