1/* 2 * Copyright 2014, General Dynamics C4 Systems 3 * 4 * This software may be distributed and modified according to the terms of 5 * the GNU General Public License version 2. Note that NO WARRANTY is provided. 6 * See "LICENSE_GPLv2.txt" for details. 7 * 8 * @TAG(GD_GPL) 9 */ 10 11#include <config.h> 12#include <kernel/boot.h> 13#include <machine/io.h> 14#include <model/statedata.h> 15#include <object/interrupt.h> 16#include <arch/object/interrupt.h> 17#include <arch/machine.h> 18#include <arch/kernel/apic.h> 19#include <arch/kernel/boot.h> 20#include <arch/kernel/boot_sys.h> 21#include <arch/kernel/vspace.h> 22#include <machine/fpu.h> 23#include <arch/machine/timer.h> 24#include <arch/object/ioport.h> 25#include <linker.h> 26#include <util.h> 27 28#include <plat/machine/intel-vtd.h> 29 30/* functions exactly corresponding to abstract specification */ 31 32BOOT_CODE static void 33init_irqs(cap_t root_cnode_cap) 34{ 35 irq_t i; 36 37 for (i = 0; i <= maxIRQ; i++) { 38 if (i == irq_timer) { 39 setIRQState(IRQTimer, i); 40#ifdef ENABLE_SMP_SUPPORT 41 } else if (i == irq_remote_call_ipi || i == irq_reschedule_ipi) { 42 setIRQState(IRQIPI, i); 43#endif /* ENABLE_SMP_SUPPORT */ 44#ifdef CONFIG_IOMMU 45 } else if (i == irq_iommu) { 46 setIRQState(IRQReserved, i); 47#endif 48 } else if (i == 2 && config_set(CONFIG_IRQ_PIC)) { 49 /* cascaded legacy PIC */ 50 setIRQState(IRQReserved, i); 51 } else if (i >= irq_isa_min && i <= irq_isa_max) { 52 if (config_set(CONFIG_IRQ_PIC)) { 53 setIRQState(IRQInactive, i); 54 } else { 55 setIRQState(IRQReserved, i); 56 } 57 } else if (i >= irq_user_min && i <= irq_user_max) { 58 if (config_set(CONFIG_IRQ_IOAPIC)) { 59 setIRQState(IRQInactive, i); 60 } else { 61 setIRQState(IRQReserved, i); 62 } 63 } else { 64 setIRQState(IRQReserved, i); 65 } 66 } 67 Arch_irqStateInit(); 68 /* provide the IRQ control cap */ 69 write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIRQControl), cap_irq_control_cap_new()); 70} 71 72/* The maximum number of reserved regions we have is 1 for each physical memory region (+ MAX_NUM_FREEMEM_REG) 73 * plus 1 for each kernel device. For kernel devices we have the ioapics (+ CONFIG_MAX_NUM_IOAPIC), 74 * iommus (+ MAX_NUM_DRHU), apic (+ 1) and the reserved MSI region (+ 1) */ 75#define NUM_RESERVED_REGIONS (MAX_NUM_FREEMEM_REG + CONFIG_MAX_NUM_IOAPIC + MAX_NUM_DRHU + 2) 76typedef struct allocated_p_region { 77 p_region_t regs[NUM_RESERVED_REGIONS]; 78 word_t cur_pos; 79} allocated_p_region_t; 80 81BOOT_BSS static allocated_p_region_t allocated_p_regions; 82 83BOOT_CODE static void 84merge_regions(void) 85{ 86 unsigned int i, j; 87 /* Walk through all the regions and see if any can get merged */ 88 for (i = 1; i < allocated_p_regions.cur_pos;) { 89 if (allocated_p_regions.regs[i - 1].end == allocated_p_regions.regs[i].start) { 90 /* move this down */ 91 allocated_p_regions.regs[i - 1].end = allocated_p_regions.regs[i].end; 92 /* fill the rest down */ 93 for (j = i; j < allocated_p_regions.cur_pos - 1; j++) { 94 allocated_p_regions.regs[j] = allocated_p_regions.regs[j + 1]; 95 } 96 allocated_p_regions.cur_pos--; 97 /* don't increment 'i' since we want to recheck that the 98 * region we just moved to this slot doesn't also need merging */ 99 } else { 100 i++; 101 } 102 } 103} 104 105static UNUSED BOOT_CODE bool_t p_region_overlaps(p_region_t reg) 106{ 107 unsigned int i; 108 for (i = 0; i < allocated_p_regions.cur_pos; i++) { 109 if (allocated_p_regions.regs[i].start < reg.end && 110 allocated_p_regions.regs[i].end > reg.start) { 111 return true; 112 } 113 } 114 return false; 115} 116 117BOOT_CODE bool_t 118add_allocated_p_region(p_region_t reg) 119{ 120 unsigned int i, j; 121 122 assert(reg.start <= reg.end); 123 assert(!p_region_overlaps(reg)); 124 125 /* Walk the existing regions and see if we can merge with an existing 126 * region, or insert in order */ 127 for (i = 0; i < allocated_p_regions.cur_pos; i++) { 128 /* see if we can merge before or after this region */ 129 if (allocated_p_regions.regs[i].end == reg.start) { 130 allocated_p_regions.regs[i].end = reg.end; 131 merge_regions(); 132 return true; 133 } 134 if (allocated_p_regions.regs[i].start == reg.end) { 135 allocated_p_regions.regs[i].start = reg.start; 136 merge_regions(); 137 return true; 138 } 139 /* see if this new one should be inserted before */ 140 if (reg.end < allocated_p_regions.regs[i].start) { 141 /* ensure there's space to bump the regions up */ 142 if (allocated_p_regions.cur_pos + 1 == NUM_RESERVED_REGIONS) { 143 printf("Ran out of reserved physical regions\n"); 144 return false; 145 } 146 /* Copy the regions up to make a gap */ 147 for (j = allocated_p_regions.cur_pos; j != i; j--) { 148 allocated_p_regions.regs[j] = allocated_p_regions.regs[j - 1]; 149 } 150 /* Put this region in the gap */ 151 allocated_p_regions.regs[i] = reg; 152 allocated_p_regions.cur_pos++; 153 return true; 154 } 155 } 156 157 /* nothing else matched, put this one at the end */ 158 if (i + 1 == NUM_RESERVED_REGIONS) { 159 printf("Ran out of reserved physical regions\n"); 160 return false; 161 } 162 allocated_p_regions.regs[i] = reg; 163 allocated_p_regions.cur_pos = i + 1; 164 return true; 165} 166 167BOOT_CODE void 168init_allocated_p_regions() 169{ 170 allocated_p_regions.cur_pos = 0; 171} 172 173BOOT_CODE static bool_t 174create_untypeds( 175 cap_t root_cnode_cap, 176 region_t boot_mem_reuse_reg) 177{ 178 seL4_SlotPos slot_pos_before; 179 seL4_SlotPos slot_pos_after; 180 word_t i; 181 182 paddr_t start = 0; 183 184 slot_pos_before = ndks_boot.slot_pos_cur; 185 create_kernel_untypeds(root_cnode_cap, boot_mem_reuse_reg, slot_pos_before); 186 187 for (i = 0; i < allocated_p_regions.cur_pos; i++) { 188 if (start != allocated_p_regions.regs[i].start) { 189 if (!create_untypeds_for_region(root_cnode_cap, true, 190 paddr_to_pptr_reg((p_region_t) { 191 start, allocated_p_regions.regs[i].start 192 }), 193 slot_pos_before)) { 194 return false; 195 } 196 } 197 start = allocated_p_regions.regs[i].end; 198 } 199 200 if (start != PADDR_USER_DEVICE_TOP) { 201 if (!create_untypeds_for_region(root_cnode_cap, true, 202 paddr_to_pptr_reg((p_region_t) { 203 start, PADDR_USER_DEVICE_TOP 204 }), 205 slot_pos_before)) { 206 return false; 207 } 208 } 209 210 slot_pos_after = ndks_boot.slot_pos_cur; 211 ndks_boot.bi_frame->untyped = (seL4_SlotRegion) { 212 slot_pos_before, slot_pos_after 213 }; 214 return true; 215} 216 217BOOT_CODE static void 218init_freemem(p_region_t ui_p_reg, mem_p_regs_t mem_p_regs) 219{ 220 word_t i; 221 /* we are guaranteed that we started loading the user image after the kernel 222 * so we only include addresses above ui_info.p_reg.end */ 223 pptr_t floor = ui_p_reg.end; 224 for (i = 0; i < MAX_NUM_FREEMEM_REG; i++) { 225 ndks_boot.freemem[i] = REG_EMPTY; 226 } 227 for (i = 0; i < mem_p_regs.count; i++) { 228 pptr_t start = mem_p_regs.list[i].start; 229 pptr_t end = mem_p_regs.list[i].end; 230 if (start < floor) { 231 start = floor; 232 } 233 if (end < floor) { 234 end = floor; 235 } 236 insert_region(paddr_to_pptr_reg((p_region_t) { 237 start, end 238 })); 239 } 240} 241 242/* This function initialises a node's kernel state. It does NOT initialise the CPU. */ 243 244BOOT_CODE bool_t 245init_sys_state( 246 cpu_id_t cpu_id, 247 mem_p_regs_t mem_p_regs, 248 ui_info_t ui_info, 249 p_region_t boot_mem_reuse_p_reg, 250 /* parameters below not modeled in abstract specification */ 251 uint32_t num_drhu, 252 paddr_t* drhu_list, 253 acpi_rmrr_list_t *rmrr_list, 254 acpi_rsdp_t *acpi_rsdp, 255 seL4_X86_BootInfo_VBE *vbe, 256 seL4_X86_BootInfo_mmap_t *mb_mmap, 257 seL4_X86_BootInfo_fb_t *fb_info 258) 259{ 260 cap_t root_cnode_cap; 261 vptr_t extra_bi_frame_vptr; 262 vptr_t bi_frame_vptr; 263 vptr_t ipcbuf_vptr; 264 cap_t it_vspace_cap; 265 cap_t it_ap_cap; 266 cap_t ipcbuf_cap; 267 pptr_t bi_frame_pptr; 268 word_t extra_bi_size = sizeof(seL4_BootInfoHeader); 269 region_t extra_bi_region; 270 pptr_t extra_bi_offset = 0; 271 uint32_t tsc_freq; 272 create_frames_of_region_ret_t create_frames_ret; 273 create_frames_of_region_ret_t extra_bi_ret; 274 275 /* convert from physical addresses to kernel pptrs */ 276 region_t ui_reg = paddr_to_pptr_reg(ui_info.p_reg); 277 region_t boot_mem_reuse_reg = paddr_to_pptr_reg(boot_mem_reuse_p_reg); 278 279 /* convert from physical addresses to userland vptrs */ 280 v_region_t ui_v_reg; 281 v_region_t it_v_reg; 282 ui_v_reg.start = ui_info.p_reg.start - ui_info.pv_offset; 283 ui_v_reg.end = ui_info.p_reg.end - ui_info.pv_offset; 284 285 ipcbuf_vptr = ui_v_reg.end; 286 bi_frame_vptr = ipcbuf_vptr + BIT(PAGE_BITS); 287 extra_bi_frame_vptr = bi_frame_vptr + BIT(PAGE_BITS); 288 289 if (vbe->vbeMode != -1) { 290 extra_bi_size += sizeof(seL4_X86_BootInfo_VBE); 291 } 292 if (acpi_rsdp) { 293 extra_bi_size += sizeof(seL4_BootInfoHeader) + sizeof(*acpi_rsdp); 294 } 295 if (fb_info && fb_info->addr) { 296 extra_bi_size += sizeof(seL4_BootInfoHeader) + sizeof(*fb_info); 297 } 298 299 word_t mb_mmap_size = sizeof(seL4_X86_BootInfo_mmap_t); 300 extra_bi_size += mb_mmap_size; 301 302 // room for tsc frequency 303 extra_bi_size += sizeof(seL4_BootInfoHeader) + 4; 304 305 /* The region of the initial thread is the user image + ipcbuf and boot info */ 306 it_v_reg.start = ui_v_reg.start; 307 it_v_reg.end = ROUND_UP(extra_bi_frame_vptr + extra_bi_size, PAGE_BITS); 308 309 init_freemem(ui_info.p_reg, mem_p_regs); 310 311 /* create the root cnode */ 312 root_cnode_cap = create_root_cnode(); 313 314 /* create the IO port cap */ 315 write_slot( 316 SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIOPortControl), 317 cap_io_port_control_cap_new() 318 ); 319 320 /* create the cap for managing thread domains */ 321 create_domain_cap(root_cnode_cap); 322 323 /* create the IRQ CNode */ 324 if (!create_irq_cnode()) { 325 return false; 326 } 327 328 /* initialise the IRQ states and provide the IRQ control cap */ 329 init_irqs(root_cnode_cap); 330 331 tsc_freq = tsc_init(); 332 333 /* create the bootinfo frame */ 334 bi_frame_pptr = allocate_bi_frame(0, ksNumCPUs, ipcbuf_vptr); 335 if (!bi_frame_pptr) { 336 return false; 337 } 338 339 extra_bi_region = allocate_extra_bi_region(extra_bi_size); 340 if (extra_bi_region.start == 0) { 341 return false; 342 } 343 344 /* populate vbe info block */ 345 if (vbe->vbeMode != -1) { 346 vbe->header.id = SEL4_BOOTINFO_HEADER_X86_VBE; 347 vbe->header.len = sizeof(seL4_X86_BootInfo_VBE); 348 memcpy((void*)(extra_bi_region.start + extra_bi_offset), vbe, sizeof(seL4_X86_BootInfo_VBE)); 349 extra_bi_offset += sizeof(seL4_X86_BootInfo_VBE); 350 } 351 352 /* populate acpi rsdp block */ 353 if (acpi_rsdp) { 354 seL4_BootInfoHeader header; 355 header.id = SEL4_BOOTINFO_HEADER_X86_ACPI_RSDP; 356 header.len = sizeof(header) + sizeof(*acpi_rsdp); 357 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header; 358 extra_bi_offset += sizeof(header); 359 memcpy((void*)(extra_bi_region.start + extra_bi_offset), acpi_rsdp, sizeof(*acpi_rsdp)); 360 extra_bi_offset += sizeof(*acpi_rsdp); 361 } 362 363 /* populate framebuffer information block */ 364 if (fb_info && fb_info->addr) { 365 seL4_BootInfoHeader header; 366 header.id = SEL4_BOOTINFO_HEADER_X86_FRAMEBUFFER; 367 header.len = sizeof(header) + sizeof(*fb_info); 368 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header; 369 extra_bi_offset += sizeof(header); 370 memcpy((void*)(extra_bi_region.start + extra_bi_offset), fb_info, sizeof(*fb_info)); 371 extra_bi_offset += sizeof(*fb_info); 372 } 373 374 /* populate multiboot mmap block */ 375 mb_mmap->header.id = SEL4_BOOTINFO_HEADER_X86_MBMMAP; 376 mb_mmap->header.len = mb_mmap_size; 377 memcpy((void*)(extra_bi_region.start + extra_bi_offset), mb_mmap, mb_mmap_size); 378 extra_bi_offset += mb_mmap_size; 379 380 /* populate tsc frequency block */ 381 { 382 seL4_BootInfoHeader header; 383 header.id = SEL4_BOOTINFO_HEADER_X86_TSC_FREQ; 384 header.len = sizeof(header) + 4; 385 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header; 386 extra_bi_offset += sizeof(header); 387 *(uint32_t*)(extra_bi_region.start + extra_bi_offset) = tsc_freq; 388 extra_bi_offset += 4; 389 } 390 391 /* provde a chunk for any leftover padding in the extended boot info */ 392 seL4_BootInfoHeader padding_header; 393 padding_header.id = SEL4_BOOTINFO_HEADER_PADDING; 394 padding_header.len = (extra_bi_region.end - extra_bi_region.start) - extra_bi_offset; 395 *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = padding_header; 396 memcpy((void*)(extra_bi_region.start + extra_bi_offset), &padding_header, sizeof(seL4_BootInfoHeader)); 397 /* set up sched control for each core */ 398 init_sched_control(root_cnode_cap, CONFIG_MAX_NUM_NODES); 399 400 /* Construct an initial address space with enough virtual addresses 401 * to cover the user image + ipc buffer and bootinfo frames */ 402 it_vspace_cap = create_it_address_space(root_cnode_cap, it_v_reg); 403 if (cap_get_capType(it_vspace_cap) == cap_null_cap) { 404 return false; 405 } 406 407 /* Create and map bootinfo frame cap */ 408 create_bi_frame_cap( 409 root_cnode_cap, 410 it_vspace_cap, 411 bi_frame_pptr, 412 bi_frame_vptr 413 ); 414 415 /* create and map extra bootinfo region */ 416 extra_bi_ret = 417 create_frames_of_region( 418 root_cnode_cap, 419 it_vspace_cap, 420 extra_bi_region, 421 true, 422 pptr_to_paddr((void*)(extra_bi_region.start - extra_bi_frame_vptr)) 423 ); 424 if (!extra_bi_ret.success) { 425 return false; 426 } 427 ndks_boot.bi_frame->extraBIPages = extra_bi_ret.region; 428 429 /* create the initial thread's IPC buffer */ 430 ipcbuf_cap = create_ipcbuf_frame(root_cnode_cap, it_vspace_cap, ipcbuf_vptr); 431 if (cap_get_capType(ipcbuf_cap) == cap_null_cap) { 432 return false; 433 } 434 435 /* create all userland image frames */ 436 create_frames_ret = 437 create_frames_of_region( 438 root_cnode_cap, 439 it_vspace_cap, 440 ui_reg, 441 true, 442 ui_info.pv_offset 443 ); 444 if (!create_frames_ret.success) { 445 return false; 446 } 447 ndks_boot.bi_frame->userImageFrames = create_frames_ret.region; 448 449 /* create the initial thread's ASID pool */ 450 it_ap_cap = create_it_asid_pool(root_cnode_cap); 451 if (cap_get_capType(it_ap_cap) == cap_null_cap) { 452 return false; 453 } 454 write_it_asid_pool(it_ap_cap, it_vspace_cap); 455 456 NODE_STATE(ksCurTime) = getCurrentTime(); 457 458 /* create the idle thread */ 459 if (!create_idle_thread()) { 460 return false; 461 } 462 463 /* create the initial thread */ 464 tcb_t *initial = create_initial_thread(root_cnode_cap, 465 it_vspace_cap, 466 ui_info.v_entry, 467 bi_frame_vptr, 468 ipcbuf_vptr, 469 ipcbuf_cap); 470 if (initial == NULL) { 471 return false; 472 } 473 init_core_state(initial); 474 475#ifdef CONFIG_IOMMU 476 /* initialise VTD-related data structures and the IOMMUs */ 477 if (!vtd_init(cpu_id, num_drhu, rmrr_list)) { 478 return false; 479 } 480 481 /* write number of IOMMU PT levels into bootinfo */ 482 ndks_boot.bi_frame->numIOPTLevels = x86KSnumIOPTLevels; 483 484 /* write IOSpace master cap */ 485 write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIOSpace), master_iospace_cap()); 486#else 487 ndks_boot.bi_frame->numIOPTLevels = -1; 488#endif 489 490 /* create all of the untypeds. Both devices and kernel window memory */ 491 if (!create_untypeds(root_cnode_cap, boot_mem_reuse_reg)) { 492 return false; 493 } 494 /* WARNING: alloc_region() must not be called anymore after here! */ 495 496 /* finalise the bootinfo frame */ 497 bi_finalise(); 498 499 return true; 500} 501 502/* This function initialises the CPU. It does NOT initialise any kernel state. */ 503 504BOOT_CODE bool_t 505init_cpu( 506 bool_t mask_legacy_irqs 507) 508{ 509 /* initialise virtual-memory-related data structures */ 510 if (!init_vm_state()) { 511 return false; 512 } 513 514 /* initialise CPU's descriptor table registers (GDTR, IDTR, LDTR, TR) */ 515 init_dtrs(); 516 517 if (config_set(CONFIG_SYSENTER)) { 518 /* initialise MSRs (needs an initialised TSS) */ 519 init_sysenter_msrs(); 520 } else if (config_set(CONFIG_SYSCALL)) { 521 init_syscall_msrs(); 522 } else { 523 return false; 524 } 525 526 /* setup additional PAT MSR */ 527 if (!init_pat_msr()) { 528 return false; 529 } 530 531 /* enable the Write Protect bit in cr0. This prevents the kernel from writing to 532 * read only memory, which we shouldn't do under correct execution */ 533 write_cr0(read_cr0() | CR0_WRITE_PROTECT); 534 535 /* check for SMAP and SMEP and enable */ 536 cpuid_007h_ebx_t ebx_007; 537 ebx_007.words[0] = x86_cpuid_ebx(0x7, 0); 538 if (cpuid_007h_ebx_get_smap(ebx_007)) { 539 /* if we have user stack trace enabled or dangerous code injection then we cannot 540 * enable this as SMAP will make them fault. */ 541 if (!config_set(CONFIG_PRINTING) && !config_set(CONFIG_DANGEROUS_CODE_INJECTION)) { 542 write_cr4(read_cr4() | CR4_SMAP); 543 } 544 } 545 if (cpuid_007h_ebx_get_smep(ebx_007)) { 546 /* similar to smap we cannot enable smep if using dangerous code injenction. it 547 * does not affect stack trace printing though */ 548 if (!config_set(CONFIG_DANGEROUS_CODE_INJECTION)) { 549 write_cr4(read_cr4() | CR4_SMEP); 550 } 551 } 552 553 if (!init_ibrs()) { 554 return false; 555 } 556 557#ifdef CONFIG_HARDWARE_DEBUG_API 558 /* Initialize hardware breakpoints */ 559 Arch_initHardwareBreakpoints(); 560#endif 561 562 /* initialise floating-point unit */ 563 if (!Arch_initFpu()) { 564 return false; 565 } 566 567 /* initialise local APIC */ 568 if (!apic_init(mask_legacy_irqs)) { 569 return false; 570 } 571 572#ifdef CONFIG_DEBUG_DISABLE_PREFETCHERS 573 if (!disablePrefetchers()) { 574 return false; 575 } 576#endif 577 578 if (config_set(CONFIG_EXPORT_PMC_USER)) { 579 enablePMCUser(); 580 } 581 582#ifdef CONFIG_VTX 583 /* initialise Intel VT-x extensions */ 584 if (!vtx_init()) { 585 return false; 586 } 587#endif 588 589 return true; 590} 591