1/** 2 * \file 3 * \brief x86 kernel bootup code. 4 */ 5 6/* 7 * Copyright (c) 2007, 2008, 2009, 2010, 2013, ETH Zurich. 8 * All rights reserved. 9 * 10 * This file is distributed under the terms in the attached LICENSE file. 11 * If you do not find this file, copies can be found by writing to: 12 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group. 13 */ 14 15#include <kernel.h> 16 17#include <dispatch.h> 18#include <elf/elf.h> 19#include <exec.h> 20#include <init.h> 21#include <getopt/getopt.h> 22#include <kcb.h> 23#include <kernel_multiboot.h> 24#include <irq.h> 25#include <kputchar.h> 26#include <mdb/mdb_tree.h> 27#ifdef CONFIG_MICROBENCHMARKS 28#include <microbenchmarks.h> 29#endif 30#include <paging_kernel_arch.h> 31#include <startup.h> 32#include <string.h> 33#include <wakeup.h> 34#include <barrelfish_kpi/cpu.h> 35#include <barrelfish_kpi/init.h> 36#include <barrelfish_kpi/paging_arch.h> 37#include <barrelfish_kpi/syscalls.h> 38#include <arch/x86/apic.h> 39#include <target/x86/barrelfish_kpi/coredata_target.h> 40#include <arch/x86/startup_x86.h> 41#include <dev/ia32_dev.h> 42 43/// Optional core ID to use for the BSP core (command-line argument) 44static int bsp_coreid; 45 46/// Quick way to find the base address of a cnode capability 47#define CNODE(cte) get_address(&(cte)->cap) 48 49/// Pointer to bootinfo structure for init 50static struct bootinfo *bootinfo = (struct bootinfo *)BOOTINFO_BASE; 51 52/** 53 * Each kernel has a local copy of global and locks. However, during booting and 54 * kernel relocation, these are set to point to global of the pristine kernel, 55 * so that all the kernels can share it. 56 */ 57static struct global myglobal; 58struct global *global = &myglobal; 59 60/** 61 * \brief Map init user-space memory. 62 * 63 * This function maps pages of the init user-space module. It expects 64 * the virtual base address 'vbase' of a program segment of the init executable, 65 * its size 'size' and its ELF64 access control flags. It maps pages 66 * into physical memory that is allocated on the fly and puts 67 * corresponding frame caps into init's segcn. 68 * 69 * \param vbase Virtual base address of program segment. 70 * \param size Size of program segment in bytes. 71 * \param flags ELF64 access control flags of program segment. 72 * \param ret Used to return base region pointer 73 */ 74errval_t startup_alloc_init(void *state, genvaddr_t gvbase, size_t size, 75 uint32_t flags, void **ret) 76{ 77 78 errval_t err; 79 80 struct spawn_state *spawn_state = state; 81 82 lvaddr_t vbase = (lvaddr_t)gvbase; /* XXX */ 83 lvaddr_t offset = BASE_PAGE_OFFSET(vbase); 84 85 /* Page align the parameters */ 86 paging_align(&vbase, NULL, &size, BASE_PAGE_SIZE); 87 88 lpaddr_t pbase = 0, paddr = 0; 89 for(lvaddr_t i = vbase; i < vbase + size; i += BASE_PAGE_SIZE) { 90 if (apic_is_bsp()) { 91 paddr = bsp_alloc_phys(BASE_PAGE_SIZE); 92 } else { 93 paddr = app_alloc_phys(BASE_PAGE_SIZE); 94 } 95 96 if(pbase == 0) { 97 pbase = paddr; 98 } 99 100 err = startup_map_init(i, paddr, BASE_PAGE_SIZE, flags); 101 assert(err_is_ok(err)); 102 } 103 104 if (apic_is_bsp()) { 105 // Create frame caps for segcn 106 paddr += BASE_PAGE_SIZE; 107 108 debug(SUBSYS_STARTUP, 109 "Allocated physical memory [0x%"PRIxLPADDR", 0x%"PRIxLPADDR"]\n", 110 pbase, paddr - pbase); 111 112 err = create_caps_to_cnode(pbase, paddr - pbase, 113 RegionType_RootTask, spawn_state, bootinfo); 114 if (err_is_fail(err)) { 115 return err; 116 } 117 } 118 119 assert(ret != NULL); 120 *ret = (void *)(vbase + offset); 121 122 return SYS_ERR_OK; 123} 124 125/// Setup the module cnode, which contains frame caps to all multiboot modules 126void create_module_caps(struct spawn_state *st) 127{ 128 errval_t err; 129 130 /* Create caps for multiboot modules */ 131 struct multiboot_modinfo *module = 132 (struct multiboot_modinfo *)local_phys_to_mem(glbl_core_data->mods_addr); 133 134 // Allocate strings area 135 lpaddr_t mmstrings_phys = bsp_alloc_phys(BASE_PAGE_SIZE); 136 lvaddr_t mmstrings_base = local_phys_to_mem(mmstrings_phys); 137 lvaddr_t mmstrings = mmstrings_base; 138 139 // create cap for strings area in first slot of modulecn 140 assert(st->modulecn_slot == 0); 141 err = caps_create_new(ObjType_Frame, mmstrings_phys, BASE_PAGE_SIZE, 142 BASE_PAGE_SIZE, my_core_id, 143 caps_locate_slot(CNODE(st->modulecn), 144 st->modulecn_slot++)); 145 assert(err_is_ok(err)); 146 147 /* Walk over multiboot modules, creating frame caps */ 148 for (int i = 0; i < glbl_core_data->mods_count; i++) { 149 struct multiboot_modinfo *m = &module[i]; 150 151 // Set memory regions within bootinfo 152 struct mem_region *region = 153 &bootinfo->regions[bootinfo->regions_length++]; 154 155 genpaddr_t remain = MULTIBOOT_MODULE_SIZE(*m); 156 genpaddr_t base_addr = local_phys_to_gen_phys(m->mod_start); 157 158 region->mr_type = RegionType_Module; 159 region->mr_base = base_addr; 160 region->mrmod_slot = st->modulecn_slot; // first slot containing caps 161 region->mrmod_size = remain; // size of image _in bytes_ 162 region->mrmod_data = mmstrings - mmstrings_base; // offset of string in area 163 164 // round up to page size for caps 165 remain = ROUND_UP(remain, BASE_PAGE_SIZE); 166 167 // Create max-sized caps to multiboot module in module cnode 168 assert((base_addr & BASE_PAGE_MASK) == 0); 169 assert((remain & BASE_PAGE_MASK) == 0); 170 171 172 assert(st->modulecn_slot < cnode_get_slots(&st->modulecn->cap)); 173 // create as DevFrame cap to avoid zeroing memory contents 174 err = caps_create_new(ObjType_DevFrame, base_addr, remain, 175 remain, my_core_id, 176 caps_locate_slot(CNODE(st->modulecn), 177 st->modulecn_slot++)); 178 assert(err_is_ok(err)); 179 180 // Copy multiboot module string to mmstrings area 181 strcpy((char *)mmstrings, MBADDR_ASSTRING(m->string)); 182 mmstrings += strlen(MBADDR_ASSTRING(m->string)) + 1; 183 assert(mmstrings < mmstrings_base + BASE_PAGE_SIZE); 184 } 185} 186 187void cleanup_bios_regions(char *mmap_addr, char **new_mmap_addr, 188 uint32_t *new_mmap_length) 189{ 190 assert(new_mmap_addr); 191 assert(new_mmap_length); 192#define PRINT_REGIONS(map, map_length) do {\ 193 for(char * printcur = map; printcur < map + map_length;) {\ 194 struct multiboot_mmap * printcurmmap = (struct multiboot_mmap * SAFE)TC(printcur);\ 195 printf("\t0x%08"PRIx64" - 0x%08"PRIx64" Type: %"PRIu32" Length: 0x%"PRIx64"\n", printcurmmap->base_addr, printcurmmap->base_addr + printcurmmap->length, printcurmmap->type, printcurmmap->length);\ 196 printcur += printcurmmap->size + 4;\ 197 }\ 198 } while (0) 199 200 printf("Raw MMAP from BIOS\n"); 201 PRINT_REGIONS(mmap_addr, glbl_core_data->mmap_length); 202 203 // normalize memory regions 204 lpaddr_t clean_base = bsp_alloc_phys(glbl_core_data->mmap_length); 205 char *clean_mmap_addr = (char *)local_phys_to_mem(clean_base); 206 uint32_t clean_mmap_length = glbl_core_data->mmap_length; 207 memcpy(clean_mmap_addr, mmap_addr, glbl_core_data->mmap_length); 208 209 // first of all, sort regions by base address 210 // yes, it's a bubble sort, but the dataset is small and usually in the right order 211 bool swapped; 212 do { 213 swapped = false; 214 215 for(char * cur = clean_mmap_addr; cur < clean_mmap_addr + clean_mmap_length;) { 216 struct multiboot_mmap * curmmap = (struct multiboot_mmap * SAFE)TC(cur); 217 if (cur + curmmap->size + 4 >= clean_mmap_addr + clean_mmap_length) 218 break; // do not try to move this check into the forloop as entries do not have to be the same length 219 220 struct multiboot_mmap * nextmmap = (struct multiboot_mmap * SAFE)TC(cur + curmmap->size + 4); 221 222 if (nextmmap->base_addr < curmmap->base_addr || 223 (nextmmap->base_addr == curmmap->base_addr && nextmmap->length > curmmap->length)) { 224 // swap 225 assert(curmmap->size == 20); // FIXME: The multiboot specification does not require this size 226 assert(nextmmap->size == 20); 227 228 struct multiboot_mmap tmp; 229 tmp = *curmmap; 230 *curmmap = *nextmmap; 231 *nextmmap = tmp; 232 233 swapped = true; 234 } 235 236 cur += curmmap->size + 4; 237 } 238 } while(swapped); 239 240 printf("Sorted MMAP\n"); 241 PRINT_REGIONS(clean_mmap_addr, clean_mmap_length); 242 243 // now merge consecutive memory regions of the same or lower type 244 for(char * cur = clean_mmap_addr; cur < clean_mmap_addr + clean_mmap_length;) { 245 struct multiboot_mmap * curmmap = (struct multiboot_mmap * SAFE)TC(cur); 246 if (cur + curmmap->size + 4 >= clean_mmap_addr + clean_mmap_length) 247 break; // do not try to move this check into the forloop as entries do not have to be the same length 248 249 struct multiboot_mmap * nextmmap = (struct multiboot_mmap * SAFE)TC(cur + curmmap->size + 4); 250 251 /* On some machines (brie1) the IOAPIC region is only 1kB. 252 * Currently we're not able to map regions that are <4kB so we 253 * make sure that every region (if there is no problematic overlap) 254 * is at least BASE_PAGE_SIZEd (==4kB) here. 255 */ 256 if ((curmmap->length < BASE_PAGE_SIZE) && (curmmap->base_addr + BASE_PAGE_SIZE <= nextmmap->base_addr)) { 257 curmmap->length = BASE_PAGE_SIZE; 258 } 259 260#define DISCARD_NEXT_MMAP do {\ 261 uint32_t discardsize = nextmmap->size + 4;\ 262 memmove(cur + curmmap->size + 4, cur + curmmap->size + 4 + discardsize, clean_mmap_length - (cur - clean_mmap_addr) - curmmap->size - 4 - discardsize);\ 263 clean_mmap_length -= discardsize;\ 264 } while (0) 265 266#define BUBBLE_NEXT_MMAP do {\ 267 for (char * bubblecur = cur + curmmap->size + 4; bubblecur < clean_mmap_addr + clean_mmap_length;){\ 268 struct multiboot_mmap * bubblecur_mmap = (struct multiboot_mmap * SAFE)TC(bubblecur);\ 269 if (bubblecur + bubblecur_mmap->size + 4 >= clean_mmap_addr + clean_mmap_length)\ 270 break;\ 271 struct multiboot_mmap * bubblenext_mmap = (struct multiboot_mmap * SAFE)TC(bubblecur + bubblecur_mmap->size + 4);\ 272 if (bubblenext_mmap->base_addr < bubblecur_mmap->base_addr || (bubblecur_mmap->base_addr == bubblenext_mmap->base_addr && bubblenext_mmap->length > bubblecur_mmap->length)) {\ 273 struct multiboot_mmap bubbletmp; bubbletmp = *bubblecur_mmap; *bubblecur_mmap = *bubblenext_mmap; *bubblenext_mmap = bubbletmp;\ 274 } else break;\ 275 }} while(0) 276 277 278 bool reduced = false; 279 do { 280 reduced = false; 281 282 if (curmmap->base_addr == nextmmap->base_addr) { 283 // regions start at the same location 284 if (curmmap->length == nextmmap->length) { 285 // trivial case. They are the same. Choose higher type and discard next 286 curmmap->type = max(curmmap->type, nextmmap->type); 287 288 DISCARD_NEXT_MMAP; 289 290 reduced = true; 291 continue; 292 } else { 293 // next region is smaller (we sorted that way) 294 if (nextmmap->type <= curmmap->type) { 295 // next regions type is the same or smaller. discard 296 DISCARD_NEXT_MMAP; 297 298 reduced = true; 299 continue; 300 } else { 301 // next regions type is higher, so it gets priority 302 // change type of current region and shrink next 303 uint32_t tmptype = curmmap->type; 304 uint64_t newlength = curmmap->length - nextmmap->length; 305 curmmap->type = nextmmap->type; 306 curmmap->length = nextmmap->length; 307 nextmmap->type = tmptype; 308 nextmmap->base_addr += nextmmap->length; 309 nextmmap->length = newlength; 310 311 // now we need to bubble next to the right place to restore order 312 BUBBLE_NEXT_MMAP; 313 314 reduced = true; 315 continue; 316 } 317 } 318 } 319 320 // regions overlap 321 if (nextmmap->base_addr > curmmap->base_addr && nextmmap->base_addr < curmmap->base_addr + curmmap->length) { 322 // same type 323 if (curmmap->type == nextmmap->type) { 324 // simple. just extend if necessary and discard next 325 if (nextmmap->base_addr + nextmmap->length > curmmap->base_addr + curmmap->length) 326 curmmap->length = (nextmmap->base_addr + nextmmap->length) - curmmap->base_addr; 327 328 DISCARD_NEXT_MMAP; 329 330 reduced = true; 331 continue; 332 } else { 333 // type is not the same 334 if (nextmmap->base_addr + nextmmap->length < curmmap->base_addr + curmmap->length) { 335 // there is a chunk at the end. create a new region 336 struct multiboot_mmap tmpmmap; 337 tmpmmap.size = 20; 338 tmpmmap.base_addr = nextmmap->base_addr + nextmmap->length; 339 tmpmmap.length = (curmmap->base_addr + curmmap->length) - (nextmmap->base_addr + nextmmap->length); 340 tmpmmap.type = curmmap->type; 341 342 // move everything to make room 343 assert(clean_mmap_length + tmpmmap.length + 4 < BOOTINFO_SIZE); 344 memmove(cur + curmmap->size + 4 + tmpmmap.size + 4, cur + curmmap->size + 4, clean_mmap_length - ((cur - clean_mmap_addr) + curmmap->size + 4)); 345 clean_mmap_length += tmpmmap.size + 4; 346 347 // insert new 348 *nextmmap = tmpmmap; 349 350 // restore order 351 BUBBLE_NEXT_MMAP; 352 353 reduced = true; 354 } 355 356 // after the previous step, the next region either ends 357 // at the same location as the current or is longer 358 uint64_t overlap = (curmmap->base_addr + curmmap->length) - nextmmap->base_addr; 359 360 if (curmmap-> type > nextmmap->type) { 361 // current has priority, shrink next and extend current 362 nextmmap->length -= overlap; 363 nextmmap->base_addr += overlap; 364 curmmap->length += overlap; 365 366 if (nextmmap->length == 0) 367 DISCARD_NEXT_MMAP; 368 369 reduced = true; 370 continue; 371 } else { 372 // next has priority, shrink current and extend next 373 nextmmap->length += overlap; 374 nextmmap->base_addr -= overlap; 375 curmmap->length -= overlap; 376 377 reduced = true; 378 continue; 379 } 380 } 381 } 382 } while (reduced); 383 384 cur += curmmap->size + 4; 385 386#undef DISCARD_NEXT_MMAP 387#undef BUBBLE_NEXT_MMAP 388 } 389 390 printf("Preprocessed MMAP\n"); 391 PRINT_REGIONS(clean_mmap_addr, clean_mmap_length); 392 393 // we can only map pages. Therefore page align regions 394 for(char * cur = clean_mmap_addr; cur < clean_mmap_addr + clean_mmap_length;) { 395 struct multiboot_mmap * curmmap = (struct multiboot_mmap * SAFE)TC(cur); 396 if (cur + curmmap->size + 4 >= clean_mmap_addr + clean_mmap_length) 397 break; // do not try to move this check into the forloop as entries do not have to be the same length 398 399 struct multiboot_mmap * nextmmap = (struct multiboot_mmap * SAFE)TC(cur + curmmap->size + 4); 400 401 if (nextmmap->base_addr & BASE_PAGE_MASK) { 402 uint64_t offset = nextmmap->base_addr - ((nextmmap->base_addr >> BASE_PAGE_BITS) << BASE_PAGE_BITS); 403 404 // round in favour of higher type 405 if (curmmap->type > nextmmap->type) { 406 curmmap->length += BASE_PAGE_SIZE - offset; 407 nextmmap->base_addr += BASE_PAGE_SIZE - offset; 408 nextmmap->length -= BASE_PAGE_SIZE - offset; 409 } else { 410 curmmap->length -= offset; 411 nextmmap->base_addr -= offset; 412 nextmmap->length += offset; 413 } 414 } 415 416 cur += curmmap->size + 4; 417 } 418 419 printf("Pagealigned MMAP\n"); 420 PRINT_REGIONS(clean_mmap_addr, clean_mmap_length); 421 422#undef PRINT_REGIONS 423 *new_mmap_addr = clean_mmap_addr; 424 *new_mmap_length = clean_mmap_length; 425 return; 426} 427 428// XXX from serial.c 429extern int serial_portbase; 430 431static struct cmdarg cmdargs[] = { 432 {"loglevel", ArgType_Int, { .integer = &kernel_loglevel }}, 433 {"logmask", ArgType_Int, { .integer = &kernel_log_subsystem_mask }}, 434 {"ticks", ArgType_Bool, { .boolean = &kernel_ticks_enabled }}, 435 {"timeslice", ArgType_UInt, { .uinteger = &config_timeslice }}, 436 {"serial", ArgType_Int, { .integer = &serial_portbase }}, 437 {"bsp_coreid", ArgType_Int, { .integer = &bsp_coreid }}, 438 {NULL, 0, {NULL}} 439}; 440 441/** 442 * Name of multiboot module containing program for init domains. 443 */ 444#if defined(__k1om__) 445# define BSP_INIT_MODULE_PATH BF_BINARY_PREFIX "k1om/sbin/init" 446#elif defined(__x86_64__) 447# define BSP_INIT_MODULE_PATH BF_BINARY_PREFIX "x86_64/sbin/init" 448#elif defined(__i386__) 449# define BSP_INIT_MODULE_PATH BF_BINARY_PREFIX "x86_32/sbin/init" 450#else 451# error "Unknown x86" 452#endif 453#define BSP_INIT_PROG_NAME "init" 454#define APP_INIT_PROG_NAME "monitor" 455 456/** 457 * \brief Kernel's early startup code, called from arch-specific bootstrap. 458 */ 459void kernel_startup_early(void) 460{ 461 const char *cmdline; 462 assert(glbl_core_data != NULL); 463 cmdline = MBADDR_ASSTRING(glbl_core_data->cmdline); 464 parse_commandline(cmdline, cmdargs); 465} 466 467/** 468 * \brief Kernel's main startup code, called from arch-specific bootstrap. 469 * 470 * This function never returns. 471 */ 472extern bool verbose_dispatch; 473void kernel_startup(void) 474{ 475#ifdef CONFIG_MICROBENCHMARKS 476 printk(LOG_NOTE, "\nRunning microbenchmarks...\n"); 477 microbenchmarks_run_all(); 478#endif 479 480 /* Initialize the core_data */ 481 /* Used when bringing up other cores, must be at consistent global address 482 * seen by all cores */ 483 struct x86_core_data *core_data 484 = (void *)((lvaddr_t)&_start_kernel - BASE_PAGE_SIZE); 485 486 struct dcb *init_dcb; 487 if (apic_is_bsp()) { 488 if (bsp_coreid != 0) { 489 my_core_id = bsp_coreid; 490 } 491 492 /* Initialize the location to allocate phys memory from */ 493 bsp_init_alloc_addr = glbl_core_data->start_free_ram; 494 495 /* allocate initial KCB */ 496 kcb_current = (struct kcb *) local_phys_to_mem(bsp_alloc_phys(sizeof(*kcb_current))); 497 memset(kcb_current, 0, sizeof(*kcb_current)); 498 assert(kcb_current); 499 500 /* spawn init */ 501 init_dcb = spawn_bsp_init(BSP_INIT_MODULE_PATH); 502 } else { 503 kcb_current = (struct kcb *) 504 local_phys_to_mem((lpaddr_t) kcb_current); 505 506 start_ap_signal(); 507 // if we have a kernel control block, use it 508 if (kcb_current && kcb_current->is_valid) { 509 debug(SUBSYS_STARTUP, "have valid kcb, restoring state\n"); 510 print_kcb(); 511 512 // restore mdb 513 errval_t err = mdb_init(kcb_current); 514 if (err_is_fail(err)) { 515 panic("couldn't restore mdb"); 516 } 517 // figure out if we need to convert scheduler state 518#ifdef CONFIG_SCHEDULER_RR 519 if (kcb_current->sched != SCHED_RR) { 520 printf("converting scheduler state to RR\n"); 521 scheduler_convert(); 522 } 523#elif CONFIG_SCHEDULER_RBED 524 if (kcb_current->sched != SCHED_RBED) { 525 printf("converting scheduler state to RBED\n"); 526 scheduler_convert(); 527 } 528#else 529#error must define scheduler 530#endif 531 // update core id of domains 532 kcb_update_core_id(kcb_current); 533 // set queue pointers 534 scheduler_restore_state(); 535 // restore wakeup queue state 536 printk(LOG_DEBUG, "%s:%s:%d: kcb_current->wakeup_queue_head = %p\n", 537 __FILE__, __FUNCTION__, __LINE__, kcb_current->wakeup_queue_head); 538 wakeup_set_queue_head(kcb_current->wakeup_queue_head); 539 540 printk(LOG_DEBUG, "%s:%s:%d: dcb_current = %p\n", 541 __FILE__, __FUNCTION__, __LINE__, dcb_current); 542 struct dcb *next = schedule(); 543 debug(SUBSYS_STARTUP, "next = %p\n", next); 544 if (next != NULL) { 545 assert (next->disp); 546 struct dispatcher_shared_generic *dst = 547 get_dispatcher_shared_generic(next->disp); 548 debug(SUBSYS_STARTUP, "scheduling '%s' from restored state\n", 549 dst->name); 550 } 551 // interrupt state should be fine, as it's used directly from the 552 // kcb. 553 dispatch(next); 554 panic("should not get here!"); 555 } 556 my_core_id = core_data->dst_core_id; 557 558 /* Initialize the allocator */ 559 app_alloc_phys_start = core_data->memory_base_start; 560 app_alloc_phys_end = ((lpaddr_t)1 << core_data->memory_bits) + 561 app_alloc_phys_start; 562 563 init_dcb = spawn_app_init(core_data, APP_INIT_PROG_NAME); 564 } 565 566 // Should not return 567 //if (apic_is_bsp()) { 568 dispatch(init_dcb); 569 //} 570 panic("Error spawning init!"); 571} 572 573/* 574 * Configure the IA32_PAT_MSR register such that PA4 is write-combining and 575 * PA5 is write-protect. 576 */ 577void configure_page_attribute_table(void) 578{ 579 ia32_t ia32; 580 ia32_cr_pat_t pat; 581 582 ia32_initialize(&ia32); 583 584 pat = ia32_cr_pat_rd(&ia32); 585 586 pat = ia32_cr_pat_pa4_insert(pat, ia32_wc); 587 pat = ia32_cr_pat_pa5_insert(pat, ia32_wp); 588 589 ia32_cr_pat_wr(&ia32, pat); 590 591 debug(SUBSYS_STARTUP, "Configured IA32_PAT_MSR.\n"); 592} 593