1/**
2 * \file
3 * \brief x86 kernel bootup code.
4 */
5
6/*
7 * Copyright (c) 2007, 2008, 2009, 2010, 2013, ETH Zurich.
8 * All rights reserved.
9 *
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group.
13 */
14
15#include <kernel.h>
16
17#include <dispatch.h>
18#include <elf/elf.h>
19#include <exec.h>
20#include <init.h>
21#include <getopt/getopt.h>
22#include <kcb.h>
23#include <kernel_multiboot.h>
24#include <irq.h>
25#include <kputchar.h>
26#include <mdb/mdb_tree.h>
27#ifdef CONFIG_MICROBENCHMARKS
28#include <microbenchmarks.h>
29#endif
30#include <paging_kernel_arch.h>
31#include <startup.h>
32#include <string.h>
33#include <wakeup.h>
34#include <barrelfish_kpi/cpu.h>
35#include <barrelfish_kpi/init.h>
36#include <barrelfish_kpi/paging_arch.h>
37#include <barrelfish_kpi/syscalls.h>
38#include <arch/x86/apic.h>
39#include <target/x86/barrelfish_kpi/coredata_target.h>
40#include <arch/x86/startup_x86.h>
41#include <dev/ia32_dev.h>
42
43/// Optional core ID to use for the BSP core (command-line argument)
44static int bsp_coreid;
45
46/// Quick way to find the base address of a cnode capability
47#define CNODE(cte)     get_address(&(cte)->cap)
48
49/// Pointer to bootinfo structure for init
50static struct bootinfo *bootinfo = (struct bootinfo *)BOOTINFO_BASE;
51
52/**
53 * Each kernel has a local copy of global and locks. However, during booting and
54 * kernel relocation, these are set to point to global of the pristine kernel,
55 * so that all the kernels can share it.
56 */
57static  struct global myglobal;
58struct global *global = &myglobal;
59
60/**
61 * \brief Map init user-space memory.
62 *
63 * This function maps pages of the init user-space module. It expects
64 * the virtual base address 'vbase' of a program segment of the init executable,
65 * its size 'size' and its ELF64 access control flags. It maps pages
66 * into physical memory that is allocated on the fly and puts
67 * corresponding frame caps into init's segcn.
68 *
69 * \param vbase Virtual base address of program segment.
70 * \param size  Size of program segment in bytes.
71 * \param flags ELF64 access control flags of program segment.
72 * \param ret   Used to return base region pointer
73 */
74errval_t startup_alloc_init(void *state, genvaddr_t gvbase, size_t size,
75                            uint32_t flags, void **ret)
76{
77
78    errval_t err;
79
80    struct spawn_state *spawn_state = state;
81
82    lvaddr_t vbase = (lvaddr_t)gvbase; /* XXX */
83    lvaddr_t offset = BASE_PAGE_OFFSET(vbase);
84
85    /* Page align the parameters */
86    paging_align(&vbase, NULL, &size, BASE_PAGE_SIZE);
87
88    lpaddr_t pbase = 0, paddr = 0;
89    for(lvaddr_t i = vbase; i < vbase + size; i += BASE_PAGE_SIZE) {
90        if (apic_is_bsp()) {
91            paddr = bsp_alloc_phys(BASE_PAGE_SIZE);
92        } else {
93            paddr = app_alloc_phys(BASE_PAGE_SIZE);
94        }
95
96        if(pbase == 0) {
97            pbase = paddr;
98        }
99
100        err = startup_map_init(i, paddr, BASE_PAGE_SIZE, flags);
101        assert(err_is_ok(err));
102    }
103
104    if (apic_is_bsp()) {
105        // Create frame caps for segcn
106        paddr += BASE_PAGE_SIZE;
107
108        debug(SUBSYS_STARTUP,
109              "Allocated physical memory [0x%"PRIxLPADDR", 0x%"PRIxLPADDR"]\n",
110              pbase, paddr - pbase);
111
112        err = create_caps_to_cnode(pbase, paddr - pbase,
113                                   RegionType_RootTask, spawn_state, bootinfo);
114        if (err_is_fail(err)) {
115            return err;
116        }
117    }
118
119    assert(ret != NULL);
120    *ret = (void *)(vbase + offset);
121
122    return SYS_ERR_OK;
123}
124
125/// Setup the module cnode, which contains frame caps to all multiboot modules
126void create_module_caps(struct spawn_state *st)
127{
128    errval_t err;
129
130    /* Create caps for multiboot modules */
131    struct multiboot_modinfo *module =
132        (struct multiboot_modinfo *)local_phys_to_mem(glbl_core_data->mods_addr);
133
134    // Allocate strings area
135    lpaddr_t mmstrings_phys = bsp_alloc_phys(BASE_PAGE_SIZE);
136    lvaddr_t mmstrings_base = local_phys_to_mem(mmstrings_phys);
137    lvaddr_t mmstrings = mmstrings_base;
138
139    // create cap for strings area in first slot of modulecn
140    assert(st->modulecn_slot == 0);
141    err = caps_create_new(ObjType_Frame, mmstrings_phys, BASE_PAGE_SIZE,
142                          BASE_PAGE_SIZE, my_core_id,
143                          caps_locate_slot(CNODE(st->modulecn),
144                                           st->modulecn_slot++));
145    assert(err_is_ok(err));
146
147    /* Walk over multiboot modules, creating frame caps */
148    for (int i = 0; i < glbl_core_data->mods_count; i++) {
149        struct multiboot_modinfo *m = &module[i];
150
151        // Set memory regions within bootinfo
152        struct mem_region *region =
153            &bootinfo->regions[bootinfo->regions_length++];
154
155        genpaddr_t remain = MULTIBOOT_MODULE_SIZE(*m);
156        genpaddr_t base_addr = local_phys_to_gen_phys(m->mod_start);
157
158        region->mr_type = RegionType_Module;
159        region->mr_base = base_addr;
160        region->mrmod_slot = st->modulecn_slot;  // first slot containing caps
161        region->mrmod_size = remain;  // size of image _in bytes_
162        region->mrmod_data = mmstrings - mmstrings_base; // offset of string in area
163
164        // round up to page size for caps
165        remain = ROUND_UP(remain, BASE_PAGE_SIZE);
166
167        // Create max-sized caps to multiboot module in module cnode
168        assert((base_addr & BASE_PAGE_MASK) == 0);
169        assert((remain & BASE_PAGE_MASK) == 0);
170
171
172        assert(st->modulecn_slot < cnode_get_slots(&st->modulecn->cap));
173        // create as DevFrame cap to avoid zeroing memory contents
174        err = caps_create_new(ObjType_DevFrame, base_addr, remain,
175                              remain, my_core_id,
176                              caps_locate_slot(CNODE(st->modulecn),
177                                               st->modulecn_slot++));
178        assert(err_is_ok(err));
179
180        // Copy multiboot module string to mmstrings area
181        strcpy((char *)mmstrings, MBADDR_ASSTRING(m->string));
182        mmstrings += strlen(MBADDR_ASSTRING(m->string)) + 1;
183        assert(mmstrings < mmstrings_base + BASE_PAGE_SIZE);
184    }
185}
186
187void cleanup_bios_regions(char *mmap_addr, char **new_mmap_addr,
188                          uint32_t *new_mmap_length)
189{
190    assert(new_mmap_addr);
191    assert(new_mmap_length);
192#define PRINT_REGIONS(map, map_length) do {\
193        for(char * printcur = map; printcur < map + map_length;) {\
194            struct multiboot_mmap * printcurmmap = (struct multiboot_mmap * SAFE)TC(printcur);\
195            printf("\t0x%08"PRIx64" - 0x%08"PRIx64" Type: %"PRIu32" Length: 0x%"PRIx64"\n", printcurmmap->base_addr, printcurmmap->base_addr + printcurmmap->length, printcurmmap->type, printcurmmap->length);\
196            printcur += printcurmmap->size + 4;\
197        }\
198    } while (0)
199
200    printf("Raw MMAP from BIOS\n");
201    PRINT_REGIONS(mmap_addr, glbl_core_data->mmap_length);
202
203    // normalize memory regions
204    lpaddr_t clean_base = bsp_alloc_phys(glbl_core_data->mmap_length);
205    char *clean_mmap_addr = (char *)local_phys_to_mem(clean_base);
206    uint32_t clean_mmap_length = glbl_core_data->mmap_length;
207    memcpy(clean_mmap_addr, mmap_addr, glbl_core_data->mmap_length);
208
209    // first of all, sort regions by base address
210    // yes, it's a bubble sort, but the dataset is small and usually in the right order
211    bool swapped;
212    do {
213        swapped = false;
214
215        for(char * cur = clean_mmap_addr; cur < clean_mmap_addr + clean_mmap_length;) {
216            struct multiboot_mmap * curmmap = (struct multiboot_mmap * SAFE)TC(cur);
217            if (cur + curmmap->size + 4 >= clean_mmap_addr + clean_mmap_length)
218                break; // do not try to move this check into the forloop as entries do not have to be the same length
219
220            struct multiboot_mmap * nextmmap = (struct multiboot_mmap * SAFE)TC(cur + curmmap->size + 4);
221
222            if (nextmmap->base_addr < curmmap->base_addr ||
223                (nextmmap->base_addr == curmmap->base_addr && nextmmap->length > curmmap->length)) {
224                // swap
225                assert(curmmap->size == 20); // FIXME: The multiboot specification does not require this size
226                assert(nextmmap->size == 20);
227
228                struct multiboot_mmap tmp;
229                tmp = *curmmap;
230                *curmmap = *nextmmap;
231                *nextmmap = tmp;
232
233                swapped = true;
234            }
235
236            cur += curmmap->size + 4;
237        }
238    } while(swapped);
239
240    printf("Sorted MMAP\n");
241    PRINT_REGIONS(clean_mmap_addr, clean_mmap_length);
242
243    // now merge consecutive memory regions of the same or lower type
244    for(char * cur = clean_mmap_addr; cur < clean_mmap_addr + clean_mmap_length;) {
245        struct multiboot_mmap * curmmap = (struct multiboot_mmap * SAFE)TC(cur);
246        if (cur + curmmap->size + 4 >= clean_mmap_addr + clean_mmap_length)
247            break; // do not try to move this check into the forloop as entries do not have to be the same length
248
249        struct multiboot_mmap * nextmmap = (struct multiboot_mmap * SAFE)TC(cur + curmmap->size + 4);
250
251        /* On some machines (brie1) the IOAPIC region is only 1kB.
252         * Currently we're not able to map regions that are <4kB so we
253         * make sure that every region (if there is no problematic overlap)
254         * is at least BASE_PAGE_SIZEd (==4kB) here.
255         */
256        if ((curmmap->length < BASE_PAGE_SIZE) && (curmmap->base_addr + BASE_PAGE_SIZE <= nextmmap->base_addr)) {
257            curmmap->length = BASE_PAGE_SIZE;
258        }
259
260#define DISCARD_NEXT_MMAP do {\
261    uint32_t discardsize = nextmmap->size + 4;\
262    memmove(cur + curmmap->size + 4, cur + curmmap->size + 4 + discardsize, clean_mmap_length - (cur - clean_mmap_addr) - curmmap->size - 4 - discardsize);\
263    clean_mmap_length -= discardsize;\
264    } while (0)
265
266#define BUBBLE_NEXT_MMAP do {\
267    for (char * bubblecur = cur + curmmap->size + 4; bubblecur < clean_mmap_addr + clean_mmap_length;){\
268        struct multiboot_mmap * bubblecur_mmap = (struct multiboot_mmap * SAFE)TC(bubblecur);\
269        if (bubblecur + bubblecur_mmap->size + 4 >= clean_mmap_addr + clean_mmap_length)\
270            break;\
271        struct multiboot_mmap * bubblenext_mmap = (struct multiboot_mmap * SAFE)TC(bubblecur + bubblecur_mmap->size + 4);\
272        if (bubblenext_mmap->base_addr < bubblecur_mmap->base_addr || (bubblecur_mmap->base_addr == bubblenext_mmap->base_addr && bubblenext_mmap->length > bubblecur_mmap->length)) {\
273            struct multiboot_mmap bubbletmp; bubbletmp = *bubblecur_mmap; *bubblecur_mmap = *bubblenext_mmap; *bubblenext_mmap = bubbletmp;\
274        } else break;\
275    }} while(0)
276
277
278        bool reduced = false;
279        do {
280            reduced = false;
281
282            if (curmmap->base_addr == nextmmap->base_addr) {
283                // regions start at the same location
284                if (curmmap->length == nextmmap->length) {
285                    // trivial case. They are the same. Choose higher type and discard next
286                    curmmap->type = max(curmmap->type, nextmmap->type);
287
288                    DISCARD_NEXT_MMAP;
289
290                    reduced = true;
291                    continue;
292                } else {
293                    // next region is smaller (we sorted that way)
294                    if (nextmmap->type <= curmmap->type) {
295                        // next regions type is the same or smaller. discard
296                        DISCARD_NEXT_MMAP;
297
298                        reduced = true;
299                        continue;
300                    } else {
301                        // next regions type is higher, so it gets priority
302                        // change type of current region and shrink next
303                        uint32_t tmptype = curmmap->type;
304                        uint64_t newlength = curmmap->length - nextmmap->length;
305                        curmmap->type = nextmmap->type;
306                        curmmap->length = nextmmap->length;
307                        nextmmap->type = tmptype;
308                        nextmmap->base_addr += nextmmap->length;
309                        nextmmap->length = newlength;
310
311                        // now we need to bubble next to the right place to restore order
312                        BUBBLE_NEXT_MMAP;
313
314                        reduced = true;
315                        continue;
316                    }
317                }
318            }
319
320            // regions overlap
321            if (nextmmap->base_addr > curmmap->base_addr && nextmmap->base_addr < curmmap->base_addr + curmmap->length) {
322                // same type
323                if (curmmap->type == nextmmap->type) {
324                    // simple. just extend if necessary and discard next
325                    if (nextmmap->base_addr + nextmmap->length > curmmap->base_addr + curmmap->length)
326                        curmmap->length = (nextmmap->base_addr + nextmmap->length) - curmmap->base_addr;
327
328                    DISCARD_NEXT_MMAP;
329
330                    reduced = true;
331                    continue;
332                } else {
333                    // type is not the same
334                    if (nextmmap->base_addr + nextmmap->length < curmmap->base_addr + curmmap->length) {
335                        // there is a chunk at the end. create a new region
336                        struct multiboot_mmap tmpmmap;
337                        tmpmmap.size = 20;
338                        tmpmmap.base_addr = nextmmap->base_addr + nextmmap->length;
339                        tmpmmap.length = (curmmap->base_addr + curmmap->length) - (nextmmap->base_addr + nextmmap->length);
340                        tmpmmap.type = curmmap->type;
341
342                        // move everything to make room
343                        assert(clean_mmap_length + tmpmmap.length + 4 < BOOTINFO_SIZE);
344                        memmove(cur + curmmap->size + 4 + tmpmmap.size + 4, cur + curmmap->size + 4, clean_mmap_length - ((cur - clean_mmap_addr) + curmmap->size + 4));
345                        clean_mmap_length += tmpmmap.size + 4;
346
347                        // insert new
348                        *nextmmap = tmpmmap;
349
350                        // restore order
351                        BUBBLE_NEXT_MMAP;
352
353                        reduced = true;
354                    }
355
356                    // after the previous step, the next region either ends
357                    // at the same location as the current or is longer
358                    uint64_t overlap = (curmmap->base_addr + curmmap->length) - nextmmap->base_addr;
359
360                    if (curmmap-> type > nextmmap->type) {
361                        // current has priority, shrink next and extend current
362                        nextmmap->length -= overlap;
363                        nextmmap->base_addr += overlap;
364                        curmmap->length += overlap;
365
366                        if (nextmmap->length == 0)
367                            DISCARD_NEXT_MMAP;
368
369                        reduced = true;
370                        continue;
371                    } else {
372                        // next has priority, shrink current and extend next
373                        nextmmap->length += overlap;
374                        nextmmap->base_addr -= overlap;
375                        curmmap->length -= overlap;
376
377                        reduced = true;
378                        continue;
379                    }
380                }
381            }
382        } while (reduced);
383
384        cur += curmmap->size + 4;
385
386#undef DISCARD_NEXT_MMAP
387#undef BUBBLE_NEXT_MMAP
388    }
389
390    printf("Preprocessed MMAP\n");
391    PRINT_REGIONS(clean_mmap_addr, clean_mmap_length);
392
393    // we can only map pages. Therefore page align regions
394    for(char * cur = clean_mmap_addr; cur < clean_mmap_addr + clean_mmap_length;) {
395        struct multiboot_mmap * curmmap = (struct multiboot_mmap * SAFE)TC(cur);
396        if (cur + curmmap->size + 4 >= clean_mmap_addr + clean_mmap_length)
397            break; // do not try to move this check into the forloop as entries do not have to be the same length
398
399        struct multiboot_mmap * nextmmap = (struct multiboot_mmap * SAFE)TC(cur + curmmap->size + 4);
400
401        if (nextmmap->base_addr & BASE_PAGE_MASK) {
402            uint64_t offset = nextmmap->base_addr - ((nextmmap->base_addr >> BASE_PAGE_BITS) << BASE_PAGE_BITS);
403
404            // round in favour of higher type
405            if (curmmap->type > nextmmap->type) {
406                curmmap->length += BASE_PAGE_SIZE - offset;
407                nextmmap->base_addr += BASE_PAGE_SIZE - offset;
408                nextmmap->length -= BASE_PAGE_SIZE - offset;
409            } else {
410                curmmap->length -= offset;
411                nextmmap->base_addr -= offset;
412                nextmmap->length += offset;
413            }
414        }
415
416        cur += curmmap->size + 4;
417    }
418
419    printf("Pagealigned MMAP\n");
420    PRINT_REGIONS(clean_mmap_addr, clean_mmap_length);
421
422#undef PRINT_REGIONS
423    *new_mmap_addr = clean_mmap_addr;
424    *new_mmap_length = clean_mmap_length;
425    return;
426}
427
428// XXX from serial.c
429extern int serial_portbase;
430
431static struct cmdarg cmdargs[] = {
432    {"loglevel", ArgType_Int, { .integer = &kernel_loglevel }},
433    {"logmask", ArgType_Int, { .integer = &kernel_log_subsystem_mask }},
434    {"ticks", ArgType_Bool, { .boolean = &kernel_ticks_enabled }},
435    {"timeslice", ArgType_UInt, { .uinteger = &config_timeslice }},
436    {"serial", ArgType_Int, { .integer = &serial_portbase }},
437    {"bsp_coreid", ArgType_Int, { .integer = &bsp_coreid }},
438    {NULL, 0, {NULL}}
439};
440
441/**
442 * Name of multiboot module containing program for init domains.
443 */
444#if defined(__k1om__)
445#       define BSP_INIT_MODULE_PATH     BF_BINARY_PREFIX "k1om/sbin/init"
446#elif defined(__x86_64__)
447#       define BSP_INIT_MODULE_PATH     BF_BINARY_PREFIX "x86_64/sbin/init"
448#elif defined(__i386__)
449#       define BSP_INIT_MODULE_PATH     BF_BINARY_PREFIX "x86_32/sbin/init"
450#else
451#       error "Unknown x86"
452#endif
453#define BSP_INIT_PROG_NAME       "init"
454#define APP_INIT_PROG_NAME       "monitor"
455
456/**
457 * \brief Kernel's early startup code, called from arch-specific bootstrap.
458 */
459void kernel_startup_early(void)
460{
461    const char *cmdline;
462    assert(glbl_core_data != NULL);
463    cmdline = MBADDR_ASSTRING(glbl_core_data->cmdline);
464    parse_commandline(cmdline, cmdargs);
465}
466
467/**
468 * \brief Kernel's main startup code, called from arch-specific bootstrap.
469 *
470 * This function never returns.
471 */
472extern bool verbose_dispatch;
473void kernel_startup(void)
474{
475#ifdef CONFIG_MICROBENCHMARKS
476    printk(LOG_NOTE, "\nRunning microbenchmarks...\n");
477    microbenchmarks_run_all();
478#endif
479
480    /* Initialize the core_data */
481    /* Used when bringing up other cores, must be at consistent global address
482     * seen by all cores */
483    struct x86_core_data *core_data
484        = (void *)((lvaddr_t)&_start_kernel - BASE_PAGE_SIZE);
485
486    struct dcb *init_dcb;
487    if (apic_is_bsp()) {
488        if (bsp_coreid != 0) {
489            my_core_id = bsp_coreid;
490        }
491
492        /* Initialize the location to allocate phys memory from */
493        bsp_init_alloc_addr = glbl_core_data->start_free_ram;
494
495        /* allocate initial KCB */
496        kcb_current = (struct kcb *) local_phys_to_mem(bsp_alloc_phys(sizeof(*kcb_current)));
497        memset(kcb_current, 0, sizeof(*kcb_current));
498        assert(kcb_current);
499
500        /* spawn init */
501        init_dcb = spawn_bsp_init(BSP_INIT_MODULE_PATH);
502    } else {
503        kcb_current = (struct kcb *)
504            local_phys_to_mem((lpaddr_t) kcb_current);
505
506        start_ap_signal();
507        // if we have a kernel control block, use it
508        if (kcb_current && kcb_current->is_valid) {
509            debug(SUBSYS_STARTUP, "have valid kcb, restoring state\n");
510            print_kcb();
511
512            // restore mdb
513            errval_t err = mdb_init(kcb_current);
514            if (err_is_fail(err)) {
515                panic("couldn't restore mdb");
516            }
517            // figure out if we need to convert scheduler state
518#ifdef CONFIG_SCHEDULER_RR
519            if (kcb_current->sched != SCHED_RR) {
520                printf("converting scheduler state to RR\n");
521                scheduler_convert();
522            }
523#elif CONFIG_SCHEDULER_RBED
524            if (kcb_current->sched != SCHED_RBED) {
525                printf("converting scheduler state to RBED\n");
526                scheduler_convert();
527            }
528#else
529#error must define scheduler
530#endif
531            // update core id of domains
532            kcb_update_core_id(kcb_current);
533            // set queue pointers
534            scheduler_restore_state();
535            // restore wakeup queue state
536            printk(LOG_DEBUG, "%s:%s:%d: kcb_current->wakeup_queue_head = %p\n",
537                   __FILE__, __FUNCTION__, __LINE__, kcb_current->wakeup_queue_head);
538            wakeup_set_queue_head(kcb_current->wakeup_queue_head);
539
540            printk(LOG_DEBUG, "%s:%s:%d: dcb_current = %p\n",
541                   __FILE__, __FUNCTION__, __LINE__, dcb_current);
542            struct dcb *next = schedule();
543            debug(SUBSYS_STARTUP, "next = %p\n", next);
544            if (next != NULL) {
545                assert (next->disp);
546                struct dispatcher_shared_generic *dst =
547                    get_dispatcher_shared_generic(next->disp);
548                debug(SUBSYS_STARTUP, "scheduling '%s' from restored state\n",
549                      dst->name);
550            }
551            // interrupt state should be fine, as it's used directly from the
552            // kcb.
553            dispatch(next);
554            panic("should not get here!");
555        }
556        my_core_id = core_data->dst_core_id;
557
558        /* Initialize the allocator */
559        app_alloc_phys_start = core_data->memory_base_start;
560        app_alloc_phys_end   = ((lpaddr_t)1 << core_data->memory_bits) +
561                                    app_alloc_phys_start;
562
563        init_dcb = spawn_app_init(core_data, APP_INIT_PROG_NAME);
564    }
565
566    // Should not return
567    //if (apic_is_bsp()) {
568    dispatch(init_dcb);
569    //}
570    panic("Error spawning init!");
571}
572
573/*
574 * Configure the IA32_PAT_MSR register such that PA4 is write-combining and
575 * PA5 is write-protect.
576 */
577void configure_page_attribute_table(void)
578{
579    ia32_t ia32;
580    ia32_cr_pat_t pat;
581
582    ia32_initialize(&ia32);
583
584    pat = ia32_cr_pat_rd(&ia32);
585
586    pat = ia32_cr_pat_pa4_insert(pat, ia32_wc);
587    pat = ia32_cr_pat_pa5_insert(pat, ia32_wp);
588
589    ia32_cr_pat_wr(&ia32, pat);
590
591    debug(SUBSYS_STARTUP, "Configured IA32_PAT_MSR.\n");
592}
593