1/*
2 * Copyright 2014, General Dynamics C4 Systems
3 *
4 * This software may be distributed and modified according to the terms of
5 * the GNU General Public License version 2. Note that NO WARRANTY is provided.
6 * See "LICENSE_GPLv2.txt" for details.
7 *
8 * @TAG(GD_GPL)
9 */
10
11#include <config.h>
12#include <kernel/boot.h>
13#include <machine/io.h>
14#include <model/statedata.h>
15#include <object/interrupt.h>
16#include <arch/object/interrupt.h>
17#include <arch/machine.h>
18#include <arch/kernel/apic.h>
19#include <arch/kernel/boot.h>
20#include <arch/kernel/boot_sys.h>
21#include <arch/kernel/vspace.h>
22#include <machine/fpu.h>
23#include <arch/machine/timer.h>
24#include <arch/object/ioport.h>
25#include <linker.h>
26#include <util.h>
27
28#include <plat/machine/intel-vtd.h>
29
30/* functions exactly corresponding to abstract specification */
31
32BOOT_CODE static void
33init_irqs(cap_t root_cnode_cap)
34{
35    irq_t i;
36
37    for (i = 0; i <= maxIRQ; i++) {
38        if (i == irq_timer) {
39            setIRQState(IRQTimer, i);
40#ifdef ENABLE_SMP_SUPPORT
41        } else if (i == irq_remote_call_ipi || i == irq_reschedule_ipi) {
42            setIRQState(IRQIPI, i);
43#endif /* ENABLE_SMP_SUPPORT */
44#ifdef CONFIG_IOMMU
45        } else if (i == irq_iommu) {
46            setIRQState(IRQReserved, i);
47#endif
48        } else if (i == 2 && config_set(CONFIG_IRQ_PIC)) {
49            /* cascaded legacy PIC */
50            setIRQState(IRQReserved, i);
51        } else if (i >= irq_isa_min && i <= irq_isa_max) {
52            if (config_set(CONFIG_IRQ_PIC)) {
53                setIRQState(IRQInactive, i);
54            } else {
55                setIRQState(IRQReserved, i);
56            }
57        } else if (i >= irq_user_min && i <= irq_user_max) {
58            if (config_set(CONFIG_IRQ_IOAPIC)) {
59                setIRQState(IRQInactive, i);
60            } else {
61                setIRQState(IRQReserved, i);
62            }
63        } else {
64            setIRQState(IRQReserved, i);
65        }
66    }
67    Arch_irqStateInit();
68    /* provide the IRQ control cap */
69    write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIRQControl), cap_irq_control_cap_new());
70}
71
72/* The maximum number of reserved regions we have is 1 for each physical memory region (+ MAX_NUM_FREEMEM_REG)
73 * plus 1 for each kernel device. For kernel devices we have the ioapics (+ CONFIG_MAX_NUM_IOAPIC),
74 * iommus (+ MAX_NUM_DRHU), apic (+ 1) and the reserved MSI region (+ 1) */
75#define NUM_RESERVED_REGIONS    (MAX_NUM_FREEMEM_REG + CONFIG_MAX_NUM_IOAPIC + MAX_NUM_DRHU + 2)
76typedef struct allocated_p_region {
77    p_region_t  regs[NUM_RESERVED_REGIONS];
78    word_t      cur_pos;
79} allocated_p_region_t;
80
81BOOT_BSS static allocated_p_region_t allocated_p_regions;
82
83BOOT_CODE static void
84merge_regions(void)
85{
86    unsigned int i, j;
87    /* Walk through all the regions and see if any can get merged */
88    for (i = 1; i < allocated_p_regions.cur_pos;) {
89        if (allocated_p_regions.regs[i - 1].end == allocated_p_regions.regs[i].start) {
90            /* move this down */
91            allocated_p_regions.regs[i - 1].end = allocated_p_regions.regs[i].end;
92            /* fill the rest down */
93            for (j = i; j < allocated_p_regions.cur_pos - 1; j++) {
94                allocated_p_regions.regs[j] = allocated_p_regions.regs[j + 1];
95            }
96            allocated_p_regions.cur_pos--;
97            /* don't increment 'i' since we want to recheck that the
98             * region we just moved to this slot doesn't also need merging */
99        } else {
100            i++;
101        }
102    }
103}
104
105static UNUSED BOOT_CODE bool_t p_region_overlaps(p_region_t reg)
106{
107    unsigned int i;
108    for (i = 0; i < allocated_p_regions.cur_pos; i++) {
109        if (allocated_p_regions.regs[i].start < reg.end &&
110                allocated_p_regions.regs[i].end > reg.start) {
111            return true;
112        }
113    }
114    return false;
115}
116
117BOOT_CODE bool_t
118add_allocated_p_region(p_region_t reg)
119{
120    unsigned int i, j;
121
122    assert(reg.start <= reg.end);
123    assert(!p_region_overlaps(reg));
124
125    /* Walk the existing regions and see if we can merge with an existing
126     * region, or insert in order */
127    for (i = 0; i < allocated_p_regions.cur_pos; i++) {
128        /* see if we can merge before or after this region */
129        if (allocated_p_regions.regs[i].end == reg.start) {
130            allocated_p_regions.regs[i].end = reg.end;
131            merge_regions();
132            return true;
133        }
134        if (allocated_p_regions.regs[i].start == reg.end) {
135            allocated_p_regions.regs[i].start = reg.start;
136            merge_regions();
137            return true;
138        }
139        /* see if this new one should be inserted before */
140        if (reg.end < allocated_p_regions.regs[i].start) {
141            /* ensure there's space to bump the regions up */
142            if (allocated_p_regions.cur_pos + 1 == NUM_RESERVED_REGIONS) {
143                printf("Ran out of reserved physical regions\n");
144                return false;
145            }
146            /* Copy the regions up to make a gap */
147            for (j = allocated_p_regions.cur_pos; j != i; j--) {
148                allocated_p_regions.regs[j] = allocated_p_regions.regs[j - 1];
149            }
150            /* Put this region in the gap */
151            allocated_p_regions.regs[i] = reg;
152            allocated_p_regions.cur_pos++;
153            return true;
154        }
155    }
156
157    /* nothing else matched, put this one at the end */
158    if (i + 1 == NUM_RESERVED_REGIONS) {
159        printf("Ran out of reserved physical regions\n");
160        return false;
161    }
162    allocated_p_regions.regs[i] = reg;
163    allocated_p_regions.cur_pos = i + 1;
164    return true;
165}
166
167BOOT_CODE void
168init_allocated_p_regions()
169{
170    allocated_p_regions.cur_pos = 0;
171}
172
173BOOT_CODE static bool_t
174create_untypeds(
175    cap_t root_cnode_cap,
176    region_t boot_mem_reuse_reg)
177{
178    seL4_SlotPos     slot_pos_before;
179    seL4_SlotPos     slot_pos_after;
180    word_t      i;
181
182    paddr_t     start = 0;
183
184    slot_pos_before = ndks_boot.slot_pos_cur;
185    create_kernel_untypeds(root_cnode_cap, boot_mem_reuse_reg, slot_pos_before);
186
187    for (i = 0; i < allocated_p_regions.cur_pos; i++) {
188        if (start != allocated_p_regions.regs[i].start) {
189            if (!create_untypeds_for_region(root_cnode_cap, true,
190            paddr_to_pptr_reg((p_region_t) {
191            start, allocated_p_regions.regs[i].start
192            }),
193            slot_pos_before)) {
194                return false;
195            }
196        }
197        start = allocated_p_regions.regs[i].end;
198    }
199
200    if (start != PADDR_USER_DEVICE_TOP) {
201        if (!create_untypeds_for_region(root_cnode_cap, true,
202        paddr_to_pptr_reg((p_region_t) {
203        start, PADDR_USER_DEVICE_TOP
204    }),
205    slot_pos_before)) {
206            return false;
207        }
208    }
209
210    slot_pos_after = ndks_boot.slot_pos_cur;
211    ndks_boot.bi_frame->untyped = (seL4_SlotRegion) {
212        slot_pos_before, slot_pos_after
213    };
214    return true;
215}
216
217BOOT_CODE static void
218init_freemem(p_region_t ui_p_reg, mem_p_regs_t mem_p_regs)
219{
220    word_t i;
221    /* we are guaranteed that we started loading the user image after the kernel
222     * so we only include addresses above ui_info.p_reg.end */
223    pptr_t floor = ui_p_reg.end;
224    for (i = 0; i < MAX_NUM_FREEMEM_REG; i++) {
225        ndks_boot.freemem[i] = REG_EMPTY;
226    }
227    for (i = 0; i < mem_p_regs.count; i++) {
228        pptr_t start = mem_p_regs.list[i].start;
229        pptr_t end = mem_p_regs.list[i].end;
230        if (start < floor) {
231            start = floor;
232        }
233        if (end < floor) {
234            end = floor;
235        }
236        insert_region(paddr_to_pptr_reg((p_region_t) {
237            start, end
238        }));
239    }
240}
241
242/* This function initialises a node's kernel state. It does NOT initialise the CPU. */
243
244BOOT_CODE bool_t
245init_sys_state(
246    cpu_id_t      cpu_id,
247    mem_p_regs_t  mem_p_regs,
248    ui_info_t     ui_info,
249    p_region_t    boot_mem_reuse_p_reg,
250    /* parameters below not modeled in abstract specification */
251    uint32_t      num_drhu,
252    paddr_t*      drhu_list,
253    acpi_rmrr_list_t *rmrr_list,
254    acpi_rsdp_t      *acpi_rsdp,
255    seL4_X86_BootInfo_VBE *vbe,
256    seL4_X86_BootInfo_mmap_t *mb_mmap,
257    seL4_X86_BootInfo_fb_t *fb_info
258)
259{
260    cap_t         root_cnode_cap;
261    vptr_t        extra_bi_frame_vptr;
262    vptr_t        bi_frame_vptr;
263    vptr_t        ipcbuf_vptr;
264    cap_t         it_vspace_cap;
265    cap_t         it_ap_cap;
266    cap_t         ipcbuf_cap;
267    pptr_t        bi_frame_pptr;
268    word_t        extra_bi_size = sizeof(seL4_BootInfoHeader);
269    region_t      extra_bi_region;
270    pptr_t        extra_bi_offset = 0;
271    uint32_t      tsc_freq;
272    create_frames_of_region_ret_t create_frames_ret;
273    create_frames_of_region_ret_t extra_bi_ret;
274
275    /* convert from physical addresses to kernel pptrs */
276    region_t ui_reg             = paddr_to_pptr_reg(ui_info.p_reg);
277    region_t boot_mem_reuse_reg = paddr_to_pptr_reg(boot_mem_reuse_p_reg);
278
279    /* convert from physical addresses to userland vptrs */
280    v_region_t ui_v_reg;
281    v_region_t it_v_reg;
282    ui_v_reg.start = ui_info.p_reg.start - ui_info.pv_offset;
283    ui_v_reg.end   = ui_info.p_reg.end   - ui_info.pv_offset;
284
285    ipcbuf_vptr = ui_v_reg.end;
286    bi_frame_vptr = ipcbuf_vptr + BIT(PAGE_BITS);
287    extra_bi_frame_vptr = bi_frame_vptr + BIT(PAGE_BITS);
288
289    if (vbe->vbeMode != -1) {
290        extra_bi_size += sizeof(seL4_X86_BootInfo_VBE);
291    }
292    if (acpi_rsdp) {
293        extra_bi_size += sizeof(seL4_BootInfoHeader) + sizeof(*acpi_rsdp);
294    }
295    if (fb_info && fb_info->addr) {
296        extra_bi_size += sizeof(seL4_BootInfoHeader) + sizeof(*fb_info);
297    }
298
299    word_t mb_mmap_size = sizeof(seL4_X86_BootInfo_mmap_t);
300    extra_bi_size += mb_mmap_size;
301
302    // room for tsc frequency
303    extra_bi_size += sizeof(seL4_BootInfoHeader) + 4;
304
305    /* The region of the initial thread is the user image + ipcbuf and boot info */
306    it_v_reg.start = ui_v_reg.start;
307    it_v_reg.end = ROUND_UP(extra_bi_frame_vptr + extra_bi_size, PAGE_BITS);
308
309    init_freemem(ui_info.p_reg, mem_p_regs);
310
311    /* create the root cnode */
312    root_cnode_cap = create_root_cnode();
313
314    /* create the IO port cap */
315    write_slot(
316        SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIOPortControl),
317        cap_io_port_control_cap_new()
318    );
319
320    /* create the cap for managing thread domains */
321    create_domain_cap(root_cnode_cap);
322
323    /* create the IRQ CNode */
324    if (!create_irq_cnode()) {
325        return false;
326    }
327
328    /* initialise the IRQ states and provide the IRQ control cap */
329    init_irqs(root_cnode_cap);
330
331    tsc_freq = tsc_init();
332
333    /* create the bootinfo frame */
334    bi_frame_pptr = allocate_bi_frame(0, ksNumCPUs, ipcbuf_vptr);
335    if (!bi_frame_pptr) {
336        return false;
337    }
338
339    extra_bi_region = allocate_extra_bi_region(extra_bi_size);
340    if (extra_bi_region.start == 0) {
341        return false;
342    }
343
344    /* populate vbe info block */
345    if (vbe->vbeMode != -1) {
346        vbe->header.id = SEL4_BOOTINFO_HEADER_X86_VBE;
347        vbe->header.len = sizeof(seL4_X86_BootInfo_VBE);
348        memcpy((void*)(extra_bi_region.start + extra_bi_offset), vbe, sizeof(seL4_X86_BootInfo_VBE));
349        extra_bi_offset += sizeof(seL4_X86_BootInfo_VBE);
350    }
351
352    /* populate acpi rsdp block */
353    if (acpi_rsdp) {
354        seL4_BootInfoHeader header;
355        header.id = SEL4_BOOTINFO_HEADER_X86_ACPI_RSDP;
356        header.len = sizeof(header) + sizeof(*acpi_rsdp);
357        *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header;
358        extra_bi_offset += sizeof(header);
359        memcpy((void*)(extra_bi_region.start + extra_bi_offset), acpi_rsdp, sizeof(*acpi_rsdp));
360        extra_bi_offset += sizeof(*acpi_rsdp);
361    }
362
363    /* populate framebuffer information block */
364    if (fb_info && fb_info->addr) {
365        seL4_BootInfoHeader header;
366        header.id = SEL4_BOOTINFO_HEADER_X86_FRAMEBUFFER;
367        header.len = sizeof(header) + sizeof(*fb_info);
368        *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header;
369        extra_bi_offset += sizeof(header);
370        memcpy((void*)(extra_bi_region.start + extra_bi_offset), fb_info, sizeof(*fb_info));
371        extra_bi_offset += sizeof(*fb_info);
372    }
373
374    /* populate multiboot mmap block */
375    mb_mmap->header.id = SEL4_BOOTINFO_HEADER_X86_MBMMAP;
376    mb_mmap->header.len = mb_mmap_size;
377    memcpy((void*)(extra_bi_region.start + extra_bi_offset), mb_mmap, mb_mmap_size);
378    extra_bi_offset += mb_mmap_size;
379
380    /* populate tsc frequency block */
381    {
382        seL4_BootInfoHeader header;
383        header.id = SEL4_BOOTINFO_HEADER_X86_TSC_FREQ;
384        header.len = sizeof(header) + 4;
385        *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = header;
386        extra_bi_offset += sizeof(header);
387        *(uint32_t*)(extra_bi_region.start + extra_bi_offset) = tsc_freq;
388        extra_bi_offset += 4;
389    }
390
391    /* provde a chunk for any leftover padding in the extended boot info */
392    seL4_BootInfoHeader padding_header;
393    padding_header.id = SEL4_BOOTINFO_HEADER_PADDING;
394    padding_header.len = (extra_bi_region.end - extra_bi_region.start) - extra_bi_offset;
395    *(seL4_BootInfoHeader*)(extra_bi_region.start + extra_bi_offset) = padding_header;
396
397    /* Construct an initial address space with enough virtual addresses
398     * to cover the user image + ipc buffer and bootinfo frames */
399    it_vspace_cap = create_it_address_space(root_cnode_cap, it_v_reg);
400    if (cap_get_capType(it_vspace_cap) == cap_null_cap) {
401        return false;
402    }
403
404    /* Create and map bootinfo frame cap */
405    create_bi_frame_cap(
406        root_cnode_cap,
407        it_vspace_cap,
408        bi_frame_pptr,
409        bi_frame_vptr
410    );
411
412    /* create and map extra bootinfo region */
413    extra_bi_ret =
414        create_frames_of_region(
415            root_cnode_cap,
416            it_vspace_cap,
417            extra_bi_region,
418            true,
419            pptr_to_paddr((void*)(extra_bi_region.start - extra_bi_frame_vptr))
420        );
421    if (!extra_bi_ret.success) {
422        return false;
423    }
424    ndks_boot.bi_frame->extraBIPages = extra_bi_ret.region;
425
426    /* create the initial thread's IPC buffer */
427    ipcbuf_cap = create_ipcbuf_frame(root_cnode_cap, it_vspace_cap, ipcbuf_vptr);
428    if (cap_get_capType(ipcbuf_cap) == cap_null_cap) {
429        return false;
430    }
431
432    /* create all userland image frames */
433    create_frames_ret =
434        create_frames_of_region(
435            root_cnode_cap,
436            it_vspace_cap,
437            ui_reg,
438            true,
439            ui_info.pv_offset
440        );
441    if (!create_frames_ret.success) {
442        return false;
443    }
444    ndks_boot.bi_frame->userImageFrames = create_frames_ret.region;
445
446    /* create the initial thread's ASID pool */
447    it_ap_cap = create_it_asid_pool(root_cnode_cap);
448    if (cap_get_capType(it_ap_cap) == cap_null_cap) {
449        return false;
450    }
451    write_it_asid_pool(it_ap_cap, it_vspace_cap);
452
453    /* create the idle thread */
454    if (!create_idle_thread()) {
455        return false;
456    }
457
458    /* create the initial thread */
459    tcb_t *initial = create_initial_thread(root_cnode_cap,
460                                           it_vspace_cap,
461                                           ui_info.v_entry,
462                                           bi_frame_vptr,
463                                           ipcbuf_vptr,
464                                           ipcbuf_cap);
465    if (initial == NULL) {
466        return false;
467    }
468    init_core_state(initial);
469
470#ifdef CONFIG_IOMMU
471    /* initialise VTD-related data structures and the IOMMUs */
472    if (!vtd_init(cpu_id, num_drhu, rmrr_list)) {
473        return false;
474    }
475
476    /* write number of IOMMU PT levels into bootinfo */
477    ndks_boot.bi_frame->numIOPTLevels = x86KSnumIOPTLevels;
478
479    /* write IOSpace master cap */
480    write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapIOSpace), master_iospace_cap());
481#else
482    ndks_boot.bi_frame->numIOPTLevels = -1;
483#endif
484
485    /* create all of the untypeds. Both devices and kernel window memory */
486    if (!create_untypeds(root_cnode_cap, boot_mem_reuse_reg)) {
487        return false;
488    }
489    /* WARNING: alloc_region() must not be called anymore after here! */
490
491    /* finalise the bootinfo frame */
492    bi_finalise();
493
494    return true;
495}
496
497/* This function initialises the CPU. It does NOT initialise any kernel state. */
498
499BOOT_CODE bool_t
500init_cpu(
501    bool_t   mask_legacy_irqs
502)
503{
504    /* initialise virtual-memory-related data structures */
505    if (!init_vm_state()) {
506        return false;
507    }
508
509    /* initialise CPU's descriptor table registers (GDTR, IDTR, LDTR, TR) */
510    init_dtrs();
511
512    if (config_set(CONFIG_SYSENTER)) {
513        /* initialise MSRs (needs an initialised TSS) */
514        init_sysenter_msrs();
515    } else if (config_set(CONFIG_SYSCALL)) {
516        init_syscall_msrs();
517    } else {
518        return false;
519    }
520
521    /* setup additional PAT MSR */
522    if (!init_pat_msr()) {
523        return false;
524    }
525
526    /* enable the Write Protect bit in cr0. This prevents the kernel from writing to
527     * read only memory, which we shouldn't do under correct execution */
528    write_cr0(read_cr0() | CR0_WRITE_PROTECT);
529
530    /* check for SMAP and SMEP and enable */
531    cpuid_007h_ebx_t ebx_007;
532    ebx_007.words[0] = x86_cpuid_ebx(0x7, 0);
533    if (cpuid_007h_ebx_get_smap(ebx_007)) {
534        /* if we have user stack trace enabled or dangerous code injection then we cannot
535         * enable this as SMAP will make them fault. */
536        if (!config_set(CONFIG_PRINTING) && !config_set(CONFIG_DANGEROUS_CODE_INJECTION)) {
537            write_cr4(read_cr4() | CR4_SMAP);
538        }
539    }
540    if (cpuid_007h_ebx_get_smep(ebx_007)) {
541        /* similar to smap we cannot enable smep if using dangerous code injenction. it
542         * does not affect stack trace printing though */
543        if (!config_set(CONFIG_DANGEROUS_CODE_INJECTION)) {
544            write_cr4(read_cr4() | CR4_SMEP);
545        }
546    }
547
548    if (!init_ibrs()) {
549        return false;
550    }
551
552#ifdef CONFIG_HARDWARE_DEBUG_API
553    /* Initialize hardware breakpoints */
554    Arch_initHardwareBreakpoints();
555#endif
556
557    /* initialise floating-point unit */
558    if (!Arch_initFpu()) {
559        return false;
560    }
561
562    /* initialise local APIC */
563    if (!apic_init(mask_legacy_irqs)) {
564        return false;
565    }
566
567#ifdef CONFIG_DEBUG_DISABLE_PREFETCHERS
568    if (!disablePrefetchers()) {
569        return false;
570    }
571#endif
572
573    if (config_set(CONFIG_EXPORT_PMC_USER)) {
574        enablePMCUser();
575    }
576
577#ifdef CONFIG_VTX
578    /* initialise Intel VT-x extensions */
579    if (!vtx_init()) {
580        return false;
581    }
582#endif
583
584    return true;
585}
586