1/*
2 * Copyright 2020, Data61, CSIRO (ABN 41 687 119 230)
3 *
4 * SPDX-License-Identifier: GPL-2.0-only
5 */
6
7#include <config.h>
8#include <api/syscall.h>
9#include <machine/io.h>
10#include <kernel/boot.h>
11#include <model/statedata.h>
12#include <arch/kernel/vspace.h>
13#include <arch/kernel/boot.h>
14#include <arch/kernel/boot_sys.h>
15#include <arch/api/invocation.h>
16#include <mode/kernel/tlb.h>
17#include <arch/kernel/tlb_bitmap.h>
18#include <object/structures.h>
19
20/* When using the SKIM window to isolate the kernel from the user we also need to
21 * not use global mappings as having global mappings and entries in the TLB is
22 * equivalent, for the purpose of exploitation, to having the mappings in the
23 * kernel window */
24#define KERNEL_IS_GLOBAL() (config_set(CONFIG_KERNEL_SKIM_WINDOW) ? 0 : 1)
25
26/* For the boot code we create two windows into the physical address space
27 * One is at the same location as the kernel window, and is placed up high
28 * The other is a 1-to-1 mapping of the first 512gb of memory. The purpose
29 * of this is to have a 1-to-1 mapping for the low parts of memory, so that
30 * when we switch paging on, and are still running at physical addresses,
31 * we don't explode. Then we also want the high mappings so we can start
32 * running at proper kernel virtual addresses */
33pml4e_t boot_pml4[BIT(PML4_INDEX_BITS)] ALIGN(BIT(seL4_PageBits)) VISIBLE PHYS_BSS;
34pdpte_t boot_pdpt[BIT(PDPT_INDEX_BITS)] ALIGN(BIT(seL4_PageBits)) VISIBLE PHYS_BSS;
35
36/* 'gdt_idt_ptr' is declared globally because of a C-subset restriction.
37 * It is only used in init_drts(), which therefore is non-reentrant.
38 */
39gdt_idt_ptr_t gdt_idt_ptr;
40
41BOOT_CODE bool_t map_kernel_window(
42    uint32_t num_ioapic,
43    paddr_t   *ioapic_paddrs,
44    uint32_t   num_drhu,
45    paddr_t   *drhu_list
46)
47{
48
49    uint64_t paddr;
50    uint64_t vaddr;
51
52#ifdef CONFIG_HUGE_PAGE
53    /* using 1 GiB page size */
54
55    /* verify that the kernel window as at the last entry of the PML4 */
56    assert(GET_PML4_INDEX(PPTR_BASE) == BIT(PML4_INDEX_BITS) - 1);
57    /* verify that the kernel_base is located in the last entry of the PML4,
58     * the second last entry of the PDPT, is 1gb aligned and 1gb in size */
59    assert(GET_PML4_INDEX(KERNEL_ELF_BASE) == BIT(PML4_INDEX_BITS) - 1);
60    assert(GET_PDPT_INDEX(KERNEL_ELF_BASE) == BIT(PML4_INDEX_BITS) - 2);
61    assert(GET_PDPT_INDEX(KDEV_BASE) == BIT(PML4_INDEX_BITS) - 1);
62    assert(IS_ALIGNED(KERNEL_ELF_BASE - KERNEL_ELF_PADDR_BASE, seL4_HugePageBits));
63    assert(IS_ALIGNED(KDEV_BASE, seL4_HugePageBits));
64    /* place the PDPT into the PML4 */
65    x64KSKernelPML4[GET_PML4_INDEX(PPTR_BASE)] = pml4e_new(
66                                                     0, /* xd */
67                                                     kpptr_to_paddr(x64KSKernelPDPT),
68                                                     0, /* accessed */
69                                                     0, /* cache_disabled */
70                                                     0, /* write_through */
71                                                     0, /* super_user */
72                                                     1, /* read_write */
73                                                     1  /* present */
74                                                 );
75    /* put the 1GB kernel_base mapping into the PDPT */
76    x64KSKernelPDPT[GET_PDPT_INDEX(KERNEL_ELF_BASE)] = pdpte_pdpte_1g_new(
77                                                           0, /* xd */
78                                                           PADDR_BASE,
79                                                           0, /* PAT */
80                                                           KERNEL_IS_GLOBAL(), /* global */
81                                                           0, /* dirty */
82                                                           0, /* accessed */
83                                                           0, /* cache_disabled */
84                                                           0, /* write_through */
85                                                           0, /* super_user */
86                                                           1, /* read_write */
87                                                           1  /* present */
88                                                       );
89    /* also map the physical memory into the big kernel window */
90    paddr = 0;
91    vaddr = PPTR_BASE;
92    for (paddr = 0; paddr < PADDR_TOP;
93         paddr += BIT(seL4_HugePageBits)) {
94
95        int pdpte_index = GET_PDPT_INDEX(vaddr);
96        x64KSKernelPDPT[pdpte_index] = pdpte_pdpte_1g_new(
97                                           0,          /* xd               */
98                                           paddr,      /* physical address */
99                                           0,          /* PAT              */
100                                           KERNEL_IS_GLOBAL(), /* global   */
101                                           0,          /* dirty            */
102                                           0,          /* accessed         */
103                                           0,          /* cache_disabled   */
104                                           0,          /* write_through    */
105                                           0,          /* super_user       */
106                                           1,          /* read_write       */
107                                           1           /* present          */
108                                       );
109
110        vaddr += BIT(seL4_HugePageBits);
111    }
112
113    /* put the PD into the PDPT */
114    x64KSKernelPDPT[GET_PDPT_INDEX(KDEV_BASE)] = pdpte_pdpte_pd_new(
115                                                     0, /* xd */
116                                                     kpptr_to_paddr(x64KSKernelPD),
117                                                     0, /* accessed */
118                                                     0, /* cache_disabled */
119                                                     0, /* write_through */
120                                                     0, /* super_user */
121                                                     1, /* read_write */
122                                                     1  /* present */
123                                                 );
124    /* put the PT into the PD */
125    x64KSKernelPD[0] = pde_pde_pt_new(
126                           0, /* xd */
127                           kpptr_to_paddr(x64KSKernelPT),
128                           0, /* accessed */
129                           0, /* cache_disabled */
130                           0, /* write_through */
131                           0, /* super_user */
132                           1, /* read_write */
133                           1  /* present */
134                       );
135#else
136
137    int pd_index = 0;
138    /* use 2 MiB page size */
139    /* verify that the kernel window as at the last entry of the PML4 */
140    assert(GET_PML4_INDEX(PPTR_BASE) == BIT(PML4_INDEX_BITS) - 1);
141    /* verify that the kernel_base is located in the last entry of the PML4,
142     * the second last entry of the PDPT, is 1gb aligned and 1gb in size */
143    assert(GET_PML4_INDEX(KERNEL_ELF_BASE) == BIT(PML4_INDEX_BITS) - 1);
144    assert(GET_PDPT_INDEX(KERNEL_ELF_BASE) == BIT(PML4_INDEX_BITS) - 2);
145    assert(GET_PDPT_INDEX(KDEV_BASE) == BIT(PML4_INDEX_BITS) - 1);
146    assert(IS_ALIGNED(KERNEL_ELF_BASE - KERNEL_ELF_PADDR_BASE, seL4_HugePageBits));
147    assert(IS_ALIGNED(KDEV_BASE, seL4_HugePageBits));
148
149    /* place the PDPT into the PML4 */
150    x64KSKernelPML4[GET_PML4_INDEX(PPTR_BASE)] = pml4e_new(
151                                                     0, /* xd */
152                                                     kpptr_to_paddr(x64KSKernelPDPT),
153                                                     0, /* accessed */
154                                                     0, /* cache_disabled */
155                                                     0, /* write_through */
156                                                     0, /* super_user */
157                                                     1, /* read_write */
158                                                     1  /* present */
159                                                 );
160
161    for (pd_index = 0; pd_index < PADDR_TOP >> seL4_HugePageBits; pd_index++) {
162        /* put the 1GB kernel_base mapping into the PDPT */
163        x64KSKernelPDPT[GET_PDPT_INDEX(PPTR_BASE) + pd_index] = pdpte_pdpte_pd_new(
164                                                                    0, /* xd */
165                                                                    kpptr_to_paddr(&x64KSKernelPDs[pd_index][0]),
166                                                                    0, /* accessed */
167                                                                    0, /* cache disabled */
168                                                                    0, /* write through */
169                                                                    0, /* super user */
170                                                                    1, /* read write */
171                                                                    1 /* present */
172                                                                );
173    }
174
175    x64KSKernelPDPT[GET_PDPT_INDEX(KERNEL_ELF_BASE)] = pdpte_pdpte_pd_new(
176                                                           0, /* xd */
177                                                           kpptr_to_paddr(&x64KSKernelPDs[0][0]),
178                                                           0, /* accessed */
179                                                           0, /* cache disable */
180                                                           1, /* write through */
181                                                           0, /* super user */
182                                                           1, /* read write */
183                                                           1  /* present */
184                                                       );
185
186    paddr = 0;
187    vaddr = PPTR_BASE;
188
189    for (paddr = 0; paddr < PADDR_TOP;
190         paddr += 0x200000) {
191
192        int pd_index = GET_PDPT_INDEX(vaddr) - GET_PDPT_INDEX(PPTR_BASE);
193        int pde_index = GET_PD_INDEX(vaddr);
194
195        x64KSKernelPDs[pd_index][pde_index] = pde_pde_large_new(
196                                                  0, /* xd */
197                                                  paddr,
198                                                  0, /* pat */
199                                                  KERNEL_IS_GLOBAL(), /* global */
200                                                  0, /* dirty */
201                                                  0, /* accessed */
202                                                  0, /* cache disabled */
203                                                  0, /* write through */
204                                                  0, /* super user */
205                                                  1, /* read write */
206                                                  1  /* present */
207                                              );
208        vaddr += 0x200000;
209    }
210
211    /* put the PD into the PDPT */
212    x64KSKernelPDPT[GET_PDPT_INDEX(KDEV_BASE)] = pdpte_pdpte_pd_new(
213                                                     0, /* xd */
214                                                     kpptr_to_paddr(&x64KSKernelPDs[BIT(PDPT_INDEX_BITS) - 1][0]),
215                                                     0, /* accessed */
216                                                     0, /* cache_disabled */
217                                                     0, /* write_through */
218                                                     0, /* super_user */
219                                                     1, /* read_write */
220                                                     1  /* present */
221                                                 );
222
223    /* put the PT into the PD */
224    x64KSKernelPDs[BIT(PDPT_INDEX_BITS) - 1][0] = pde_pde_pt_new(
225                                                      0, /* xd */
226                                                      kpptr_to_paddr(x64KSKernelPT),
227                                                      0, /* accessed */
228                                                      0, /* cache_disabled */
229                                                      0, /* write_through */
230                                                      0, /* super_user */
231                                                      1, /* read_write */
232                                                      1  /* present */
233                                                  );
234#endif
235
236#if CONFIG_MAX_NUM_TRACE_POINTS > 0
237    /* use the last PD entry as the benchmark log storage.
238     * the actual backing physical memory will be filled
239     * later by using alloc_region */
240    ksLog = (ks_log_entry_t *)(KDEV_BASE + 0x200000 * (BIT(PD_INDEX_BITS) - 1));
241#endif
242
243    /* now map in the kernel devices */
244    if (!map_kernel_window_devices(x64KSKernelPT, num_ioapic, ioapic_paddrs, num_drhu, drhu_list)) {
245        return false;
246    }
247
248#ifdef ENABLE_SMP_SUPPORT
249    /* initialize the TLB bitmap */
250    tlb_bitmap_init(x64KSKernelPML4);
251#endif /* ENABLE_SMP_SUPPORT */
252
253    /* In boot code, so fine to just trash everything here */
254    invalidateLocalTranslationAll();
255    printf("Mapping kernel window is done\n");
256    return true;
257}
258
259#ifdef CONFIG_KERNEL_SKIM_WINDOW
260BOOT_CODE bool_t map_skim_window(vptr_t skim_start, vptr_t skim_end)
261{
262    /* place the PDPT into the PML4 */
263    x64KSSKIMPML4[GET_PML4_INDEX(PPTR_BASE)] = pml4e_new(
264                                                   0, /* xd */
265                                                   kpptr_to_paddr(x64KSSKIMPDPT),
266                                                   0, /* accessed */
267                                                   0, /* cache_disabled */
268                                                   0, /* write_through */
269                                                   0, /* super_user */
270                                                   1, /* read_write */
271                                                   1  /* present */
272                                               );
273    /* place the PD into the kernel_base slot of the PDPT */
274    x64KSSKIMPDPT[GET_PDPT_INDEX(KERNEL_ELF_BASE)] = pdpte_pdpte_pd_new(
275                                                         0, /* xd */
276                                                         kpptr_to_paddr(x64KSSKIMPD),
277                                                         0, /* accessed */
278                                                         0, /* cache_disabled */
279                                                         0, /* write_through */
280                                                         0, /* super_user */
281                                                         1, /* read_write */
282                                                         1  /* present */
283                                                     );
284    /* map the skim portion into the PD. we expect it to be 2M aligned */
285    assert((skim_start % BIT(seL4_LargePageBits)) == 0);
286    assert((skim_end % BIT(seL4_LargePageBits)) == 0);
287    uint64_t paddr = kpptr_to_paddr((void *)skim_start);
288    for (int i = GET_PD_INDEX(skim_start); i < GET_PD_INDEX(skim_end); i++) {
289        x64KSSKIMPD[i] = pde_pde_large_new(
290                             0, /* xd */
291                             paddr,
292                             0, /* pat */
293                             KERNEL_IS_GLOBAL(), /* global */
294                             0, /* dirty */
295                             0, /* accessed */
296                             0, /* cache_disabled */
297                             0, /* write_through */
298                             0, /* super_user */
299                             1, /* read_write */
300                             1  /* present */
301                         );
302        paddr += BIT(seL4_LargePageBits);
303    }
304    return true;
305}
306#endif
307
308BOOT_CODE void init_tss(tss_t *tss)
309{
310    word_t base = (word_t)&x64KSIRQStack[CURRENT_CPU_INDEX()][IRQ_STACK_SIZE];
311    *tss = tss_new(
312               sizeof(*tss),   /* io map base */
313               0, 0,       /* ist 7 */
314               0, 0,
315               0, 0,
316               0, 0,
317               0, 0,
318               0, 0,
319               /* ist 1 is the stack frame we use for interrupts */
320               base >> 32, base & 0xffffffff,  /* ist 1 */
321               0, 0,       /* rsp 2 */
322               0, 0,       /* rsp 1 */
323               0, 0        /* rsp 0 */
324           );
325    /* set the IO map to all 1 to block user IN/OUT instructions */
326    memset(&x86KSGlobalState[CURRENT_CPU_INDEX()].x86KStss.io_map[0], 0xff,
327           sizeof(x86KSGlobalState[CURRENT_CPU_INDEX()].x86KStss.io_map));
328}
329
330BOOT_CODE void init_syscall_msrs(void)
331{
332    x86_wrmsr(IA32_LSTAR_MSR, (uint64_t)&handle_fastsyscall);
333    // mask bit 9 in the kernel (which is the interrupt enable bit)
334    // also mask bit 8, which is the Trap Flag, to prevent the kernel
335    // from single stepping
336    x86_wrmsr(IA32_FMASK_MSR, FLAGS_TF | FLAGS_IF);
337    x86_wrmsr(IA32_STAR_MSR, ((uint64_t)SEL_CS_0 << 32) | ((uint64_t)SEL_CS_3 << 48));
338}
339
340BOOT_CODE void init_gdt(gdt_entry_t *gdt, tss_t *tss)
341{
342
343    uint64_t tss_base = (uint64_t)tss;
344    gdt_tss_t gdt_tss;
345
346    gdt[GDT_NULL] = gdt_entry_gdt_null_new();
347
348    gdt[GDT_CS_0] = gdt_entry_gdt_code_new(
349                        0,                  /* base high */
350                        1,                  /* granularity */
351                        0,                  /* operation size, must be 0 when 64-bit is set */
352                        1,                  /* long mode */
353                        0,                  /* avl */
354                        0xf,                /* limit high */
355                        1,                  /* present */
356                        0,                  /* dpl */
357                        1,                  /* always 1 for segment */
358                        0,                  /* base middle */
359                        0,                  /* base low */
360                        0xffff              /* limit low */
361                    );
362
363    gdt[GDT_DS_0] = gdt_entry_gdt_data_new(
364                        0,                  /* base high */
365                        1,                  /* granularity */
366                        1,                  /* operation size */
367                        0,                  /* avl */
368                        0xf,                /* seg limit high */
369                        1,                  /* present */
370                        0,                  /* dpl */
371                        1,                  /* always 1 */
372                        0,                  /* base mid */
373                        0,                  /* base low */
374                        0xffff              /* seg limit low */
375                    );
376
377    gdt[GDT_CS_3] = gdt_entry_gdt_code_new(
378                        0,                  /* base high */
379                        1,                  /* granularity */
380                        0,                  /* operation size, must be 0 when 64-bit is set */
381                        1,                  /* long mode */
382                        0,                  /* avl */
383                        0xf,                /* limit high */
384                        1,                  /* present */
385                        3,                  /* dpl */
386                        1,                  /* always 1 */
387                        0,                  /* base middle */
388                        0,                  /* base low */
389                        0xffff              /* limit low */
390                    );
391
392    gdt[GDT_DS_3] = gdt_entry_gdt_data_new(
393                        0,
394                        1,
395                        1,
396                        0,
397                        0xf,
398                        1,
399                        3,
400                        1,
401                        0,
402                        0,
403                        0xffff
404                    );
405
406    gdt[GDT_FS] = gdt_entry_gdt_data_new(
407                      0,
408                      1,
409                      1,
410                      0,
411                      0xf,
412                      1,
413                      3,
414                      1,
415                      0,
416                      0,
417                      0xffff
418                  );
419
420    gdt[GDT_GS] = gdt_entry_gdt_data_new(
421                      0,
422                      1,
423                      1,
424                      0,
425                      0xf,
426                      1,
427                      3,
428                      1,
429                      0,
430                      0,
431                      0xffff
432                  );
433
434    gdt_tss = gdt_tss_new(
435                  tss_base >> 32,                     /* base 63 - 32 */
436                  (tss_base & 0xff000000UL) >> 24,    /* base 31 - 24 */
437                  1,                                  /* granularity */
438                  0,                                  /* avl */
439                  0,                                  /* limit high */
440                  1,                                  /* present */
441                  0,                                  /* dpl */
442                  9,                                  /* desc type */
443                  (tss_base & 0xff0000UL) >> 16,      /* base 23-16 */
444                  (tss_base & 0xffffUL),              /* base 15 - 0 */
445                  sizeof(tss_io_t) - 1
446              );
447
448    gdt[GDT_TSS].words[0] = gdt_tss.words[0];
449    gdt[GDT_TSS + 1].words[0] = gdt_tss.words[1];
450}
451
452BOOT_CODE void init_idt_entry(idt_entry_t *idt, interrupt_t interrupt, void(*handler)(void))
453{
454    uint64_t handler_addr = (uint64_t)handler;
455    uint64_t dpl = 3;
456
457    if (interrupt < int_trap_min && interrupt != int_software_break_request) {
458        dpl = 0;
459    }
460
461    idt[interrupt] = idt_entry_interrupt_gate_new(
462                         handler_addr >> 32,                 /* offset 63 - 32 */
463                         ((handler_addr >> 16) & 0xffff),
464                         1,                                  /* present */
465                         dpl,                                /* dpl */
466                         1,                                  /* ist */
467                         SEL_CS_0,                           /* segment selector */
468                         (handler_addr & 0xffff)               /* offset 15 - 0 */
469                     );
470}
471
472void setVMRoot(tcb_t *tcb)
473{
474    cap_t threadRoot;
475    asid_t asid;
476    pml4e_t *pml4;
477    findVSpaceForASID_ret_t find_ret;
478    cr3_t cr3;
479
480    threadRoot = TCB_PTR_CTE_PTR(tcb, tcbVTable)->cap;
481
482    if (cap_get_capType(threadRoot) != cap_pml4_cap ||
483        !cap_pml4_cap_get_capPML4IsMapped(threadRoot)) {
484        setCurrentUserVSpaceRoot(kpptr_to_paddr(X86_GLOBAL_VSPACE_ROOT), 0);
485        return;
486    }
487
488    pml4 = PML4E_PTR(cap_pml4_cap_get_capPML4BasePtr(threadRoot));
489    asid = cap_pml4_cap_get_capPML4MappedASID(threadRoot);
490    find_ret = findVSpaceForASID(asid);
491    if (unlikely(find_ret.status != EXCEPTION_NONE || find_ret.vspace_root != pml4)) {
492        setCurrentUserVSpaceRoot(kpptr_to_paddr(X86_GLOBAL_VSPACE_ROOT), 0);
493        return;
494    }
495    cr3 = makeCR3(pptr_to_paddr(pml4), asid);
496    if (getCurrentUserCR3().words[0] != cr3.words[0]) {
497        SMP_COND_STATEMENT(tlb_bitmap_set(pml4, getCurrentCPUIndex());)
498        setCurrentUserCR3(cr3);
499    }
500}
501
502
503BOOT_CODE void init_dtrs(void)
504{
505    gdt_idt_ptr.limit = (sizeof(gdt_entry_t) * GDT_ENTRIES) - 1;
506    gdt_idt_ptr.base = (uint64_t)x86KSGlobalState[CURRENT_CPU_INDEX()].x86KSgdt;
507
508    /* When we install the gdt it will clobber any value of gs that
509     * we have. Since we might be using it for TLS we can stash
510     * and unstash any gs value using swapgs
511     */
512    swapgs();
513    x64_install_gdt(&gdt_idt_ptr);
514    swapgs();
515
516    gdt_idt_ptr.limit = (sizeof(idt_entry_t) * (int_max + 1)) - 1;
517    gdt_idt_ptr.base = (uint64_t)x86KSGlobalState[CURRENT_CPU_INDEX()].x86KSidt;
518    x64_install_idt(&gdt_idt_ptr);
519
520    x64_install_ldt(SEL_NULL);
521
522    x64_install_tss(SEL_TSS);
523}
524
525BOOT_CODE void map_it_frame_cap(cap_t pd_cap, cap_t frame_cap)
526{
527    pml4e_t *pml4 = PML4_PTR(pptr_of_cap(pd_cap));
528    pdpte_t *pdpt;
529    pde_t *pd;
530    pte_t *pt;
531    vptr_t vptr = cap_frame_cap_get_capFMappedAddress(frame_cap);
532    void *pptr = (void *)cap_frame_cap_get_capFBasePtr(frame_cap);
533
534    assert(cap_frame_cap_get_capFMapType(frame_cap) == X86_MappingVSpace);
535    assert(cap_frame_cap_get_capFMappedASID(frame_cap) != asidInvalid);
536    pml4 += GET_PML4_INDEX(vptr);
537    assert(pml4e_ptr_get_present(pml4));
538    pdpt = paddr_to_pptr(pml4e_ptr_get_pdpt_base_address(pml4));
539    pdpt += GET_PDPT_INDEX(vptr);
540    assert(pdpte_pdpte_pd_ptr_get_present(pdpt));
541    pd = paddr_to_pptr(pdpte_pdpte_pd_ptr_get_pd_base_address(pdpt));
542    pd += GET_PD_INDEX(vptr);
543    assert(pde_pde_pt_ptr_get_present(pd));
544    pt = paddr_to_pptr(pde_pde_pt_ptr_get_pt_base_address(pd));
545    *(pt + GET_PT_INDEX(vptr)) = pte_new(
546                                     0,                      /* xd                   */
547                                     pptr_to_paddr(pptr),    /* page_base_address    */
548                                     0,                      /* global               */
549                                     0,                      /* pat                  */
550                                     0,                      /* dirty                */
551                                     0,                      /* accessed             */
552                                     0,                      /* cache_disabled       */
553                                     0,                      /* write_through        */
554                                     1,                      /* super_user           */
555                                     1,                      /* read_write           */
556                                     1                       /* present              */
557                                 );
558}
559
560static BOOT_CODE void map_it_pdpt_cap(cap_t vspace_cap, cap_t pdpt_cap)
561{
562    pml4e_t *pml4 = PML4_PTR(pptr_of_cap(vspace_cap));
563    pdpte_t *pdpt = PDPT_PTR(cap_pdpt_cap_get_capPDPTBasePtr(pdpt_cap));
564    vptr_t vptr = cap_pdpt_cap_get_capPDPTMappedAddress(pdpt_cap);
565
566    assert(cap_pdpt_cap_get_capPDPTIsMapped(pdpt_cap));
567    *(pml4 + GET_PML4_INDEX(vptr)) = pml4e_new(
568                                         0,                      /* xd                   */
569                                         pptr_to_paddr(pdpt),    /* pdpt_base_address    */
570                                         0,                      /* accessed             */
571                                         0,                      /* cache_disabled       */
572                                         0,                      /* write_through        */
573                                         1,                      /* super_user           */
574                                         1,                      /* read_write           */
575                                         1                       /* present              */
576                                     );
577}
578
579BOOT_CODE void map_it_pd_cap(cap_t vspace_cap, cap_t pd_cap)
580{
581    pml4e_t *pml4 = PML4_PTR(pptr_of_cap(vspace_cap));
582    pdpte_t *pdpt;
583    pde_t *pd = PD_PTR(cap_page_directory_cap_get_capPDBasePtr(pd_cap));
584    vptr_t vptr = cap_page_directory_cap_get_capPDMappedAddress(pd_cap);
585
586    assert(cap_page_directory_cap_get_capPDIsMapped(pd_cap));
587    pml4 += GET_PML4_INDEX(vptr);
588    assert(pml4e_ptr_get_present(pml4));
589    pdpt = paddr_to_pptr(pml4e_ptr_get_pdpt_base_address(pml4));
590    *(pdpt + GET_PDPT_INDEX(vptr)) = pdpte_pdpte_pd_new(
591                                         0,                      /* xd                   */
592                                         pptr_to_paddr(pd),      /* pd_base_address      */
593                                         0,                      /* accessed             */
594                                         0,                      /* cache_disabled       */
595                                         0,                      /* write_through        */
596                                         1,                      /* super_user           */
597                                         1,                      /* read_write           */
598                                         1                       /* present              */
599                                     );
600}
601
602BOOT_CODE void map_it_pt_cap(cap_t vspace_cap, cap_t pt_cap)
603{
604    pml4e_t *pml4 = PML4_PTR(pptr_of_cap(vspace_cap));
605    pdpte_t *pdpt;
606    pde_t *pd;
607    pte_t *pt = PT_PTR(cap_page_table_cap_get_capPTBasePtr(pt_cap));
608    vptr_t vptr = cap_page_table_cap_get_capPTMappedAddress(pt_cap);
609
610    assert(cap_page_table_cap_get_capPTIsMapped(pt_cap));
611    pml4 += GET_PML4_INDEX(vptr);
612    assert(pml4e_ptr_get_present(pml4));
613    pdpt = paddr_to_pptr(pml4e_ptr_get_pdpt_base_address(pml4));
614    pdpt += GET_PDPT_INDEX(vptr);
615    assert(pdpte_pdpte_pd_ptr_get_present(pdpt));
616    pd = paddr_to_pptr(pdpte_pdpte_pd_ptr_get_pd_base_address(pdpt));
617    *(pd + GET_PD_INDEX(vptr)) = pde_pde_pt_new(
618                                     0,                      /* xd                   */
619                                     pptr_to_paddr(pt),      /* pt_base_address      */
620                                     0,                      /* accessed             */
621                                     0,                      /* cache_disabled       */
622                                     0,                      /* write_through        */
623                                     1,                      /* super_user           */
624                                     1,                      /* read_write           */
625                                     1                       /* present              */
626                                 );
627}
628
629BOOT_CODE void *map_temp_boot_page(void *entry, uint32_t large_pages)
630{
631    /* this function is for legacy 32-bit systems where the ACPI tables might
632     * collide with the kernel window. Here we just assert that the table is
633     * in fact in the lower 4GiB region (which is already 1-to-1 mapped) and
634     * continue */
635    assert((word_t)entry < BIT(32));
636    return entry;
637}
638
639static BOOT_CODE cap_t create_it_pdpt_cap(cap_t vspace_cap, pptr_t pptr, vptr_t vptr, asid_t asid)
640{
641    cap_t cap;
642    cap = cap_pdpt_cap_new(
643              asid,   /* capPDPTMappedASID    */
644              pptr,   /* capPDPTBasePtr       */
645              1,      /* capPDPTIsMapped      */
646              vptr    /* capPDPTMappedAddress */
647          );
648    map_it_pdpt_cap(vspace_cap, cap);
649    return cap;
650}
651
652static BOOT_CODE cap_t create_it_pd_cap(cap_t vspace_cap, pptr_t pptr, vptr_t vptr, asid_t asid)
653{
654    cap_t cap;
655    cap = cap_page_directory_cap_new(
656              asid,   /* capPDMappedASID      */
657              pptr,   /* capPDBasePtr         */
658              1,      /* capPDIsMapped        */
659              vptr    /* capPDMappedAddress   */
660          );
661    map_it_pd_cap(vspace_cap, cap);
662    return cap;
663}
664
665static BOOT_CODE cap_t create_it_pt_cap(cap_t vspace_cap, pptr_t pptr, vptr_t vptr, asid_t asid)
666{
667    cap_t cap;
668    cap = cap_page_table_cap_new(
669              asid,   /* capPTMappedASID      */
670              pptr,   /* capPTBasePtr         */
671              1,      /* capPTIsMapped        */
672              vptr    /* capPTMappedAddress   */
673          );
674    map_it_pt_cap(vspace_cap, cap);
675    return cap;
676}
677
678
679BOOT_CODE word_t arch_get_n_paging(v_region_t it_v_reg)
680{
681    word_t n = get_n_paging(it_v_reg, PD_INDEX_OFFSET);
682    n += get_n_paging(it_v_reg, PDPT_INDEX_OFFSET);
683    n += get_n_paging(it_v_reg, PML4_INDEX_OFFSET);
684#ifdef CONFIG_IOMMU
685    n += vtd_get_n_paging(&boot_state.rmrr_list);
686#endif
687    return n;
688}
689
690BOOT_CODE cap_t create_it_address_space(cap_t root_cnode_cap, v_region_t it_v_reg)
691{
692    cap_t      vspace_cap;
693    vptr_t     vptr;
694    seL4_SlotPos slot_pos_before;
695    seL4_SlotPos slot_pos_after;
696
697    slot_pos_before = ndks_boot.slot_pos_cur;
698    copyGlobalMappings(PML4_PTR(rootserver.vspace));
699    vspace_cap = cap_pml4_cap_new(
700                     IT_ASID,        /* capPML4MappedASID */
701                     rootserver.vspace,           /* capPML4BasePtr   */
702                     1               /* capPML4IsMapped   */
703                 );
704
705
706    write_slot(SLOT_PTR(pptr_of_cap(root_cnode_cap), seL4_CapInitThreadVSpace), vspace_cap);
707
708    /* Create any PDPTs needed for the user land image */
709    for (vptr = ROUND_DOWN(it_v_reg.start, PML4_INDEX_OFFSET);
710         vptr < it_v_reg.end;
711         vptr += BIT(PML4_INDEX_OFFSET)) {
712        if (!provide_cap(root_cnode_cap,
713                         create_it_pdpt_cap(vspace_cap, it_alloc_paging(), vptr, IT_ASID))
714           ) {
715            return cap_null_cap_new();
716        }
717    }
718
719    /* Create any PDs needed for the user land image */
720    for (vptr = ROUND_DOWN(it_v_reg.start, PDPT_INDEX_OFFSET);
721         vptr < it_v_reg.end;
722         vptr += BIT(PDPT_INDEX_OFFSET)) {
723        if (!provide_cap(root_cnode_cap,
724                         create_it_pd_cap(vspace_cap, it_alloc_paging(), vptr, IT_ASID))
725           ) {
726            return cap_null_cap_new();
727        }
728    }
729
730    /* Create any PTs needed for the user land image */
731    for (vptr = ROUND_DOWN(it_v_reg.start, PD_INDEX_OFFSET);
732         vptr < it_v_reg.end;
733         vptr += BIT(PD_INDEX_OFFSET)) {
734        if (!provide_cap(root_cnode_cap,
735                         create_it_pt_cap(vspace_cap, it_alloc_paging(), vptr, IT_ASID))
736           ) {
737            return cap_null_cap_new();
738        }
739    }
740
741    slot_pos_after = ndks_boot.slot_pos_cur;
742    ndks_boot.bi_frame->userImagePaging = (seL4_SlotRegion) {
743        slot_pos_before, slot_pos_after
744    };
745    return vspace_cap;
746}
747
748void copyGlobalMappings(vspace_root_t *new_vspace)
749{
750    unsigned long i;
751    pml4e_t *vspace = (pml4e_t *)new_vspace;
752
753    /* Copy from the tlbbitmap_pptr so that we copy the default entries of the
754     * tlb bitmap (if it exists). If it doesn't exist then this loop
755     * will be equivalent to copying from PPTR_BASE
756     */
757    for (i = GET_PML4_INDEX(TLBBITMAP_PPTR); i < BIT(PML4_INDEX_BITS); i++) {
758        vspace[i] = X86_GLOBAL_VSPACE_ROOT[i];
759    }
760}
761
762static BOOT_CODE cap_t create_it_frame_cap(pptr_t pptr, vptr_t vptr, asid_t asid, bool_t use_large, seL4_Word map_type)
763{
764    vm_page_size_t frame_size;
765
766    if (use_large) {
767        frame_size = X86_LargePage;
768    } else {
769        frame_size = X86_SmallPage;
770    }
771
772    return
773        cap_frame_cap_new(
774            asid,                          /* capFMappedASID     */
775            pptr,                          /* capFBasePtr        */
776            frame_size,                    /* capFSize           */
777            map_type,                      /* capFMapType        */
778            vptr,                          /* capFMappedAddress  */
779            wordFromVMRights(VMReadWrite), /* capFVMRights       */
780            0                              /* capFIsDevice       */
781        );
782}
783
784BOOT_CODE cap_t create_unmapped_it_frame_cap(pptr_t pptr, bool_t use_large)
785{
786    return create_it_frame_cap(pptr, 0, asidInvalid, use_large, X86_MappingNone);
787}
788
789BOOT_CODE cap_t create_mapped_it_frame_cap(cap_t vspace_cap, pptr_t pptr, vptr_t vptr, asid_t asid, bool_t use_large,
790                                           bool_t executable UNUSED)
791{
792    cap_t cap = create_it_frame_cap(pptr, vptr, asid, use_large, X86_MappingVSpace);
793    map_it_frame_cap(vspace_cap, cap);
794    return cap;
795}
796
797/* ====================== BOOT CODE FINISHES HERE ======================== */
798
799
800
801exception_t performASIDPoolInvocation(asid_t asid, asid_pool_t *poolPtr, cte_t *vspaceCapSlot)
802{
803    asid_map_t asid_map;
804#ifdef CONFIG_VTX
805    if (cap_get_capType(vspaceCapSlot->cap) == cap_ept_pml4_cap) {
806        cap_ept_pml4_cap_ptr_set_capPML4MappedASID(&vspaceCapSlot->cap, asid);
807        cap_ept_pml4_cap_ptr_set_capPML4IsMapped(&vspaceCapSlot->cap, 1);
808        asid_map = asid_map_asid_map_ept_new(cap_ept_pml4_cap_get_capPML4BasePtr(vspaceCapSlot->cap));
809    } else
810#endif
811    {
812        assert(cap_get_capType(vspaceCapSlot->cap) == cap_pml4_cap);
813        cap_pml4_cap_ptr_set_capPML4MappedASID(&vspaceCapSlot->cap, asid);
814        cap_pml4_cap_ptr_set_capPML4IsMapped(&vspaceCapSlot->cap, 1);
815        asid_map = asid_map_asid_map_vspace_new(cap_pml4_cap_get_capPML4BasePtr(vspaceCapSlot->cap));
816    }
817    poolPtr->array[asid & MASK(asidLowBits)] = asid_map;
818    return EXCEPTION_NONE;
819}
820
821bool_t CONST isVTableRoot(cap_t cap)
822{
823    return cap_get_capType(cap) == cap_pml4_cap;
824}
825
826bool_t CONST isValidNativeRoot(cap_t cap)
827{
828    return isVTableRoot(cap) &&
829           cap_pml4_cap_get_capPML4IsMapped(cap);
830}
831
832static pml4e_t CONST makeUserPML4E(paddr_t paddr, vm_attributes_t vm_attr)
833{
834    return pml4e_new(
835               0,
836               paddr,
837               0,
838               vm_attributes_get_x86PCDBit(vm_attr),
839               vm_attributes_get_x86PWTBit(vm_attr),
840               1,
841               1,
842               1
843           );
844}
845
846static pml4e_t CONST makeUserPML4EInvalid(void)
847{
848    return pml4e_new(
849               0,                  /* xd               */
850               0,                  /* pdpt_base_addr   */
851               0,                  /* accessed         */
852               0,                  /* cache_disabled   */
853               0,                  /* write through    */
854               0,                  /* super user       */
855               0,                  /* read_write       */
856               0                   /* present          */
857           );
858}
859
860static pdpte_t CONST makeUserPDPTEHugePage(paddr_t paddr, vm_attributes_t vm_attr, vm_rights_t vm_rights)
861{
862    return pdpte_pdpte_1g_new(
863               0,          /* xd               */
864               paddr,      /* physical address */
865               0,          /* PAT              */
866               0,          /* global           */
867               0,          /* dirty            */
868               0,          /* accessed         */
869               vm_attributes_get_x86PCDBit(vm_attr),  /* cache disabled */
870               vm_attributes_get_x86PWTBit(vm_attr),  /* write through  */
871               SuperUserFromVMRights(vm_rights),       /* super user     */
872               WritableFromVMRights(vm_rights),        /* read write     */
873               1                                       /* present        */
874           );
875}
876
877static pdpte_t CONST makeUserPDPTEPageDirectory(paddr_t paddr, vm_attributes_t vm_attr)
878{
879    return pdpte_pdpte_pd_new(
880               0,                      /* xd       */
881               paddr,                  /* paddr    */
882               0,                      /* accessed */
883               vm_attributes_get_x86PCDBit(vm_attr),  /* cache disabled */
884               vm_attributes_get_x86PWTBit(vm_attr),  /* write through  */
885               1,                      /* super user */
886               1,                      /* read write */
887               1                       /* present    */
888           );
889}
890
891static pdpte_t CONST makeUserPDPTEInvalid(void)
892{
893    return pdpte_pdpte_pd_new(
894               0,          /* xd               */
895               0,          /* physical address */
896               0,          /* accessed         */
897               0,          /* cache disabled */
898               0,          /* write through  */
899               0,          /* super user     */
900               0,          /* read write     */
901               0           /* present        */
902           );
903}
904
905pde_t CONST makeUserPDELargePage(paddr_t paddr, vm_attributes_t vm_attr, vm_rights_t vm_rights)
906{
907    return pde_pde_large_new(
908               0,                                              /* xd                   */
909               paddr,                                          /* page_base_address    */
910               vm_attributes_get_x86PATBit(vm_attr),          /* pat                  */
911               0,                                              /* global               */
912               0,                                              /* dirty                */
913               0,                                              /* accessed             */
914               vm_attributes_get_x86PCDBit(vm_attr),          /* cache_disabled       */
915               vm_attributes_get_x86PWTBit(vm_attr),          /* write_through        */
916               SuperUserFromVMRights(vm_rights),               /* super_user           */
917               WritableFromVMRights(vm_rights),                /* read_write           */
918               1                                               /* present              */
919           );
920}
921
922pde_t CONST makeUserPDEPageTable(paddr_t paddr, vm_attributes_t vm_attr)
923{
924
925    return  pde_pde_pt_new(
926                0,                                  /* xd               */
927                paddr,                              /* pt_base_address  */
928                0,                                  /* accessed         */
929                vm_attributes_get_x86PCDBit(vm_attr), /* cache_disabled   */
930                vm_attributes_get_x86PWTBit(vm_attr), /* write_through    */
931                1,                                  /* super_user       */
932                1,                                  /* read_write       */
933                1                                   /* present          */
934            );
935}
936
937pde_t CONST makeUserPDEInvalid(void)
938{
939    /* The bitfield only declares two kinds of PDE entries (page tables or large pages)
940     * and an invalid entry should really be a third type, but we can simulate it by
941     * creating an invalid (present bit 0) entry of either of the defined types */
942    return pde_pde_pt_new(
943               0,      /* xd               */
944               0,      /* pt_base_addr     */
945               0,      /* accessed         */
946               0,      /* cache_disabled   */
947               0,      /* write_through    */
948               0,      /* super_user       */
949               0,      /* read_write       */
950               0       /* present          */
951           );
952}
953
954pte_t CONST makeUserPTE(paddr_t paddr, vm_attributes_t vm_attr, vm_rights_t vm_rights)
955{
956    return pte_new(
957               0,                                              /* xd                   */
958               paddr,                                          /* page_base_address    */
959               0,                                              /* global               */
960               vm_attributes_get_x86PATBit(vm_attr),          /* pat                  */
961               0,                                              /* dirty                */
962               0,                                              /* accessed             */
963               vm_attributes_get_x86PCDBit(vm_attr),          /* cache_disabled       */
964               vm_attributes_get_x86PWTBit(vm_attr),          /* write_through        */
965               SuperUserFromVMRights(vm_rights),               /* super_user           */
966               WritableFromVMRights(vm_rights),                /* read_write           */
967               1                                               /* present              */
968           );
969}
970
971pte_t CONST makeUserPTEInvalid(void)
972{
973    return pte_new(
974               0,                   /* xd                   */
975               0,                   /* page_base_address    */
976               0,                   /* global               */
977               0,                   /* pat                  */
978               0,                   /* dirty                */
979               0,                   /* accessed             */
980               0,                   /* cache_disabled       */
981               0,                   /* write_through        */
982               0,                   /* super_user           */
983               0,                   /* read_write           */
984               0                    /* present              */
985           );
986}
987
988
989static pml4e_t *lookupPML4Slot(vspace_root_t *pml4, vptr_t vptr)
990{
991    pml4e_t *pml4e = PML4E_PTR(pml4);
992    word_t pml4Index = GET_PML4_INDEX(vptr);
993    return pml4e + pml4Index;
994}
995
996static lookupPDPTSlot_ret_t lookupPDPTSlot(vspace_root_t *pml4, vptr_t vptr)
997{
998    pml4e_t *pml4Slot = lookupPML4Slot(pml4, vptr);
999    lookupPDPTSlot_ret_t ret;
1000
1001    if (!pml4e_ptr_get_present(pml4Slot)) {
1002        current_lookup_fault = lookup_fault_missing_capability_new(PML4_INDEX_OFFSET);
1003
1004        ret.pdptSlot = NULL;
1005        ret.status = EXCEPTION_LOOKUP_FAULT;
1006        return ret;
1007    } else {
1008        pdpte_t *pdpt;
1009        pdpte_t *pdptSlot;
1010        word_t pdptIndex = GET_PDPT_INDEX(vptr);
1011        pdpt = paddr_to_pptr(pml4e_ptr_get_pdpt_base_address(pml4Slot));
1012        pdptSlot = pdpt + pdptIndex;
1013
1014        ret.status = EXCEPTION_NONE;
1015        ret.pdptSlot = pdptSlot;
1016        return ret;
1017    }
1018}
1019
1020lookupPDSlot_ret_t lookupPDSlot(vspace_root_t *pml4, vptr_t vptr)
1021{
1022    lookupPDPTSlot_ret_t pdptSlot;
1023    lookupPDSlot_ret_t ret;
1024
1025    pdptSlot = lookupPDPTSlot(pml4, vptr);
1026
1027    if (pdptSlot.status != EXCEPTION_NONE) {
1028        ret.pdSlot = NULL;
1029        ret.status = pdptSlot.status;
1030        return ret;
1031    }
1032    if ((pdpte_ptr_get_page_size(pdptSlot.pdptSlot) != pdpte_pdpte_pd) ||
1033        !pdpte_pdpte_pd_ptr_get_present(pdptSlot.pdptSlot)) {
1034        current_lookup_fault = lookup_fault_missing_capability_new(PDPT_INDEX_OFFSET);
1035
1036        ret.pdSlot = NULL;
1037        ret.status = EXCEPTION_LOOKUP_FAULT;
1038        return ret;
1039    } else {
1040        pde_t *pd;
1041        pde_t *pdSlot;
1042        word_t pdIndex = GET_PD_INDEX(vptr);
1043        pd = paddr_to_pptr(pdpte_pdpte_pd_ptr_get_pd_base_address(pdptSlot.pdptSlot));
1044        pdSlot = pd + pdIndex;
1045
1046        ret.status = EXCEPTION_NONE;
1047        ret.pdSlot = pdSlot;
1048        return ret;
1049    }
1050}
1051
1052static void flushPD(vspace_root_t *vspace, word_t vptr, pde_t *pd, asid_t asid)
1053{
1054    /* clearing the entire PCID vs flushing the virtual addresses
1055     * one by one using invplg.
1056     * choose the easy way, invalidate the PCID
1057     */
1058    invalidateASID(vspace, asid, SMP_TERNARY(tlb_bitmap_get(vspace), 0));
1059
1060}
1061
1062static void flushPDPT(vspace_root_t *vspace, word_t vptr, pdpte_t *pdpt, asid_t asid)
1063{
1064    /* similar here */
1065    invalidateASID(vspace, asid, SMP_TERNARY(tlb_bitmap_get(vspace), 0));
1066    return;
1067}
1068
1069void hwASIDInvalidate(asid_t asid, vspace_root_t *vspace)
1070{
1071    invalidateASID(vspace, asid, SMP_TERNARY(tlb_bitmap_get(vspace), 0));
1072}
1073
1074void unmapPageDirectory(asid_t asid, vptr_t vaddr, pde_t *pd)
1075{
1076    findVSpaceForASID_ret_t find_ret;
1077    lookupPDPTSlot_ret_t    lu_ret;
1078
1079    find_ret = findVSpaceForASID(asid);
1080    if (find_ret.status != EXCEPTION_NONE) {
1081        return;
1082    }
1083
1084    lu_ret = lookupPDPTSlot(find_ret.vspace_root, vaddr);
1085    if (lu_ret.status != EXCEPTION_NONE) {
1086        return;
1087    }
1088
1089    /* check if the PDPT has the PD */
1090    if (!(pdpte_ptr_get_page_size(lu_ret.pdptSlot) == pdpte_pdpte_pd &&
1091          pdpte_pdpte_pd_ptr_get_present(lu_ret.pdptSlot) &&
1092          (pdpte_pdpte_pd_ptr_get_pd_base_address(lu_ret.pdptSlot) == pptr_to_paddr(pd)))) {
1093        return;
1094    }
1095
1096    flushPD(find_ret.vspace_root, vaddr, pd, asid);
1097
1098    *lu_ret.pdptSlot = makeUserPDPTEInvalid();
1099
1100    invalidatePageStructureCacheASID(pptr_to_paddr(find_ret.vspace_root), asid,
1101                                     SMP_TERNARY(tlb_bitmap_get(find_ret.vspace_root), 0));
1102}
1103
1104
1105static exception_t performX64PageDirectoryInvocationUnmap(cap_t cap, cte_t *ctSlot)
1106{
1107
1108    if (cap_page_directory_cap_get_capPDIsMapped(cap)) {
1109        pde_t *pd = PDE_PTR(cap_page_directory_cap_get_capPDBasePtr(cap));
1110        unmapPageDirectory(
1111            cap_page_directory_cap_get_capPDMappedASID(cap),
1112            cap_page_directory_cap_get_capPDMappedAddress(cap),
1113            pd
1114        );
1115        clearMemory((void *)pd, cap_get_capSizeBits(cap));
1116    }
1117
1118    cap_page_directory_cap_ptr_set_capPDIsMapped(&(ctSlot->cap), 0);
1119
1120    return EXCEPTION_NONE;
1121}
1122
1123static exception_t performX64PageDirectoryInvocationMap(cap_t cap, cte_t *ctSlot, pdpte_t pdpte, pdpte_t *pdptSlot,
1124                                                        vspace_root_t *vspace)
1125{
1126    ctSlot->cap = cap;
1127    *pdptSlot = pdpte;
1128    invalidatePageStructureCacheASID(pptr_to_paddr(vspace), cap_page_directory_cap_get_capPDMappedASID(cap),
1129                                     SMP_TERNARY(tlb_bitmap_get(vspace), 0));
1130    return EXCEPTION_NONE;
1131}
1132
1133
1134static exception_t decodeX64PageDirectoryInvocation(
1135    word_t label,
1136    word_t length,
1137    cte_t *cte,
1138    cap_t cap,
1139    extra_caps_t extraCaps,
1140    word_t *buffer
1141)
1142{
1143    word_t              vaddr;
1144    vm_attributes_t     vm_attr;
1145    cap_t               vspaceCap;
1146    vspace_root_t      *vspace;
1147    pdpte_t             pdpte;
1148    paddr_t             paddr;
1149    asid_t              asid;
1150    lookupPDPTSlot_ret_t pdptSlot;
1151
1152    if (label == X86PageDirectoryUnmap) {
1153        if (!isFinalCapability(cte)) {
1154            current_syscall_error.type = seL4_RevokeFirst;
1155            userError("X86PageDirectory: Cannot unmap if more than one cap exist.");
1156            return EXCEPTION_SYSCALL_ERROR;
1157        }
1158        setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
1159
1160        return performX64PageDirectoryInvocationUnmap(cap, cte);
1161    }
1162
1163    if (label != X86PageDirectoryMap) {
1164        userError("X64Directory: Illegal operation.");
1165        current_syscall_error.type = seL4_IllegalOperation;
1166        return EXCEPTION_SYSCALL_ERROR;
1167    }
1168
1169    if (length < 2 || extraCaps.excaprefs[0] == NULL) {
1170        userError("X64PageDirectory: Truncated message.");
1171        current_syscall_error.type = seL4_TruncatedMessage;
1172        return EXCEPTION_SYSCALL_ERROR;
1173    }
1174
1175    if (cap_page_directory_cap_get_capPDIsMapped(cap)) {
1176        userError("X64PageDirectory: PD is already mapped to a PML4.");
1177        current_syscall_error.type = seL4_InvalidCapability;
1178        current_syscall_error.invalidCapNumber = 0;
1179
1180        return EXCEPTION_SYSCALL_ERROR;
1181    }
1182
1183    vaddr = getSyscallArg(0, buffer) & (~MASK(PDPT_INDEX_OFFSET));
1184    vm_attr = vmAttributesFromWord(getSyscallArg(1, buffer));
1185    vspaceCap = extraCaps.excaprefs[0]->cap;
1186
1187    if (!isValidNativeRoot(vspaceCap)) {
1188        current_syscall_error.type = seL4_InvalidCapability;
1189        current_syscall_error.invalidCapNumber = 1;
1190
1191        return EXCEPTION_SYSCALL_ERROR;
1192    }
1193
1194    vspace = (vspace_root_t *)pptr_of_cap(vspaceCap);
1195    asid = cap_get_capMappedASID(vspaceCap);
1196
1197    if (vaddr > USER_TOP) {
1198        userError("X64PageDirectory: Mapping address too high.");
1199        current_syscall_error.type = seL4_InvalidArgument;
1200        current_syscall_error.invalidArgumentNumber = 0;
1201
1202        return EXCEPTION_SYSCALL_ERROR;
1203    }
1204
1205    findVSpaceForASID_ret_t find_ret;
1206
1207    find_ret = findVSpaceForASID(asid);
1208    if (find_ret.status != EXCEPTION_NONE) {
1209        current_syscall_error.type = seL4_FailedLookup;
1210        current_syscall_error.failedLookupWasSource = false;
1211
1212        return EXCEPTION_SYSCALL_ERROR;
1213    }
1214
1215    if (find_ret.vspace_root != vspace) {
1216        current_syscall_error.type = seL4_InvalidCapability;
1217        current_syscall_error.invalidCapNumber = 1;
1218
1219        return EXCEPTION_SYSCALL_ERROR;
1220    }
1221
1222    pdptSlot = lookupPDPTSlot(vspace, vaddr);
1223    if (pdptSlot.status != EXCEPTION_NONE) {
1224        current_syscall_error.type = seL4_FailedLookup;
1225        current_syscall_error.failedLookupWasSource = false;
1226
1227        return EXCEPTION_SYSCALL_ERROR;
1228    }
1229
1230    if ((pdpte_ptr_get_page_size(pdptSlot.pdptSlot) == pdpte_pdpte_pd &&
1231         pdpte_pdpte_pd_ptr_get_present(pdptSlot.pdptSlot)) ||
1232        (pdpte_ptr_get_page_size(pdptSlot.pdptSlot) == pdpte_pdpte_1g
1233         && pdpte_pdpte_1g_ptr_get_present(pdptSlot.pdptSlot))) {
1234        current_syscall_error.type = seL4_DeleteFirst;
1235
1236        return EXCEPTION_SYSCALL_ERROR;
1237    }
1238
1239    paddr = pptr_to_paddr(PDE_PTR(cap_page_directory_cap_get_capPDBasePtr(cap)));
1240    pdpte = makeUserPDPTEPageDirectory(paddr, vm_attr);
1241
1242    cap = cap_page_directory_cap_set_capPDIsMapped(cap, 1);
1243    cap = cap_page_directory_cap_set_capPDMappedASID(cap, asid);
1244    cap = cap_page_directory_cap_set_capPDMappedAddress(cap, vaddr);
1245
1246    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
1247    return performX64PageDirectoryInvocationMap(cap, cte, pdpte, pdptSlot.pdptSlot, vspace);
1248}
1249
1250static void unmapPDPT(asid_t asid, vptr_t vaddr, pdpte_t *pdpt)
1251{
1252    findVSpaceForASID_ret_t find_ret;
1253    pml4e_t *pml4Slot;
1254
1255    find_ret = findVSpaceForASID(asid);
1256    if (find_ret.status != EXCEPTION_NONE) {
1257        return;
1258    }
1259
1260    pml4Slot = lookupPML4Slot(find_ret.vspace_root, vaddr);
1261
1262    /* check if the PML4 has the PDPT */
1263    if (!(pml4e_ptr_get_present(pml4Slot) &&
1264          pml4e_ptr_get_pdpt_base_address(pml4Slot) == pptr_to_paddr(pdpt))) {
1265        return;
1266    }
1267
1268    flushPDPT(find_ret.vspace_root, vaddr, pdpt, asid);
1269
1270    *pml4Slot = makeUserPML4EInvalid();
1271}
1272
1273static exception_t performX64PDPTInvocationUnmap(cap_t cap, cte_t *ctSlot)
1274{
1275    if (cap_pdpt_cap_get_capPDPTIsMapped(cap)) {
1276        pdpte_t *pdpt = PDPTE_PTR(cap_pdpt_cap_get_capPDPTBasePtr(cap));
1277        unmapPDPT(cap_pdpt_cap_get_capPDPTMappedASID(cap),
1278                  cap_pdpt_cap_get_capPDPTMappedAddress(cap),
1279                  pdpt);
1280        clearMemory((void *)pdpt, cap_get_capSizeBits(cap));
1281    }
1282
1283    cap_pdpt_cap_ptr_set_capPDPTIsMapped(&(ctSlot->cap), 0);
1284
1285    return EXCEPTION_NONE;
1286}
1287
1288static exception_t performX64PDPTInvocationMap(cap_t cap, cte_t *ctSlot, pml4e_t pml4e, pml4e_t *pml4Slot,
1289                                               vspace_root_t *vspace)
1290{
1291    ctSlot->cap = cap;
1292    *pml4Slot = pml4e;
1293    invalidatePageStructureCacheASID(pptr_to_paddr(vspace), cap_pdpt_cap_get_capPDPTMappedASID(cap),
1294                                     SMP_TERNARY(tlb_bitmap_get(vspace), 0));
1295
1296    return EXCEPTION_NONE;
1297}
1298
1299static exception_t decodeX64PDPTInvocation(
1300    word_t  label,
1301    word_t length,
1302    cte_t   *cte,
1303    cap_t   cap,
1304    extra_caps_t extraCaps,
1305    word_t  *buffer)
1306{
1307    word_t                  vaddr;
1308    vm_attributes_t         attr;
1309    pml4e_t                *pml4Slot;
1310    cap_t                   vspaceCap;
1311    vspace_root_t          *vspace;
1312    pml4e_t                 pml4e;
1313    paddr_t                 paddr;
1314    asid_t                  asid;
1315
1316    if (label == X86PDPTUnmap) {
1317        if (!isFinalCapability(cte)) {
1318            current_syscall_error.type = seL4_RevokeFirst;
1319            userError("X86PDPT: Cannot unmap if more than one cap exist.");
1320            return EXCEPTION_SYSCALL_ERROR;
1321        }
1322
1323        setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
1324
1325        return performX64PDPTInvocationUnmap(cap, cte);
1326    }
1327
1328    if (label != X86PDPTMap) {
1329        userError("X86PDPT: Illegal operation.");
1330        current_syscall_error.type = seL4_IllegalOperation;
1331        return EXCEPTION_SYSCALL_ERROR;
1332    }
1333
1334    if (length < 2 || extraCaps.excaprefs[0] == NULL) {
1335        userError("X64PDPT: Truncated message.");
1336        current_syscall_error.type = seL4_TruncatedMessage;
1337        return EXCEPTION_SYSCALL_ERROR;
1338    }
1339
1340    if (cap_pdpt_cap_get_capPDPTIsMapped(cap)) {
1341        userError("X64PDPT: PDPT is already mapped to a PML4.");
1342        current_syscall_error.type = seL4_InvalidCapability;
1343        current_syscall_error.invalidCapNumber = 0;
1344
1345        return EXCEPTION_SYSCALL_ERROR;
1346    }
1347
1348    vaddr = getSyscallArg(0, buffer) & (~MASK(PML4_INDEX_OFFSET));
1349    attr = vmAttributesFromWord(getSyscallArg(1, buffer));
1350    vspaceCap = extraCaps.excaprefs[0]->cap;
1351
1352    if (!isValidNativeRoot(vspaceCap)) {
1353        current_syscall_error.type = seL4_InvalidCapability;
1354        current_syscall_error.invalidCapNumber = 1;
1355
1356        return EXCEPTION_SYSCALL_ERROR;
1357    }
1358
1359    vspace = (vspace_root_t *)pptr_of_cap(vspaceCap);
1360    asid = cap_get_capMappedASID(vspaceCap);
1361
1362    if (vaddr > USER_TOP) {
1363        userError("X64PDPT: Mapping address too high.");
1364        current_syscall_error.type = seL4_InvalidArgument;
1365        current_syscall_error.invalidArgumentNumber = 0;
1366
1367        return EXCEPTION_SYSCALL_ERROR;
1368    }
1369
1370    findVSpaceForASID_ret_t find_ret;
1371
1372    find_ret = findVSpaceForASID(asid);
1373    if (find_ret.status != EXCEPTION_NONE) {
1374        current_syscall_error.type = seL4_FailedLookup;
1375        current_syscall_error.failedLookupWasSource = false;
1376
1377        return EXCEPTION_SYSCALL_ERROR;
1378    }
1379
1380    if (find_ret.vspace_root != vspace) {
1381        current_syscall_error.type = seL4_InvalidCapability;
1382        current_syscall_error.invalidCapNumber = 1;
1383
1384        return EXCEPTION_SYSCALL_ERROR;
1385    }
1386
1387    pml4Slot = lookupPML4Slot(vspace, vaddr);
1388
1389    if (pml4e_ptr_get_present(pml4Slot)) {
1390        current_syscall_error.type = seL4_DeleteFirst;
1391
1392        return EXCEPTION_SYSCALL_ERROR;
1393    }
1394
1395    paddr = pptr_to_paddr(PDPTE_PTR((cap_pdpt_cap_get_capPDPTBasePtr(cap))));
1396    pml4e = makeUserPML4E(paddr, attr);
1397
1398    cap = cap_pdpt_cap_set_capPDPTIsMapped(cap, 1);
1399    cap = cap_pdpt_cap_set_capPDPTMappedASID(cap, asid);
1400    cap = cap_pdpt_cap_set_capPDPTMappedAddress(cap, vaddr);
1401
1402    setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
1403    return performX64PDPTInvocationMap(cap, cte, pml4e, pml4Slot, vspace);
1404}
1405
1406exception_t decodeX86ModeMMUInvocation(
1407    word_t label,
1408    word_t length,
1409    cptr_t cptr,
1410    cte_t *cte,
1411    cap_t cap,
1412    extra_caps_t extraCaps,
1413    word_t *buffer
1414)
1415{
1416    switch (cap_get_capType(cap)) {
1417
1418    case cap_pml4_cap:
1419        current_syscall_error.type = seL4_IllegalOperation;
1420        return EXCEPTION_SYSCALL_ERROR;
1421
1422    case cap_pdpt_cap:
1423        return decodeX64PDPTInvocation(label, length, cte, cap, extraCaps, buffer);
1424
1425    case cap_page_directory_cap:
1426        return decodeX64PageDirectoryInvocation(label, length, cte, cap, extraCaps, buffer);
1427
1428    default:
1429        fail("Invalid arch cap type");
1430    }
1431}
1432
1433bool_t modeUnmapPage(vm_page_size_t page_size, vspace_root_t *vroot, vptr_t vaddr, void *pptr)
1434{
1435    if (config_set(CONFIG_HUGE_PAGE) && page_size == X64_HugePage) {
1436        pdpte_t     *pdpte;
1437        lookupPDPTSlot_ret_t pdpt_ret = lookupPDPTSlot(vroot, vaddr);
1438        if (pdpt_ret.status != EXCEPTION_NONE) {
1439            return false;
1440        }
1441        pdpte = pdpt_ret.pdptSlot;
1442
1443
1444        if (!(pdpte_ptr_get_page_size(pdpte) == pdpte_pdpte_1g
1445              && pdpte_pdpte_1g_ptr_get_present(pdpte)
1446              && (pdpte_pdpte_1g_ptr_get_page_base_address(pdpte)
1447                  == pptr_to_paddr(pptr)))) {
1448            return false;
1449        }
1450
1451        *pdpte = makeUserPDPTEInvalid();
1452        return true;
1453    }
1454    fail("Invalid page type");
1455    return false;
1456}
1457
1458static exception_t updatePDPTE(asid_t asid, pdpte_t pdpte, pdpte_t *pdptSlot, vspace_root_t *vspace)
1459{
1460    *pdptSlot = pdpte;
1461    invalidatePageStructureCacheASID(pptr_to_paddr(vspace), asid,
1462                                     SMP_TERNARY(tlb_bitmap_get(vspace), 0));
1463    return EXCEPTION_NONE;
1464}
1465
1466static exception_t performX64ModeMap(cap_t cap, cte_t *ctSlot, pdpte_t pdpte, pdpte_t *pdptSlot, vspace_root_t *vspace)
1467{
1468    ctSlot->cap = cap;
1469    return updatePDPTE(cap_frame_cap_get_capFMappedASID(cap), pdpte, pdptSlot, vspace);
1470}
1471
1472struct create_mapping_pdpte_return {
1473    exception_t status;
1474    pdpte_t pdpte;
1475    pdpte_t *pdptSlot;
1476};
1477typedef struct create_mapping_pdpte_return create_mapping_pdpte_return_t;
1478
1479static create_mapping_pdpte_return_t createSafeMappingEntries_PDPTE(paddr_t base, word_t vaddr, vm_rights_t vmRights,
1480                                                                    vm_attributes_t attr,
1481                                                                    vspace_root_t *vspace)
1482{
1483    create_mapping_pdpte_return_t ret;
1484    lookupPDPTSlot_ret_t          lu_ret;
1485
1486    lu_ret = lookupPDPTSlot(vspace, vaddr);
1487    if (lu_ret.status != EXCEPTION_NONE) {
1488        current_syscall_error.type = seL4_FailedLookup;
1489        current_syscall_error.failedLookupWasSource = false;
1490        ret.status = EXCEPTION_SYSCALL_ERROR;
1491        /* current_lookup_fault will have been set by lookupPDSlot */
1492        return ret;
1493    }
1494    ret.pdptSlot = lu_ret.pdptSlot;
1495
1496    /* check for existing page directory */
1497    if ((pdpte_ptr_get_page_size(ret.pdptSlot) == pdpte_pdpte_pd) &&
1498        (pdpte_pdpte_pd_ptr_get_present(ret.pdptSlot))) {
1499        current_syscall_error.type = seL4_DeleteFirst;
1500        ret.status = EXCEPTION_SYSCALL_ERROR;
1501        return ret;
1502    }
1503
1504    ret.pdpte = makeUserPDPTEHugePage(base, attr, vmRights);
1505    ret.status = EXCEPTION_NONE;
1506    return ret;
1507}
1508
1509exception_t decodeX86ModeMapPage(word_t label, vm_page_size_t page_size, cte_t *cte, cap_t cap,
1510                                 vspace_root_t *vroot, vptr_t vaddr, paddr_t paddr, vm_rights_t vm_rights, vm_attributes_t vm_attr)
1511{
1512    if (config_set(CONFIG_HUGE_PAGE) && page_size == X64_HugePage) {
1513        create_mapping_pdpte_return_t map_ret;
1514
1515        map_ret = createSafeMappingEntries_PDPTE(paddr, vaddr, vm_rights, vm_attr, vroot);
1516        if (map_ret.status != EXCEPTION_NONE) {
1517            return map_ret.status;
1518        }
1519
1520        setThreadState(NODE_STATE(ksCurThread), ThreadState_Restart);
1521
1522        switch (label) {
1523        case X86PageMap:
1524            return performX64ModeMap(cap, cte, map_ret.pdpte, map_ret.pdptSlot, vroot);
1525
1526        default:
1527            current_syscall_error.type = seL4_IllegalOperation;
1528            return EXCEPTION_SYSCALL_ERROR;
1529        }
1530    }
1531    fail("Invalid Page type");
1532}
1533
1534#ifdef CONFIG_PRINTING
1535typedef struct readWordFromVSpace_ret {
1536    exception_t status;
1537    word_t value;
1538} readWordFromVSpace_ret_t;
1539
1540static readWordFromVSpace_ret_t readWordFromVSpace(vspace_root_t *vspace, word_t vaddr)
1541{
1542    readWordFromVSpace_ret_t ret;
1543    lookupPTSlot_ret_t ptSlot;
1544    lookupPDSlot_ret_t pdSlot;
1545    lookupPDPTSlot_ret_t pdptSlot;
1546    paddr_t paddr;
1547    word_t offset;
1548    pptr_t kernel_vaddr;
1549    word_t *value;
1550
1551    pdptSlot = lookupPDPTSlot(vspace, vaddr);
1552    if (pdptSlot.status == EXCEPTION_NONE &&
1553        pdpte_ptr_get_page_size(pdptSlot.pdptSlot) == pdpte_pdpte_1g &&
1554        pdpte_pdpte_1g_ptr_get_present(pdptSlot.pdptSlot)) {
1555
1556        paddr = pdpte_pdpte_1g_ptr_get_page_base_address(pdptSlot.pdptSlot);
1557        offset = vaddr & MASK(seL4_HugePageBits);
1558    } else {
1559        pdSlot = lookupPDSlot(vspace, vaddr);
1560        if (pdSlot.status == EXCEPTION_NONE &&
1561            ((pde_ptr_get_page_size(pdSlot.pdSlot) == pde_pde_large) &&
1562             pde_pde_large_ptr_get_present(pdSlot.pdSlot))) {
1563
1564            paddr = pde_pde_large_ptr_get_page_base_address(pdSlot.pdSlot);
1565            offset = vaddr & MASK(seL4_LargePageBits);
1566        } else {
1567            ptSlot = lookupPTSlot(vspace, vaddr);
1568            if (ptSlot.status == EXCEPTION_NONE && pte_ptr_get_present(ptSlot.ptSlot)) {
1569                paddr = pte_ptr_get_page_base_address(ptSlot.ptSlot);
1570                offset = vaddr & MASK(seL4_PageBits);
1571            } else {
1572                ret.status = EXCEPTION_LOOKUP_FAULT;
1573                return ret;
1574            }
1575        }
1576    }
1577
1578
1579    kernel_vaddr = (word_t)paddr_to_pptr(paddr);
1580    value = (word_t *)(kernel_vaddr + offset);
1581    ret.status = EXCEPTION_NONE;
1582    ret.value = *value;
1583    return ret;
1584}
1585
1586void Arch_userStackTrace(tcb_t *tptr)
1587{
1588    cap_t threadRoot;
1589    vspace_root_t *vspace_root;
1590    word_t sp;
1591    int i;
1592
1593    threadRoot = TCB_PTR_CTE_PTR(tptr, tcbVTable)->cap;
1594
1595    /* lookup the PD */
1596    if (cap_get_capType(threadRoot) != cap_pml4_cap) {
1597        printf("Invalid vspace\n");
1598        return;
1599    }
1600
1601    vspace_root = (vspace_root_t *)pptr_of_cap(threadRoot);
1602
1603    sp = getRegister(tptr, RSP);
1604    /* check for alignment so we don't have to worry about accessing
1605     * words that might be on two different pages */
1606    if (!IS_ALIGNED(sp, seL4_WordSizeBits)) {
1607        printf("RSP not aligned\n");
1608        return;
1609    }
1610
1611    for (i = 0; i < CONFIG_USER_STACK_TRACE_LENGTH; i++) {
1612        word_t address = sp + (i * sizeof(word_t));
1613        readWordFromVSpace_ret_t result;
1614        result = readWordFromVSpace(vspace_root, address);
1615        if (result.status == EXCEPTION_NONE) {
1616            printf("0x%lx: 0x%lx\n", (long)address, (long)result.value);
1617        } else {
1618            printf("0x%lx: INVALID\n", (long)address);
1619        }
1620    }
1621}
1622#endif /* CONFIG_PRINTING */
1623
1624#ifdef CONFIG_KERNEL_LOG_BUFFER
1625exception_t benchmark_arch_map_logBuffer(word_t frame_cptr)
1626{
1627    lookupCapAndSlot_ret_t lu_ret;
1628    vm_page_size_t frameSize;
1629    pptr_t frame_pptr;
1630
1631    /* faulting section */
1632    lu_ret = lookupCapAndSlot(NODE_STATE(ksCurThread), frame_cptr);
1633
1634    if (unlikely(lu_ret.status != EXCEPTION_NONE)) {
1635        userError("Invalid cap #%lu.", frame_cptr);
1636        current_fault = seL4_Fault_CapFault_new(frame_cptr, false);
1637
1638        return EXCEPTION_SYSCALL_ERROR;
1639    }
1640
1641    if (cap_get_capType(lu_ret.cap) != cap_frame_cap) {
1642        userError("Invalid cap. Log buffer should be of a frame cap");
1643        current_fault = seL4_Fault_CapFault_new(frame_cptr, false);
1644
1645        return EXCEPTION_SYSCALL_ERROR;
1646    }
1647
1648    frameSize = cap_frame_cap_get_capFSize(lu_ret.cap);
1649
1650    if (frameSize != X86_LargePage) {
1651        userError("Invalid size for log Buffer. The kernel expects at least 1M log buffer");
1652        current_fault = seL4_Fault_CapFault_new(frame_cptr, false);
1653
1654        return EXCEPTION_SYSCALL_ERROR;
1655    }
1656
1657    frame_pptr = cap_frame_cap_get_capFBasePtr(lu_ret.cap);
1658
1659    ksUserLogBuffer = pptr_to_paddr((void *) frame_pptr);
1660
1661    pde_t pde = pde_pde_large_new(
1662                    0,                 /* xd                   */
1663                    ksUserLogBuffer,   /* page_base_address    */
1664                    VMKernelOnly,      /* pat                  */
1665                    1,                 /* global               */
1666                    0,                 /* dirty                */
1667                    0,                 /* accessed             */
1668                    0,                 /* cache_disabled       */
1669                    1,                 /* write_through        */
1670                    1,                 /* super_user           */
1671                    1,                 /* read_write           */
1672                    1                  /* present              */
1673                );
1674
1675    /* Stored in the PD slot after the device page table */
1676#ifdef CONFIG_HUGE_PAGE
1677    x64KSKernelPD[1] = pde;
1678#else
1679    x64KSKernelPDs[BIT(PDPT_INDEX_BITS) - 1][1] = pde;
1680#endif
1681    invalidateTranslationAll(MASK(CONFIG_MAX_NUM_NODES));
1682
1683    return EXCEPTION_NONE;
1684}
1685#endif /* CONFIG_KERNEL_LOG_BUFFER */
1686