1/**
2 * \file
3 * \brief pmap management
4 *
5 * x86_64 specific management of page tables
6 *
7 * Warning: This code is coupled with the code in slot_alloc/. and pinned.c
8 *
9 * The maximum number of slots required to map a BASE_PAGE_SIZE
10 * sized page is the number of page table levels + 1.
11 * The sum for x86_64 is 4.
12 *
13 * Warning: Additional slots will be required to map a BASE_PAGE_SIZE size page,
14 * if we also track the actual frames that are mapped.
15 * Currently this is not the case.
16 */
17
18/*
19 * Copyright (c) 2009-2013 ETH Zurich.
20 * Copyright (c) 2014 HP Labs.
21 * All rights reserved.
22 *
23 * This file is distributed under the terms in the attached LICENSE file.
24 * If you do not find this file, copies can be found by writing to:
25 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group.
26 */
27
28#include <barrelfish/barrelfish.h>
29#include <barrelfish/dispatch.h>
30#include "target/x86/pmap_x86.h"
31#include <stdio.h>
32#include <barrelfish/cap_predicates.h>
33#include <pmap_priv.h>
34#include <pmap_ds.h> // pull in selected pmap datastructure implementation
35
36// For tracing
37#include <trace/trace.h>
38#include <trace_definitions/trace_defs.h>
39
40/**
41 * \brief Translate generic vregion flags to architecture specific pmap flags
42 */
43static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags)
44{
45    paging_x86_64_flags_t pmap_flags =
46        PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE;
47
48    if (!(vregion_flags & VREGION_FLAGS_GUARD)) {
49        if (vregion_flags & VREGION_FLAGS_WRITE) {
50            pmap_flags |= PTABLE_READ_WRITE;
51        }
52        if (vregion_flags & VREGION_FLAGS_EXECUTE) {
53            pmap_flags &= ~PTABLE_EXECUTE_DISABLE;
54        }
55        if (vregion_flags & VREGION_FLAGS_NOCACHE) {
56            pmap_flags |= PTABLE_CACHE_DISABLED;
57        }
58        else if (vregion_flags & VREGION_FLAGS_WRITE_COMBINING) {
59            // PA4 is configured as write-combining
60            pmap_flags |= PTABLE_ATTR_INDEX;
61        }
62    }
63
64    return pmap_flags;
65}
66
67// returns whether va1 and va2 share a page directory entry
68// not using X86_64_PDIR_BASE() macro as this would give false positives (same
69// entry in different directories)
70static inline bool is_same_pdir(genvaddr_t va1, genvaddr_t va2)
71{
72    return (va1>>X86_64_LARGE_PAGE_BITS) == ((va2-1)>>X86_64_LARGE_PAGE_BITS);
73}
74// returns whether va1 and va2 share a page directory pointer table entry
75static inline bool is_same_pdpt(genvaddr_t va1, genvaddr_t va2)
76{
77    return (va1>>X86_64_HUGE_PAGE_BITS) == ((va2-1)>>X86_64_HUGE_PAGE_BITS);
78}
79// returns whether va1 and va2 share a page map level 4 entry
80static inline bool is_same_pml4(genvaddr_t va1, genvaddr_t va2)
81{
82    // the base macros work here as we only have one pml4.
83    return X86_64_PML4_BASE(va1) == X86_64_PML4_BASE(va2-1);
84}
85// size indicates how many bits to shift
86static inline genvaddr_t get_addr_prefix(genvaddr_t va, uint8_t size)
87{
88    return va >> size;
89}
90
91static inline bool is_large_page(struct vnode *p)
92{
93    return !p->v.is_vnode && p->v.u.frame.flags & VREGION_FLAGS_LARGE;
94}
95
96static inline bool is_huge_page(struct vnode *p)
97{
98    return !p->v.is_vnode && p->v.u.frame.flags & VREGION_FLAGS_HUGE;
99}
100
101/**
102 * \brief Returns the vnode for the pdpt mapping a given vspace address
103 */
104errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base,
105                                struct vnode **pdpt);
106errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base,
107                                struct vnode **pdpt)
108{
109    errval_t err;
110    struct vnode *root = &pmap->root;
111    assert(root != NULL);
112
113    // PML4 mapping
114    if((*pdpt = pmap_find_vnode(root, X86_64_PML4_BASE(base))) == NULL) {
115        enum objtype type = type_is_ept(pmap->root.v.type) ?
116            ObjType_VNode_x86_64_ept_pdpt :
117            ObjType_VNode_x86_64_pdpt;
118        err = alloc_vnode(pmap, root, type, X86_64_PML4_BASE(base),
119                pdpt, base);
120        errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP);
121        if (err == expected_concurrent) {
122            if ((*pdpt = pmap_find_vnode(root, X86_64_PML4_BASE(base))) != NULL) {
123                return SYS_ERR_OK;
124            }
125        }
126        if (err_is_fail(err)) {
127            DEBUG_ERR(err, "alloc_vnode for pdpt");
128            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
129        }
130    }
131
132    return SYS_ERR_OK;
133}
134
135/**
136 * \brief Returns the vnode for the page directory mapping a given vspace
137 * address
138 */
139errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base,
140                                struct vnode **pdir);
141errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base,
142                                struct vnode **pdir)
143{
144    errval_t err;
145    struct vnode *pdpt;
146    err = get_pdpt(pmap, base, &pdpt);
147    if (err_is_fail(err)) {
148        return err;
149    }
150    assert(pdpt != NULL);
151
152    // PDPT mapping
153    if((*pdir = pmap_find_vnode(pdpt, X86_64_PDPT_BASE(base))) == NULL) {
154        enum objtype type = type_is_ept(pmap->root.v.type) ?
155            ObjType_VNode_x86_64_ept_pdir :
156            ObjType_VNode_x86_64_pdir;
157        err = alloc_vnode(pmap, pdpt, type,
158                            X86_64_PDPT_BASE(base), pdir, base);
159        errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP);
160        if (err == expected_concurrent) {
161            if ((*pdir = pmap_find_vnode(pdpt, X86_64_PDPT_BASE(base))) != NULL) {
162                return SYS_ERR_OK;
163            }
164        }
165        if (err_is_fail(err)) {
166            DEBUG_ERR(err, "alloc_vnode for pdir");
167            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
168        }
169    }
170
171    return SYS_ERR_OK;
172}
173
174/**
175 * \brief Returns the vnode for the pagetable mapping a given vspace address
176 */
177 errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base,
178                                   struct vnode **ptable);
179errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base,
180                                  struct vnode **ptable)
181{
182    errval_t err;
183    struct vnode *pdir;
184    err = get_pdir(pmap, base, &pdir);
185    if (err_is_fail(err)) {
186        return err;
187    }
188    assert(pdir != NULL);
189
190    // PDIR mapping
191    if((*ptable = pmap_find_vnode(pdir, X86_64_PDIR_BASE(base))) == NULL) {
192        enum objtype type = type_is_ept(pmap->root.v.type) ?
193            ObjType_VNode_x86_64_ept_ptable :
194            ObjType_VNode_x86_64_ptable;
195        err = alloc_vnode(pmap, pdir, type,
196                            X86_64_PDIR_BASE(base), ptable, base);
197        errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP);
198        if (err == expected_concurrent) {
199            if ((*ptable = pmap_find_vnode(pdir, X86_64_PDIR_BASE(base))) != NULL) {
200                return SYS_ERR_OK;
201            }
202        }
203        if (err_is_fail(err)) {
204            DEBUG_ERR(err, "alloc_vnode for ptable");
205            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
206        }
207    }
208
209    return SYS_ERR_OK;
210}
211
212/**
213 * \brief Returns the vnode for the page directory pointer table mapping for a
214 * given vspace address
215 */
216static inline struct vnode *find_pdpt(struct pmap_x86 *pmap, genvaddr_t base)
217{
218    struct vnode *root = &pmap->root;
219    assert(root != NULL);
220
221    // PDPT mapping
222    return pmap_find_vnode(root, X86_64_PML4_BASE(base));
223}
224
225/**
226 * \brief Returns the vnode for the page directory mapping a given vspace
227 * address, without performing allocations as get_pdir() does
228 */
229static inline struct vnode *find_pdir(struct pmap_x86 *pmap, genvaddr_t base)
230{
231    struct vnode *pdpt = find_pdpt(pmap, base);
232
233    if (pdpt) {
234        // PDPT mapping
235        return pmap_find_vnode(pdpt, X86_64_PDPT_BASE(base));
236    } else {
237        return NULL;
238    }
239}
240
241/**
242 * \brief Returns the vnode for the pagetable mapping a given vspace address,
243 * without performing allocations as get_ptable() does
244 */
245static inline struct vnode *find_ptable(struct pmap_x86 *pmap, genvaddr_t base)
246{
247    struct vnode *pdir = find_pdir(pmap, base);
248
249    if (pdir) {
250        // PDIR mapping
251        return pmap_find_vnode(pdir, X86_64_PDIR_BASE(base));
252    } else {
253        return NULL;
254    }
255}
256
257// TODO: documentation for this feature! -SG,2018-10-18
258size_t ALL_THE_VNODES_MAX_ENTRIES = (15*4096);
259struct vnode **ALL_THE_VNODES = NULL;
260size_t all_the_vnodes_cnt = 0;
261
262static errval_t do_single_map(struct pmap_x86 *pmap, genvaddr_t vaddr,
263                              genvaddr_t vend, struct capref frame,
264                              size_t offset, size_t pte_count,
265                              vregion_flags_t flags)
266{
267    if (pte_count == 0) {
268        debug_printf("do_single_map: pte_count == 0, called from %p\n",
269                __builtin_return_address(0));
270        return SYS_ERR_OK;
271    }
272    assert(pte_count > 0);
273    // translate flags
274    paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags);
275
276    // Get the paging structure and set paging relevant parameters
277    struct vnode *ptable = NULL;
278    errval_t err;
279    size_t table_base;
280
281    // get the right paging table and address part
282    if (flags & VREGION_FLAGS_LARGE) {
283        //large 2M pages, mapped into pdir
284        err = get_pdir(pmap, vaddr, &ptable);
285        table_base = X86_64_PDIR_BASE(vaddr);
286    } else if (flags & VREGION_FLAGS_HUGE) {
287        //huge 1GB pages, mapped into pdpt
288        err = get_pdpt(pmap, vaddr, &ptable);
289        table_base = X86_64_PDPT_BASE(vaddr);
290    } else {
291        //normal 4K pages, mapped into ptable
292        err = get_ptable(pmap, vaddr, &ptable);
293        table_base = X86_64_PTABLE_BASE(vaddr);
294        if (ALL_THE_VNODES && (all_the_vnodes_cnt+1) < ALL_THE_VNODES_MAX_ENTRIES) {
295            ALL_THE_VNODES[all_the_vnodes_cnt++] = ptable;
296        }
297    }
298    if (err_is_fail(err)) {
299        return err_push(err, LIB_ERR_PMAP_GET_PTABLE);
300    }
301    assert(ptable->v.is_vnode);
302
303    // check if there is an overlapping mapping
304    if (has_vnode(ptable, table_base, pte_count, false)) {
305        if (has_vnode(ptable, table_base, pte_count, true)) {
306            printf("page already exists in 0x%"
307                    PRIxGENVADDR"--0x%"PRIxGENVADDR"\n", vaddr, vend);
308            return LIB_ERR_PMAP_EXISTING_MAPPING;
309        } else {
310            // clean out empty page tables. We do this here because we benefit
311            // from having the page tables in place when doing lots of small
312            // mappings
313            remove_empty_vnodes(pmap, ptable, table_base, pte_count);
314        }
315    }
316
317    // setup userspace mapping
318    struct vnode *page = slab_alloc(&pmap->p.m.slab);
319    assert(page);
320    page->v.is_vnode = false;
321    page->is_cloned = false;
322    page->v.entry = table_base;
323    page->v.cap = frame;
324    page->v.u.frame.offset = offset;
325    page->v.u.frame.flags = flags;
326    page->v.u.frame.pte_count = pte_count;
327    page->u.frame.vaddr = vaddr;
328    page->u.frame.cloned_count = 0;
329
330    // only insert after vnode fully initialized
331    pmap_vnode_insert_child(ptable, page);
332
333    set_mapping_cap(&pmap->p, page, ptable, table_base);
334    pmap->used_cap_slots ++;
335
336    // do map
337    assert(!capref_is_null(ptable->v.u.vnode.invokable));
338    assert(!capref_is_null(page->v.mapping));
339    err = vnode_map(ptable->v.u.vnode.invokable, frame, table_base,
340                    pmap_flags, offset, pte_count, page->v.mapping);
341    if (err_is_fail(err)) {
342        return err_push(err, LIB_ERR_VNODE_MAP);
343    }
344
345    return SYS_ERR_OK;
346}
347
348/**
349 * \brief Called when enough slabs exist for the given mapping
350 */
351errval_t do_map(struct pmap *pmap_gen, genvaddr_t vaddr,
352                struct capref frame, size_t offset, size_t size,
353                vregion_flags_t flags, size_t *retoff, size_t *retsize)
354{
355    struct pmap_x86 *pmap = (struct pmap_x86 *)pmap_gen;
356    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 0);
357    errval_t err;
358
359    // determine page size and relevant address part
360    size_t page_size  = X86_64_BASE_PAGE_SIZE;
361    size_t table_base = X86_64_PTABLE_BASE(vaddr);
362    uint8_t map_bits  = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
363    bool debug_out    = false;
364
365    // get base address and size of frame
366    struct frame_identity fi;
367    err = cap_identify_mappable(frame, &fi);
368    if (err_is_fail(err)) {
369        trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
370        return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY);
371    }
372
373    if ((flags & VREGION_FLAGS_HUGE) &&
374        (vaddr & X86_64_HUGE_PAGE_MASK) == 0 &&
375        fi.bytes >= X86_64_HUGE_PAGE_SIZE &&
376        ((fi.base & X86_64_HUGE_PAGE_MASK) == 0))
377    {
378        // huge page branch (1GB)
379        page_size  = X86_64_HUGE_PAGE_SIZE;
380        table_base = X86_64_PDPT_BASE(vaddr);
381        map_bits   = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
382        debug_out  = false;
383        // remove large flag, if we're doing huge mapping
384        flags     &= ~VREGION_FLAGS_LARGE;
385    } else if ((flags & VREGION_FLAGS_LARGE) &&
386               (vaddr & X86_64_LARGE_PAGE_MASK) == 0 &&
387               fi.bytes >= X86_64_LARGE_PAGE_SIZE &&
388               ((fi.base & X86_64_LARGE_PAGE_MASK) == 0))
389    {
390        // large page branch (2MB)
391        page_size  = X86_64_LARGE_PAGE_SIZE;
392        table_base = X86_64_PDIR_BASE(vaddr);
393        map_bits   = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
394        debug_out  = false;
395    } else {
396        // remove large/huge flags
397        flags &= ~(VREGION_FLAGS_LARGE|VREGION_FLAGS_HUGE);
398    }
399
400    // round to the next full page and calculate end address and #ptes
401    size = ROUND_UP(size, page_size);
402    size_t pte_count = DIVIDE_ROUND_UP(size, page_size);
403    genvaddr_t vend = vaddr + size;
404
405    if (offset+size > fi.bytes) {
406        debug_printf("do_map: offset=%zu; size=%zu; frame size=%zu\n",
407                offset, size, fi.bytes);
408        trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
409        return LIB_ERR_PMAP_FRAME_SIZE;
410    }
411
412#if 0
413    if (true || debug_out) {
414        genpaddr_t paddr = fi.base + offset;
415
416        debug_printf("do_map: 0x%"
417                PRIxGENVADDR"--0x%"PRIxGENVADDR" -> 0x%"PRIxGENPADDR
418                "; pte_count = %zd; frame bytes = 0x%zx; page size = 0x%zx\n",
419                vaddr, vend, paddr, pte_count, fi.bytes, page_size);
420    }
421#endif
422
423    // all mapping on one leaf table?
424    if (is_same_pdir(vaddr, vend) ||
425        (flags & VREGION_FLAGS_LARGE && is_same_pdpt(vaddr, vend)) ||
426        (flags & VREGION_FLAGS_HUGE && is_same_pml4(vaddr, vend))) {
427        // fast path
428        if (debug_out) {
429            debug_printf("  do_map: fast path: %zd\n", pte_count);
430        }
431        err = do_single_map(pmap, vaddr, vend, frame, offset, pte_count, flags);
432        if (err_is_fail(err)) {
433            trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
434            return err_push(err, LIB_ERR_PMAP_DO_MAP);
435        }
436    }
437    else { // multiple leaf page tables
438        // first leaf
439        uint32_t c = X86_64_PTABLE_SIZE - table_base;
440        if (debug_out) {
441            debug_printf("  do_map: slow path: first leaf %"PRIu32"\n", c);
442        }
443        genvaddr_t temp_end = vaddr + c * page_size;
444        err = do_single_map(pmap, vaddr, temp_end, frame, offset, c, flags);
445        if (err_is_fail(err)) {
446            trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
447            return err_push(err, LIB_ERR_PMAP_DO_MAP);
448        }
449
450        // map full leaves
451        while (get_addr_prefix(temp_end, map_bits) <
452                get_addr_prefix(vend, map_bits))
453        {
454            // update vars
455            vaddr = temp_end;
456            temp_end = vaddr + X86_64_PTABLE_SIZE * page_size;
457            offset += c * page_size;
458            c = X86_64_PTABLE_SIZE;
459
460            // do mapping
461            if (debug_out) {
462                debug_printf("  do_map: slow path: full leaf\n");
463            }
464            err = do_single_map(pmap, vaddr, temp_end, frame, offset,
465                    X86_64_PTABLE_SIZE, flags);
466            if (err_is_fail(err)) {
467                trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
468                return err_push(err, LIB_ERR_PMAP_DO_MAP);
469            }
470        }
471
472        // map remaining part
473        offset += c * page_size;
474
475        // calculate remaining pages (subtract ptable bits from map_bits to
476        // get #ptes of last-level instead of 2nd-to-last).
477        c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) -
478            get_addr_prefix(temp_end, map_bits-X86_64_PTABLE_BITS);
479
480        if (c) {
481            // do mapping
482            if (debug_out) {
483                debug_printf("do_map: slow path: last leaf %"PRIu32"\n", c);
484            }
485            err = do_single_map(pmap, temp_end, vend, frame, offset, c, flags);
486            if (err_is_fail(err)) {
487                trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
488                return err_push(err, LIB_ERR_PMAP_DO_MAP);
489            }
490        }
491    }
492
493    if (retoff) {
494        *retoff = offset;
495    }
496    if (retsize) {
497        *retsize = size;
498    }
499
500    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1);
501    return SYS_ERR_OK;
502}
503
504/// Computer upper limit on number of slabs required to perform a mapping
505static size_t max_slabs_for_mapping(size_t bytes)
506{
507    size_t max_pages  = DIVIDE_ROUND_UP(bytes, X86_64_BASE_PAGE_SIZE);
508    size_t max_ptable = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
509    size_t max_pdir   = DIVIDE_ROUND_UP(max_ptable, X86_64_PTABLE_SIZE);
510    size_t max_pdpt   = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE);
511    // Worst case, our mapping spans over two pdpts
512    return 2 * (max_ptable + max_pdir + max_pdpt);
513}
514
515static size_t max_slabs_for_mapping_large(size_t bytes)
516{
517    size_t max_pages  = DIVIDE_ROUND_UP(bytes, X86_64_LARGE_PAGE_SIZE);
518    size_t max_pdir   = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
519    size_t max_pdpt   = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE);
520    // Worst case, our mapping spans over two pdpts
521    return 2 * (max_pdir + max_pdpt);
522}
523
524static size_t max_slabs_for_mapping_huge(size_t bytes)
525{
526    size_t max_pages  = DIVIDE_ROUND_UP(bytes, X86_64_HUGE_PAGE_SIZE);
527    size_t max_pdpt   = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
528    // Worst case, our mapping spans over two pdpts
529    return 2 * max_pdpt;
530}
531
532size_t max_slabs_required(size_t bytes)
533{
534    return max_slabs_for_mapping(bytes);
535}
536
537/**
538 * \brief Create page mappings
539 *
540 * \param pmap     The pmap object
541 * \param vaddr    The virtual address to create the mapping for
542 * \param frame    The frame cap to map in
543 * \param offset   Offset into the frame cap
544 * \param size     Size of the mapping
545 * \param flags    Flags for the mapping
546 * \param retoff   If non-NULL, filled in with adjusted offset of mapped region
547 * \param retsize  If non-NULL, filled in with adjusted size of mapped region
548 */
549static errval_t map(struct pmap *pmap, genvaddr_t vaddr, struct capref frame,
550                    size_t offset, size_t size, vregion_flags_t flags,
551                    size_t *retoff, size_t *retsize)
552{
553    errval_t err;
554
555    struct capability cap;
556    err = cap_direct_identify(frame, &cap);
557    if (err_is_fail(err)) {
558        return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY);
559    }
560    struct frame_identity fi;
561    fi.base = get_address(&cap);
562    fi.bytes = get_size(&cap);
563
564    size_t max_slabs;
565    // Adjust the parameters to page boundaries
566    // TODO: overestimating needed slabs shouldn't hurt much in the long run,
567    // and would keep the code easier to read and possibly faster due to less
568    // branching
569    if ((flags & VREGION_FLAGS_LARGE) &&
570        (vaddr & X86_64_LARGE_PAGE_MASK) == 0 &&
571        (fi.base & X86_64_LARGE_PAGE_MASK) == 0 &&
572        fi.bytes >= offset+size) {
573        //case large pages (2MB)
574        size   += LARGE_PAGE_OFFSET(offset);
575        size    = ROUND_UP(size, LARGE_PAGE_SIZE);
576        offset -= LARGE_PAGE_OFFSET(offset);
577        max_slabs = max_slabs_for_mapping_large(size);
578    } else if ((flags & VREGION_FLAGS_HUGE) &&
579               (vaddr & X86_64_HUGE_PAGE_MASK) == 0 &&
580               (fi.base & X86_64_HUGE_PAGE_MASK) == 0 &&
581               fi.bytes >= offset+size) {
582        // case huge pages (1GB)
583        size   += HUGE_PAGE_OFFSET(offset);
584        size    = ROUND_UP(size, HUGE_PAGE_SIZE);
585        offset -= HUGE_PAGE_OFFSET(offset);
586        max_slabs = max_slabs_for_mapping_huge(size);
587    } else {
588        //case normal pages (4KB)
589        size   += BASE_PAGE_OFFSET(offset);
590        size    = ROUND_UP(size, BASE_PAGE_SIZE);
591        offset -= BASE_PAGE_OFFSET(offset);
592        max_slabs = max_slabs_for_mapping(size);
593    }
594
595    max_slabs += 6; // minimum amount required to map a region spanning 2 ptables
596
597    err = pmap_refill_slabs(pmap, max_slabs);
598    if (err_is_fail(err)) {
599        return err;
600    }
601
602    err = do_map(pmap, vaddr, frame, offset, size, flags, retoff, retsize);
603    return err;
604}
605
606struct find_mapping_info {
607    struct vnode *page_table;
608    struct vnode *page;
609    size_t page_size;
610    size_t table_base;
611    uint8_t map_bits;
612};
613
614/**
615 * \brief Find mapping for `vaddr` in `pmap`.
616 * \arg pmap the pmap to search in
617 * \arg vaddr the virtual address to search for
618 * \arg pt the last-level page table meta-data we found if any
619 * \arg page the page meta-data we found if any
620 * \returns `true` iff we found a mapping for vaddr
621 */
622static bool find_mapping(struct pmap_x86 *pmap, genvaddr_t vaddr,
623                         struct find_mapping_info *info)
624{
625    struct vnode *pdpt = NULL, *pdir = NULL, *pt = NULL, *page = NULL;
626
627    size_t page_size = 0;
628    size_t table_base = 0;
629    uint8_t map_bits = 0;
630
631    // find page and last-level page table (can be pdir or pdpt)
632    if ((pdpt = find_pdpt(pmap, vaddr)) != NULL) {
633        page = pmap_find_vnode(pdpt, X86_64_PDPT_BASE(vaddr));
634        if (page && page->v.is_vnode) { // not 1G pages
635            pdir = page;
636            page = pmap_find_vnode(pdir, X86_64_PDIR_BASE(vaddr));
637            if (page && page->v.is_vnode) { // not 2M pages
638                pt = page;
639                page = pmap_find_vnode(pt, X86_64_PTABLE_BASE(vaddr));
640                page_size = X86_64_BASE_PAGE_SIZE;
641                table_base = X86_64_PTABLE_BASE(vaddr);
642                map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
643            } else if (page) {
644                assert(is_large_page(page));
645                pt = pdir;
646                page_size = X86_64_LARGE_PAGE_SIZE;
647                table_base = X86_64_PDIR_BASE(vaddr);
648                map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
649            }
650        } else if (page) {
651            assert(is_huge_page(page));
652            pt = pdpt;
653            page_size = X86_64_HUGE_PAGE_SIZE;
654            table_base = X86_64_PDPT_BASE(vaddr);
655            map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
656        }
657    }
658    if (info) {
659        info->page_table = pt;
660        info->page = page;
661        info->page_size = page_size;
662        info->table_base = table_base;
663        info->map_bits = map_bits;
664    }
665    if (pt && page) {
666        return true;
667    } else {
668        return false;
669    }
670}
671
672static errval_t do_single_unmap(struct pmap_x86 *pmap, genvaddr_t vaddr,
673                                size_t pte_count)
674{
675    errval_t err;
676    struct find_mapping_info info;
677
678    if (!find_mapping(pmap, vaddr, &info)) {
679        return LIB_ERR_PMAP_FIND_VNODE;
680    }
681    assert(info.page_table && info.page_table->v.is_vnode && info.page && !info.page->v.is_vnode);
682
683    if (info.page->v.u.frame.pte_count == pte_count) {
684        err = vnode_unmap(info.page_table->v.cap, info.page->v.mapping);
685        if (err_is_fail(err)) {
686            debug_printf("vnode_unmap returned error: %s (%d)\n",
687                    err_getstring(err), err_no(err));
688            return err_push(err, LIB_ERR_VNODE_UNMAP);
689        }
690
691        // delete&free page->v.mapping after doing vnode_unmap()
692        err = cap_delete(info.page->v.mapping);
693        if (err_is_fail(err)) {
694            return err_push(err, LIB_ERR_CAP_DELETE);
695        }
696#ifndef GLOBAL_MCN
697        err = pmap->p.slot_alloc->free(pmap->p.slot_alloc, info.page->v.mapping);
698        if (err_is_fail(err)) {
699            debug_printf("remove_empty_vnodes: slot_free (mapping): %s\n",
700                    err_getstring(err));
701        }
702#endif
703        assert(pmap->used_cap_slots > 0);
704        pmap->used_cap_slots --;
705        // Free up the resources
706        pmap_remove_vnode(info.page_table, info.page);
707        slab_free(&pmap->p.m.slab, info.page);
708    }
709
710    return SYS_ERR_OK;
711}
712
713/**
714 * \brief Remove page mappings
715 *
716 * \param pmap     The pmap object
717 * \param vaddr    The start of the virtual region to remove
718 * \param size     The size of virtual region to remove
719 * \param retsize  If non-NULL, filled in with the actual size removed
720 */
721static errval_t unmap(struct pmap *pmap, genvaddr_t vaddr, size_t size,
722                      size_t *retsize)
723{
724    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 0);
725    //printf("[unmap] 0x%"PRIxGENVADDR", %zu\n", vaddr, size);
726    errval_t err, ret = SYS_ERR_OK;
727    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
728
729    //determine if we unmap a larger page
730    struct find_mapping_info info;
731
732    if (!find_mapping(x86, vaddr, &info)) {
733        //TODO: better error --> LIB_ERR_PMAP_NOT_MAPPED
734        trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
735        return LIB_ERR_PMAP_UNMAP;
736    }
737
738    assert(!info.page->v.is_vnode);
739
740    if (info.page->v.entry > info.table_base) {
741        debug_printf("trying to partially unmap region\n");
742        // XXX: error code
743        trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
744        return LIB_ERR_PMAP_FIND_VNODE;
745    }
746
747    // TODO: match new policy of map when implemented
748    size = ROUND_UP(size, info.page_size);
749    genvaddr_t vend = vaddr + size;
750
751    if (is_same_pdir(vaddr, vend) ||
752        (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) ||
753        (is_same_pml4(vaddr, vend) && is_huge_page(info.page)))
754    {
755        // fast path
756        err = do_single_unmap(x86, vaddr, size / info.page_size);
757        if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
758            printf("error fast path\n");
759            trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
760            return err_push(err, LIB_ERR_PMAP_UNMAP);
761        }
762    }
763    else { // slow path
764        // unmap first leaf
765        uint32_t c = X86_64_PTABLE_SIZE - info.table_base;
766
767        err = do_single_unmap(x86, vaddr, c);
768        if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
769            printf("error first leaf\n");
770            trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
771            return err_push(err, LIB_ERR_PMAP_UNMAP);
772        }
773
774        // unmap full leaves
775        vaddr += c * info.page_size;
776        while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) {
777            c = X86_64_PTABLE_SIZE;
778            err = do_single_unmap(x86, vaddr, X86_64_PTABLE_SIZE);
779            if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
780                printf("error while loop\n");
781                trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
782                return err_push(err, LIB_ERR_PMAP_UNMAP);
783            }
784            vaddr += c * info.page_size;
785        }
786
787        // unmap remaining part
788        // subtracting ptable bits from map_bits to get #ptes in last-level table
789        // instead of 2nd-to-last.
790        c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) -
791            get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS);
792        assert(c < X86_64_PTABLE_SIZE);
793        if (c) {
794            err = do_single_unmap(x86, vaddr, c);
795            if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
796                printf("error remaining part\n");
797                trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
798                return err_push(err, LIB_ERR_PMAP_UNMAP);
799            }
800        }
801    }
802
803    if (retsize) {
804        *retsize = size;
805    }
806
807    //printf("[unmap] exiting\n");
808    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1);
809    return ret;
810}
811
812int pmap_selective_flush = 0;
813static errval_t do_single_modify_flags(struct pmap_x86 *pmap, genvaddr_t vaddr,
814                                       size_t pages, vregion_flags_t flags)
815{
816    errval_t err = SYS_ERR_OK;
817
818    struct find_mapping_info info;
819
820    if (!find_mapping(pmap, vaddr, &info)) {
821        return LIB_ERR_PMAP_FIND_VNODE;
822    }
823
824    assert(info.page_table && info.page_table->v.is_vnode && info.page && !info.page->v.is_vnode);
825    assert(pages <= PTABLE_SIZE);
826
827    if (pmap_inside_region(info.page_table, info.table_base, pages)) {
828        // we're modifying part of a valid mapped region
829        // arguments to invocation: invoke frame cap, first affected
830        // page (as offset from first page in mapping), #affected
831        // pages, new flags. Invocation mask flags based on capability
832        // access permissions.
833        size_t off = info.table_base - info.page->v.entry;
834        paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags);
835        // calculate TLB flushing hint
836        genvaddr_t va_hint = 0;
837        if (pmap_selective_flush == 3) {
838            // always do full flush
839            va_hint = 0;
840        } else if (pmap_selective_flush == 2) {
841            // always do assisted selective flush
842            va_hint = vaddr & ~(info.page_size - 1);
843        } else if (pmap_selective_flush == 1) {
844            // always do computed selective flush
845            va_hint = 1;
846        } else {
847            /*
848             * default strategy is to only use selective flushing for single page
849             */
850            if (pages == 1) {
851                // do assisted selective flush for single page
852                va_hint = vaddr & ~(info.page_size - 1);
853            }
854        }
855        err = invoke_mapping_modify_flags(info.page->v.mapping, off, pages,
856                                          pmap_flags, va_hint);
857        return err;
858    } else {
859        // overlaps some region border
860        // XXX: need better error
861        return LIB_ERR_PMAP_EXISTING_MAPPING;
862    }
863
864    return SYS_ERR_OK;
865}
866
867
868/**
869 * \brief Modify page mapping
870 *
871 * \param pmap     The pmap object
872 * \param vaddr    The first virtual address for which to change the flags
873 * \param size     The length of the region to change in bytes
874 * \param flags    New flags for the mapping
875 * \param retsize  If non-NULL, filled in with the actual size modified
876 */
877static errval_t modify_flags(struct pmap *pmap, genvaddr_t vaddr, size_t size,
878                             vregion_flags_t flags, size_t *retsize)
879{
880    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 0);
881    errval_t err;
882    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
883
884    //determine if we unmap a larger page
885    struct find_mapping_info info;
886
887    if (!find_mapping(x86, vaddr, &info)) {
888        trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1);
889        return LIB_ERR_PMAP_NOT_MAPPED;
890    }
891
892    assert(info.page && !info.page->v.is_vnode);
893    // XXX: be more graceful about size == 0? -SG, 2017-11-28.
894    assert(size > 0);
895
896    // TODO: match new policy of map when implemented
897    size = ROUND_UP(size, info.page_size);
898    genvaddr_t vend = vaddr + size;
899
900    size_t pages = size / info.page_size;
901
902    // vaddr and vend specify begin and end of the region (inside a mapping)
903    // that should receive the new set of flags
904    if (is_same_pdir(vaddr, vend) ||
905        (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) ||
906        (is_same_pml4(vaddr, vend) && is_huge_page(info.page))) {
907        // fast path
908        assert(pages <= PTABLE_SIZE);
909        err = do_single_modify_flags(x86, vaddr, pages, flags);
910        if (err_is_fail(err)) {
911            trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1);
912            return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
913        }
914    }
915    else { // slow path
916        // modify first part
917        uint32_t c = X86_64_PTABLE_SIZE - info.table_base;
918        assert(c <= PTABLE_SIZE);
919        err = do_single_modify_flags(x86, vaddr, c, flags);
920        if (err_is_fail(err)) {
921            trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1);
922            return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
923        }
924
925        // modify full leaves
926        vaddr += c * info.page_size;
927        while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) {
928            c = X86_64_PTABLE_SIZE;
929            err = do_single_modify_flags(x86, vaddr, X86_64_PTABLE_SIZE, flags);
930            if (err_is_fail(err)) {
931                trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1);
932                return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
933            }
934            vaddr += c * info.page_size;
935        }
936
937        // modify remaining part
938        c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) -
939                get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS);
940        if (c) {
941            assert(c <= PTABLE_SIZE);
942            err = do_single_modify_flags(x86, vaddr, c, flags);
943            if (err_is_fail(err)) {
944                trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1);
945                return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
946            }
947        }
948    }
949
950    if (retsize) {
951        *retsize = size;
952    }
953
954    //printf("[modify_flags] exiting\n");
955    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1);
956    return SYS_ERR_OK;
957}
958
959/**
960 * \brief Query existing page mapping
961 *
962 * \param pmap     The pmap object
963 * \param vaddr    The virtual address to query
964 * \param retvaddr Returns the base virtual address of the mapping
965 * \param retsize  Returns the actual size of the mapping
966 * \param retcap   Returns the cap mapped at this address
967 * \param retoffset Returns the offset within the cap that is mapped
968 * \param retflags Returns the flags for this mapping
969 *
970 * All of the ret parameters are optional.
971 */
972static errval_t lookup(struct pmap *pmap, genvaddr_t vaddr,
973                       struct pmap_mapping_info *info)
974{
975    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_LOOKUP, 0);
976    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
977
978    struct find_mapping_info find_info;
979    bool found = find_mapping(x86, vaddr, &find_info);
980
981    if (!found) {
982        trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_LOOKUP, 1);
983        return LIB_ERR_PMAP_FIND_VNODE;
984    }
985
986    if (info) {
987        info->vaddr = find_info.page->u.frame.vaddr;
988        info->size = find_info.page_size * find_info.page->v.u.frame.pte_count;
989        info->cap = find_info.page->v.cap;
990        info->offset = find_info.page->v.u.frame.offset;
991        info->flags = find_info.page->v.u.frame.flags;
992        info->mapping = find_info.page->v.mapping;
993    }
994    trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_LOOKUP, 1);
995    return SYS_ERR_OK;
996}
997
998
999
1000#if defined(PMAP_LL)
1001static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written)
1002{
1003    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
1004    struct pmap_dump_info *buf_ = buf;
1005
1006    struct vnode *pml4 = &x86->root;
1007    struct vnode *pdpt, *pdir, *pt, *frame;
1008    assert(pml4 != NULL);
1009
1010    *items_written = 0;
1011
1012    // iterate over PML4 entries
1013    size_t pml4_index, pdpt_index, pdir_index;
1014    for (pdpt = pml4->v.u.vnode.children; pdpt != NULL; pdpt = pdpt->v.meta.next) {
1015        pml4_index = pdpt->v.entry;
1016        // iterate over pdpt entries
1017        for (pdir = pdpt->v.u.vnode.children; pdir != NULL; pdir = pdir->v.meta.next) {
1018            pdpt_index = pdir->v.entry;
1019            // iterate over pdir entries
1020            for (pt = pdir->v.u.vnode.children; pt != NULL; pt = pt->v.meta.next) {
1021                pdir_index = pt->v.entry;
1022                // iterate over pt entries
1023                for (frame = pt->v.u.vnode.children; frame != NULL; frame = frame->v.meta.next) {
1024                    if (*items_written < buflen) {
1025                        buf_->pml4_index = pml4_index;
1026                        buf_->pdpt_index = pdpt_index;
1027                        buf_->pdir_index = pdir_index;
1028                        buf_->pt_index = frame->v.entry;
1029                        buf_->cap = frame->v.cap;
1030                        buf_->offset = frame->v.u.frame.offset;
1031                        buf_->flags = frame->v.u.frame.flags;
1032                        buf_++;
1033                        (*items_written)++;
1034                    }
1035                }
1036            }
1037        }
1038    }
1039    return SYS_ERR_OK;
1040}
1041#elif defined(PMAP_ARRAY)
1042static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written)
1043{
1044    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
1045    struct pmap_dump_info *buf_ = buf;
1046
1047    struct vnode *pml4 = &x86->root;
1048    struct vnode *pdpt, *pdir, *pt, *frame;
1049    assert(pml4 != NULL);
1050
1051    *items_written = 0;
1052
1053    // iterate over PML4 entries
1054    size_t pml4_index, pdpt_index, pdir_index, pt_index;
1055    for (pml4_index = 0; pml4_index < X86_64_PTABLE_SIZE; pml4_index++) {
1056        if (!(pdpt = pml4->v.u.vnode.children[pml4_index])) {
1057            // skip empty entries
1058            continue;
1059        }
1060        // iterate over pdpt entries
1061        for (pdpt_index = 0; pdpt_index < X86_64_PTABLE_SIZE; pdpt_index++) {
1062            if (!(pdir = pdpt->v.u.vnode.children[pdpt_index])) {
1063                // skip empty entries
1064                continue;
1065            }
1066            // iterate over pdir entries
1067            for (pdir_index = 0; pdir_index < X86_64_PTABLE_SIZE; pdir_index++) {
1068                if (!(pt = pdir->v.u.vnode.children[pdir_index])) {
1069                    // skip empty entries
1070                    continue;
1071                }
1072                // iterate over pt entries
1073                for (pt_index = 0; pt_index < X86_64_PTABLE_SIZE; pt_index++) {
1074                    if (!(frame = pt->v.u.vnode.children[pt_index])) {
1075                        // skip empty entries
1076                        continue;
1077                    }
1078                    if (*items_written < buflen) {
1079                        buf_->pml4_index = pml4_index;
1080                        buf_->pdpt_index = pdpt_index;
1081                        buf_->pdir_index = pdir_index;
1082                        buf_->pt_index   = pt_index;
1083                        buf_->cap = frame->v.cap;
1084                        buf_->offset = frame->v.u.frame.offset;
1085                        buf_->flags = frame->v.u.frame.flags;
1086                        buf_++;
1087                        (*items_written)++;
1088                    }
1089                }
1090            }
1091        }
1092    }
1093    return SYS_ERR_OK;
1094}
1095#else
1096#error Invalid pmap datastructure
1097#endif
1098
1099
1100/*
1101 * creates pinned page table entries
1102 */
1103static errval_t create_pts_pinned(struct pmap *pmap, genvaddr_t vaddr, size_t bytes,
1104                           vregion_flags_t flags)
1105{
1106    errval_t err = SYS_ERR_OK;
1107    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
1108
1109    size_t pagesize;
1110
1111    /* work out the number of vnodes we may need and grow the slabs*/
1112    size_t max_slabs;
1113    if ((flags & VREGION_FLAGS_LARGE)) {
1114        assert(!(vaddr & (LARGE_PAGE_SIZE -1)));
1115        assert(!(bytes & (LARGE_PAGE_SIZE -1)));
1116        pagesize = HUGE_PAGE_SIZE;
1117        max_slabs = max_slabs_for_mapping_huge(bytes);
1118    } else if ((flags & VREGION_FLAGS_HUGE)) {
1119        // case huge pages (1GB)
1120        assert(!(vaddr & (HUGE_PAGE_SIZE -1)));
1121        assert(!(bytes & (HUGE_PAGE_SIZE -1)));
1122        pagesize = HUGE_PAGE_SIZE * 512UL;
1123        max_slabs = (bytes / HUGE_PAGE_SIZE) + 1;
1124    } else {
1125        //case normal pages (4KB)
1126        assert(!(vaddr & (BASE_PAGE_SIZE -1)));
1127        assert(!(bytes & (BASE_PAGE_SIZE -1)));
1128        pagesize = LARGE_PAGE_SIZE;
1129        max_slabs = max_slabs_for_mapping_large(bytes);
1130    }
1131
1132    max_slabs += 6; // minimum amount required to map a region spanning 2 ptables
1133
1134    // Refill slab allocator if necessary
1135    err = pmap_refill_slabs(pmap, max_slabs);
1136    if (err_is_fail(err)) {
1137        return err;
1138    }
1139
1140    /* do the actual creation of the page tables */
1141    for (size_t va = vaddr; va < (vaddr + bytes); va += pagesize) {
1142        struct vnode *vnode;
1143        if ((flags & VREGION_FLAGS_LARGE)) {
1144            err = get_pdir(x86, va, &vnode);
1145        } else if ((flags & VREGION_FLAGS_HUGE)) {
1146            err = get_pdpt(x86, va, &vnode);
1147        } else {
1148            err = get_ptable(x86, va, &vnode);
1149        }
1150        if (err_is_fail(err)) {
1151            return err;
1152        }
1153
1154        /* map the page-table read only for access to status bits */
1155        genvaddr_t genvaddr = pmap->m.vregion_offset;
1156        pmap->m.vregion_offset += (genvaddr_t)4096;
1157
1158        assert(pmap->m.vregion_offset < vregion_get_base_addr(&pmap->m.vregion) +
1159               vregion_get_size(&pmap->m.vregion));
1160
1161        /* copy the page-table capability */
1162        /* XXX: this should be somewhere in struct vnode */
1163        struct capref slot;
1164        err = x86->p.slot_alloc->alloc(x86->p.slot_alloc, &slot);
1165        if (err_is_fail(err)) {
1166            return err_push(err, LIB_ERR_SLOT_ALLOC);
1167        }
1168
1169        err = cap_copy(slot, vnode->v.cap);
1170        if (err_is_fail(err)) {
1171            x86->p.slot_alloc->free(x86->p.slot_alloc, slot);
1172            return err;
1173        }
1174
1175        /* get slot for mapping */
1176        /* XXX: this should be in struct vnode somewhere! */
1177        struct capref mapping;
1178        err = x86->p.slot_alloc->alloc(x86->p.slot_alloc, &mapping);
1179        if (err_is_fail(err)) {
1180            return err_push(err, LIB_ERR_SLOT_ALLOC);
1181        }
1182
1183        /* get the page table of the reserved range and map the PT */
1184        struct vnode *ptable;
1185        err = get_ptable(x86, genvaddr, &ptable);
1186        err = vnode_map(ptable->v.cap, slot, X86_64_PTABLE_BASE(genvaddr),
1187                        vregion_to_pmap_flag(VREGION_FLAGS_READ), 0, 1, mapping);
1188
1189        if (err_is_fail(err)) {
1190            return err_push(err, LIB_ERR_PMAP_DO_MAP);
1191        }
1192
1193        /* update the vnode structure */
1194        vnode->is_pinned = 1;
1195        vnode->u.vnode.virt_base = genvaddr;
1196    }
1197
1198    return err;
1199}
1200
1201
1202/*
1203 * returns the virtual address of the leaf pagetable for a mapping
1204 */
1205static errval_t get_leaf_pt(struct pmap *pmap, genvaddr_t vaddr, lvaddr_t *ret_va)
1206{
1207    assert(ret_va);
1208
1209    /* walk down the pt hierarchy and stop at the leaf */
1210
1211    struct vnode *parent = NULL, *current = NULL;
1212    // find page and last-level page table (can be pdir or pdpt)
1213    if ((current = find_pdpt((struct pmap_x86 *)pmap, vaddr)) == NULL) {
1214        return -1;
1215    }
1216
1217    parent = current;
1218    if ((current = pmap_find_vnode(parent, X86_64_PDPT_BASE(vaddr))) == NULL) {
1219        current = parent;
1220        goto out;
1221    }
1222
1223    parent = current;
1224    if ((current = pmap_find_vnode(parent, X86_64_PDIR_BASE(vaddr))) == NULL) {
1225        current = parent;
1226        goto out;
1227    }
1228
1229out:
1230    assert(current && current->v.is_vnode);
1231
1232    *ret_va =  current->u.vnode.virt_base;
1233    return SYS_ERR_OK;
1234}
1235
1236static errval_t determine_addr_raw(struct pmap *pmap, size_t size,
1237                                   size_t alignment, genvaddr_t *retvaddr)
1238{
1239    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
1240
1241    if (alignment == 0) {
1242        alignment = BASE_PAGE_SIZE;
1243    } else {
1244        alignment = ROUND_UP(alignment, BASE_PAGE_SIZE);
1245    }
1246    size = ROUND_UP(size, alignment);
1247    assert(size < 512ul * 1024 * 1024 * 1024); // pml4 size
1248
1249#if defined(PMAP_LL)
1250    struct vnode *walk_pml4 = x86->root.v.u.vnode.children;
1251    assert(walk_pml4 != NULL); // assume there's always at least one existing entry
1252
1253    // try to find free pml4 entry
1254    bool f[512];
1255    for (int i = 0; i < 512; i++) {
1256        f[i] = true;
1257    }
1258    //debug_printf("entry: %d\n", walk_pml4->entry);
1259    f[walk_pml4->v.entry] = false;
1260    while (walk_pml4) {
1261        //debug_printf("looping over pml4 entries\n");
1262        assert(walk_pml4->v.is_vnode);
1263        f[walk_pml4->v.entry] = false;
1264        walk_pml4 = walk_pml4->v.meta.next;
1265    }
1266    genvaddr_t first_free = 16;
1267    for (; first_free < 512; first_free++) {
1268        //debug_printf("f[%"PRIuGENVADDR"] = %d\n", first_free, f[first_free]);
1269        if (f[first_free]) {
1270            break;
1271        }
1272    }
1273#elif defined(PMAP_ARRAY)
1274    genvaddr_t first_free = 16;
1275    for (; first_free < X86_64_PTABLE_SIZE; first_free++) {
1276        if (!x86->root.v.u.vnode.children[first_free]) {
1277            break;
1278        }
1279    }
1280#else
1281#error Invalid pmap datastructure
1282#endif
1283    //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free);
1284    if (first_free < X86_64_PTABLE_SIZE) {
1285        //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free);
1286        *retvaddr = first_free << 39;
1287        return SYS_ERR_OK;
1288    } else {
1289        return LIB_ERR_OUT_OF_VIRTUAL_ADDR;
1290    }
1291}
1292
1293static struct pmap_funcs pmap_funcs = {
1294    .determine_addr = pmap_x86_determine_addr,
1295    .determine_addr_raw = determine_addr_raw,
1296    .map = map,
1297    .unmap = unmap,
1298    .lookup = lookup,
1299    .modify_flags = modify_flags,
1300    .serialise = pmap_serialise,
1301    .deserialise = pmap_deserialise,
1302    .dump = dump,
1303    .create_pts_pinned = create_pts_pinned,
1304    .get_leaf_pt = get_leaf_pt,
1305    .measure_res = pmap_x86_measure_res,
1306};
1307
1308/**
1309 * \brief Initialize a x86 pmap object
1310 *
1311 * \param pmap Pmap object of type x86
1312 */
1313errval_t pmap_x86_64_init(struct pmap *pmap, struct vspace *vspace,
1314                          struct capref vnode,
1315                          struct slot_allocator *opt_slot_alloc)
1316{
1317    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
1318
1319    /* Generic portion */
1320    pmap->f = pmap_funcs;
1321    pmap->vspace = vspace;
1322
1323    if (opt_slot_alloc != NULL) {
1324        pmap->slot_alloc = opt_slot_alloc;
1325    } else { /* use default allocator for this dispatcher */
1326        pmap->slot_alloc = get_default_slot_allocator();
1327    }
1328    x86->used_cap_slots = 0;
1329
1330    errval_t err;
1331    err = pmap_vnode_mgmt_init(pmap);
1332    if (err_is_fail(err)) {
1333        return err_push(err, LIB_ERR_PMAP_INIT);
1334    }
1335
1336    x86->root.v.type = ObjType_VNode_x86_64_pml4;
1337    x86->root.v.is_vnode          = true;
1338    x86->root.v.cap       = vnode;
1339    x86->root.v.u.vnode.invokable = vnode;
1340    if (get_croot_addr(vnode) != CPTR_ROOTCN) {
1341        err = slot_alloc(&x86->root.v.u.vnode.invokable);
1342        assert(err_is_ok(err));
1343        x86->used_cap_slots ++;
1344        err = cap_copy(x86->root.v.u.vnode.invokable, vnode);
1345        assert(err_is_ok(err));
1346    }
1347    assert(!capref_is_null(x86->root.v.cap));
1348    assert(!capref_is_null(x86->root.v.u.vnode.invokable));
1349    pmap_vnode_init(pmap, &x86->root);
1350    x86->root.u.vnode.virt_base = 0;
1351    x86->root.u.vnode.page_table_frame  = NULL_CAP;
1352
1353#ifdef GLOBAL_MCN
1354    if (pmap == get_current_pmap()) {
1355        /*
1356         * for now, for our own pmap, we use the left over slot allocator cnode to
1357         * provide the mapping cnode for the first half of the root page table as
1358         * we cannot allocate CNodes before establishing a connection to the
1359         * memory server!
1360         */
1361        x86->root.u.vnode.mcn[0].cnode = cnode_root;
1362        x86->root.u.vnode.mcn[0].slot = ROOTCN_SLOT_ROOT_MAPPING;
1363        x86->root.u.vnode.mcnode[0].croot = CPTR_ROOTCN;
1364        x86->root.u.vnode.mcnode[0].cnode = ROOTCN_SLOT_ADDR(ROOTCN_SLOT_ROOT_MAPPING);
1365        x86->root.u.vnode.mcnode[0].level = CNODE_TYPE_OTHER;
1366    } else {
1367        err = cnode_create_l2(&x86->root.u.vnode.mcn[0], &x86->root.u.vnode.mcnode[0]);
1368        if (err_is_fail(err)) {
1369            return err_push(err, LIB_ERR_PMAP_ALLOC_CNODE);
1370        }
1371    }
1372#endif
1373
1374    // choose a minimum mappable VA for most domains; enough to catch NULL
1375    // pointer derefs with suitably large offsets
1376    x86->min_mappable_va = 64 * 1024;
1377
1378    // maximum mappable VA is derived from X86_64_MEMORY_OFFSET in kernel
1379    x86->max_mappable_va = (genvaddr_t)0xffffff8000000000;
1380
1381    return SYS_ERR_OK;
1382}
1383
1384errval_t pmap_x86_64_init_ept(struct pmap *pmap, struct vspace *vspace,
1385                              struct capref vnode,
1386                              struct slot_allocator *opt_slot_alloc)
1387{
1388    errval_t err;
1389    err = pmap_x86_64_init(pmap, vspace, vnode, opt_slot_alloc);
1390    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
1391
1392    x86->root.v.type = ObjType_VNode_x86_64_ept_pml4;
1393
1394    return err;
1395}
1396
1397/**
1398 * \brief Initialize the current pmap. Reserve space for metadata
1399 *
1400 * This code is coupled with #vspace_current_init()
1401 */
1402errval_t pmap_x86_64_current_init(bool init_domain)
1403{
1404    struct pmap_x86 *x86 = (struct pmap_x86*)get_current_pmap();
1405
1406    pmap_vnode_mgmt_current_init((struct pmap *)x86);
1407
1408    // We don't know the vnode layout for the first part of our address space
1409    // (which was setup by the kernel), so we avoid mapping there until told it.
1410    x86->min_mappable_va = x86->p.m.vregion.base;
1411
1412    return SYS_ERR_OK;
1413}
1414