1/**
2 * \file
3 * \brief pmap management
4 *
5 * x86_64 specific management of page tables
6 *
7 * Warning: This code is coupled with the code in slot_alloc/. and pinned.c
8 *
9 * The maximum number of slots required to map a BASE_PAGE_SIZE
10 * sized page is the number of page table levels + 1.
11 * The sum for x86_64 is 4.
12 *
13 * Warning: Additional slots will be required to map a BASE_PAGE_SIZE size page,
14 * if we also track the actual frames that are mapped.
15 * Currently this is not the case.
16 */
17
18/*
19 * Copyright (c) 2009-2013 ETH Zurich.
20 * Copyright (c) 2014 HP Labs.
21 * All rights reserved.
22 *
23 * This file is distributed under the terms in the attached LICENSE file.
24 * If you do not find this file, copies can be found by writing to:
25 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group.
26 */
27
28#include <barrelfish/barrelfish.h>
29#include <barrelfish/dispatch.h>
30#include "target/x86/pmap_x86.h"
31#include <stdio.h>
32
33// Size of virtual region mapped by a single PML4 entry
34#define PML4_MAPPING_SIZE ((genvaddr_t)512*512*512*BASE_PAGE_SIZE)
35
36// Location and size of virtual address space reserved for mapping
37// frames backing refill_slabs
38#define META_DATA_RESERVED_BASE (PML4_MAPPING_SIZE * (disp_get_core_id() + 1))
39#define META_DATA_RESERVED_SIZE (X86_64_BASE_PAGE_SIZE * 80000)
40
41/**
42 * \brief Translate generic vregion flags to architecture specific pmap flags
43 */
44static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags)
45{
46    paging_x86_64_flags_t pmap_flags =
47        PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE;
48
49    if (!(vregion_flags & VREGION_FLAGS_GUARD)) {
50        if (vregion_flags & VREGION_FLAGS_WRITE) {
51            pmap_flags |= PTABLE_READ_WRITE;
52        }
53        if (vregion_flags & VREGION_FLAGS_EXECUTE) {
54            pmap_flags &= ~PTABLE_EXECUTE_DISABLE;
55        }
56        if (vregion_flags & VREGION_FLAGS_NOCACHE) {
57            pmap_flags |= PTABLE_CACHE_DISABLED;
58        }
59        else if (vregion_flags & VREGION_FLAGS_WRITE_COMBINING) {
60            // PA4 is configured as write-combining
61            pmap_flags |= PTABLE_ATTR_INDEX;
62        }
63    }
64
65    return pmap_flags;
66}
67
68// returns whether va1 and va2 share a page directory entry
69// not using X86_64_PDIR_BASE() macro as this would give false positives (same
70// entry in different directories)
71static inline bool is_same_pdir(genvaddr_t va1, genvaddr_t va2)
72{
73    return (va1>>X86_64_LARGE_PAGE_BITS) == ((va2-1)>>X86_64_LARGE_PAGE_BITS);
74}
75// returns whether va1 and va2 share a page directory pointer table entry
76static inline bool is_same_pdpt(genvaddr_t va1, genvaddr_t va2)
77{
78    return (va1>>X86_64_HUGE_PAGE_BITS) == ((va2-1)>>X86_64_HUGE_PAGE_BITS);
79}
80// returns whether va1 and va2 share a page map level 4 entry
81static inline bool is_same_pml4(genvaddr_t va1, genvaddr_t va2)
82{
83    // the base macros work here as we only have one pml4.
84    return X86_64_PML4_BASE(va1) == X86_64_PML4_BASE(va2-1);
85}
86// size indicates how many bits to shift
87static inline genvaddr_t get_addr_prefix(genvaddr_t va, uint8_t size)
88{
89    return va >> size;
90}
91
92static inline bool is_large_page(struct vnode *p)
93{
94    return !p->is_vnode && p->u.frame.flags & VREGION_FLAGS_LARGE;
95}
96
97static inline bool is_huge_page(struct vnode *p)
98{
99    return !p->is_vnode && p->u.frame.flags & VREGION_FLAGS_HUGE;
100}
101
102/**
103 * \brief Returns the vnode for the pdpt mapping a given vspace address
104 */
105static inline errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base,
106                                struct vnode **pdpt)
107{
108    errval_t err;
109    struct vnode *root = &pmap->root;
110    assert(root != NULL);
111
112    // PML4 mapping
113    if((*pdpt = find_vnode(root, X86_64_PML4_BASE(base))) == NULL) {
114        err = alloc_vnode(pmap, root, ObjType_VNode_x86_64_pdpt,
115                            X86_64_PML4_BASE(base), pdpt);
116        errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP);
117        if (err == expected_concurrent) {
118            if ((*pdpt = find_vnode(root, X86_64_PML4_BASE(base))) != NULL) {
119                return SYS_ERR_OK;
120            }
121        }
122        if (err_is_fail(err)) {
123            DEBUG_ERR(err, "alloc_vnode for pdpt");
124            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
125        }
126    }
127
128    return SYS_ERR_OK;
129}
130
131/**
132 * \brief Returns the vnode for the page directory mapping a given vspace
133 * address
134 */
135static inline errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base,
136                                struct vnode **pdir)
137{
138    errval_t err;
139    struct vnode *pdpt;
140    err = get_pdpt(pmap, base, &pdpt);
141    if (err_is_fail(err)) {
142        return err;
143    }
144    assert(pdpt != NULL);
145
146    // PDPT mapping
147    if((*pdir = find_vnode(pdpt, X86_64_PDPT_BASE(base))) == NULL) {
148        err = alloc_vnode(pmap, pdpt, ObjType_VNode_x86_64_pdir,
149                            X86_64_PDPT_BASE(base), pdir);
150        errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP);
151        if (err == expected_concurrent) {
152            if ((*pdir = find_vnode(pdpt, X86_64_PDPT_BASE(base))) != NULL) {
153                return SYS_ERR_OK;
154            }
155        }
156        if (err_is_fail(err)) {
157            DEBUG_ERR(err, "alloc_vnode for pdir");
158            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
159        }
160    }
161
162    return SYS_ERR_OK;
163}
164
165/**
166 * \brief Returns the vnode for the pagetable mapping a given vspace address
167 */
168static inline errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base,
169                                  struct vnode **ptable)
170{
171    errval_t err;
172    struct vnode *pdir;
173    err = get_pdir(pmap, base, &pdir);
174    if (err_is_fail(err)) {
175        return err;
176    }
177    assert(pdir != NULL);
178
179    // PDIR mapping
180    if ((*ptable = find_vnode(pdir, X86_64_PDIR_BASE(base))) == NULL) {
181        err = alloc_vnode(pmap, pdir, ObjType_VNode_x86_64_ptable,
182                            X86_64_PDIR_BASE(base), ptable);
183        errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP);
184        if (err == expected_concurrent) {
185            if ((*ptable = find_vnode(pdir, X86_64_PDIR_BASE(base))) != NULL) {
186                return SYS_ERR_OK;
187            }
188        }
189        if (err_is_fail(err)) {
190            DEBUG_ERR(err, "alloc_vnode for ptable");
191            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
192        }
193    }
194
195    return SYS_ERR_OK;
196}
197
198/**
199 * \brief Returns the vnode for the page directory pointer table mapping for a
200 * given vspace address
201 */
202static inline struct vnode *find_pdpt(struct pmap_x86 *pmap, genvaddr_t base)
203{
204    struct vnode *root = &pmap->root;
205    assert(root != NULL);
206
207    // PDPT mapping
208    return find_vnode(root, X86_64_PML4_BASE(base));
209}
210
211/**
212 * \brief Returns the vnode for the page directory mapping a given vspace
213 * address, without performing allocations as get_pdir() does
214 */
215static inline struct vnode *find_pdir(struct pmap_x86 *pmap, genvaddr_t base)
216{
217    struct vnode *pdpt = find_pdpt(pmap, base);
218
219    if (pdpt) {
220        // PDPT mapping
221        return find_vnode(pdpt, X86_64_PDPT_BASE(base));
222    } else {
223        return NULL;
224    }
225}
226
227/**
228 * \brief Returns the vnode for the pagetable mapping a given vspace address,
229 * without performing allocations as get_ptable() does
230 */
231static inline struct vnode *find_ptable(struct pmap_x86 *pmap, genvaddr_t base)
232{
233    struct vnode *pdir = find_pdir(pmap, base);
234
235    if (pdir) {
236        // PDIR mapping
237        return find_vnode(pdir, X86_64_PDIR_BASE(base));
238    } else {
239        return NULL;
240    }
241}
242
243static errval_t do_single_map(struct pmap_x86 *pmap, genvaddr_t vaddr,
244                              genvaddr_t vend, struct capref frame,
245                              size_t offset, size_t pte_count,
246                              vregion_flags_t flags)
247{
248    if (pte_count == 0) {
249        debug_printf("do_single_map: pte_count == 0, called from %p\n",
250                __builtin_return_address(0));
251        return SYS_ERR_OK;
252    }
253    assert(pte_count > 0);
254    // translate flags
255    paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags);
256
257    // Get the paging structure and set paging relevant parameters
258    struct vnode *ptable = NULL;
259    errval_t err;
260    size_t table_base;
261
262    // get the right paging table and address part
263    if (flags & VREGION_FLAGS_LARGE) {
264        //large 2M pages, mapped into pdir
265        err = get_pdir(pmap, vaddr, &ptable);
266        table_base = X86_64_PDIR_BASE(vaddr);
267    } else if (flags & VREGION_FLAGS_HUGE) {
268        //huge 1GB pages, mapped into pdpt
269        err = get_pdpt(pmap, vaddr, &ptable);
270        table_base = X86_64_PDPT_BASE(vaddr);
271    } else {
272        //normal 4K pages, mapped into ptable
273        err = get_ptable(pmap, vaddr, &ptable);
274        table_base = X86_64_PTABLE_BASE(vaddr);
275    }
276    if (err_is_fail(err)) {
277        return err_push(err, LIB_ERR_PMAP_GET_PTABLE);
278    }
279    assert(ptable->is_vnode);
280
281    // check if there is an overlapping mapping
282    if (has_vnode(ptable, table_base, pte_count, false)) {
283        if (has_vnode(ptable, table_base, pte_count, true)) {
284            printf("page already exists in 0x%"
285                    PRIxGENVADDR"--0x%"PRIxGENVADDR"\n", vaddr, vend);
286            return LIB_ERR_PMAP_EXISTING_MAPPING;
287        } else {
288            // clean out empty page tables. We do this here because we benefit
289            // from having the page tables in place when doing lots of small
290            // mappings
291            remove_empty_vnodes(pmap, ptable, table_base, pte_count);
292        }
293    }
294
295    // setup userspace mapping
296    struct vnode *page = slab_alloc(&pmap->slab);
297    assert(page);
298    page->is_vnode = false;
299    page->entry = table_base;
300    page->next  = ptable->u.vnode.children;
301    ptable->u.vnode.children = page;
302    page->u.frame.cap = frame;
303    page->u.frame.offset = offset;
304    page->u.frame.flags = flags;
305    page->u.frame.pte_count = pte_count;
306
307    err = pmap->p.slot_alloc->alloc(pmap->p.slot_alloc, &page->mapping);
308    if (err_is_fail(err)) {
309        return err_push(err, LIB_ERR_SLOT_ALLOC);
310    }
311
312    // do map
313    assert(!capref_is_null(ptable->u.vnode.invokable));
314    err = vnode_map(ptable->u.vnode.invokable, frame, table_base,
315                    pmap_flags, offset, pte_count, page->mapping);
316    if (err_is_fail(err)) {
317        return err_push(err, LIB_ERR_VNODE_MAP);
318    }
319
320    return SYS_ERR_OK;
321}
322
323/**
324 * \brief Called when enough slabs exist for the given mapping
325 */
326static errval_t do_map(struct pmap_x86 *pmap, genvaddr_t vaddr,
327                       struct capref frame, size_t offset, size_t size,
328                       vregion_flags_t flags, size_t *retoff, size_t *retsize)
329{
330    errval_t err;
331
332    // determine page size and relevant address part
333    size_t page_size  = X86_64_BASE_PAGE_SIZE;
334    size_t table_base = X86_64_PTABLE_BASE(vaddr);
335    uint8_t map_bits  = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
336    bool debug_out    = false;
337
338    // get base address and size of frame
339    struct frame_identity fi;
340    err = frame_identify(frame, &fi);
341    if (err_is_fail(err)) {
342        return err_push(err, LIB_ERR_PMAP_DO_MAP);
343    }
344
345    if ((flags & VREGION_FLAGS_HUGE) &&
346        (vaddr & X86_64_HUGE_PAGE_MASK) == 0 &&
347        fi.bytes >= X86_64_HUGE_PAGE_SIZE &&
348        ((fi.base & X86_64_HUGE_PAGE_MASK) == 0))
349    {
350        // huge page branch (1GB)
351        page_size  = X86_64_HUGE_PAGE_SIZE;
352        table_base = X86_64_PDPT_BASE(vaddr);
353        map_bits   = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
354        debug_out  = false;
355        // remove large flag, if we're doing huge mapping
356        flags     &= ~VREGION_FLAGS_LARGE;
357    } else if ((flags & VREGION_FLAGS_LARGE) &&
358               (vaddr & X86_64_LARGE_PAGE_MASK) == 0 &&
359               fi.bytes >= X86_64_LARGE_PAGE_SIZE &&
360               ((fi.base & X86_64_LARGE_PAGE_MASK) == 0))
361    {
362        // large page branch (2MB)
363        page_size  = X86_64_LARGE_PAGE_SIZE;
364        table_base = X86_64_PDIR_BASE(vaddr);
365        map_bits   = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
366        debug_out  = false;
367    } else {
368        // remove large/huge flags
369        flags &= ~(VREGION_FLAGS_LARGE|VREGION_FLAGS_HUGE);
370    }
371
372    // round to the next full page and calculate end address and #ptes
373    size = ROUND_UP(size, page_size);
374    size_t pte_count = DIVIDE_ROUND_UP(size, page_size);
375    genvaddr_t vend = vaddr + size;
376
377    if (offset+size > fi.bytes) {
378        debug_printf("do_map: offset=%zu; size=%zu; frame size=%zu\n",
379                offset, size, fi.bytes);
380        return LIB_ERR_PMAP_FRAME_SIZE;
381    }
382
383#if 0
384    if (true || debug_out) {
385        genpaddr_t paddr = fi.base + offset;
386
387        debug_printf("do_map: 0x%"
388                PRIxGENVADDR"--0x%"PRIxGENVADDR" -> 0x%"PRIxGENPADDR
389                "; pte_count = %zd; frame bytes = 0x%zx; page size = 0x%zx\n",
390                vaddr, vend, paddr, pte_count, fi.bytes, page_size);
391    }
392#endif
393
394    // all mapping on one leaf table?
395    if (is_same_pdir(vaddr, vend) ||
396        (flags & VREGION_FLAGS_LARGE && is_same_pdpt(vaddr, vend)) ||
397        (flags & VREGION_FLAGS_HUGE && is_same_pml4(vaddr, vend))) {
398        // fast path
399        if (debug_out) {
400            debug_printf("  do_map: fast path: %zd\n", pte_count);
401        }
402        err = do_single_map(pmap, vaddr, vend, frame, offset, pte_count, flags);
403        if (err_is_fail(err)) {
404            return err_push(err, LIB_ERR_PMAP_DO_MAP);
405        }
406    }
407    else { // multiple leaf page tables
408        // first leaf
409        uint32_t c = X86_64_PTABLE_SIZE - table_base;
410        if (debug_out) {
411            debug_printf("  do_map: slow path: first leaf %"PRIu32"\n", c);
412        }
413        genvaddr_t temp_end = vaddr + c * page_size;
414        err = do_single_map(pmap, vaddr, temp_end, frame, offset, c, flags);
415        if (err_is_fail(err)) {
416            return err_push(err, LIB_ERR_PMAP_DO_MAP);
417        }
418
419        // map full leaves
420        while (get_addr_prefix(temp_end, map_bits) <
421                get_addr_prefix(vend, map_bits))
422        {
423            // update vars
424            vaddr = temp_end;
425            temp_end = vaddr + X86_64_PTABLE_SIZE * page_size;
426            offset += c * page_size;
427            c = X86_64_PTABLE_SIZE;
428
429            // do mapping
430            if (debug_out) {
431                debug_printf("  do_map: slow path: full leaf\n");
432            }
433            err = do_single_map(pmap, vaddr, temp_end, frame, offset,
434                    X86_64_PTABLE_SIZE, flags);
435            if (err_is_fail(err)) {
436                return err_push(err, LIB_ERR_PMAP_DO_MAP);
437            }
438        }
439
440        // map remaining part
441        offset += c * page_size;
442
443        // calculate remaining pages (subtract ptable bits from map_bits to
444        // get #ptes of last-level instead of 2nd-to-last).
445        c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) -
446            get_addr_prefix(temp_end, map_bits-X86_64_PTABLE_BITS);
447
448        if (c) {
449            // do mapping
450            if (debug_out) {
451                debug_printf("do_map: slow path: last leaf %"PRIu32"\n", c);
452            }
453            err = do_single_map(pmap, temp_end, vend, frame, offset, c, flags);
454            if (err_is_fail(err)) {
455                return err_push(err, LIB_ERR_PMAP_DO_MAP);
456            }
457        }
458    }
459
460    if (retoff) {
461        *retoff = offset;
462    }
463    if (retsize) {
464        *retsize = size;
465    }
466    return SYS_ERR_OK;
467}
468
469/// Computer upper limit on number of slabs required to perform a mapping
470static size_t max_slabs_for_mapping(size_t bytes)
471{
472    size_t max_pages  = DIVIDE_ROUND_UP(bytes, X86_64_BASE_PAGE_SIZE);
473    size_t max_ptable = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
474    size_t max_pdir   = DIVIDE_ROUND_UP(max_ptable, X86_64_PTABLE_SIZE);
475    size_t max_pdpt   = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE);
476    return max_pages + max_ptable + max_pdir + max_pdpt;
477}
478
479static size_t max_slabs_for_mapping_large(size_t bytes)
480{
481    size_t max_pages  = DIVIDE_ROUND_UP(bytes, X86_64_LARGE_PAGE_SIZE);
482    size_t max_pdir   = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
483    size_t max_pdpt   = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE);
484    return max_pages  + max_pdir + max_pdpt;
485}
486
487static size_t max_slabs_for_mapping_huge(size_t bytes)
488{
489    size_t max_pages  = DIVIDE_ROUND_UP(bytes, X86_64_HUGE_PAGE_SIZE);
490    size_t max_pdpt   = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE);
491    return max_pages  + max_pdpt;
492}
493
494/**
495 * \brief Refill slabs used for metadata
496 *
497 * \param pmap     The pmap to refill in
498 * \param request  The number of slabs the allocator must have
499 * when the function returns
500 *
501 * When the current pmap is initialized,
502 * it reserves some virtual address space for metadata.
503 * This reserved address space is used here
504 *
505 * Can only be called for the current pmap
506 * Will recursively call into itself till it has enough slabs
507 */
508static errval_t refill_slabs(struct pmap_x86 *pmap, size_t request)
509{
510    errval_t err;
511
512    /* Keep looping till we have #request slabs */
513    while (slab_freecount(&pmap->slab) < request) {
514        // Amount of bytes required for #request
515        size_t bytes = SLAB_STATIC_SIZE(request - slab_freecount(&pmap->slab),
516                                        sizeof(struct vnode));
517
518        /* Get a frame of that size */
519        struct capref cap;
520        err = frame_alloc(&cap, bytes, &bytes);
521        if (err_is_fail(err)) {
522            return err_push(err, LIB_ERR_FRAME_ALLOC);
523        }
524
525        /* If we do not have enough slabs to map the frame in, recurse */
526        size_t required_slabs_for_frame = max_slabs_for_mapping(bytes);
527        if (slab_freecount(&pmap->slab) < required_slabs_for_frame) {
528            // If we recurse, we require more slabs than to map a single page
529            assert(required_slabs_for_frame > 4);
530
531            err = refill_slabs(pmap, required_slabs_for_frame);
532            if (err_is_fail(err)) {
533                return err_push(err, LIB_ERR_SLAB_REFILL);
534            }
535        }
536
537        /* Perform mapping */
538        genvaddr_t genvaddr = pmap->vregion_offset;
539        pmap->vregion_offset += (genvaddr_t)bytes;
540        assert(pmap->vregion_offset < vregion_get_base_addr(&pmap->vregion) +
541               vregion_get_size(&pmap->vregion));
542
543        err = do_map(pmap, genvaddr, cap, 0, bytes,
544                     VREGION_FLAGS_READ_WRITE, NULL, NULL);
545        if (err_is_fail(err)) {
546            return err_push(err, LIB_ERR_PMAP_DO_MAP);
547        }
548
549        /* Grow the slab */
550        lvaddr_t buf = vspace_genvaddr_to_lvaddr(genvaddr);
551        slab_grow(&pmap->slab, (void*)buf, bytes);
552    }
553
554    return SYS_ERR_OK;
555}
556
557/// Minimally refill the slab allocator
558static errval_t min_refill_slabs(struct pmap_x86 *pmap)
559{
560    return refill_slabs(pmap, 5);
561}
562
563/**
564 * \brief Create page mappings
565 *
566 * \param pmap     The pmap object
567 * \param vaddr    The virtual address to create the mapping for
568 * \param frame    The frame cap to map in
569 * \param offset   Offset into the frame cap
570 * \param size     Size of the mapping
571 * \param flags    Flags for the mapping
572 * \param retoff   If non-NULL, filled in with adjusted offset of mapped region
573 * \param retsize  If non-NULL, filled in with adjusted size of mapped region
574 */
575static errval_t map(struct pmap *pmap, genvaddr_t vaddr, struct capref frame,
576                    size_t offset, size_t size, vregion_flags_t flags,
577                    size_t *retoff, size_t *retsize)
578{
579    errval_t err;
580    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
581
582    struct frame_identity fi;
583    err = frame_identify(frame, &fi);
584    if (err_is_fail(err)) {
585        return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY);
586    }
587
588    size_t max_slabs;
589    // Adjust the parameters to page boundaries
590    // TODO: overestimating needed slabs shouldn't hurt much in the long run,
591    // and would keep the code easier to read and possibly faster due to less
592    // branching
593    if ((flags & VREGION_FLAGS_LARGE) &&
594        (vaddr & X86_64_LARGE_PAGE_MASK) == 0 &&
595        (fi.base & X86_64_LARGE_PAGE_MASK) == 0 &&
596        fi.bytes >= offset+size) {
597        //case large pages (2MB)
598        size   += LARGE_PAGE_OFFSET(offset);
599        size    = ROUND_UP(size, LARGE_PAGE_SIZE);
600        offset -= LARGE_PAGE_OFFSET(offset);
601        max_slabs = max_slabs_for_mapping_large(size);
602    } else if ((flags & VREGION_FLAGS_HUGE) &&
603               (vaddr & X86_64_HUGE_PAGE_MASK) == 0 &&
604               (fi.base & X86_64_HUGE_PAGE_MASK) == 0 &&
605               fi.bytes >= offset+size) {
606        // case huge pages (1GB)
607        size   += HUGE_PAGE_OFFSET(offset);
608        size    = ROUND_UP(size, HUGE_PAGE_SIZE);
609        offset -= HUGE_PAGE_OFFSET(offset);
610        max_slabs = max_slabs_for_mapping_huge(size);
611    } else {
612        //case normal pages (4KB)
613        size   += BASE_PAGE_OFFSET(offset);
614        size    = ROUND_UP(size, BASE_PAGE_SIZE);
615        offset -= BASE_PAGE_OFFSET(offset);
616        max_slabs = max_slabs_for_mapping(size);
617    }
618
619    // Refill slab allocator if necessary
620    size_t slabs_free = slab_freecount(&x86->slab);
621
622    max_slabs += 5; // minimum amount required to map a page
623    if (slabs_free < max_slabs) {
624        struct pmap *mypmap = get_current_pmap();
625        if (pmap == mypmap) {
626            err = refill_slabs(x86, max_slabs);
627            if (err_is_fail(err)) {
628                return err_push(err, LIB_ERR_SLAB_REFILL);
629            }
630        } else {
631            size_t bytes = SLAB_STATIC_SIZE(max_slabs - slabs_free,
632                                            sizeof(struct vnode));
633            void *buf = malloc(bytes);
634            if (!buf) {
635                return LIB_ERR_MALLOC_FAIL;
636            }
637            slab_grow(&x86->slab, buf, bytes);
638        }
639    }
640
641    err = do_map(x86, vaddr, frame, offset, size, flags, retoff, retsize);
642    return err;
643}
644
645struct find_mapping_info {
646    struct vnode *page_table;
647    struct vnode *page;
648    size_t page_size;
649    size_t table_base;
650    uint8_t map_bits;
651};
652
653/**
654 * \brief Find mapping for `vaddr` in `pmap`.
655 * \arg pmap the pmap to search in
656 * \arg vaddr the virtual address to search for
657 * \arg pt the last-level page table meta-data we found if any
658 * \arg page the page meta-data we found if any
659 * \returns `true` iff we found a mapping for vaddr
660 */
661static bool find_mapping(struct pmap_x86 *pmap, genvaddr_t vaddr,
662                         struct find_mapping_info *info)
663{
664    struct vnode *pdpt = NULL, *pdir = NULL, *pt = NULL, *page = NULL;
665
666    size_t page_size = 0;
667    size_t table_base = 0;
668    uint8_t map_bits = 0;
669
670    // find page and last-level page table (can be pdir or pdpt)
671    if ((pdpt = find_pdpt(pmap, vaddr)) != NULL) {
672        page = find_vnode(pdpt, X86_64_PDPT_BASE(vaddr));
673        if (page && page->is_vnode) { // not 1G pages
674            pdir = page;
675            page = find_vnode(pdir, X86_64_PDIR_BASE(vaddr));
676            if (page && page->is_vnode) { // not 2M pages
677                pt = page;
678                page = find_vnode(pt, X86_64_PTABLE_BASE(vaddr));
679                page_size = X86_64_BASE_PAGE_SIZE;
680                table_base = X86_64_PTABLE_BASE(vaddr);
681                map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS;
682            } else if (page) {
683                assert(is_large_page(page));
684                pt = pdir;
685                page_size = X86_64_LARGE_PAGE_SIZE;
686                table_base = X86_64_PDIR_BASE(vaddr);
687                map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS;
688            }
689        } else if (page) {
690            assert(is_huge_page(page));
691            pt = pdpt;
692            page_size = X86_64_HUGE_PAGE_SIZE;
693            table_base = X86_64_PDPT_BASE(vaddr);
694            map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS;
695        }
696    }
697    if (info) {
698        info->page_table = pt;
699        info->page = page;
700        info->page_size = page_size;
701        info->table_base = table_base;
702        info->map_bits = map_bits;
703    }
704    if (pt && page) {
705        return true;
706    } else {
707        return false;
708    }
709}
710
711static errval_t do_single_unmap(struct pmap_x86 *pmap, genvaddr_t vaddr,
712                                size_t pte_count)
713{
714    errval_t err;
715    struct find_mapping_info info;
716
717    if (!find_mapping(pmap, vaddr, &info)) {
718        return LIB_ERR_PMAP_FIND_VNODE;
719    }
720    assert(info.page_table && info.page_table->is_vnode && info.page && !info.page->is_vnode);
721
722    if (info.page->u.frame.pte_count == pte_count) {
723        err = vnode_unmap(info.page_table->u.vnode.cap, info.page->mapping);
724        if (err_is_fail(err)) {
725            printf("vnode_unmap returned error: %s (%d)\n",
726                    err_getstring(err), err_no(err));
727            return err_push(err, LIB_ERR_VNODE_UNMAP);
728        }
729
730        // delete&free page->mapping after doing vnode_unmap()
731        err = cap_delete(info.page->mapping);
732        if (err_is_fail(err)) {
733            return err_push(err, LIB_ERR_CAP_DELETE);
734        }
735        err = pmap->p.slot_alloc->free(pmap->p.slot_alloc, info.page->mapping);
736        if (err_is_fail(err)) {
737            return err_push(err, LIB_ERR_SLOT_FREE);
738        }
739        // Free up the resources
740        remove_vnode(info.page_table, info.page);
741        slab_free(&pmap->slab, info.page);
742    }
743
744    return SYS_ERR_OK;
745}
746
747/**
748 * \brief Remove page mappings
749 *
750 * \param pmap     The pmap object
751 * \param vaddr    The start of the virtual region to remove
752 * \param size     The size of virtual region to remove
753 * \param retsize  If non-NULL, filled in with the actual size removed
754 */
755static errval_t unmap(struct pmap *pmap, genvaddr_t vaddr, size_t size,
756                      size_t *retsize)
757{
758    //printf("[unmap] 0x%"PRIxGENVADDR", %zu\n", vaddr, size);
759    errval_t err, ret = SYS_ERR_OK;
760    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
761
762    //determine if we unmap a larger page
763    struct find_mapping_info info;
764
765    if (!find_mapping(x86, vaddr, &info)) {
766        //TODO: better error --> LIB_ERR_PMAP_NOT_MAPPED
767        return LIB_ERR_PMAP_UNMAP;
768    }
769
770    assert(!info.page->is_vnode);
771
772    if (info.page->entry > info.table_base) {
773        debug_printf("trying to partially unmap region\n");
774        // XXX: error code
775        return LIB_ERR_PMAP_FIND_VNODE;
776    }
777
778    // TODO: match new policy of map when implemented
779    size = ROUND_UP(size, info.page_size);
780    genvaddr_t vend = vaddr + size;
781
782    if (is_same_pdir(vaddr, vend) ||
783        (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) ||
784        (is_same_pml4(vaddr, vend) && is_huge_page(info.page)))
785    {
786        // fast path
787        err = do_single_unmap(x86, vaddr, size / info.page_size);
788        if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
789            printf("error fast path\n");
790            return err_push(err, LIB_ERR_PMAP_UNMAP);
791        }
792    }
793    else { // slow path
794        // unmap first leaf
795        uint32_t c = X86_64_PTABLE_SIZE - info.table_base;
796
797        err = do_single_unmap(x86, vaddr, c);
798        if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
799            printf("error first leaf\n");
800            return err_push(err, LIB_ERR_PMAP_UNMAP);
801        }
802
803        // unmap full leaves
804        vaddr += c * info.page_size;
805        while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) {
806            c = X86_64_PTABLE_SIZE;
807            err = do_single_unmap(x86, vaddr, X86_64_PTABLE_SIZE);
808            if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
809                printf("error while loop\n");
810                return err_push(err, LIB_ERR_PMAP_UNMAP);
811            }
812            vaddr += c * info.page_size;
813        }
814
815        // unmap remaining part
816        // subtracting ptable bits from map_bits to get #ptes in last-level table
817        // instead of 2nd-to-last.
818        c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) -
819            get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS);
820        assert(c < X86_64_PTABLE_SIZE);
821        if (c) {
822            err = do_single_unmap(x86, vaddr, c);
823            if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) {
824                printf("error remaining part\n");
825                return err_push(err, LIB_ERR_PMAP_UNMAP);
826            }
827        }
828    }
829
830    if (retsize) {
831        *retsize = size;
832    }
833
834    //printf("[unmap] exiting\n");
835    return ret;
836}
837
838static errval_t do_single_modify_flags(struct pmap_x86 *pmap, genvaddr_t vaddr,
839                                       size_t pages, vregion_flags_t flags)
840{
841    errval_t err = SYS_ERR_OK;
842
843    struct find_mapping_info info;
844
845    if (!find_mapping(pmap, vaddr, &info)) {
846        return LIB_ERR_PMAP_FIND_VNODE;
847    }
848
849    assert(info.page_table && info.page_table->is_vnode && info.page && !info.page->is_vnode);
850
851    if (inside_region(info.page_table, info.table_base, pages)) {
852        // we're modifying part of a valid mapped region
853        // arguments to invocation: invoke frame cap, first affected
854        // page (as offset from first page in mapping), #affected
855        // pages, new flags. Invocation mask flags based on capability
856        // access permissions.
857        size_t off = info.table_base - info.page->entry;
858        paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags);
859        // calculate TLB flushing hint
860        genvaddr_t va_hint = 0;
861        if (pages == 1) {
862            // do assisted selective flush for single page
863            va_hint = vaddr & ~(info.page_size - 1);
864        }
865        err = invoke_mapping_modify_flags(info.page->mapping, off, pages,
866                                          pmap_flags, va_hint);
867        return err;
868    } else {
869        // overlaps some region border
870        // XXX: need better error
871        return LIB_ERR_PMAP_EXISTING_MAPPING;
872    }
873
874    return SYS_ERR_OK;
875}
876
877
878/**
879 * \brief Modify page mapping
880 *
881 * \param pmap     The pmap object
882 * \param vaddr    The first virtual address for which to change the flags
883 * \param size     The length of the region to change in bytes
884 * \param flags    New flags for the mapping
885 * \param retsize  If non-NULL, filled in with the actual size modified
886 */
887static errval_t modify_flags(struct pmap *pmap, genvaddr_t vaddr, size_t size,
888                             vregion_flags_t flags, size_t *retsize)
889{
890    errval_t err;
891    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
892
893    //determine if we unmap a larger page
894    struct find_mapping_info info;
895
896    if (!find_mapping(x86, vaddr, &info)) {
897        return LIB_ERR_PMAP_NOT_MAPPED;
898    }
899
900    assert(info.page && !info.page->is_vnode);
901
902    // TODO: match new policy of map when implemented
903    size = ROUND_UP(size, info.page_size);
904    genvaddr_t vend = vaddr + size;
905
906    size_t pages = size / info.page_size;
907
908    // vaddr and vend specify begin and end of the region (inside a mapping)
909    // that should receive the new set of flags
910    if (is_same_pdir(vaddr, vend) ||
911        (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) ||
912        (is_same_pml4(vaddr, vend) && is_huge_page(info.page))) {
913        // fast path
914        err = do_single_modify_flags(x86, vaddr, pages, flags);
915        if (err_is_fail(err)) {
916            return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
917        }
918    }
919    else { // slow path
920        // modify first part
921        uint32_t c = X86_64_PTABLE_SIZE - info.table_base;
922        err = do_single_modify_flags(x86, vaddr, c, flags);
923        if (err_is_fail(err)) {
924            return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
925        }
926
927        // modify full leaves
928        vaddr += c * info.page_size;
929        while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) {
930            c = X86_64_PTABLE_SIZE;
931            err = do_single_modify_flags(x86, vaddr, X86_64_PTABLE_SIZE, flags);
932            if (err_is_fail(err)) {
933                return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
934            }
935            vaddr += c * info.page_size;
936        }
937
938        // modify remaining part
939        c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) -
940                get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS);
941        if (c) {
942            err = do_single_modify_flags(x86, vaddr, c, flags);
943            if (err_is_fail(err)) {
944                return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS);
945            }
946        }
947    }
948
949    if (retsize) {
950        *retsize = size;
951    }
952
953    //printf("[modify_flags] exiting\n");
954    return SYS_ERR_OK;
955}
956
957/**
958 * \brief Query existing page mapping
959 *
960 * \param pmap     The pmap object
961 * \param vaddr    The virtual address to query
962 * \param retvaddr Returns the base virtual address of the mapping
963 * \param retsize  Returns the actual size of the mapping
964 * \param retcap   Returns the cap mapped at this address
965 * \param retoffset Returns the offset within the cap that is mapped
966 * \param retflags Returns the flags for this mapping
967 *
968 * All of the ret parameters are optional.
969 */
970static errval_t lookup(struct pmap *pmap, genvaddr_t vaddr,
971                       struct pmap_mapping_info *info)
972{
973    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
974
975    struct find_mapping_info find_info;
976    bool found = find_mapping(x86, vaddr, &find_info);
977
978    if (!found) {
979        return LIB_ERR_PMAP_FIND_VNODE;
980    }
981
982    if (info) {
983        info->vaddr = vaddr & ~(genvaddr_t)(find_info.page_size - 1);
984        info->size = find_info.page_size;
985        info->cap = find_info.page->u.frame.cap;
986        info->offset = find_info.page->u.frame.offset;
987        info->flags = find_info.page->u.frame.flags;
988        info->mapping = find_info.page->mapping;
989    }
990    return SYS_ERR_OK;
991}
992
993
994
995static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written)
996{
997    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
998    struct pmap_dump_info *buf_ = buf;
999
1000    struct vnode *pml4 = &x86->root;
1001    struct vnode *pdpt, *pdir, *pt, *frame;
1002    assert(pml4 != NULL);
1003
1004    *items_written = 0;
1005
1006    // iterate over PML4 entries
1007    size_t pml4_index, pdpt_index, pdir_index;
1008    for (pdpt = pml4->u.vnode.children; pdpt != NULL; pdpt = pdpt->next) {
1009        pml4_index = pdpt->entry;
1010        // iterate over pdpt entries
1011        for (pdir = pdpt->u.vnode.children; pdir != NULL; pdir = pdir->next) {
1012            pdpt_index = pdir->entry;
1013            // iterate over pdir entries
1014            for (pt = pdir->u.vnode.children; pt != NULL; pt = pt->next) {
1015                pdir_index = pt->entry;
1016                // iterate over pt entries
1017                for (frame = pt->u.vnode.children; frame != NULL; frame = frame->next) {
1018                    if (*items_written < buflen) {
1019                        buf_->pml4_index = pml4_index;
1020                        buf_->pdpt_index = pdpt_index;
1021                        buf_->pdir_index = pdir_index;
1022                        buf_->pt_index = frame->entry;
1023                        buf_->cap = frame->u.frame.cap;
1024                        buf_->offset = frame->u.frame.offset;
1025                        buf_->flags = frame->u.frame.flags;
1026                        buf_++;
1027                        (*items_written)++;
1028                    }
1029                }
1030            }
1031        }
1032    }
1033    return SYS_ERR_OK;
1034}
1035
1036static errval_t determine_addr_raw(struct pmap *pmap, size_t size,
1037                                   size_t alignment, genvaddr_t *retvaddr)
1038{
1039    struct pmap_x86 *x86 = (struct pmap_x86 *)pmap;
1040
1041    struct vnode *walk_pml4 = x86->root.u.vnode.children;
1042    assert(walk_pml4 != NULL); // assume there's always at least one existing entry
1043
1044    if (alignment == 0) {
1045        alignment = BASE_PAGE_SIZE;
1046    } else {
1047        alignment = ROUND_UP(alignment, BASE_PAGE_SIZE);
1048    }
1049    size = ROUND_UP(size, alignment);
1050    assert(size < 512ul * 1024 * 1024 * 1024); // pml4 size
1051
1052    // try to find free pml4 entry
1053    bool f[512];
1054    for (int i = 0; i < 512; i++) {
1055        f[i] = true;
1056    }
1057    //debug_printf("entry: %d\n", walk_pml4->entry);
1058    f[walk_pml4->entry] = false;
1059    while (walk_pml4) {
1060        //debug_printf("looping over pml4 entries\n");
1061        assert(walk_pml4->is_vnode);
1062        f[walk_pml4->entry] = false;
1063        walk_pml4 = walk_pml4->next;
1064    }
1065    genvaddr_t first_free = 16;
1066    for (; first_free < 512; first_free++) {
1067        //debug_printf("f[%"PRIuGENVADDR"] = %d\n", first_free, f[first_free]);
1068        if (f[first_free]) {
1069            break;
1070        }
1071    }
1072    //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free);
1073    if (first_free < 512) {
1074        //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free);
1075        *retvaddr = first_free << 39;
1076        return SYS_ERR_OK;
1077    } else {
1078        return LIB_ERR_OUT_OF_VIRTUAL_ADDR;
1079    }
1080}
1081
1082static struct pmap_funcs pmap_funcs = {
1083    .determine_addr = pmap_x86_determine_addr,
1084    .determine_addr_raw = determine_addr_raw,
1085    .map = map,
1086    .unmap = unmap,
1087    .lookup = lookup,
1088    .modify_flags = modify_flags,
1089    .serialise = pmap_x86_serialise,
1090    .deserialise = pmap_x86_deserialise,
1091    .dump = dump,
1092};
1093
1094/**
1095 * \brief Initialize a x86 pmap object
1096 *
1097 * \param pmap Pmap object of type x86
1098 */
1099errval_t pmap_x86_64_init(struct pmap *pmap, struct vspace *vspace,
1100                          struct capref vnode,
1101                          struct slot_allocator *opt_slot_alloc)
1102{
1103    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
1104
1105    /* Generic portion */
1106    pmap->f = pmap_funcs;
1107    pmap->vspace = vspace;
1108
1109    if (opt_slot_alloc != NULL) {
1110        pmap->slot_alloc = opt_slot_alloc;
1111    } else { /* use default allocator for this dispatcher */
1112        pmap->slot_alloc = get_default_slot_allocator();
1113    }
1114
1115    /* x86 specific portion */
1116    slab_init(&x86->slab, sizeof(struct vnode), NULL);
1117    slab_grow(&x86->slab, x86->slab_buffer,
1118              sizeof(x86->slab_buffer));
1119    x86->refill_slabs = min_refill_slabs;
1120
1121    x86->root.is_vnode          = true;
1122    x86->root.u.vnode.cap       = vnode;
1123    x86->root.u.vnode.invokable = vnode;
1124    if (get_croot_addr(vnode) != CPTR_ROOTCN) {
1125        errval_t err = slot_alloc(&x86->root.u.vnode.invokable);
1126        assert(err_is_ok(err));
1127        err = cap_copy(x86->root.u.vnode.invokable, vnode);
1128        assert(err_is_ok(err));
1129    }
1130    assert(!capref_is_null(x86->root.u.vnode.cap));
1131    assert(!capref_is_null(x86->root.u.vnode.invokable));
1132    x86->root.u.vnode.children  = NULL;
1133    x86->root.next              = NULL;
1134
1135    // choose a minimum mappable VA for most domains; enough to catch NULL
1136    // pointer derefs with suitably large offsets
1137    x86->min_mappable_va = 64 * 1024;
1138
1139    // maximum mappable VA is derived from X86_64_MEMORY_OFFSET in kernel
1140    x86->max_mappable_va = (genvaddr_t)0xffffff8000000000;
1141
1142    return SYS_ERR_OK;
1143}
1144
1145/**
1146 * \brief Initialize the current pmap. Reserve space for metadata
1147 *
1148 * This code is coupled with #vspace_current_init()
1149 */
1150errval_t pmap_x86_64_current_init(bool init_domain)
1151{
1152    struct pmap_x86 *x86 = (struct pmap_x86*)get_current_pmap();
1153
1154    // To reserve a block of virtual address space,
1155    // a vregion representing the address space is required.
1156    // We construct a superficial one here and add it to the vregion list.
1157    struct vregion *vregion = &x86->vregion;
1158    vregion->vspace = NULL;
1159    vregion->memobj = NULL;
1160    vregion->base   = META_DATA_RESERVED_BASE;
1161    vregion->offset = 0;
1162    vregion->size   = META_DATA_RESERVED_SIZE;
1163    vregion->flags  = 0;
1164    vregion->next = NULL;
1165
1166    struct vspace *vspace = x86->p.vspace;
1167    assert(!vspace->head);
1168    vspace->head = vregion;
1169
1170    x86->vregion_offset = x86->vregion.base;
1171
1172    // We don't know the vnode layout for the first part of our address space
1173    // (which was setup by the kernel), so we avoid mapping there until told it.
1174    x86->min_mappable_va = META_DATA_RESERVED_BASE;
1175
1176    return SYS_ERR_OK;
1177}
1178