1#include <barrelfish/barrelfish.h>
2#include <barrelfish/except.h>
3#include <barrelfish_kpi/paging_target.h>
4#include <assert.h>
5
6#include "pmap_cow.h"
7#include "debug.h"
8
9/* the benchmark USER_PANICs when PMAP_COW is not set, instead of generating a
10 * preprocessor error here, so we can build BF with PMAP_LL.
11#ifndef PMAP_ARRAY
12#error need PMAP_ARRAY for pmap_cow to work
13#endif
14*/
15
16static struct vnode *cow_root_pte = NULL;
17#define EX_STACK_SIZE 16384
18static char ex_stack[EX_STACK_SIZE];
19
20// default alloc 1MB
21static size_t default_frame_bytes = 1ULL << 20;
22static struct capref current_ram, current_frame;
23static cslot_t current_slot_count = 0;
24size_t get_ram_caps_count = 0;
25static errval_t get_ram_caps(void)
26{
27    get_ram_caps_count ++;
28    struct capref ram;
29    size_t alloc_bytes = default_frame_bytes;
30    errval_t err;
31ram_alloc_retry:
32    err = ram_alloc(&ram, log2ceil(alloc_bytes));
33    if (err_no(err) == LIB_ERR_RAM_ALLOC_WRONG_SIZE) {
34        DEBUG_COW("early ram_alloc, retry with BASE_PAGE_BITS\n");
35        // this is probably before we have a connection to init and are using
36        // ram_alloc_fixed() which can only do 4kB pages, so we don't yet
37        // touch the default allocation size
38        alloc_bytes = BASE_PAGE_SIZE;
39        err = ram_alloc(&ram, BASE_PAGE_BITS);
40        if (err_is_fail(err)) {
41            USER_PANIC_ERR(err, "early ram_alloc failed\n");
42            return err;
43        }
44    } else if (err_no(err) == MM_ERR_NOT_FOUND && alloc_bytes > BASE_PAGE_SIZE) {
45        DEBUG_COW("err: %s\n", err_getstring(err));
46        default_frame_bytes >>= 1; // halve default allocation size
47        DEBUG_COW("smaller allocation size: %zd\n", default_frame_bytes);
48        alloc_bytes = default_frame_bytes;
49        goto ram_alloc_retry;
50    } else if (err_is_fail(err)) {
51        debug_printf("error in ram_alloc: %s\n", err_getstring(err));
52        return err;
53    }
54    // make sure we have a RAM cap that is a multiple of base pages
55    assert(alloc_bytes >= BASE_PAGE_SIZE);
56    assert(alloc_bytes % BASE_PAGE_SIZE == 0);
57
58    // retype into 4k caps in new cnode
59    cslot_t slots_needed = alloc_bytes / BASE_PAGE_SIZE;
60    current_slot_count = slots_needed;
61    debug_printf("slots_needed = %"PRIuCSLOT"\n", slots_needed);
62    if (slots_needed == 1) {
63        USER_PANIC("OOM");
64    }
65    if (slots_needed < L2_CNODE_SLOTS) {
66        debug_printf("slowly running out of RAM: only got %d pages\n", slots_needed);
67    }
68    assert(slots_needed > 1);
69    assert(slots_needed <= L2_CNODE_SLOTS);
70    struct capref nextcncap;
71    struct cnoderef nextcn;
72    DEBUG_COW("%s: need CNode with %d slots\n", __FUNCTION__, slots_needed);
73    err = cnode_create_l2(&nextcncap, &nextcn);
74    if (err_is_fail(err)) {
75        DEBUG_ERR(err, "cnode_create");
76        return err;
77    }
78    current_ram = (struct capref) {
79        .cnode = nextcn,
80        .slot = 0,
81    };
82    // Create empty cnode for retypes to frames/ptables in cow_get_page
83    err = cnode_create_l2(&nextcncap, &nextcn);
84    if (err_is_fail(err)) {
85        DEBUG_ERR(err, "cnode_create");
86        return err;
87    }
88    current_frame = (struct capref) {
89        .cnode = nextcn,
90        .slot = 0,
91    };
92
93    // Retype into 4kB RAM caps
94    err = cap_retype(current_ram, ram, 0, ObjType_RAM, BASE_PAGE_SIZE, slots_needed);
95    if (err_is_fail(err)) {
96        debug_printf("error in cap_retype: %s\n", err_getstring(err));
97        return err;
98    }
99
100    return SYS_ERR_OK;
101}
102
103size_t cow_get_page_count = 0;
104static errval_t cow_get_page(struct capref *f, enum objtype type)
105{
106    cow_get_page_count++;
107    errval_t err;
108    assert(f);
109    if (current_slot_count == 0 || current_ram.slot == current_slot_count) {
110        err = get_ram_caps();
111        if (err_is_fail(err)) {
112            return err;
113        }
114    }
115    err = cap_retype(current_frame, current_ram, 0, type, BASE_PAGE_SIZE, 1);
116    if (err_is_fail(err)) {
117        return err;
118    }
119    *f = current_frame;
120    current_frame.slot++;
121    current_ram.slot++;
122    return SYS_ERR_OK;
123}
124
125static errval_t alloc_vnode_noalloc(struct pmap_x86 *pmap, struct vnode *root,
126                     struct capref vnodecap, uint32_t entry,
127                     struct vnode **retvnode)
128{
129    errval_t err;
130
131    struct vnode *newvnode = slab_alloc(&pmap->p.m.slab);
132    if (newvnode == NULL) {
133        return LIB_ERR_SLAB_ALLOC_FAIL;
134    }
135    newvnode->v.cap = vnodecap;
136
137    err = slot_alloc(&newvnode->v.mapping);
138    assert(err_is_ok(err));
139
140    // Map it
141    err = vnode_map(root->v.cap, newvnode->v.cap, entry,
142                    PTABLE_ACCESS_DEFAULT, 0, 1, newvnode->v.mapping);
143    if (err_is_fail(err)) {
144        return err_push(err, LIB_ERR_VNODE_MAP);
145    }
146
147    // The VNode meta data
148    newvnode->v.is_vnode  = true;
149    newvnode->is_cloned = false;
150    newvnode->v.entry     = entry;
151#ifdef PMAP_LL
152    newvnode->v.meta.next      = root->v.u.vnode.children;
153    root->v.u.vnode.children = newvnode;
154    newvnode->v.u.vnode.children = NULL;
155#elif defined(PMAP_ARRAY)
156    memset(newvnode->v.u.vnode.children, 0, sizeof(struct vode *)*PTABLE_SIZE);
157    root->v.u.vnode.children[entry] = newvnode;
158#else
159#error Invalid pmap datastructure
160#endif
161
162    *retvnode = newvnode;
163    return SYS_ERR_OK;
164}
165
166static errval_t alloc_vnode(struct pmap_x86 *pmap, struct vnode *root,
167                     enum objtype type, uint32_t entry,
168                     struct vnode **retvnode)
169{
170    errval_t err;
171
172    struct capref vnodecap;
173    // Get the VNode capability
174    err = cow_get_page(&vnodecap, type);
175    if (err_is_fail(err)) {
176        return err_push(err, LIB_ERR_VNODE_CREATE);
177    }
178
179    return alloc_vnode_noalloc(pmap, root, vnodecap, entry, retvnode);
180}
181
182#if defined(PMAP_LL)
183static struct vnode *find_vnode(struct vnode *root, uint16_t entry)
184{
185    assert(root != NULL);
186    assert(root->v.is_vnode);
187    struct vnode *n;
188
189    for(n = root->v.u.vnode.children; n != NULL; n = n->v.meta.next) {
190        if (!n->v.is_vnode) {
191            // check whether entry is inside a large region
192            uint16_t end = n->v.entry + n->v.u.frame.pte_count;
193            if (n->v.entry <= entry && entry < end) {
194                //if (n->v.entry < entry) {
195                //    debug_printf("%d \\in [%d, %d]\n", entry, n->v.entry, end);
196                //}
197                return n;
198            }
199        }
200        else if(n->v.entry == entry) {
201            // return n if n is a vnode and the indices match
202            return n;
203        }
204    }
205    return NULL;
206}
207
208static errval_t vnode_clone(struct pmap_x86 *x86,
209        struct vnode *parent, size_t entry,
210        struct vnode **dest, struct vnode *src)
211{
212    return LIB_ERR_NOT_IMPLEMENTED;
213}
214
215#elif defined(PMAP_ARRAY)
216static struct vnode *find_vnode(struct vnode *root, uint16_t entry)
217{
218    assert(root != NULL);
219    assert(root->v.is_vnode);
220    assert(entry < PTABLE_SIZE);
221
222    if (root->v.u.vnode.children) {
223        return root->v.u.vnode.children[entry];
224    } else {
225        return NULL;
226    }
227}
228
229static errval_t vnode_clone(struct pmap_x86 *x86,
230        struct vnode *parent, size_t entry,
231        struct vnode **dest, struct vnode *src)
232{
233    errval_t err;
234    // TODO: better change to r/o on pml4e or pdpt?
235    err = vnode_modify_flags(src->v.cap, 0,
236            PTABLE_SIZE, PTABLE_ACCESS_READONLY);
237    if (err_is_fail(err)) {
238        USER_PANIC_ERR(err, "vnode_modify_flags");
239    }
240    assert(err_is_ok(err));
241    // create copy of pdpt cap
242    struct capref copy;
243    err = slot_alloc(&copy);
244    if (err_is_fail(err)) {
245        USER_PANIC_ERR(err, "slot_alloc");
246    }
247    assert(err_is_ok(err));
248    err = cap_copy(copy, src->v.cap);
249    if (err_is_fail(err)) {
250        USER_PANIC_ERR(err, "cap_copy");
251    }
252    assert(err_is_ok(err));
253
254    err = alloc_vnode_noalloc(x86, parent, copy,
255                      entry, dest);
256    if (err_is_fail(err)) {
257        USER_PANIC_ERR(err, "alloc_vnode_noalloc");
258    }
259    assert(*dest);
260    // copy children metadata
261    // XXX: should copy caps to keep revoke safety
262    memcpy((*dest)->v.u.vnode.children, src->v.u.vnode.children,
263            PTABLE_SIZE * sizeof(struct vnode *));
264
265    return SYS_ERR_OK;
266}
267#else
268#error Invalid pmap datastructure
269#endif
270
271size_t cow_pt_alloc_count = 0, cow_pd_alloc_count = 0, cow_pdpt_alloc_count = 0;
272static errval_t find_or_clone_vnode(struct pmap_x86 *pmap,
273        struct vnode *parent, enum objtype type,
274        size_t entry, struct vnode **ptable)
275{
276    errval_t err;
277    *ptable = find_vnode(parent, entry);
278    if (*ptable == NULL || !(*ptable)->is_cloned) {
279        switch(type) {
280            case ObjType_VNode_x86_64_ptable:
281                cow_pt_alloc_count++;
282                break;
283            case ObjType_VNode_x86_64_pdir:
284                cow_pd_alloc_count++;
285                break;
286            case ObjType_VNode_x86_64_pdpt:
287                cow_pdpt_alloc_count++;
288                break;
289            default:
290                break;
291        }
292    }
293    if(*ptable == NULL) {
294        err = alloc_vnode(pmap, parent, type, entry, ptable);
295        if (err_is_fail(err)) {
296            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
297        }
298    } else if (!(*ptable)->is_cloned) {
299        // need to clone ptable to ensure copy on write
300        struct vnode *newptable;
301        err = alloc_vnode(pmap, parent, type, entry, &newptable);
302        if (err_is_fail(err)) {
303            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
304        }
305        err = vnode_inherit_attr(newptable->v.cap,
306                (*ptable)->v.cap, 0, PTABLE_SIZE, PTABLE_ACCESS_READONLY,
307                (*ptable)->u.vnode.mcn, newptable->u.vnode.mcn);
308        if (err_is_fail(err)) {
309            return err_push(err, LIB_ERR_PMAP_CLONE_VNODE);
310        }
311        memcpy(newptable->v.u.vnode.children, (*ptable)->v.u.vnode.children,
312                PTABLE_SIZE * sizeof(struct vnode *));
313        newptable->is_cloned = true;
314        *ptable = newptable;
315    }
316    assert(*ptable);
317
318    return SYS_ERR_OK;
319}
320
321// assume that we created a struct vnode but didn't clone the actual pte page
322// for pml4 entries
323static errval_t cow_get_pdpt(struct pmap_x86 *pmap,
324        genvaddr_t base, struct vnode **pdpt)
325{
326    DEBUG_COW("%s: %"PRIxGENVADDR"\n", __FUNCTION__, base);
327    errval_t err;
328    struct vnode *root = &pmap->root;
329    size_t entry = X86_64_PML4_BASE(base);
330    *pdpt = find_vnode(root, entry);
331    assert(*pdpt);
332    DEBUG_COW("%s: is_cloned=%d\n", __FUNCTION__, (*pdpt)->is_cloned);
333    if (!(*pdpt)->is_cloned) {
334        // need to clone ptable to ensure copy on write
335        struct vnode *newptable;
336        err = alloc_vnode(pmap, root, ObjType_VNode_x86_64_pdpt, entry,
337                &newptable);
338        if (err_is_fail(err)) {
339            return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE);
340        }
341        err = vnode_inherit_attr(newptable->v.cap,
342                (*pdpt)->v.cap, 0, PTABLE_SIZE, PTABLE_ACCESS_READONLY,
343                (*pdpt)->u.vnode.mcn, newptable->u.vnode.mcn);
344        if (err_is_fail(err)) {
345            return err_push(err, LIB_ERR_PMAP_CLONE_VNODE);
346        }
347        memcpy(newptable->v.u.vnode.children, (*pdpt)->v.u.vnode.children,
348                PTABLE_SIZE * sizeof(struct vnode *));
349        newptable->is_cloned = true;
350        *pdpt = newptable;
351    }
352    return SYS_ERR_OK;
353}
354
355static errval_t cow_get_pdir(struct pmap_x86 *pmap,
356        genvaddr_t base, struct vnode **pdir)
357{
358    DEBUG_COW("%s: %"PRIxGENVADDR"\n", __FUNCTION__, base);
359    errval_t err;
360    struct vnode *pdpt = NULL;
361    err = cow_get_pdpt(pmap, base, &pdpt);
362    if (err_is_fail(err)) {
363        return err;
364    }
365    assert(pdpt != NULL);
366    assert(pdpt->is_cloned);
367
368    return find_or_clone_vnode(pmap, pdpt,
369            ObjType_VNode_x86_64_pdir,
370            X86_64_PDPT_BASE(base), pdir);
371}
372/**
373 * \brief Returns the vnode (potentially cloned) for `base'
374 */
375static errval_t cow_get_ptable(struct pmap_x86 *pmap,
376        genvaddr_t base, struct vnode **ptable)
377{
378    DEBUG_COW("%s: %"PRIxGENVADDR"\n", __FUNCTION__, base);
379    errval_t err;
380    struct vnode *pdir;
381    err = cow_get_pdir(pmap, base, &pdir);
382    if (err_is_fail(err)) {
383        return err;
384    }
385    assert(pdir != NULL);
386    assert(pdir->is_cloned);
387
388    return find_or_clone_vnode(pmap, pdir,
389            ObjType_VNode_x86_64_ptable,
390            X86_64_PDIR_BASE(base), ptable);
391}
392
393static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags)
394{
395    paging_x86_64_flags_t pmap_flags =
396        PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE;
397
398    if (!(vregion_flags & VREGION_FLAGS_GUARD)) {
399        if (vregion_flags & VREGION_FLAGS_WRITE) {
400            pmap_flags |= PTABLE_READ_WRITE;
401        }
402        if (vregion_flags & VREGION_FLAGS_EXECUTE) {
403            pmap_flags &= ~PTABLE_EXECUTE_DISABLE;
404        }
405        if (vregion_flags & VREGION_FLAGS_NOCACHE) {
406            pmap_flags |= PTABLE_CACHE_DISABLED;
407        }
408        else if (vregion_flags & VREGION_FLAGS_WRITE_COMBINING) {
409            // PA4 is configured as write-combining
410            pmap_flags |= PTABLE_ATTR_INDEX;
411        }
412    }
413
414    return pmap_flags;
415}
416
417static exception_handler_fn next_handler = NULL;
418static void cow_handler(enum exception_type type, int subtype, void *vaddr,
419        arch_registers_state_t *regs)
420{
421    errval_t err;
422    DEBUG_COW("got exception %d(%d) on %p\n", type, subtype, vaddr);
423    if (next_handler && type != EXCEPT_PAGEFAULT) {
424        next_handler(type, subtype, vaddr, regs);
425    }
426    assert(type == EXCEPT_PAGEFAULT);
427    if (next_handler && subtype != PAGEFLT_WRITE) {
428        next_handler(type, subtype, vaddr, regs);
429    }
430    assert(subtype == PAGEFLT_WRITE);
431    uintptr_t addr = (uintptr_t) vaddr;
432    uintptr_t faddr = addr & ~BASE_PAGE_MASK;
433    // TODO: check whether fault inside a registered COW region
434    DEBUG_COW("got write pagefault on %p, creating copy of page\n", vaddr);
435    struct vnode *ptable = NULL;
436    struct capref newframe;
437    struct pmap_x86 *pmap = (struct pmap_x86 *)get_current_pmap();
438    err = cow_get_ptable(pmap, faddr, &ptable);
439    if (err_is_fail(err)) {
440        USER_PANIC_ERR(err, "cow_get_ptable");
441    }
442    err = cow_get_page(&newframe, ObjType_Frame);
443    if (err_is_fail(err)) {
444        USER_PANIC_ERR(err, "cow_get_page");
445    }
446    struct capref mapping;
447    err = slot_alloc(&mapping);
448    assert(err_is_ok(err));
449    err = vnode_copy_remap(ptable->v.cap, newframe, X86_64_PTABLE_BASE(faddr),
450            vregion_to_pmap_flag(VREGION_FLAGS_READ_WRITE), 0, 1, mapping);
451    if (err_is_fail(err)) {
452        USER_PANIC_ERR(err, "frame_alloc");
453    }
454}
455
456errval_t pmap_cow_init(void)
457{
458    errval_t err;
459    err = thread_set_exception_handler(cow_handler, &next_handler, ex_stack,
460            ex_stack+EX_STACK_SIZE, NULL, NULL);
461    assert(err_is_ok(err));
462    return SYS_ERR_OK;
463}
464
465
466errval_t pmap_setup_cow(struct vregion *vregion, void **retbuf)
467{
468    errval_t err;
469    struct pmap *pmap = get_current_pmap();
470    genvaddr_t vregion_base = vregion_get_base_addr(vregion);
471    size_t vregion_size = vregion_get_size(vregion);
472
473    size_t pml4e = X86_64_PML4_BASE(vregion_base);
474    // no support for regions that are not in a single pml4e
475    if (pml4e != X86_64_PML4_BASE(vregion_base + vregion_size - 1)) {
476        debug_printf("vregion spanning pml4 entries\n");
477        return LIB_ERR_NOT_IMPLEMENTED; //XXX
478    }
479
480    genvaddr_t new_vaddr;
481    // XXX: right now this allocates pml4 entries
482    err = pmap->f.determine_addr_raw(pmap, vregion_size, 0, &new_vaddr);
483    assert(err_is_ok(err));
484    size_t new_pml4e = X86_64_PML4_BASE(new_vaddr);
485    if ((new_pml4e << 39) != new_vaddr) {
486        USER_PANIC("new_vaddr not pml4e aligned: %"PRIxGENVADDR"\n",
487                new_vaddr);
488    }
489    assert((new_pml4e << 39) == new_vaddr);
490    DEBUG_COW("using pml4e %zu to alias pml4e %zu\n",
491            new_pml4e, pml4e);
492
493    struct pmap_x86 *x86 = (struct pmap_x86*)pmap;
494
495    // get pml4 vnode for region that we wanna cow
496    cow_root_pte = find_vnode(&x86->root, pml4e);
497    if (!cow_root_pte) {
498        USER_PANIC("cow_root_pte NULL");
499    }
500    DEBUG_COW("cow_root_pte:%p\n", cow_root_pte);
501    assert(cow_root_pte);
502
503    // create vnode for new aliased mapping
504    struct vnode *root_pte_copy = NULL;
505    err = vnode_clone(x86, &x86->root, new_pml4e,
506            &root_pte_copy, cow_root_pte);
507    if (err_is_fail(err)) {
508        USER_PANIC_ERR(err, "vnode_clone");
509    }
510    assert(err_is_ok(err));
511
512    default_frame_bytes = L2_CNODE_SLOTS * BASE_PAGE_SIZE;
513    DEBUG_COW("setting up frame pool (%uMB) for remapping pages\n",
514            default_frame_bytes / 1024 / 1024);
515    err = get_ram_caps();
516    if (err_is_fail(err)) {
517        USER_PANIC_ERR(err, "get_frames");
518    }
519
520    //XXX: fix this if we have better determine_addr()
521    *retbuf = (void *)new_vaddr;
522
523    return err;
524}
525