1#include <barrelfish/barrelfish.h> 2#include <barrelfish/except.h> 3#include <barrelfish_kpi/paging_target.h> 4#include <assert.h> 5 6#include "pmap_cow.h" 7#include "debug.h" 8 9/* the benchmark USER_PANICs when PMAP_COW is not set, instead of generating a 10 * preprocessor error here, so we can build BF with PMAP_LL. 11#ifndef PMAP_ARRAY 12#error need PMAP_ARRAY for pmap_cow to work 13#endif 14*/ 15 16static struct vnode *cow_root_pte = NULL; 17#define EX_STACK_SIZE 16384 18static char ex_stack[EX_STACK_SIZE]; 19 20// default alloc 1MB 21static size_t default_frame_bytes = 1ULL << 20; 22static struct capref current_ram, current_frame; 23static cslot_t current_slot_count = 0; 24size_t get_ram_caps_count = 0; 25static errval_t get_ram_caps(void) 26{ 27 get_ram_caps_count ++; 28 struct capref ram; 29 size_t alloc_bytes = default_frame_bytes; 30 errval_t err; 31ram_alloc_retry: 32 err = ram_alloc(&ram, log2ceil(alloc_bytes)); 33 if (err_no(err) == LIB_ERR_RAM_ALLOC_WRONG_SIZE) { 34 DEBUG_COW("early ram_alloc, retry with BASE_PAGE_BITS\n"); 35 // this is probably before we have a connection to init and are using 36 // ram_alloc_fixed() which can only do 4kB pages, so we don't yet 37 // touch the default allocation size 38 alloc_bytes = BASE_PAGE_SIZE; 39 err = ram_alloc(&ram, BASE_PAGE_BITS); 40 if (err_is_fail(err)) { 41 USER_PANIC_ERR(err, "early ram_alloc failed\n"); 42 return err; 43 } 44 } else if (err_no(err) == MM_ERR_NOT_FOUND && alloc_bytes > BASE_PAGE_SIZE) { 45 DEBUG_COW("err: %s\n", err_getstring(err)); 46 default_frame_bytes >>= 1; // halve default allocation size 47 DEBUG_COW("smaller allocation size: %zd\n", default_frame_bytes); 48 alloc_bytes = default_frame_bytes; 49 goto ram_alloc_retry; 50 } else if (err_is_fail(err)) { 51 debug_printf("error in ram_alloc: %s\n", err_getstring(err)); 52 return err; 53 } 54 // make sure we have a RAM cap that is a multiple of base pages 55 assert(alloc_bytes >= BASE_PAGE_SIZE); 56 assert(alloc_bytes % BASE_PAGE_SIZE == 0); 57 58 // retype into 4k caps in new cnode 59 cslot_t slots_needed = alloc_bytes / BASE_PAGE_SIZE; 60 current_slot_count = slots_needed; 61 debug_printf("slots_needed = %"PRIuCSLOT"\n", slots_needed); 62 if (slots_needed == 1) { 63 USER_PANIC("OOM"); 64 } 65 if (slots_needed < L2_CNODE_SLOTS) { 66 debug_printf("slowly running out of RAM: only got %d pages\n", slots_needed); 67 } 68 assert(slots_needed > 1); 69 assert(slots_needed <= L2_CNODE_SLOTS); 70 struct capref nextcncap; 71 struct cnoderef nextcn; 72 DEBUG_COW("%s: need CNode with %d slots\n", __FUNCTION__, slots_needed); 73 err = cnode_create_l2(&nextcncap, &nextcn); 74 if (err_is_fail(err)) { 75 DEBUG_ERR(err, "cnode_create"); 76 return err; 77 } 78 current_ram = (struct capref) { 79 .cnode = nextcn, 80 .slot = 0, 81 }; 82 // Create empty cnode for retypes to frames/ptables in cow_get_page 83 err = cnode_create_l2(&nextcncap, &nextcn); 84 if (err_is_fail(err)) { 85 DEBUG_ERR(err, "cnode_create"); 86 return err; 87 } 88 current_frame = (struct capref) { 89 .cnode = nextcn, 90 .slot = 0, 91 }; 92 93 // Retype into 4kB RAM caps 94 err = cap_retype(current_ram, ram, 0, ObjType_RAM, BASE_PAGE_SIZE, slots_needed); 95 if (err_is_fail(err)) { 96 debug_printf("error in cap_retype: %s\n", err_getstring(err)); 97 return err; 98 } 99 100 return SYS_ERR_OK; 101} 102 103size_t cow_get_page_count = 0; 104static errval_t cow_get_page(struct capref *f, enum objtype type) 105{ 106 cow_get_page_count++; 107 errval_t err; 108 assert(f); 109 if (current_slot_count == 0 || current_ram.slot == current_slot_count) { 110 err = get_ram_caps(); 111 if (err_is_fail(err)) { 112 return err; 113 } 114 } 115 err = cap_retype(current_frame, current_ram, 0, type, BASE_PAGE_SIZE, 1); 116 if (err_is_fail(err)) { 117 return err; 118 } 119 *f = current_frame; 120 current_frame.slot++; 121 current_ram.slot++; 122 return SYS_ERR_OK; 123} 124 125static errval_t alloc_vnode_noalloc(struct pmap_x86 *pmap, struct vnode *root, 126 struct capref vnodecap, uint32_t entry, 127 struct vnode **retvnode) 128{ 129 errval_t err; 130 131 struct vnode *newvnode = slab_alloc(&pmap->p.m.slab); 132 if (newvnode == NULL) { 133 return LIB_ERR_SLAB_ALLOC_FAIL; 134 } 135 newvnode->v.cap = vnodecap; 136 137 err = slot_alloc(&newvnode->v.mapping); 138 assert(err_is_ok(err)); 139 140 // Map it 141 err = vnode_map(root->v.cap, newvnode->v.cap, entry, 142 PTABLE_ACCESS_DEFAULT, 0, 1, newvnode->v.mapping); 143 if (err_is_fail(err)) { 144 return err_push(err, LIB_ERR_VNODE_MAP); 145 } 146 147 // The VNode meta data 148 newvnode->v.is_vnode = true; 149 newvnode->is_cloned = false; 150 newvnode->v.entry = entry; 151#ifdef PMAP_LL 152 newvnode->v.meta.next = root->v.u.vnode.children; 153 root->v.u.vnode.children = newvnode; 154 newvnode->v.u.vnode.children = NULL; 155#elif defined(PMAP_ARRAY) 156 memset(newvnode->v.u.vnode.children, 0, sizeof(struct vode *)*PTABLE_SIZE); 157 root->v.u.vnode.children[entry] = newvnode; 158#else 159#error Invalid pmap datastructure 160#endif 161 162 *retvnode = newvnode; 163 return SYS_ERR_OK; 164} 165 166static errval_t alloc_vnode(struct pmap_x86 *pmap, struct vnode *root, 167 enum objtype type, uint32_t entry, 168 struct vnode **retvnode) 169{ 170 errval_t err; 171 172 struct capref vnodecap; 173 // Get the VNode capability 174 err = cow_get_page(&vnodecap, type); 175 if (err_is_fail(err)) { 176 return err_push(err, LIB_ERR_VNODE_CREATE); 177 } 178 179 return alloc_vnode_noalloc(pmap, root, vnodecap, entry, retvnode); 180} 181 182#if defined(PMAP_LL) 183static struct vnode *find_vnode(struct vnode *root, uint16_t entry) 184{ 185 assert(root != NULL); 186 assert(root->v.is_vnode); 187 struct vnode *n; 188 189 for(n = root->v.u.vnode.children; n != NULL; n = n->v.meta.next) { 190 if (!n->v.is_vnode) { 191 // check whether entry is inside a large region 192 uint16_t end = n->v.entry + n->v.u.frame.pte_count; 193 if (n->v.entry <= entry && entry < end) { 194 //if (n->v.entry < entry) { 195 // debug_printf("%d \\in [%d, %d]\n", entry, n->v.entry, end); 196 //} 197 return n; 198 } 199 } 200 else if(n->v.entry == entry) { 201 // return n if n is a vnode and the indices match 202 return n; 203 } 204 } 205 return NULL; 206} 207 208static errval_t vnode_clone(struct pmap_x86 *x86, 209 struct vnode *parent, size_t entry, 210 struct vnode **dest, struct vnode *src) 211{ 212 return LIB_ERR_NOT_IMPLEMENTED; 213} 214 215#elif defined(PMAP_ARRAY) 216static struct vnode *find_vnode(struct vnode *root, uint16_t entry) 217{ 218 assert(root != NULL); 219 assert(root->v.is_vnode); 220 assert(entry < PTABLE_SIZE); 221 222 if (root->v.u.vnode.children) { 223 return root->v.u.vnode.children[entry]; 224 } else { 225 return NULL; 226 } 227} 228 229static errval_t vnode_clone(struct pmap_x86 *x86, 230 struct vnode *parent, size_t entry, 231 struct vnode **dest, struct vnode *src) 232{ 233 errval_t err; 234 // TODO: better change to r/o on pml4e or pdpt? 235 err = vnode_modify_flags(src->v.cap, 0, 236 PTABLE_SIZE, PTABLE_ACCESS_READONLY); 237 if (err_is_fail(err)) { 238 USER_PANIC_ERR(err, "vnode_modify_flags"); 239 } 240 assert(err_is_ok(err)); 241 // create copy of pdpt cap 242 struct capref copy; 243 err = slot_alloc(©); 244 if (err_is_fail(err)) { 245 USER_PANIC_ERR(err, "slot_alloc"); 246 } 247 assert(err_is_ok(err)); 248 err = cap_copy(copy, src->v.cap); 249 if (err_is_fail(err)) { 250 USER_PANIC_ERR(err, "cap_copy"); 251 } 252 assert(err_is_ok(err)); 253 254 err = alloc_vnode_noalloc(x86, parent, copy, 255 entry, dest); 256 if (err_is_fail(err)) { 257 USER_PANIC_ERR(err, "alloc_vnode_noalloc"); 258 } 259 assert(*dest); 260 // copy children metadata 261 // XXX: should copy caps to keep revoke safety 262 memcpy((*dest)->v.u.vnode.children, src->v.u.vnode.children, 263 PTABLE_SIZE * sizeof(struct vnode *)); 264 265 return SYS_ERR_OK; 266} 267#else 268#error Invalid pmap datastructure 269#endif 270 271size_t cow_pt_alloc_count = 0, cow_pd_alloc_count = 0, cow_pdpt_alloc_count = 0; 272static errval_t find_or_clone_vnode(struct pmap_x86 *pmap, 273 struct vnode *parent, enum objtype type, 274 size_t entry, struct vnode **ptable) 275{ 276 errval_t err; 277 *ptable = find_vnode(parent, entry); 278 if (*ptable == NULL || !(*ptable)->is_cloned) { 279 switch(type) { 280 case ObjType_VNode_x86_64_ptable: 281 cow_pt_alloc_count++; 282 break; 283 case ObjType_VNode_x86_64_pdir: 284 cow_pd_alloc_count++; 285 break; 286 case ObjType_VNode_x86_64_pdpt: 287 cow_pdpt_alloc_count++; 288 break; 289 default: 290 break; 291 } 292 } 293 if(*ptable == NULL) { 294 err = alloc_vnode(pmap, parent, type, entry, ptable); 295 if (err_is_fail(err)) { 296 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 297 } 298 } else if (!(*ptable)->is_cloned) { 299 // need to clone ptable to ensure copy on write 300 struct vnode *newptable; 301 err = alloc_vnode(pmap, parent, type, entry, &newptable); 302 if (err_is_fail(err)) { 303 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 304 } 305 err = vnode_inherit_attr(newptable->v.cap, 306 (*ptable)->v.cap, 0, PTABLE_SIZE, PTABLE_ACCESS_READONLY, 307 (*ptable)->u.vnode.mcn, newptable->u.vnode.mcn); 308 if (err_is_fail(err)) { 309 return err_push(err, LIB_ERR_PMAP_CLONE_VNODE); 310 } 311 memcpy(newptable->v.u.vnode.children, (*ptable)->v.u.vnode.children, 312 PTABLE_SIZE * sizeof(struct vnode *)); 313 newptable->is_cloned = true; 314 *ptable = newptable; 315 } 316 assert(*ptable); 317 318 return SYS_ERR_OK; 319} 320 321// assume that we created a struct vnode but didn't clone the actual pte page 322// for pml4 entries 323static errval_t cow_get_pdpt(struct pmap_x86 *pmap, 324 genvaddr_t base, struct vnode **pdpt) 325{ 326 DEBUG_COW("%s: %"PRIxGENVADDR"\n", __FUNCTION__, base); 327 errval_t err; 328 struct vnode *root = &pmap->root; 329 size_t entry = X86_64_PML4_BASE(base); 330 *pdpt = find_vnode(root, entry); 331 assert(*pdpt); 332 DEBUG_COW("%s: is_cloned=%d\n", __FUNCTION__, (*pdpt)->is_cloned); 333 if (!(*pdpt)->is_cloned) { 334 // need to clone ptable to ensure copy on write 335 struct vnode *newptable; 336 err = alloc_vnode(pmap, root, ObjType_VNode_x86_64_pdpt, entry, 337 &newptable); 338 if (err_is_fail(err)) { 339 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 340 } 341 err = vnode_inherit_attr(newptable->v.cap, 342 (*pdpt)->v.cap, 0, PTABLE_SIZE, PTABLE_ACCESS_READONLY, 343 (*pdpt)->u.vnode.mcn, newptable->u.vnode.mcn); 344 if (err_is_fail(err)) { 345 return err_push(err, LIB_ERR_PMAP_CLONE_VNODE); 346 } 347 memcpy(newptable->v.u.vnode.children, (*pdpt)->v.u.vnode.children, 348 PTABLE_SIZE * sizeof(struct vnode *)); 349 newptable->is_cloned = true; 350 *pdpt = newptable; 351 } 352 return SYS_ERR_OK; 353} 354 355static errval_t cow_get_pdir(struct pmap_x86 *pmap, 356 genvaddr_t base, struct vnode **pdir) 357{ 358 DEBUG_COW("%s: %"PRIxGENVADDR"\n", __FUNCTION__, base); 359 errval_t err; 360 struct vnode *pdpt = NULL; 361 err = cow_get_pdpt(pmap, base, &pdpt); 362 if (err_is_fail(err)) { 363 return err; 364 } 365 assert(pdpt != NULL); 366 assert(pdpt->is_cloned); 367 368 return find_or_clone_vnode(pmap, pdpt, 369 ObjType_VNode_x86_64_pdir, 370 X86_64_PDPT_BASE(base), pdir); 371} 372/** 373 * \brief Returns the vnode (potentially cloned) for `base' 374 */ 375static errval_t cow_get_ptable(struct pmap_x86 *pmap, 376 genvaddr_t base, struct vnode **ptable) 377{ 378 DEBUG_COW("%s: %"PRIxGENVADDR"\n", __FUNCTION__, base); 379 errval_t err; 380 struct vnode *pdir; 381 err = cow_get_pdir(pmap, base, &pdir); 382 if (err_is_fail(err)) { 383 return err; 384 } 385 assert(pdir != NULL); 386 assert(pdir->is_cloned); 387 388 return find_or_clone_vnode(pmap, pdir, 389 ObjType_VNode_x86_64_ptable, 390 X86_64_PDIR_BASE(base), ptable); 391} 392 393static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags) 394{ 395 paging_x86_64_flags_t pmap_flags = 396 PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE; 397 398 if (!(vregion_flags & VREGION_FLAGS_GUARD)) { 399 if (vregion_flags & VREGION_FLAGS_WRITE) { 400 pmap_flags |= PTABLE_READ_WRITE; 401 } 402 if (vregion_flags & VREGION_FLAGS_EXECUTE) { 403 pmap_flags &= ~PTABLE_EXECUTE_DISABLE; 404 } 405 if (vregion_flags & VREGION_FLAGS_NOCACHE) { 406 pmap_flags |= PTABLE_CACHE_DISABLED; 407 } 408 else if (vregion_flags & VREGION_FLAGS_WRITE_COMBINING) { 409 // PA4 is configured as write-combining 410 pmap_flags |= PTABLE_ATTR_INDEX; 411 } 412 } 413 414 return pmap_flags; 415} 416 417static exception_handler_fn next_handler = NULL; 418static void cow_handler(enum exception_type type, int subtype, void *vaddr, 419 arch_registers_state_t *regs) 420{ 421 errval_t err; 422 DEBUG_COW("got exception %d(%d) on %p\n", type, subtype, vaddr); 423 if (next_handler && type != EXCEPT_PAGEFAULT) { 424 next_handler(type, subtype, vaddr, regs); 425 } 426 assert(type == EXCEPT_PAGEFAULT); 427 if (next_handler && subtype != PAGEFLT_WRITE) { 428 next_handler(type, subtype, vaddr, regs); 429 } 430 assert(subtype == PAGEFLT_WRITE); 431 uintptr_t addr = (uintptr_t) vaddr; 432 uintptr_t faddr = addr & ~BASE_PAGE_MASK; 433 // TODO: check whether fault inside a registered COW region 434 DEBUG_COW("got write pagefault on %p, creating copy of page\n", vaddr); 435 struct vnode *ptable = NULL; 436 struct capref newframe; 437 struct pmap_x86 *pmap = (struct pmap_x86 *)get_current_pmap(); 438 err = cow_get_ptable(pmap, faddr, &ptable); 439 if (err_is_fail(err)) { 440 USER_PANIC_ERR(err, "cow_get_ptable"); 441 } 442 err = cow_get_page(&newframe, ObjType_Frame); 443 if (err_is_fail(err)) { 444 USER_PANIC_ERR(err, "cow_get_page"); 445 } 446 struct capref mapping; 447 err = slot_alloc(&mapping); 448 assert(err_is_ok(err)); 449 err = vnode_copy_remap(ptable->v.cap, newframe, X86_64_PTABLE_BASE(faddr), 450 vregion_to_pmap_flag(VREGION_FLAGS_READ_WRITE), 0, 1, mapping); 451 if (err_is_fail(err)) { 452 USER_PANIC_ERR(err, "frame_alloc"); 453 } 454} 455 456errval_t pmap_cow_init(void) 457{ 458 errval_t err; 459 err = thread_set_exception_handler(cow_handler, &next_handler, ex_stack, 460 ex_stack+EX_STACK_SIZE, NULL, NULL); 461 assert(err_is_ok(err)); 462 return SYS_ERR_OK; 463} 464 465 466errval_t pmap_setup_cow(struct vregion *vregion, void **retbuf) 467{ 468 errval_t err; 469 struct pmap *pmap = get_current_pmap(); 470 genvaddr_t vregion_base = vregion_get_base_addr(vregion); 471 size_t vregion_size = vregion_get_size(vregion); 472 473 size_t pml4e = X86_64_PML4_BASE(vregion_base); 474 // no support for regions that are not in a single pml4e 475 if (pml4e != X86_64_PML4_BASE(vregion_base + vregion_size - 1)) { 476 debug_printf("vregion spanning pml4 entries\n"); 477 return LIB_ERR_NOT_IMPLEMENTED; //XXX 478 } 479 480 genvaddr_t new_vaddr; 481 // XXX: right now this allocates pml4 entries 482 err = pmap->f.determine_addr_raw(pmap, vregion_size, 0, &new_vaddr); 483 assert(err_is_ok(err)); 484 size_t new_pml4e = X86_64_PML4_BASE(new_vaddr); 485 if ((new_pml4e << 39) != new_vaddr) { 486 USER_PANIC("new_vaddr not pml4e aligned: %"PRIxGENVADDR"\n", 487 new_vaddr); 488 } 489 assert((new_pml4e << 39) == new_vaddr); 490 DEBUG_COW("using pml4e %zu to alias pml4e %zu\n", 491 new_pml4e, pml4e); 492 493 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 494 495 // get pml4 vnode for region that we wanna cow 496 cow_root_pte = find_vnode(&x86->root, pml4e); 497 if (!cow_root_pte) { 498 USER_PANIC("cow_root_pte NULL"); 499 } 500 DEBUG_COW("cow_root_pte:%p\n", cow_root_pte); 501 assert(cow_root_pte); 502 503 // create vnode for new aliased mapping 504 struct vnode *root_pte_copy = NULL; 505 err = vnode_clone(x86, &x86->root, new_pml4e, 506 &root_pte_copy, cow_root_pte); 507 if (err_is_fail(err)) { 508 USER_PANIC_ERR(err, "vnode_clone"); 509 } 510 assert(err_is_ok(err)); 511 512 default_frame_bytes = L2_CNODE_SLOTS * BASE_PAGE_SIZE; 513 DEBUG_COW("setting up frame pool (%uMB) for remapping pages\n", 514 default_frame_bytes / 1024 / 1024); 515 err = get_ram_caps(); 516 if (err_is_fail(err)) { 517 USER_PANIC_ERR(err, "get_frames"); 518 } 519 520 //XXX: fix this if we have better determine_addr() 521 *retbuf = (void *)new_vaddr; 522 523 return err; 524} 525