1/** 2 * \file 3 * \brief pmap management 4 * 5 * x86_64 specific management of page tables 6 * 7 * Warning: This code is coupled with the code in slot_alloc/. and pinned.c 8 * 9 * The maximum number of slots required to map a BASE_PAGE_SIZE 10 * sized page is the number of page table levels + 1. 11 * The sum for x86_64 is 4. 12 * 13 * Warning: Additional slots will be required to map a BASE_PAGE_SIZE size page, 14 * if we also track the actual frames that are mapped. 15 * Currently this is not the case. 16 */ 17 18/* 19 * Copyright (c) 2009-2013 ETH Zurich. 20 * Copyright (c) 2014 HP Labs. 21 * All rights reserved. 22 * 23 * This file is distributed under the terms in the attached LICENSE file. 24 * If you do not find this file, copies can be found by writing to: 25 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group. 26 */ 27 28#include <barrelfish/barrelfish.h> 29#include <barrelfish/dispatch.h> 30#include "target/x86/pmap_x86.h" 31#include <stdio.h> 32 33// Size of virtual region mapped by a single PML4 entry 34#define PML4_MAPPING_SIZE ((genvaddr_t)512*512*512*BASE_PAGE_SIZE) 35 36// Location and size of virtual address space reserved for mapping 37// frames backing refill_slabs 38#define META_DATA_RESERVED_BASE (PML4_MAPPING_SIZE * (disp_get_core_id() + 1)) 39#define META_DATA_RESERVED_SIZE (X86_64_BASE_PAGE_SIZE * 80000) 40 41/** 42 * \brief Translate generic vregion flags to architecture specific pmap flags 43 */ 44static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags) 45{ 46 paging_x86_64_flags_t pmap_flags = 47 PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE; 48 49 if (!(vregion_flags & VREGION_FLAGS_GUARD)) { 50 if (vregion_flags & VREGION_FLAGS_WRITE) { 51 pmap_flags |= PTABLE_READ_WRITE; 52 } 53 if (vregion_flags & VREGION_FLAGS_EXECUTE) { 54 pmap_flags &= ~PTABLE_EXECUTE_DISABLE; 55 } 56 if (vregion_flags & VREGION_FLAGS_NOCACHE) { 57 pmap_flags |= PTABLE_CACHE_DISABLED; 58 } 59 else if (vregion_flags & VREGION_FLAGS_WRITE_COMBINING) { 60 // PA4 is configured as write-combining 61 pmap_flags |= PTABLE_ATTR_INDEX; 62 } 63 } 64 65 return pmap_flags; 66} 67 68// returns whether va1 and va2 share a page directory entry 69// not using X86_64_PDIR_BASE() macro as this would give false positives (same 70// entry in different directories) 71static inline bool is_same_pdir(genvaddr_t va1, genvaddr_t va2) 72{ 73 return (va1>>X86_64_LARGE_PAGE_BITS) == ((va2-1)>>X86_64_LARGE_PAGE_BITS); 74} 75// returns whether va1 and va2 share a page directory pointer table entry 76static inline bool is_same_pdpt(genvaddr_t va1, genvaddr_t va2) 77{ 78 return (va1>>X86_64_HUGE_PAGE_BITS) == ((va2-1)>>X86_64_HUGE_PAGE_BITS); 79} 80// returns whether va1 and va2 share a page map level 4 entry 81static inline bool is_same_pml4(genvaddr_t va1, genvaddr_t va2) 82{ 83 // the base macros work here as we only have one pml4. 84 return X86_64_PML4_BASE(va1) == X86_64_PML4_BASE(va2-1); 85} 86// size indicates how many bits to shift 87static inline genvaddr_t get_addr_prefix(genvaddr_t va, uint8_t size) 88{ 89 return va >> size; 90} 91 92static inline bool is_large_page(struct vnode *p) 93{ 94 return !p->is_vnode && p->u.frame.flags & VREGION_FLAGS_LARGE; 95} 96 97static inline bool is_huge_page(struct vnode *p) 98{ 99 return !p->is_vnode && p->u.frame.flags & VREGION_FLAGS_HUGE; 100} 101 102/** 103 * \brief Returns the vnode for the pdpt mapping a given vspace address 104 */ 105static inline errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base, 106 struct vnode **pdpt) 107{ 108 errval_t err; 109 struct vnode *root = &pmap->root; 110 assert(root != NULL); 111 112 // PML4 mapping 113 if((*pdpt = find_vnode(root, X86_64_PML4_BASE(base))) == NULL) { 114 err = alloc_vnode(pmap, root, ObjType_VNode_x86_64_pdpt, 115 X86_64_PML4_BASE(base), pdpt); 116 errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP); 117 if (err == expected_concurrent) { 118 if ((*pdpt = find_vnode(root, X86_64_PML4_BASE(base))) != NULL) { 119 return SYS_ERR_OK; 120 } 121 } 122 if (err_is_fail(err)) { 123 DEBUG_ERR(err, "alloc_vnode for pdpt"); 124 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 125 } 126 } 127 128 return SYS_ERR_OK; 129} 130 131/** 132 * \brief Returns the vnode for the page directory mapping a given vspace 133 * address 134 */ 135static inline errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base, 136 struct vnode **pdir) 137{ 138 errval_t err; 139 struct vnode *pdpt; 140 err = get_pdpt(pmap, base, &pdpt); 141 if (err_is_fail(err)) { 142 return err; 143 } 144 assert(pdpt != NULL); 145 146 // PDPT mapping 147 if((*pdir = find_vnode(pdpt, X86_64_PDPT_BASE(base))) == NULL) { 148 err = alloc_vnode(pmap, pdpt, ObjType_VNode_x86_64_pdir, 149 X86_64_PDPT_BASE(base), pdir); 150 errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP); 151 if (err == expected_concurrent) { 152 if ((*pdir = find_vnode(pdpt, X86_64_PDPT_BASE(base))) != NULL) { 153 return SYS_ERR_OK; 154 } 155 } 156 if (err_is_fail(err)) { 157 DEBUG_ERR(err, "alloc_vnode for pdir"); 158 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 159 } 160 } 161 162 return SYS_ERR_OK; 163} 164 165/** 166 * \brief Returns the vnode for the pagetable mapping a given vspace address 167 */ 168static inline errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base, 169 struct vnode **ptable) 170{ 171 errval_t err; 172 struct vnode *pdir; 173 err = get_pdir(pmap, base, &pdir); 174 if (err_is_fail(err)) { 175 return err; 176 } 177 assert(pdir != NULL); 178 179 // PDIR mapping 180 if ((*ptable = find_vnode(pdir, X86_64_PDIR_BASE(base))) == NULL) { 181 err = alloc_vnode(pmap, pdir, ObjType_VNode_x86_64_ptable, 182 X86_64_PDIR_BASE(base), ptable); 183 errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP); 184 if (err == expected_concurrent) { 185 if ((*ptable = find_vnode(pdir, X86_64_PDIR_BASE(base))) != NULL) { 186 return SYS_ERR_OK; 187 } 188 } 189 if (err_is_fail(err)) { 190 DEBUG_ERR(err, "alloc_vnode for ptable"); 191 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 192 } 193 } 194 195 return SYS_ERR_OK; 196} 197 198/** 199 * \brief Returns the vnode for the page directory pointer table mapping for a 200 * given vspace address 201 */ 202static inline struct vnode *find_pdpt(struct pmap_x86 *pmap, genvaddr_t base) 203{ 204 struct vnode *root = &pmap->root; 205 assert(root != NULL); 206 207 // PDPT mapping 208 return find_vnode(root, X86_64_PML4_BASE(base)); 209} 210 211/** 212 * \brief Returns the vnode for the page directory mapping a given vspace 213 * address, without performing allocations as get_pdir() does 214 */ 215static inline struct vnode *find_pdir(struct pmap_x86 *pmap, genvaddr_t base) 216{ 217 struct vnode *pdpt = find_pdpt(pmap, base); 218 219 if (pdpt) { 220 // PDPT mapping 221 return find_vnode(pdpt, X86_64_PDPT_BASE(base)); 222 } else { 223 return NULL; 224 } 225} 226 227/** 228 * \brief Returns the vnode for the pagetable mapping a given vspace address, 229 * without performing allocations as get_ptable() does 230 */ 231static inline struct vnode *find_ptable(struct pmap_x86 *pmap, genvaddr_t base) 232{ 233 struct vnode *pdir = find_pdir(pmap, base); 234 235 if (pdir) { 236 // PDIR mapping 237 return find_vnode(pdir, X86_64_PDIR_BASE(base)); 238 } else { 239 return NULL; 240 } 241} 242 243static errval_t do_single_map(struct pmap_x86 *pmap, genvaddr_t vaddr, 244 genvaddr_t vend, struct capref frame, 245 size_t offset, size_t pte_count, 246 vregion_flags_t flags) 247{ 248 if (pte_count == 0) { 249 debug_printf("do_single_map: pte_count == 0, called from %p\n", 250 __builtin_return_address(0)); 251 return SYS_ERR_OK; 252 } 253 assert(pte_count > 0); 254 // translate flags 255 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags); 256 257 // Get the paging structure and set paging relevant parameters 258 struct vnode *ptable = NULL; 259 errval_t err; 260 size_t table_base; 261 262 // get the right paging table and address part 263 if (flags & VREGION_FLAGS_LARGE) { 264 //large 2M pages, mapped into pdir 265 err = get_pdir(pmap, vaddr, &ptable); 266 table_base = X86_64_PDIR_BASE(vaddr); 267 } else if (flags & VREGION_FLAGS_HUGE) { 268 //huge 1GB pages, mapped into pdpt 269 err = get_pdpt(pmap, vaddr, &ptable); 270 table_base = X86_64_PDPT_BASE(vaddr); 271 } else { 272 //normal 4K pages, mapped into ptable 273 err = get_ptable(pmap, vaddr, &ptable); 274 table_base = X86_64_PTABLE_BASE(vaddr); 275 } 276 if (err_is_fail(err)) { 277 return err_push(err, LIB_ERR_PMAP_GET_PTABLE); 278 } 279 assert(ptable->is_vnode); 280 281 // check if there is an overlapping mapping 282 if (has_vnode(ptable, table_base, pte_count, false)) { 283 if (has_vnode(ptable, table_base, pte_count, true)) { 284 printf("page already exists in 0x%" 285 PRIxGENVADDR"--0x%"PRIxGENVADDR"\n", vaddr, vend); 286 return LIB_ERR_PMAP_EXISTING_MAPPING; 287 } else { 288 // clean out empty page tables. We do this here because we benefit 289 // from having the page tables in place when doing lots of small 290 // mappings 291 remove_empty_vnodes(pmap, ptable, table_base, pte_count); 292 } 293 } 294 295 // setup userspace mapping 296 struct vnode *page = slab_alloc(&pmap->slab); 297 assert(page); 298 page->is_vnode = false; 299 page->entry = table_base; 300 page->next = ptable->u.vnode.children; 301 ptable->u.vnode.children = page; 302 page->u.frame.cap = frame; 303 page->u.frame.offset = offset; 304 page->u.frame.flags = flags; 305 page->u.frame.pte_count = pte_count; 306 307 err = pmap->p.slot_alloc->alloc(pmap->p.slot_alloc, &page->mapping); 308 if (err_is_fail(err)) { 309 return err_push(err, LIB_ERR_SLOT_ALLOC); 310 } 311 312 // do map 313 assert(!capref_is_null(ptable->u.vnode.invokable)); 314 err = vnode_map(ptable->u.vnode.invokable, frame, table_base, 315 pmap_flags, offset, pte_count, page->mapping); 316 if (err_is_fail(err)) { 317 return err_push(err, LIB_ERR_VNODE_MAP); 318 } 319 320 return SYS_ERR_OK; 321} 322 323/** 324 * \brief Called when enough slabs exist for the given mapping 325 */ 326static errval_t do_map(struct pmap_x86 *pmap, genvaddr_t vaddr, 327 struct capref frame, size_t offset, size_t size, 328 vregion_flags_t flags, size_t *retoff, size_t *retsize) 329{ 330 errval_t err; 331 332 // determine page size and relevant address part 333 size_t page_size = X86_64_BASE_PAGE_SIZE; 334 size_t table_base = X86_64_PTABLE_BASE(vaddr); 335 uint8_t map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS; 336 bool debug_out = false; 337 338 // get base address and size of frame 339 struct frame_identity fi; 340 err = frame_identify(frame, &fi); 341 if (err_is_fail(err)) { 342 return err_push(err, LIB_ERR_PMAP_DO_MAP); 343 } 344 345 if ((flags & VREGION_FLAGS_HUGE) && 346 (vaddr & X86_64_HUGE_PAGE_MASK) == 0 && 347 fi.bytes >= X86_64_HUGE_PAGE_SIZE && 348 ((fi.base & X86_64_HUGE_PAGE_MASK) == 0)) 349 { 350 // huge page branch (1GB) 351 page_size = X86_64_HUGE_PAGE_SIZE; 352 table_base = X86_64_PDPT_BASE(vaddr); 353 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS; 354 debug_out = false; 355 // remove large flag, if we're doing huge mapping 356 flags &= ~VREGION_FLAGS_LARGE; 357 } else if ((flags & VREGION_FLAGS_LARGE) && 358 (vaddr & X86_64_LARGE_PAGE_MASK) == 0 && 359 fi.bytes >= X86_64_LARGE_PAGE_SIZE && 360 ((fi.base & X86_64_LARGE_PAGE_MASK) == 0)) 361 { 362 // large page branch (2MB) 363 page_size = X86_64_LARGE_PAGE_SIZE; 364 table_base = X86_64_PDIR_BASE(vaddr); 365 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS; 366 debug_out = false; 367 } else { 368 // remove large/huge flags 369 flags &= ~(VREGION_FLAGS_LARGE|VREGION_FLAGS_HUGE); 370 } 371 372 // round to the next full page and calculate end address and #ptes 373 size = ROUND_UP(size, page_size); 374 size_t pte_count = DIVIDE_ROUND_UP(size, page_size); 375 genvaddr_t vend = vaddr + size; 376 377 if (offset+size > fi.bytes) { 378 debug_printf("do_map: offset=%zu; size=%zu; frame size=%zu\n", 379 offset, size, fi.bytes); 380 return LIB_ERR_PMAP_FRAME_SIZE; 381 } 382 383#if 0 384 if (true || debug_out) { 385 genpaddr_t paddr = fi.base + offset; 386 387 debug_printf("do_map: 0x%" 388 PRIxGENVADDR"--0x%"PRIxGENVADDR" -> 0x%"PRIxGENPADDR 389 "; pte_count = %zd; frame bytes = 0x%zx; page size = 0x%zx\n", 390 vaddr, vend, paddr, pte_count, fi.bytes, page_size); 391 } 392#endif 393 394 // all mapping on one leaf table? 395 if (is_same_pdir(vaddr, vend) || 396 (flags & VREGION_FLAGS_LARGE && is_same_pdpt(vaddr, vend)) || 397 (flags & VREGION_FLAGS_HUGE && is_same_pml4(vaddr, vend))) { 398 // fast path 399 if (debug_out) { 400 debug_printf(" do_map: fast path: %zd\n", pte_count); 401 } 402 err = do_single_map(pmap, vaddr, vend, frame, offset, pte_count, flags); 403 if (err_is_fail(err)) { 404 return err_push(err, LIB_ERR_PMAP_DO_MAP); 405 } 406 } 407 else { // multiple leaf page tables 408 // first leaf 409 uint32_t c = X86_64_PTABLE_SIZE - table_base; 410 if (debug_out) { 411 debug_printf(" do_map: slow path: first leaf %"PRIu32"\n", c); 412 } 413 genvaddr_t temp_end = vaddr + c * page_size; 414 err = do_single_map(pmap, vaddr, temp_end, frame, offset, c, flags); 415 if (err_is_fail(err)) { 416 return err_push(err, LIB_ERR_PMAP_DO_MAP); 417 } 418 419 // map full leaves 420 while (get_addr_prefix(temp_end, map_bits) < 421 get_addr_prefix(vend, map_bits)) 422 { 423 // update vars 424 vaddr = temp_end; 425 temp_end = vaddr + X86_64_PTABLE_SIZE * page_size; 426 offset += c * page_size; 427 c = X86_64_PTABLE_SIZE; 428 429 // do mapping 430 if (debug_out) { 431 debug_printf(" do_map: slow path: full leaf\n"); 432 } 433 err = do_single_map(pmap, vaddr, temp_end, frame, offset, 434 X86_64_PTABLE_SIZE, flags); 435 if (err_is_fail(err)) { 436 return err_push(err, LIB_ERR_PMAP_DO_MAP); 437 } 438 } 439 440 // map remaining part 441 offset += c * page_size; 442 443 // calculate remaining pages (subtract ptable bits from map_bits to 444 // get #ptes of last-level instead of 2nd-to-last). 445 c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) - 446 get_addr_prefix(temp_end, map_bits-X86_64_PTABLE_BITS); 447 448 if (c) { 449 // do mapping 450 if (debug_out) { 451 debug_printf("do_map: slow path: last leaf %"PRIu32"\n", c); 452 } 453 err = do_single_map(pmap, temp_end, vend, frame, offset, c, flags); 454 if (err_is_fail(err)) { 455 return err_push(err, LIB_ERR_PMAP_DO_MAP); 456 } 457 } 458 } 459 460 if (retoff) { 461 *retoff = offset; 462 } 463 if (retsize) { 464 *retsize = size; 465 } 466 return SYS_ERR_OK; 467} 468 469/// Computer upper limit on number of slabs required to perform a mapping 470static size_t max_slabs_for_mapping(size_t bytes) 471{ 472 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_BASE_PAGE_SIZE); 473 size_t max_ptable = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE); 474 size_t max_pdir = DIVIDE_ROUND_UP(max_ptable, X86_64_PTABLE_SIZE); 475 size_t max_pdpt = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE); 476 return max_pages + max_ptable + max_pdir + max_pdpt; 477} 478 479static size_t max_slabs_for_mapping_large(size_t bytes) 480{ 481 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_LARGE_PAGE_SIZE); 482 size_t max_pdir = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE); 483 size_t max_pdpt = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE); 484 return max_pages + max_pdir + max_pdpt; 485} 486 487static size_t max_slabs_for_mapping_huge(size_t bytes) 488{ 489 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_HUGE_PAGE_SIZE); 490 size_t max_pdpt = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE); 491 return max_pages + max_pdpt; 492} 493 494/** 495 * \brief Refill slabs used for metadata 496 * 497 * \param pmap The pmap to refill in 498 * \param request The number of slabs the allocator must have 499 * when the function returns 500 * 501 * When the current pmap is initialized, 502 * it reserves some virtual address space for metadata. 503 * This reserved address space is used here 504 * 505 * Can only be called for the current pmap 506 * Will recursively call into itself till it has enough slabs 507 */ 508static errval_t refill_slabs(struct pmap_x86 *pmap, size_t request) 509{ 510 errval_t err; 511 512 /* Keep looping till we have #request slabs */ 513 while (slab_freecount(&pmap->slab) < request) { 514 // Amount of bytes required for #request 515 size_t bytes = SLAB_STATIC_SIZE(request - slab_freecount(&pmap->slab), 516 sizeof(struct vnode)); 517 518 /* Get a frame of that size */ 519 struct capref cap; 520 err = frame_alloc(&cap, bytes, &bytes); 521 if (err_is_fail(err)) { 522 return err_push(err, LIB_ERR_FRAME_ALLOC); 523 } 524 525 /* If we do not have enough slabs to map the frame in, recurse */ 526 size_t required_slabs_for_frame = max_slabs_for_mapping(bytes); 527 if (slab_freecount(&pmap->slab) < required_slabs_for_frame) { 528 // If we recurse, we require more slabs than to map a single page 529 assert(required_slabs_for_frame > 4); 530 531 err = refill_slabs(pmap, required_slabs_for_frame); 532 if (err_is_fail(err)) { 533 return err_push(err, LIB_ERR_SLAB_REFILL); 534 } 535 } 536 537 /* Perform mapping */ 538 genvaddr_t genvaddr = pmap->vregion_offset; 539 pmap->vregion_offset += (genvaddr_t)bytes; 540 assert(pmap->vregion_offset < vregion_get_base_addr(&pmap->vregion) + 541 vregion_get_size(&pmap->vregion)); 542 543 err = do_map(pmap, genvaddr, cap, 0, bytes, 544 VREGION_FLAGS_READ_WRITE, NULL, NULL); 545 if (err_is_fail(err)) { 546 return err_push(err, LIB_ERR_PMAP_DO_MAP); 547 } 548 549 /* Grow the slab */ 550 lvaddr_t buf = vspace_genvaddr_to_lvaddr(genvaddr); 551 slab_grow(&pmap->slab, (void*)buf, bytes); 552 } 553 554 return SYS_ERR_OK; 555} 556 557/// Minimally refill the slab allocator 558static errval_t min_refill_slabs(struct pmap_x86 *pmap) 559{ 560 return refill_slabs(pmap, 5); 561} 562 563/** 564 * \brief Create page mappings 565 * 566 * \param pmap The pmap object 567 * \param vaddr The virtual address to create the mapping for 568 * \param frame The frame cap to map in 569 * \param offset Offset into the frame cap 570 * \param size Size of the mapping 571 * \param flags Flags for the mapping 572 * \param retoff If non-NULL, filled in with adjusted offset of mapped region 573 * \param retsize If non-NULL, filled in with adjusted size of mapped region 574 */ 575static errval_t map(struct pmap *pmap, genvaddr_t vaddr, struct capref frame, 576 size_t offset, size_t size, vregion_flags_t flags, 577 size_t *retoff, size_t *retsize) 578{ 579 errval_t err; 580 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 581 582 struct frame_identity fi; 583 err = frame_identify(frame, &fi); 584 if (err_is_fail(err)) { 585 return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY); 586 } 587 588 size_t max_slabs; 589 // Adjust the parameters to page boundaries 590 // TODO: overestimating needed slabs shouldn't hurt much in the long run, 591 // and would keep the code easier to read and possibly faster due to less 592 // branching 593 if ((flags & VREGION_FLAGS_LARGE) && 594 (vaddr & X86_64_LARGE_PAGE_MASK) == 0 && 595 (fi.base & X86_64_LARGE_PAGE_MASK) == 0 && 596 fi.bytes >= offset+size) { 597 //case large pages (2MB) 598 size += LARGE_PAGE_OFFSET(offset); 599 size = ROUND_UP(size, LARGE_PAGE_SIZE); 600 offset -= LARGE_PAGE_OFFSET(offset); 601 max_slabs = max_slabs_for_mapping_large(size); 602 } else if ((flags & VREGION_FLAGS_HUGE) && 603 (vaddr & X86_64_HUGE_PAGE_MASK) == 0 && 604 (fi.base & X86_64_HUGE_PAGE_MASK) == 0 && 605 fi.bytes >= offset+size) { 606 // case huge pages (1GB) 607 size += HUGE_PAGE_OFFSET(offset); 608 size = ROUND_UP(size, HUGE_PAGE_SIZE); 609 offset -= HUGE_PAGE_OFFSET(offset); 610 max_slabs = max_slabs_for_mapping_huge(size); 611 } else { 612 //case normal pages (4KB) 613 size += BASE_PAGE_OFFSET(offset); 614 size = ROUND_UP(size, BASE_PAGE_SIZE); 615 offset -= BASE_PAGE_OFFSET(offset); 616 max_slabs = max_slabs_for_mapping(size); 617 } 618 619 // Refill slab allocator if necessary 620 size_t slabs_free = slab_freecount(&x86->slab); 621 622 max_slabs += 5; // minimum amount required to map a page 623 if (slabs_free < max_slabs) { 624 struct pmap *mypmap = get_current_pmap(); 625 if (pmap == mypmap) { 626 err = refill_slabs(x86, max_slabs); 627 if (err_is_fail(err)) { 628 return err_push(err, LIB_ERR_SLAB_REFILL); 629 } 630 } else { 631 size_t bytes = SLAB_STATIC_SIZE(max_slabs - slabs_free, 632 sizeof(struct vnode)); 633 void *buf = malloc(bytes); 634 if (!buf) { 635 return LIB_ERR_MALLOC_FAIL; 636 } 637 slab_grow(&x86->slab, buf, bytes); 638 } 639 } 640 641 err = do_map(x86, vaddr, frame, offset, size, flags, retoff, retsize); 642 return err; 643} 644 645struct find_mapping_info { 646 struct vnode *page_table; 647 struct vnode *page; 648 size_t page_size; 649 size_t table_base; 650 uint8_t map_bits; 651}; 652 653/** 654 * \brief Find mapping for `vaddr` in `pmap`. 655 * \arg pmap the pmap to search in 656 * \arg vaddr the virtual address to search for 657 * \arg pt the last-level page table meta-data we found if any 658 * \arg page the page meta-data we found if any 659 * \returns `true` iff we found a mapping for vaddr 660 */ 661static bool find_mapping(struct pmap_x86 *pmap, genvaddr_t vaddr, 662 struct find_mapping_info *info) 663{ 664 struct vnode *pdpt = NULL, *pdir = NULL, *pt = NULL, *page = NULL; 665 666 size_t page_size = 0; 667 size_t table_base = 0; 668 uint8_t map_bits = 0; 669 670 // find page and last-level page table (can be pdir or pdpt) 671 if ((pdpt = find_pdpt(pmap, vaddr)) != NULL) { 672 page = find_vnode(pdpt, X86_64_PDPT_BASE(vaddr)); 673 if (page && page->is_vnode) { // not 1G pages 674 pdir = page; 675 page = find_vnode(pdir, X86_64_PDIR_BASE(vaddr)); 676 if (page && page->is_vnode) { // not 2M pages 677 pt = page; 678 page = find_vnode(pt, X86_64_PTABLE_BASE(vaddr)); 679 page_size = X86_64_BASE_PAGE_SIZE; 680 table_base = X86_64_PTABLE_BASE(vaddr); 681 map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS; 682 } else if (page) { 683 assert(is_large_page(page)); 684 pt = pdir; 685 page_size = X86_64_LARGE_PAGE_SIZE; 686 table_base = X86_64_PDIR_BASE(vaddr); 687 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS; 688 } 689 } else if (page) { 690 assert(is_huge_page(page)); 691 pt = pdpt; 692 page_size = X86_64_HUGE_PAGE_SIZE; 693 table_base = X86_64_PDPT_BASE(vaddr); 694 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS; 695 } 696 } 697 if (info) { 698 info->page_table = pt; 699 info->page = page; 700 info->page_size = page_size; 701 info->table_base = table_base; 702 info->map_bits = map_bits; 703 } 704 if (pt && page) { 705 return true; 706 } else { 707 return false; 708 } 709} 710 711static errval_t do_single_unmap(struct pmap_x86 *pmap, genvaddr_t vaddr, 712 size_t pte_count) 713{ 714 errval_t err; 715 struct find_mapping_info info; 716 717 if (!find_mapping(pmap, vaddr, &info)) { 718 return LIB_ERR_PMAP_FIND_VNODE; 719 } 720 assert(info.page_table && info.page_table->is_vnode && info.page && !info.page->is_vnode); 721 722 if (info.page->u.frame.pte_count == pte_count) { 723 err = vnode_unmap(info.page_table->u.vnode.cap, info.page->mapping); 724 if (err_is_fail(err)) { 725 printf("vnode_unmap returned error: %s (%d)\n", 726 err_getstring(err), err_no(err)); 727 return err_push(err, LIB_ERR_VNODE_UNMAP); 728 } 729 730 // delete&free page->mapping after doing vnode_unmap() 731 err = cap_delete(info.page->mapping); 732 if (err_is_fail(err)) { 733 return err_push(err, LIB_ERR_CAP_DELETE); 734 } 735 err = pmap->p.slot_alloc->free(pmap->p.slot_alloc, info.page->mapping); 736 if (err_is_fail(err)) { 737 return err_push(err, LIB_ERR_SLOT_FREE); 738 } 739 // Free up the resources 740 remove_vnode(info.page_table, info.page); 741 slab_free(&pmap->slab, info.page); 742 } 743 744 return SYS_ERR_OK; 745} 746 747/** 748 * \brief Remove page mappings 749 * 750 * \param pmap The pmap object 751 * \param vaddr The start of the virtual region to remove 752 * \param size The size of virtual region to remove 753 * \param retsize If non-NULL, filled in with the actual size removed 754 */ 755static errval_t unmap(struct pmap *pmap, genvaddr_t vaddr, size_t size, 756 size_t *retsize) 757{ 758 //printf("[unmap] 0x%"PRIxGENVADDR", %zu\n", vaddr, size); 759 errval_t err, ret = SYS_ERR_OK; 760 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 761 762 //determine if we unmap a larger page 763 struct find_mapping_info info; 764 765 if (!find_mapping(x86, vaddr, &info)) { 766 //TODO: better error --> LIB_ERR_PMAP_NOT_MAPPED 767 return LIB_ERR_PMAP_UNMAP; 768 } 769 770 assert(!info.page->is_vnode); 771 772 if (info.page->entry > info.table_base) { 773 debug_printf("trying to partially unmap region\n"); 774 // XXX: error code 775 return LIB_ERR_PMAP_FIND_VNODE; 776 } 777 778 // TODO: match new policy of map when implemented 779 size = ROUND_UP(size, info.page_size); 780 genvaddr_t vend = vaddr + size; 781 782 if (is_same_pdir(vaddr, vend) || 783 (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) || 784 (is_same_pml4(vaddr, vend) && is_huge_page(info.page))) 785 { 786 // fast path 787 err = do_single_unmap(x86, vaddr, size / info.page_size); 788 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 789 printf("error fast path\n"); 790 return err_push(err, LIB_ERR_PMAP_UNMAP); 791 } 792 } 793 else { // slow path 794 // unmap first leaf 795 uint32_t c = X86_64_PTABLE_SIZE - info.table_base; 796 797 err = do_single_unmap(x86, vaddr, c); 798 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 799 printf("error first leaf\n"); 800 return err_push(err, LIB_ERR_PMAP_UNMAP); 801 } 802 803 // unmap full leaves 804 vaddr += c * info.page_size; 805 while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) { 806 c = X86_64_PTABLE_SIZE; 807 err = do_single_unmap(x86, vaddr, X86_64_PTABLE_SIZE); 808 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 809 printf("error while loop\n"); 810 return err_push(err, LIB_ERR_PMAP_UNMAP); 811 } 812 vaddr += c * info.page_size; 813 } 814 815 // unmap remaining part 816 // subtracting ptable bits from map_bits to get #ptes in last-level table 817 // instead of 2nd-to-last. 818 c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) - 819 get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS); 820 assert(c < X86_64_PTABLE_SIZE); 821 if (c) { 822 err = do_single_unmap(x86, vaddr, c); 823 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 824 printf("error remaining part\n"); 825 return err_push(err, LIB_ERR_PMAP_UNMAP); 826 } 827 } 828 } 829 830 if (retsize) { 831 *retsize = size; 832 } 833 834 //printf("[unmap] exiting\n"); 835 return ret; 836} 837 838static errval_t do_single_modify_flags(struct pmap_x86 *pmap, genvaddr_t vaddr, 839 size_t pages, vregion_flags_t flags) 840{ 841 errval_t err = SYS_ERR_OK; 842 843 struct find_mapping_info info; 844 845 if (!find_mapping(pmap, vaddr, &info)) { 846 return LIB_ERR_PMAP_FIND_VNODE; 847 } 848 849 assert(info.page_table && info.page_table->is_vnode && info.page && !info.page->is_vnode); 850 851 if (inside_region(info.page_table, info.table_base, pages)) { 852 // we're modifying part of a valid mapped region 853 // arguments to invocation: invoke frame cap, first affected 854 // page (as offset from first page in mapping), #affected 855 // pages, new flags. Invocation mask flags based on capability 856 // access permissions. 857 size_t off = info.table_base - info.page->entry; 858 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags); 859 // calculate TLB flushing hint 860 genvaddr_t va_hint = 0; 861 if (pages == 1) { 862 // do assisted selective flush for single page 863 va_hint = vaddr & ~(info.page_size - 1); 864 } 865 err = invoke_mapping_modify_flags(info.page->mapping, off, pages, 866 pmap_flags, va_hint); 867 return err; 868 } else { 869 // overlaps some region border 870 // XXX: need better error 871 return LIB_ERR_PMAP_EXISTING_MAPPING; 872 } 873 874 return SYS_ERR_OK; 875} 876 877 878/** 879 * \brief Modify page mapping 880 * 881 * \param pmap The pmap object 882 * \param vaddr The first virtual address for which to change the flags 883 * \param size The length of the region to change in bytes 884 * \param flags New flags for the mapping 885 * \param retsize If non-NULL, filled in with the actual size modified 886 */ 887static errval_t modify_flags(struct pmap *pmap, genvaddr_t vaddr, size_t size, 888 vregion_flags_t flags, size_t *retsize) 889{ 890 errval_t err; 891 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 892 893 //determine if we unmap a larger page 894 struct find_mapping_info info; 895 896 if (!find_mapping(x86, vaddr, &info)) { 897 return LIB_ERR_PMAP_NOT_MAPPED; 898 } 899 900 assert(info.page && !info.page->is_vnode); 901 902 // TODO: match new policy of map when implemented 903 size = ROUND_UP(size, info.page_size); 904 genvaddr_t vend = vaddr + size; 905 906 size_t pages = size / info.page_size; 907 908 // vaddr and vend specify begin and end of the region (inside a mapping) 909 // that should receive the new set of flags 910 if (is_same_pdir(vaddr, vend) || 911 (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) || 912 (is_same_pml4(vaddr, vend) && is_huge_page(info.page))) { 913 // fast path 914 err = do_single_modify_flags(x86, vaddr, pages, flags); 915 if (err_is_fail(err)) { 916 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 917 } 918 } 919 else { // slow path 920 // modify first part 921 uint32_t c = X86_64_PTABLE_SIZE - info.table_base; 922 err = do_single_modify_flags(x86, vaddr, c, flags); 923 if (err_is_fail(err)) { 924 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 925 } 926 927 // modify full leaves 928 vaddr += c * info.page_size; 929 while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) { 930 c = X86_64_PTABLE_SIZE; 931 err = do_single_modify_flags(x86, vaddr, X86_64_PTABLE_SIZE, flags); 932 if (err_is_fail(err)) { 933 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 934 } 935 vaddr += c * info.page_size; 936 } 937 938 // modify remaining part 939 c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) - 940 get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS); 941 if (c) { 942 err = do_single_modify_flags(x86, vaddr, c, flags); 943 if (err_is_fail(err)) { 944 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 945 } 946 } 947 } 948 949 if (retsize) { 950 *retsize = size; 951 } 952 953 //printf("[modify_flags] exiting\n"); 954 return SYS_ERR_OK; 955} 956 957/** 958 * \brief Query existing page mapping 959 * 960 * \param pmap The pmap object 961 * \param vaddr The virtual address to query 962 * \param retvaddr Returns the base virtual address of the mapping 963 * \param retsize Returns the actual size of the mapping 964 * \param retcap Returns the cap mapped at this address 965 * \param retoffset Returns the offset within the cap that is mapped 966 * \param retflags Returns the flags for this mapping 967 * 968 * All of the ret parameters are optional. 969 */ 970static errval_t lookup(struct pmap *pmap, genvaddr_t vaddr, 971 struct pmap_mapping_info *info) 972{ 973 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 974 975 struct find_mapping_info find_info; 976 bool found = find_mapping(x86, vaddr, &find_info); 977 978 if (!found) { 979 return LIB_ERR_PMAP_FIND_VNODE; 980 } 981 982 if (info) { 983 info->vaddr = vaddr & ~(genvaddr_t)(find_info.page_size - 1); 984 info->size = find_info.page_size; 985 info->cap = find_info.page->u.frame.cap; 986 info->offset = find_info.page->u.frame.offset; 987 info->flags = find_info.page->u.frame.flags; 988 info->mapping = find_info.page->mapping; 989 } 990 return SYS_ERR_OK; 991} 992 993 994 995static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written) 996{ 997 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 998 struct pmap_dump_info *buf_ = buf; 999 1000 struct vnode *pml4 = &x86->root; 1001 struct vnode *pdpt, *pdir, *pt, *frame; 1002 assert(pml4 != NULL); 1003 1004 *items_written = 0; 1005 1006 // iterate over PML4 entries 1007 size_t pml4_index, pdpt_index, pdir_index; 1008 for (pdpt = pml4->u.vnode.children; pdpt != NULL; pdpt = pdpt->next) { 1009 pml4_index = pdpt->entry; 1010 // iterate over pdpt entries 1011 for (pdir = pdpt->u.vnode.children; pdir != NULL; pdir = pdir->next) { 1012 pdpt_index = pdir->entry; 1013 // iterate over pdir entries 1014 for (pt = pdir->u.vnode.children; pt != NULL; pt = pt->next) { 1015 pdir_index = pt->entry; 1016 // iterate over pt entries 1017 for (frame = pt->u.vnode.children; frame != NULL; frame = frame->next) { 1018 if (*items_written < buflen) { 1019 buf_->pml4_index = pml4_index; 1020 buf_->pdpt_index = pdpt_index; 1021 buf_->pdir_index = pdir_index; 1022 buf_->pt_index = frame->entry; 1023 buf_->cap = frame->u.frame.cap; 1024 buf_->offset = frame->u.frame.offset; 1025 buf_->flags = frame->u.frame.flags; 1026 buf_++; 1027 (*items_written)++; 1028 } 1029 } 1030 } 1031 } 1032 } 1033 return SYS_ERR_OK; 1034} 1035 1036static errval_t determine_addr_raw(struct pmap *pmap, size_t size, 1037 size_t alignment, genvaddr_t *retvaddr) 1038{ 1039 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 1040 1041 struct vnode *walk_pml4 = x86->root.u.vnode.children; 1042 assert(walk_pml4 != NULL); // assume there's always at least one existing entry 1043 1044 if (alignment == 0) { 1045 alignment = BASE_PAGE_SIZE; 1046 } else { 1047 alignment = ROUND_UP(alignment, BASE_PAGE_SIZE); 1048 } 1049 size = ROUND_UP(size, alignment); 1050 assert(size < 512ul * 1024 * 1024 * 1024); // pml4 size 1051 1052 // try to find free pml4 entry 1053 bool f[512]; 1054 for (int i = 0; i < 512; i++) { 1055 f[i] = true; 1056 } 1057 //debug_printf("entry: %d\n", walk_pml4->entry); 1058 f[walk_pml4->entry] = false; 1059 while (walk_pml4) { 1060 //debug_printf("looping over pml4 entries\n"); 1061 assert(walk_pml4->is_vnode); 1062 f[walk_pml4->entry] = false; 1063 walk_pml4 = walk_pml4->next; 1064 } 1065 genvaddr_t first_free = 16; 1066 for (; first_free < 512; first_free++) { 1067 //debug_printf("f[%"PRIuGENVADDR"] = %d\n", first_free, f[first_free]); 1068 if (f[first_free]) { 1069 break; 1070 } 1071 } 1072 //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free); 1073 if (first_free < 512) { 1074 //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free); 1075 *retvaddr = first_free << 39; 1076 return SYS_ERR_OK; 1077 } else { 1078 return LIB_ERR_OUT_OF_VIRTUAL_ADDR; 1079 } 1080} 1081 1082static struct pmap_funcs pmap_funcs = { 1083 .determine_addr = pmap_x86_determine_addr, 1084 .determine_addr_raw = determine_addr_raw, 1085 .map = map, 1086 .unmap = unmap, 1087 .lookup = lookup, 1088 .modify_flags = modify_flags, 1089 .serialise = pmap_x86_serialise, 1090 .deserialise = pmap_x86_deserialise, 1091 .dump = dump, 1092}; 1093 1094/** 1095 * \brief Initialize a x86 pmap object 1096 * 1097 * \param pmap Pmap object of type x86 1098 */ 1099errval_t pmap_x86_64_init(struct pmap *pmap, struct vspace *vspace, 1100 struct capref vnode, 1101 struct slot_allocator *opt_slot_alloc) 1102{ 1103 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 1104 1105 /* Generic portion */ 1106 pmap->f = pmap_funcs; 1107 pmap->vspace = vspace; 1108 1109 if (opt_slot_alloc != NULL) { 1110 pmap->slot_alloc = opt_slot_alloc; 1111 } else { /* use default allocator for this dispatcher */ 1112 pmap->slot_alloc = get_default_slot_allocator(); 1113 } 1114 1115 /* x86 specific portion */ 1116 slab_init(&x86->slab, sizeof(struct vnode), NULL); 1117 slab_grow(&x86->slab, x86->slab_buffer, 1118 sizeof(x86->slab_buffer)); 1119 x86->refill_slabs = min_refill_slabs; 1120 1121 x86->root.is_vnode = true; 1122 x86->root.u.vnode.cap = vnode; 1123 x86->root.u.vnode.invokable = vnode; 1124 if (get_croot_addr(vnode) != CPTR_ROOTCN) { 1125 errval_t err = slot_alloc(&x86->root.u.vnode.invokable); 1126 assert(err_is_ok(err)); 1127 err = cap_copy(x86->root.u.vnode.invokable, vnode); 1128 assert(err_is_ok(err)); 1129 } 1130 assert(!capref_is_null(x86->root.u.vnode.cap)); 1131 assert(!capref_is_null(x86->root.u.vnode.invokable)); 1132 x86->root.u.vnode.children = NULL; 1133 x86->root.next = NULL; 1134 1135 // choose a minimum mappable VA for most domains; enough to catch NULL 1136 // pointer derefs with suitably large offsets 1137 x86->min_mappable_va = 64 * 1024; 1138 1139 // maximum mappable VA is derived from X86_64_MEMORY_OFFSET in kernel 1140 x86->max_mappable_va = (genvaddr_t)0xffffff8000000000; 1141 1142 return SYS_ERR_OK; 1143} 1144 1145/** 1146 * \brief Initialize the current pmap. Reserve space for metadata 1147 * 1148 * This code is coupled with #vspace_current_init() 1149 */ 1150errval_t pmap_x86_64_current_init(bool init_domain) 1151{ 1152 struct pmap_x86 *x86 = (struct pmap_x86*)get_current_pmap(); 1153 1154 // To reserve a block of virtual address space, 1155 // a vregion representing the address space is required. 1156 // We construct a superficial one here and add it to the vregion list. 1157 struct vregion *vregion = &x86->vregion; 1158 vregion->vspace = NULL; 1159 vregion->memobj = NULL; 1160 vregion->base = META_DATA_RESERVED_BASE; 1161 vregion->offset = 0; 1162 vregion->size = META_DATA_RESERVED_SIZE; 1163 vregion->flags = 0; 1164 vregion->next = NULL; 1165 1166 struct vspace *vspace = x86->p.vspace; 1167 assert(!vspace->head); 1168 vspace->head = vregion; 1169 1170 x86->vregion_offset = x86->vregion.base; 1171 1172 // We don't know the vnode layout for the first part of our address space 1173 // (which was setup by the kernel), so we avoid mapping there until told it. 1174 x86->min_mappable_va = META_DATA_RESERVED_BASE; 1175 1176 return SYS_ERR_OK; 1177} 1178