1/** 2 * \file 3 * \brief pmap management 4 * 5 * x86_64 specific management of page tables 6 * 7 * Warning: This code is coupled with the code in slot_alloc/. and pinned.c 8 * 9 * The maximum number of slots required to map a BASE_PAGE_SIZE 10 * sized page is the number of page table levels + 1. 11 * The sum for x86_64 is 4. 12 * 13 * Warning: Additional slots will be required to map a BASE_PAGE_SIZE size page, 14 * if we also track the actual frames that are mapped. 15 * Currently this is not the case. 16 */ 17 18/* 19 * Copyright (c) 2009-2013 ETH Zurich. 20 * Copyright (c) 2014 HP Labs. 21 * All rights reserved. 22 * 23 * This file is distributed under the terms in the attached LICENSE file. 24 * If you do not find this file, copies can be found by writing to: 25 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group. 26 */ 27 28#include <barrelfish/barrelfish.h> 29#include <barrelfish/dispatch.h> 30#include "target/x86/pmap_x86.h" 31#include <stdio.h> 32#include <barrelfish/cap_predicates.h> 33#include <pmap_priv.h> 34#include <pmap_ds.h> // pull in selected pmap datastructure implementation 35 36// For tracing 37#include <trace/trace.h> 38#include <trace_definitions/trace_defs.h> 39 40/** 41 * \brief Translate generic vregion flags to architecture specific pmap flags 42 */ 43static paging_x86_64_flags_t vregion_to_pmap_flag(vregion_flags_t vregion_flags) 44{ 45 paging_x86_64_flags_t pmap_flags = 46 PTABLE_USER_SUPERVISOR | PTABLE_EXECUTE_DISABLE; 47 48 if (!(vregion_flags & VREGION_FLAGS_GUARD)) { 49 if (vregion_flags & VREGION_FLAGS_WRITE) { 50 pmap_flags |= PTABLE_READ_WRITE; 51 } 52 if (vregion_flags & VREGION_FLAGS_EXECUTE) { 53 pmap_flags &= ~PTABLE_EXECUTE_DISABLE; 54 } 55 if (vregion_flags & VREGION_FLAGS_NOCACHE) { 56 pmap_flags |= PTABLE_CACHE_DISABLED; 57 } 58 else if (vregion_flags & VREGION_FLAGS_WRITE_COMBINING) { 59 // PA4 is configured as write-combining 60 pmap_flags |= PTABLE_ATTR_INDEX; 61 } 62 } 63 64 return pmap_flags; 65} 66 67// returns whether va1 and va2 share a page directory entry 68// not using X86_64_PDIR_BASE() macro as this would give false positives (same 69// entry in different directories) 70static inline bool is_same_pdir(genvaddr_t va1, genvaddr_t va2) 71{ 72 return (va1>>X86_64_LARGE_PAGE_BITS) == ((va2-1)>>X86_64_LARGE_PAGE_BITS); 73} 74// returns whether va1 and va2 share a page directory pointer table entry 75static inline bool is_same_pdpt(genvaddr_t va1, genvaddr_t va2) 76{ 77 return (va1>>X86_64_HUGE_PAGE_BITS) == ((va2-1)>>X86_64_HUGE_PAGE_BITS); 78} 79// returns whether va1 and va2 share a page map level 4 entry 80static inline bool is_same_pml4(genvaddr_t va1, genvaddr_t va2) 81{ 82 // the base macros work here as we only have one pml4. 83 return X86_64_PML4_BASE(va1) == X86_64_PML4_BASE(va2-1); 84} 85// size indicates how many bits to shift 86static inline genvaddr_t get_addr_prefix(genvaddr_t va, uint8_t size) 87{ 88 return va >> size; 89} 90 91static inline bool is_large_page(struct vnode *p) 92{ 93 return !p->v.is_vnode && p->v.u.frame.flags & VREGION_FLAGS_LARGE; 94} 95 96static inline bool is_huge_page(struct vnode *p) 97{ 98 return !p->v.is_vnode && p->v.u.frame.flags & VREGION_FLAGS_HUGE; 99} 100 101/** 102 * \brief Returns the vnode for the pdpt mapping a given vspace address 103 */ 104errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base, 105 struct vnode **pdpt); 106errval_t get_pdpt(struct pmap_x86 *pmap, genvaddr_t base, 107 struct vnode **pdpt) 108{ 109 errval_t err; 110 struct vnode *root = &pmap->root; 111 assert(root != NULL); 112 113 // PML4 mapping 114 if((*pdpt = pmap_find_vnode(root, X86_64_PML4_BASE(base))) == NULL) { 115 enum objtype type = type_is_ept(pmap->root.v.type) ? 116 ObjType_VNode_x86_64_ept_pdpt : 117 ObjType_VNode_x86_64_pdpt; 118 err = alloc_vnode(pmap, root, type, X86_64_PML4_BASE(base), 119 pdpt, base); 120 errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP); 121 if (err == expected_concurrent) { 122 if ((*pdpt = pmap_find_vnode(root, X86_64_PML4_BASE(base))) != NULL) { 123 return SYS_ERR_OK; 124 } 125 } 126 if (err_is_fail(err)) { 127 DEBUG_ERR(err, "alloc_vnode for pdpt"); 128 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 129 } 130 } 131 132 return SYS_ERR_OK; 133} 134 135/** 136 * \brief Returns the vnode for the page directory mapping a given vspace 137 * address 138 */ 139errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base, 140 struct vnode **pdir); 141errval_t get_pdir(struct pmap_x86 *pmap, genvaddr_t base, 142 struct vnode **pdir) 143{ 144 errval_t err; 145 struct vnode *pdpt; 146 err = get_pdpt(pmap, base, &pdpt); 147 if (err_is_fail(err)) { 148 return err; 149 } 150 assert(pdpt != NULL); 151 152 // PDPT mapping 153 if((*pdir = pmap_find_vnode(pdpt, X86_64_PDPT_BASE(base))) == NULL) { 154 enum objtype type = type_is_ept(pmap->root.v.type) ? 155 ObjType_VNode_x86_64_ept_pdir : 156 ObjType_VNode_x86_64_pdir; 157 err = alloc_vnode(pmap, pdpt, type, 158 X86_64_PDPT_BASE(base), pdir, base); 159 errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP); 160 if (err == expected_concurrent) { 161 if ((*pdir = pmap_find_vnode(pdpt, X86_64_PDPT_BASE(base))) != NULL) { 162 return SYS_ERR_OK; 163 } 164 } 165 if (err_is_fail(err)) { 166 DEBUG_ERR(err, "alloc_vnode for pdir"); 167 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 168 } 169 } 170 171 return SYS_ERR_OK; 172} 173 174/** 175 * \brief Returns the vnode for the pagetable mapping a given vspace address 176 */ 177 errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base, 178 struct vnode **ptable); 179errval_t get_ptable(struct pmap_x86 *pmap, genvaddr_t base, 180 struct vnode **ptable) 181{ 182 errval_t err; 183 struct vnode *pdir; 184 err = get_pdir(pmap, base, &pdir); 185 if (err_is_fail(err)) { 186 return err; 187 } 188 assert(pdir != NULL); 189 190 // PDIR mapping 191 if((*ptable = pmap_find_vnode(pdir, X86_64_PDIR_BASE(base))) == NULL) { 192 enum objtype type = type_is_ept(pmap->root.v.type) ? 193 ObjType_VNode_x86_64_ept_ptable : 194 ObjType_VNode_x86_64_ptable; 195 err = alloc_vnode(pmap, pdir, type, 196 X86_64_PDIR_BASE(base), ptable, base); 197 errval_t expected_concurrent = err_push(SYS_ERR_VNODE_SLOT_INUSE, LIB_ERR_VNODE_MAP); 198 if (err == expected_concurrent) { 199 if ((*ptable = pmap_find_vnode(pdir, X86_64_PDIR_BASE(base))) != NULL) { 200 return SYS_ERR_OK; 201 } 202 } 203 if (err_is_fail(err)) { 204 DEBUG_ERR(err, "alloc_vnode for ptable"); 205 return err_push(err, LIB_ERR_PMAP_ALLOC_VNODE); 206 } 207 } 208 209 return SYS_ERR_OK; 210} 211 212/** 213 * \brief Returns the vnode for the page directory pointer table mapping for a 214 * given vspace address 215 */ 216static inline struct vnode *find_pdpt(struct pmap_x86 *pmap, genvaddr_t base) 217{ 218 struct vnode *root = &pmap->root; 219 assert(root != NULL); 220 221 // PDPT mapping 222 return pmap_find_vnode(root, X86_64_PML4_BASE(base)); 223} 224 225/** 226 * \brief Returns the vnode for the page directory mapping a given vspace 227 * address, without performing allocations as get_pdir() does 228 */ 229static inline struct vnode *find_pdir(struct pmap_x86 *pmap, genvaddr_t base) 230{ 231 struct vnode *pdpt = find_pdpt(pmap, base); 232 233 if (pdpt) { 234 // PDPT mapping 235 return pmap_find_vnode(pdpt, X86_64_PDPT_BASE(base)); 236 } else { 237 return NULL; 238 } 239} 240 241/** 242 * \brief Returns the vnode for the pagetable mapping a given vspace address, 243 * without performing allocations as get_ptable() does 244 */ 245static inline struct vnode *find_ptable(struct pmap_x86 *pmap, genvaddr_t base) 246{ 247 struct vnode *pdir = find_pdir(pmap, base); 248 249 if (pdir) { 250 // PDIR mapping 251 return pmap_find_vnode(pdir, X86_64_PDIR_BASE(base)); 252 } else { 253 return NULL; 254 } 255} 256 257// TODO: documentation for this feature! -SG,2018-10-18 258size_t ALL_THE_VNODES_MAX_ENTRIES = (15*4096); 259struct vnode **ALL_THE_VNODES = NULL; 260size_t all_the_vnodes_cnt = 0; 261 262static errval_t do_single_map(struct pmap_x86 *pmap, genvaddr_t vaddr, 263 genvaddr_t vend, struct capref frame, 264 size_t offset, size_t pte_count, 265 vregion_flags_t flags) 266{ 267 if (pte_count == 0) { 268 debug_printf("do_single_map: pte_count == 0, called from %p\n", 269 __builtin_return_address(0)); 270 return SYS_ERR_OK; 271 } 272 assert(pte_count > 0); 273 // translate flags 274 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags); 275 276 // Get the paging structure and set paging relevant parameters 277 struct vnode *ptable = NULL; 278 errval_t err; 279 size_t table_base; 280 281 // get the right paging table and address part 282 if (flags & VREGION_FLAGS_LARGE) { 283 //large 2M pages, mapped into pdir 284 err = get_pdir(pmap, vaddr, &ptable); 285 table_base = X86_64_PDIR_BASE(vaddr); 286 } else if (flags & VREGION_FLAGS_HUGE) { 287 //huge 1GB pages, mapped into pdpt 288 err = get_pdpt(pmap, vaddr, &ptable); 289 table_base = X86_64_PDPT_BASE(vaddr); 290 } else { 291 //normal 4K pages, mapped into ptable 292 err = get_ptable(pmap, vaddr, &ptable); 293 table_base = X86_64_PTABLE_BASE(vaddr); 294 if (ALL_THE_VNODES && (all_the_vnodes_cnt+1) < ALL_THE_VNODES_MAX_ENTRIES) { 295 ALL_THE_VNODES[all_the_vnodes_cnt++] = ptable; 296 } 297 } 298 if (err_is_fail(err)) { 299 return err_push(err, LIB_ERR_PMAP_GET_PTABLE); 300 } 301 assert(ptable->v.is_vnode); 302 303 // check if there is an overlapping mapping 304 if (has_vnode(ptable, table_base, pte_count, false)) { 305 if (has_vnode(ptable, table_base, pte_count, true)) { 306 printf("page already exists in 0x%" 307 PRIxGENVADDR"--0x%"PRIxGENVADDR"\n", vaddr, vend); 308 return LIB_ERR_PMAP_EXISTING_MAPPING; 309 } else { 310 // clean out empty page tables. We do this here because we benefit 311 // from having the page tables in place when doing lots of small 312 // mappings 313 remove_empty_vnodes(pmap, ptable, table_base, pte_count); 314 } 315 } 316 317 // setup userspace mapping 318 struct vnode *page = slab_alloc(&pmap->p.m.slab); 319 assert(page); 320 page->v.is_vnode = false; 321 page->is_cloned = false; 322 page->v.entry = table_base; 323 page->v.cap = frame; 324 page->v.u.frame.offset = offset; 325 page->v.u.frame.flags = flags; 326 page->v.u.frame.pte_count = pte_count; 327 page->u.frame.vaddr = vaddr; 328 page->u.frame.cloned_count = 0; 329 330 // only insert after vnode fully initialized 331 pmap_vnode_insert_child(ptable, page); 332 333 set_mapping_cap(&pmap->p, page, ptable, table_base); 334 pmap->used_cap_slots ++; 335 336 // do map 337 assert(!capref_is_null(ptable->v.u.vnode.invokable)); 338 assert(!capref_is_null(page->v.mapping)); 339 err = vnode_map(ptable->v.u.vnode.invokable, frame, table_base, 340 pmap_flags, offset, pte_count, page->v.mapping); 341 if (err_is_fail(err)) { 342 return err_push(err, LIB_ERR_VNODE_MAP); 343 } 344 345 return SYS_ERR_OK; 346} 347 348/** 349 * \brief Called when enough slabs exist for the given mapping 350 */ 351errval_t do_map(struct pmap *pmap_gen, genvaddr_t vaddr, 352 struct capref frame, size_t offset, size_t size, 353 vregion_flags_t flags, size_t *retoff, size_t *retsize) 354{ 355 struct pmap_x86 *pmap = (struct pmap_x86 *)pmap_gen; 356 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 0); 357 errval_t err; 358 359 // determine page size and relevant address part 360 size_t page_size = X86_64_BASE_PAGE_SIZE; 361 size_t table_base = X86_64_PTABLE_BASE(vaddr); 362 uint8_t map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS; 363 bool debug_out = false; 364 365 // get base address and size of frame 366 struct frame_identity fi; 367 err = cap_identify_mappable(frame, &fi); 368 if (err_is_fail(err)) { 369 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 370 return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY); 371 } 372 373 if ((flags & VREGION_FLAGS_HUGE) && 374 (vaddr & X86_64_HUGE_PAGE_MASK) == 0 && 375 fi.bytes >= X86_64_HUGE_PAGE_SIZE && 376 ((fi.base & X86_64_HUGE_PAGE_MASK) == 0)) 377 { 378 // huge page branch (1GB) 379 page_size = X86_64_HUGE_PAGE_SIZE; 380 table_base = X86_64_PDPT_BASE(vaddr); 381 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS; 382 debug_out = false; 383 // remove large flag, if we're doing huge mapping 384 flags &= ~VREGION_FLAGS_LARGE; 385 } else if ((flags & VREGION_FLAGS_LARGE) && 386 (vaddr & X86_64_LARGE_PAGE_MASK) == 0 && 387 fi.bytes >= X86_64_LARGE_PAGE_SIZE && 388 ((fi.base & X86_64_LARGE_PAGE_MASK) == 0)) 389 { 390 // large page branch (2MB) 391 page_size = X86_64_LARGE_PAGE_SIZE; 392 table_base = X86_64_PDIR_BASE(vaddr); 393 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS; 394 debug_out = false; 395 } else { 396 // remove large/huge flags 397 flags &= ~(VREGION_FLAGS_LARGE|VREGION_FLAGS_HUGE); 398 } 399 400 // round to the next full page and calculate end address and #ptes 401 size = ROUND_UP(size, page_size); 402 size_t pte_count = DIVIDE_ROUND_UP(size, page_size); 403 genvaddr_t vend = vaddr + size; 404 405 if (offset+size > fi.bytes) { 406 debug_printf("do_map: offset=%zu; size=%zu; frame size=%zu\n", 407 offset, size, fi.bytes); 408 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 409 return LIB_ERR_PMAP_FRAME_SIZE; 410 } 411 412#if 0 413 if (true || debug_out) { 414 genpaddr_t paddr = fi.base + offset; 415 416 debug_printf("do_map: 0x%" 417 PRIxGENVADDR"--0x%"PRIxGENVADDR" -> 0x%"PRIxGENPADDR 418 "; pte_count = %zd; frame bytes = 0x%zx; page size = 0x%zx\n", 419 vaddr, vend, paddr, pte_count, fi.bytes, page_size); 420 } 421#endif 422 423 // all mapping on one leaf table? 424 if (is_same_pdir(vaddr, vend) || 425 (flags & VREGION_FLAGS_LARGE && is_same_pdpt(vaddr, vend)) || 426 (flags & VREGION_FLAGS_HUGE && is_same_pml4(vaddr, vend))) { 427 // fast path 428 if (debug_out) { 429 debug_printf(" do_map: fast path: %zd\n", pte_count); 430 } 431 err = do_single_map(pmap, vaddr, vend, frame, offset, pte_count, flags); 432 if (err_is_fail(err)) { 433 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 434 return err_push(err, LIB_ERR_PMAP_DO_MAP); 435 } 436 } 437 else { // multiple leaf page tables 438 // first leaf 439 uint32_t c = X86_64_PTABLE_SIZE - table_base; 440 if (debug_out) { 441 debug_printf(" do_map: slow path: first leaf %"PRIu32"\n", c); 442 } 443 genvaddr_t temp_end = vaddr + c * page_size; 444 err = do_single_map(pmap, vaddr, temp_end, frame, offset, c, flags); 445 if (err_is_fail(err)) { 446 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 447 return err_push(err, LIB_ERR_PMAP_DO_MAP); 448 } 449 450 // map full leaves 451 while (get_addr_prefix(temp_end, map_bits) < 452 get_addr_prefix(vend, map_bits)) 453 { 454 // update vars 455 vaddr = temp_end; 456 temp_end = vaddr + X86_64_PTABLE_SIZE * page_size; 457 offset += c * page_size; 458 c = X86_64_PTABLE_SIZE; 459 460 // do mapping 461 if (debug_out) { 462 debug_printf(" do_map: slow path: full leaf\n"); 463 } 464 err = do_single_map(pmap, vaddr, temp_end, frame, offset, 465 X86_64_PTABLE_SIZE, flags); 466 if (err_is_fail(err)) { 467 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 468 return err_push(err, LIB_ERR_PMAP_DO_MAP); 469 } 470 } 471 472 // map remaining part 473 offset += c * page_size; 474 475 // calculate remaining pages (subtract ptable bits from map_bits to 476 // get #ptes of last-level instead of 2nd-to-last). 477 c = get_addr_prefix(vend, map_bits-X86_64_PTABLE_BITS) - 478 get_addr_prefix(temp_end, map_bits-X86_64_PTABLE_BITS); 479 480 if (c) { 481 // do mapping 482 if (debug_out) { 483 debug_printf("do_map: slow path: last leaf %"PRIu32"\n", c); 484 } 485 err = do_single_map(pmap, temp_end, vend, frame, offset, c, flags); 486 if (err_is_fail(err)) { 487 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 488 return err_push(err, LIB_ERR_PMAP_DO_MAP); 489 } 490 } 491 } 492 493 if (retoff) { 494 *retoff = offset; 495 } 496 if (retsize) { 497 *retsize = size; 498 } 499 500 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_DO_MAP, 1); 501 return SYS_ERR_OK; 502} 503 504/// Computer upper limit on number of slabs required to perform a mapping 505static size_t max_slabs_for_mapping(size_t bytes) 506{ 507 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_BASE_PAGE_SIZE); 508 size_t max_ptable = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE); 509 size_t max_pdir = DIVIDE_ROUND_UP(max_ptable, X86_64_PTABLE_SIZE); 510 size_t max_pdpt = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE); 511 // Worst case, our mapping spans over two pdpts 512 return 2 * (max_ptable + max_pdir + max_pdpt); 513} 514 515static size_t max_slabs_for_mapping_large(size_t bytes) 516{ 517 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_LARGE_PAGE_SIZE); 518 size_t max_pdir = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE); 519 size_t max_pdpt = DIVIDE_ROUND_UP(max_pdir, X86_64_PTABLE_SIZE); 520 // Worst case, our mapping spans over two pdpts 521 return 2 * (max_pdir + max_pdpt); 522} 523 524static size_t max_slabs_for_mapping_huge(size_t bytes) 525{ 526 size_t max_pages = DIVIDE_ROUND_UP(bytes, X86_64_HUGE_PAGE_SIZE); 527 size_t max_pdpt = DIVIDE_ROUND_UP(max_pages, X86_64_PTABLE_SIZE); 528 // Worst case, our mapping spans over two pdpts 529 return 2 * max_pdpt; 530} 531 532size_t max_slabs_required(size_t bytes) 533{ 534 return max_slabs_for_mapping(bytes); 535} 536 537/** 538 * \brief Create page mappings 539 * 540 * \param pmap The pmap object 541 * \param vaddr The virtual address to create the mapping for 542 * \param frame The frame cap to map in 543 * \param offset Offset into the frame cap 544 * \param size Size of the mapping 545 * \param flags Flags for the mapping 546 * \param retoff If non-NULL, filled in with adjusted offset of mapped region 547 * \param retsize If non-NULL, filled in with adjusted size of mapped region 548 */ 549static errval_t map(struct pmap *pmap, genvaddr_t vaddr, struct capref frame, 550 size_t offset, size_t size, vregion_flags_t flags, 551 size_t *retoff, size_t *retsize) 552{ 553 errval_t err; 554 555 struct capability cap; 556 err = cap_direct_identify(frame, &cap); 557 if (err_is_fail(err)) { 558 return err_push(err, LIB_ERR_PMAP_FRAME_IDENTIFY); 559 } 560 struct frame_identity fi; 561 fi.base = get_address(&cap); 562 fi.bytes = get_size(&cap); 563 564 size_t max_slabs; 565 // Adjust the parameters to page boundaries 566 // TODO: overestimating needed slabs shouldn't hurt much in the long run, 567 // and would keep the code easier to read and possibly faster due to less 568 // branching 569 if ((flags & VREGION_FLAGS_LARGE) && 570 (vaddr & X86_64_LARGE_PAGE_MASK) == 0 && 571 (fi.base & X86_64_LARGE_PAGE_MASK) == 0 && 572 fi.bytes >= offset+size) { 573 //case large pages (2MB) 574 size += LARGE_PAGE_OFFSET(offset); 575 size = ROUND_UP(size, LARGE_PAGE_SIZE); 576 offset -= LARGE_PAGE_OFFSET(offset); 577 max_slabs = max_slabs_for_mapping_large(size); 578 } else if ((flags & VREGION_FLAGS_HUGE) && 579 (vaddr & X86_64_HUGE_PAGE_MASK) == 0 && 580 (fi.base & X86_64_HUGE_PAGE_MASK) == 0 && 581 fi.bytes >= offset+size) { 582 // case huge pages (1GB) 583 size += HUGE_PAGE_OFFSET(offset); 584 size = ROUND_UP(size, HUGE_PAGE_SIZE); 585 offset -= HUGE_PAGE_OFFSET(offset); 586 max_slabs = max_slabs_for_mapping_huge(size); 587 } else { 588 //case normal pages (4KB) 589 size += BASE_PAGE_OFFSET(offset); 590 size = ROUND_UP(size, BASE_PAGE_SIZE); 591 offset -= BASE_PAGE_OFFSET(offset); 592 max_slabs = max_slabs_for_mapping(size); 593 } 594 595 max_slabs += 6; // minimum amount required to map a region spanning 2 ptables 596 597 err = pmap_refill_slabs(pmap, max_slabs); 598 if (err_is_fail(err)) { 599 return err; 600 } 601 602 err = do_map(pmap, vaddr, frame, offset, size, flags, retoff, retsize); 603 return err; 604} 605 606struct find_mapping_info { 607 struct vnode *page_table; 608 struct vnode *page; 609 size_t page_size; 610 size_t table_base; 611 uint8_t map_bits; 612}; 613 614/** 615 * \brief Find mapping for `vaddr` in `pmap`. 616 * \arg pmap the pmap to search in 617 * \arg vaddr the virtual address to search for 618 * \arg pt the last-level page table meta-data we found if any 619 * \arg page the page meta-data we found if any 620 * \returns `true` iff we found a mapping for vaddr 621 */ 622static bool find_mapping(struct pmap_x86 *pmap, genvaddr_t vaddr, 623 struct find_mapping_info *info) 624{ 625 struct vnode *pdpt = NULL, *pdir = NULL, *pt = NULL, *page = NULL; 626 627 size_t page_size = 0; 628 size_t table_base = 0; 629 uint8_t map_bits = 0; 630 631 // find page and last-level page table (can be pdir or pdpt) 632 if ((pdpt = find_pdpt(pmap, vaddr)) != NULL) { 633 page = pmap_find_vnode(pdpt, X86_64_PDPT_BASE(vaddr)); 634 if (page && page->v.is_vnode) { // not 1G pages 635 pdir = page; 636 page = pmap_find_vnode(pdir, X86_64_PDIR_BASE(vaddr)); 637 if (page && page->v.is_vnode) { // not 2M pages 638 pt = page; 639 page = pmap_find_vnode(pt, X86_64_PTABLE_BASE(vaddr)); 640 page_size = X86_64_BASE_PAGE_SIZE; 641 table_base = X86_64_PTABLE_BASE(vaddr); 642 map_bits = X86_64_BASE_PAGE_BITS + X86_64_PTABLE_BITS; 643 } else if (page) { 644 assert(is_large_page(page)); 645 pt = pdir; 646 page_size = X86_64_LARGE_PAGE_SIZE; 647 table_base = X86_64_PDIR_BASE(vaddr); 648 map_bits = X86_64_LARGE_PAGE_BITS + X86_64_PTABLE_BITS; 649 } 650 } else if (page) { 651 assert(is_huge_page(page)); 652 pt = pdpt; 653 page_size = X86_64_HUGE_PAGE_SIZE; 654 table_base = X86_64_PDPT_BASE(vaddr); 655 map_bits = X86_64_HUGE_PAGE_BITS + X86_64_PTABLE_BITS; 656 } 657 } 658 if (info) { 659 info->page_table = pt; 660 info->page = page; 661 info->page_size = page_size; 662 info->table_base = table_base; 663 info->map_bits = map_bits; 664 } 665 if (pt && page) { 666 return true; 667 } else { 668 return false; 669 } 670} 671 672static errval_t do_single_unmap(struct pmap_x86 *pmap, genvaddr_t vaddr, 673 size_t pte_count) 674{ 675 errval_t err; 676 struct find_mapping_info info; 677 678 if (!find_mapping(pmap, vaddr, &info)) { 679 return LIB_ERR_PMAP_FIND_VNODE; 680 } 681 assert(info.page_table && info.page_table->v.is_vnode && info.page && !info.page->v.is_vnode); 682 683 if (info.page->v.u.frame.pte_count == pte_count) { 684 err = vnode_unmap(info.page_table->v.cap, info.page->v.mapping); 685 if (err_is_fail(err)) { 686 debug_printf("vnode_unmap returned error: %s (%d)\n", 687 err_getstring(err), err_no(err)); 688 return err_push(err, LIB_ERR_VNODE_UNMAP); 689 } 690 691 // delete&free page->v.mapping after doing vnode_unmap() 692 err = cap_delete(info.page->v.mapping); 693 if (err_is_fail(err)) { 694 return err_push(err, LIB_ERR_CAP_DELETE); 695 } 696#ifndef GLOBAL_MCN 697 err = pmap->p.slot_alloc->free(pmap->p.slot_alloc, info.page->v.mapping); 698 if (err_is_fail(err)) { 699 debug_printf("remove_empty_vnodes: slot_free (mapping): %s\n", 700 err_getstring(err)); 701 } 702#endif 703 assert(pmap->used_cap_slots > 0); 704 pmap->used_cap_slots --; 705 // Free up the resources 706 pmap_remove_vnode(info.page_table, info.page); 707 slab_free(&pmap->p.m.slab, info.page); 708 } 709 710 return SYS_ERR_OK; 711} 712 713/** 714 * \brief Remove page mappings 715 * 716 * \param pmap The pmap object 717 * \param vaddr The start of the virtual region to remove 718 * \param size The size of virtual region to remove 719 * \param retsize If non-NULL, filled in with the actual size removed 720 */ 721static errval_t unmap(struct pmap *pmap, genvaddr_t vaddr, size_t size, 722 size_t *retsize) 723{ 724 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 0); 725 //printf("[unmap] 0x%"PRIxGENVADDR", %zu\n", vaddr, size); 726 errval_t err, ret = SYS_ERR_OK; 727 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 728 729 //determine if we unmap a larger page 730 struct find_mapping_info info; 731 732 if (!find_mapping(x86, vaddr, &info)) { 733 //TODO: better error --> LIB_ERR_PMAP_NOT_MAPPED 734 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 735 return LIB_ERR_PMAP_UNMAP; 736 } 737 738 assert(!info.page->v.is_vnode); 739 740 if (info.page->v.entry > info.table_base) { 741 debug_printf("trying to partially unmap region\n"); 742 // XXX: error code 743 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 744 return LIB_ERR_PMAP_FIND_VNODE; 745 } 746 747 // TODO: match new policy of map when implemented 748 size = ROUND_UP(size, info.page_size); 749 genvaddr_t vend = vaddr + size; 750 751 if (is_same_pdir(vaddr, vend) || 752 (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) || 753 (is_same_pml4(vaddr, vend) && is_huge_page(info.page))) 754 { 755 // fast path 756 err = do_single_unmap(x86, vaddr, size / info.page_size); 757 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 758 printf("error fast path\n"); 759 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 760 return err_push(err, LIB_ERR_PMAP_UNMAP); 761 } 762 } 763 else { // slow path 764 // unmap first leaf 765 uint32_t c = X86_64_PTABLE_SIZE - info.table_base; 766 767 err = do_single_unmap(x86, vaddr, c); 768 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 769 printf("error first leaf\n"); 770 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 771 return err_push(err, LIB_ERR_PMAP_UNMAP); 772 } 773 774 // unmap full leaves 775 vaddr += c * info.page_size; 776 while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) { 777 c = X86_64_PTABLE_SIZE; 778 err = do_single_unmap(x86, vaddr, X86_64_PTABLE_SIZE); 779 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 780 printf("error while loop\n"); 781 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 782 return err_push(err, LIB_ERR_PMAP_UNMAP); 783 } 784 vaddr += c * info.page_size; 785 } 786 787 // unmap remaining part 788 // subtracting ptable bits from map_bits to get #ptes in last-level table 789 // instead of 2nd-to-last. 790 c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) - 791 get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS); 792 assert(c < X86_64_PTABLE_SIZE); 793 if (c) { 794 err = do_single_unmap(x86, vaddr, c); 795 if (err_is_fail(err) && err_no(err) != LIB_ERR_PMAP_FIND_VNODE) { 796 printf("error remaining part\n"); 797 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 798 return err_push(err, LIB_ERR_PMAP_UNMAP); 799 } 800 } 801 } 802 803 if (retsize) { 804 *retsize = size; 805 } 806 807 //printf("[unmap] exiting\n"); 808 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_UNMAP, 1); 809 return ret; 810} 811 812int pmap_selective_flush = 0; 813static errval_t do_single_modify_flags(struct pmap_x86 *pmap, genvaddr_t vaddr, 814 size_t pages, vregion_flags_t flags) 815{ 816 errval_t err = SYS_ERR_OK; 817 818 struct find_mapping_info info; 819 820 if (!find_mapping(pmap, vaddr, &info)) { 821 return LIB_ERR_PMAP_FIND_VNODE; 822 } 823 824 assert(info.page_table && info.page_table->v.is_vnode && info.page && !info.page->v.is_vnode); 825 assert(pages <= PTABLE_SIZE); 826 827 if (pmap_inside_region(info.page_table, info.table_base, pages)) { 828 // we're modifying part of a valid mapped region 829 // arguments to invocation: invoke frame cap, first affected 830 // page (as offset from first page in mapping), #affected 831 // pages, new flags. Invocation mask flags based on capability 832 // access permissions. 833 size_t off = info.table_base - info.page->v.entry; 834 paging_x86_64_flags_t pmap_flags = vregion_to_pmap_flag(flags); 835 // calculate TLB flushing hint 836 genvaddr_t va_hint = 0; 837 if (pmap_selective_flush == 3) { 838 // always do full flush 839 va_hint = 0; 840 } else if (pmap_selective_flush == 2) { 841 // always do assisted selective flush 842 va_hint = vaddr & ~(info.page_size - 1); 843 } else if (pmap_selective_flush == 1) { 844 // always do computed selective flush 845 va_hint = 1; 846 } else { 847 /* 848 * default strategy is to only use selective flushing for single page 849 */ 850 if (pages == 1) { 851 // do assisted selective flush for single page 852 va_hint = vaddr & ~(info.page_size - 1); 853 } 854 } 855 err = invoke_mapping_modify_flags(info.page->v.mapping, off, pages, 856 pmap_flags, va_hint); 857 return err; 858 } else { 859 // overlaps some region border 860 // XXX: need better error 861 return LIB_ERR_PMAP_EXISTING_MAPPING; 862 } 863 864 return SYS_ERR_OK; 865} 866 867 868/** 869 * \brief Modify page mapping 870 * 871 * \param pmap The pmap object 872 * \param vaddr The first virtual address for which to change the flags 873 * \param size The length of the region to change in bytes 874 * \param flags New flags for the mapping 875 * \param retsize If non-NULL, filled in with the actual size modified 876 */ 877static errval_t modify_flags(struct pmap *pmap, genvaddr_t vaddr, size_t size, 878 vregion_flags_t flags, size_t *retsize) 879{ 880 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 0); 881 errval_t err; 882 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 883 884 //determine if we unmap a larger page 885 struct find_mapping_info info; 886 887 if (!find_mapping(x86, vaddr, &info)) { 888 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1); 889 return LIB_ERR_PMAP_NOT_MAPPED; 890 } 891 892 assert(info.page && !info.page->v.is_vnode); 893 // XXX: be more graceful about size == 0? -SG, 2017-11-28. 894 assert(size > 0); 895 896 // TODO: match new policy of map when implemented 897 size = ROUND_UP(size, info.page_size); 898 genvaddr_t vend = vaddr + size; 899 900 size_t pages = size / info.page_size; 901 902 // vaddr and vend specify begin and end of the region (inside a mapping) 903 // that should receive the new set of flags 904 if (is_same_pdir(vaddr, vend) || 905 (is_same_pdpt(vaddr, vend) && is_large_page(info.page)) || 906 (is_same_pml4(vaddr, vend) && is_huge_page(info.page))) { 907 // fast path 908 assert(pages <= PTABLE_SIZE); 909 err = do_single_modify_flags(x86, vaddr, pages, flags); 910 if (err_is_fail(err)) { 911 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1); 912 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 913 } 914 } 915 else { // slow path 916 // modify first part 917 uint32_t c = X86_64_PTABLE_SIZE - info.table_base; 918 assert(c <= PTABLE_SIZE); 919 err = do_single_modify_flags(x86, vaddr, c, flags); 920 if (err_is_fail(err)) { 921 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1); 922 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 923 } 924 925 // modify full leaves 926 vaddr += c * info.page_size; 927 while (get_addr_prefix(vaddr, info.map_bits) < get_addr_prefix(vend, info.map_bits)) { 928 c = X86_64_PTABLE_SIZE; 929 err = do_single_modify_flags(x86, vaddr, X86_64_PTABLE_SIZE, flags); 930 if (err_is_fail(err)) { 931 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1); 932 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 933 } 934 vaddr += c * info.page_size; 935 } 936 937 // modify remaining part 938 c = get_addr_prefix(vend, info.map_bits - X86_64_PTABLE_BITS) - 939 get_addr_prefix(vaddr, info.map_bits - X86_64_PTABLE_BITS); 940 if (c) { 941 assert(c <= PTABLE_SIZE); 942 err = do_single_modify_flags(x86, vaddr, c, flags); 943 if (err_is_fail(err)) { 944 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1); 945 return err_push(err, LIB_ERR_PMAP_MODIFY_FLAGS); 946 } 947 } 948 } 949 950 if (retsize) { 951 *retsize = size; 952 } 953 954 //printf("[modify_flags] exiting\n"); 955 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_MODIFY, 1); 956 return SYS_ERR_OK; 957} 958 959/** 960 * \brief Query existing page mapping 961 * 962 * \param pmap The pmap object 963 * \param vaddr The virtual address to query 964 * \param retvaddr Returns the base virtual address of the mapping 965 * \param retsize Returns the actual size of the mapping 966 * \param retcap Returns the cap mapped at this address 967 * \param retoffset Returns the offset within the cap that is mapped 968 * \param retflags Returns the flags for this mapping 969 * 970 * All of the ret parameters are optional. 971 */ 972static errval_t lookup(struct pmap *pmap, genvaddr_t vaddr, 973 struct pmap_mapping_info *info) 974{ 975 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_LOOKUP, 0); 976 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 977 978 struct find_mapping_info find_info; 979 bool found = find_mapping(x86, vaddr, &find_info); 980 981 if (!found) { 982 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_LOOKUP, 1); 983 return LIB_ERR_PMAP_FIND_VNODE; 984 } 985 986 if (info) { 987 info->vaddr = find_info.page->u.frame.vaddr; 988 info->size = find_info.page_size * find_info.page->v.u.frame.pte_count; 989 info->cap = find_info.page->v.cap; 990 info->offset = find_info.page->v.u.frame.offset; 991 info->flags = find_info.page->v.u.frame.flags; 992 info->mapping = find_info.page->v.mapping; 993 } 994 trace_event(TRACE_SUBSYS_MEMORY, TRACE_EVENT_MEMORY_LOOKUP, 1); 995 return SYS_ERR_OK; 996} 997 998 999 1000#if defined(PMAP_LL) 1001static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written) 1002{ 1003 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 1004 struct pmap_dump_info *buf_ = buf; 1005 1006 struct vnode *pml4 = &x86->root; 1007 struct vnode *pdpt, *pdir, *pt, *frame; 1008 assert(pml4 != NULL); 1009 1010 *items_written = 0; 1011 1012 // iterate over PML4 entries 1013 size_t pml4_index, pdpt_index, pdir_index; 1014 for (pdpt = pml4->v.u.vnode.children; pdpt != NULL; pdpt = pdpt->v.meta.next) { 1015 pml4_index = pdpt->v.entry; 1016 // iterate over pdpt entries 1017 for (pdir = pdpt->v.u.vnode.children; pdir != NULL; pdir = pdir->v.meta.next) { 1018 pdpt_index = pdir->v.entry; 1019 // iterate over pdir entries 1020 for (pt = pdir->v.u.vnode.children; pt != NULL; pt = pt->v.meta.next) { 1021 pdir_index = pt->v.entry; 1022 // iterate over pt entries 1023 for (frame = pt->v.u.vnode.children; frame != NULL; frame = frame->v.meta.next) { 1024 if (*items_written < buflen) { 1025 buf_->pml4_index = pml4_index; 1026 buf_->pdpt_index = pdpt_index; 1027 buf_->pdir_index = pdir_index; 1028 buf_->pt_index = frame->v.entry; 1029 buf_->cap = frame->v.cap; 1030 buf_->offset = frame->v.u.frame.offset; 1031 buf_->flags = frame->v.u.frame.flags; 1032 buf_++; 1033 (*items_written)++; 1034 } 1035 } 1036 } 1037 } 1038 } 1039 return SYS_ERR_OK; 1040} 1041#elif defined(PMAP_ARRAY) 1042static errval_t dump(struct pmap *pmap, struct pmap_dump_info *buf, size_t buflen, size_t *items_written) 1043{ 1044 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 1045 struct pmap_dump_info *buf_ = buf; 1046 1047 struct vnode *pml4 = &x86->root; 1048 struct vnode *pdpt, *pdir, *pt, *frame; 1049 assert(pml4 != NULL); 1050 1051 *items_written = 0; 1052 1053 // iterate over PML4 entries 1054 size_t pml4_index, pdpt_index, pdir_index, pt_index; 1055 for (pml4_index = 0; pml4_index < X86_64_PTABLE_SIZE; pml4_index++) { 1056 if (!(pdpt = pml4->v.u.vnode.children[pml4_index])) { 1057 // skip empty entries 1058 continue; 1059 } 1060 // iterate over pdpt entries 1061 for (pdpt_index = 0; pdpt_index < X86_64_PTABLE_SIZE; pdpt_index++) { 1062 if (!(pdir = pdpt->v.u.vnode.children[pdpt_index])) { 1063 // skip empty entries 1064 continue; 1065 } 1066 // iterate over pdir entries 1067 for (pdir_index = 0; pdir_index < X86_64_PTABLE_SIZE; pdir_index++) { 1068 if (!(pt = pdir->v.u.vnode.children[pdir_index])) { 1069 // skip empty entries 1070 continue; 1071 } 1072 // iterate over pt entries 1073 for (pt_index = 0; pt_index < X86_64_PTABLE_SIZE; pt_index++) { 1074 if (!(frame = pt->v.u.vnode.children[pt_index])) { 1075 // skip empty entries 1076 continue; 1077 } 1078 if (*items_written < buflen) { 1079 buf_->pml4_index = pml4_index; 1080 buf_->pdpt_index = pdpt_index; 1081 buf_->pdir_index = pdir_index; 1082 buf_->pt_index = pt_index; 1083 buf_->cap = frame->v.cap; 1084 buf_->offset = frame->v.u.frame.offset; 1085 buf_->flags = frame->v.u.frame.flags; 1086 buf_++; 1087 (*items_written)++; 1088 } 1089 } 1090 } 1091 } 1092 } 1093 return SYS_ERR_OK; 1094} 1095#else 1096#error Invalid pmap datastructure 1097#endif 1098 1099 1100/* 1101 * creates pinned page table entries 1102 */ 1103static errval_t create_pts_pinned(struct pmap *pmap, genvaddr_t vaddr, size_t bytes, 1104 vregion_flags_t flags) 1105{ 1106 errval_t err = SYS_ERR_OK; 1107 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 1108 1109 size_t pagesize; 1110 1111 /* work out the number of vnodes we may need and grow the slabs*/ 1112 size_t max_slabs; 1113 if ((flags & VREGION_FLAGS_LARGE)) { 1114 assert(!(vaddr & (LARGE_PAGE_SIZE -1))); 1115 assert(!(bytes & (LARGE_PAGE_SIZE -1))); 1116 pagesize = HUGE_PAGE_SIZE; 1117 max_slabs = max_slabs_for_mapping_huge(bytes); 1118 } else if ((flags & VREGION_FLAGS_HUGE)) { 1119 // case huge pages (1GB) 1120 assert(!(vaddr & (HUGE_PAGE_SIZE -1))); 1121 assert(!(bytes & (HUGE_PAGE_SIZE -1))); 1122 pagesize = HUGE_PAGE_SIZE * 512UL; 1123 max_slabs = (bytes / HUGE_PAGE_SIZE) + 1; 1124 } else { 1125 //case normal pages (4KB) 1126 assert(!(vaddr & (BASE_PAGE_SIZE -1))); 1127 assert(!(bytes & (BASE_PAGE_SIZE -1))); 1128 pagesize = LARGE_PAGE_SIZE; 1129 max_slabs = max_slabs_for_mapping_large(bytes); 1130 } 1131 1132 max_slabs += 6; // minimum amount required to map a region spanning 2 ptables 1133 1134 // Refill slab allocator if necessary 1135 err = pmap_refill_slabs(pmap, max_slabs); 1136 if (err_is_fail(err)) { 1137 return err; 1138 } 1139 1140 /* do the actual creation of the page tables */ 1141 for (size_t va = vaddr; va < (vaddr + bytes); va += pagesize) { 1142 struct vnode *vnode; 1143 if ((flags & VREGION_FLAGS_LARGE)) { 1144 err = get_pdir(x86, va, &vnode); 1145 } else if ((flags & VREGION_FLAGS_HUGE)) { 1146 err = get_pdpt(x86, va, &vnode); 1147 } else { 1148 err = get_ptable(x86, va, &vnode); 1149 } 1150 if (err_is_fail(err)) { 1151 return err; 1152 } 1153 1154 /* map the page-table read only for access to status bits */ 1155 genvaddr_t genvaddr = pmap->m.vregion_offset; 1156 pmap->m.vregion_offset += (genvaddr_t)4096; 1157 1158 assert(pmap->m.vregion_offset < vregion_get_base_addr(&pmap->m.vregion) + 1159 vregion_get_size(&pmap->m.vregion)); 1160 1161 /* copy the page-table capability */ 1162 /* XXX: this should be somewhere in struct vnode */ 1163 struct capref slot; 1164 err = x86->p.slot_alloc->alloc(x86->p.slot_alloc, &slot); 1165 if (err_is_fail(err)) { 1166 return err_push(err, LIB_ERR_SLOT_ALLOC); 1167 } 1168 1169 err = cap_copy(slot, vnode->v.cap); 1170 if (err_is_fail(err)) { 1171 x86->p.slot_alloc->free(x86->p.slot_alloc, slot); 1172 return err; 1173 } 1174 1175 /* get slot for mapping */ 1176 /* XXX: this should be in struct vnode somewhere! */ 1177 struct capref mapping; 1178 err = x86->p.slot_alloc->alloc(x86->p.slot_alloc, &mapping); 1179 if (err_is_fail(err)) { 1180 return err_push(err, LIB_ERR_SLOT_ALLOC); 1181 } 1182 1183 /* get the page table of the reserved range and map the PT */ 1184 struct vnode *ptable; 1185 err = get_ptable(x86, genvaddr, &ptable); 1186 err = vnode_map(ptable->v.cap, slot, X86_64_PTABLE_BASE(genvaddr), 1187 vregion_to_pmap_flag(VREGION_FLAGS_READ), 0, 1, mapping); 1188 1189 if (err_is_fail(err)) { 1190 return err_push(err, LIB_ERR_PMAP_DO_MAP); 1191 } 1192 1193 /* update the vnode structure */ 1194 vnode->is_pinned = 1; 1195 vnode->u.vnode.virt_base = genvaddr; 1196 } 1197 1198 return err; 1199} 1200 1201 1202/* 1203 * returns the virtual address of the leaf pagetable for a mapping 1204 */ 1205static errval_t get_leaf_pt(struct pmap *pmap, genvaddr_t vaddr, lvaddr_t *ret_va) 1206{ 1207 assert(ret_va); 1208 1209 /* walk down the pt hierarchy and stop at the leaf */ 1210 1211 struct vnode *parent = NULL, *current = NULL; 1212 // find page and last-level page table (can be pdir or pdpt) 1213 if ((current = find_pdpt((struct pmap_x86 *)pmap, vaddr)) == NULL) { 1214 return -1; 1215 } 1216 1217 parent = current; 1218 if ((current = pmap_find_vnode(parent, X86_64_PDPT_BASE(vaddr))) == NULL) { 1219 current = parent; 1220 goto out; 1221 } 1222 1223 parent = current; 1224 if ((current = pmap_find_vnode(parent, X86_64_PDIR_BASE(vaddr))) == NULL) { 1225 current = parent; 1226 goto out; 1227 } 1228 1229out: 1230 assert(current && current->v.is_vnode); 1231 1232 *ret_va = current->u.vnode.virt_base; 1233 return SYS_ERR_OK; 1234} 1235 1236static errval_t determine_addr_raw(struct pmap *pmap, size_t size, 1237 size_t alignment, genvaddr_t *retvaddr) 1238{ 1239 struct pmap_x86 *x86 = (struct pmap_x86 *)pmap; 1240 1241 if (alignment == 0) { 1242 alignment = BASE_PAGE_SIZE; 1243 } else { 1244 alignment = ROUND_UP(alignment, BASE_PAGE_SIZE); 1245 } 1246 size = ROUND_UP(size, alignment); 1247 assert(size < 512ul * 1024 * 1024 * 1024); // pml4 size 1248 1249#if defined(PMAP_LL) 1250 struct vnode *walk_pml4 = x86->root.v.u.vnode.children; 1251 assert(walk_pml4 != NULL); // assume there's always at least one existing entry 1252 1253 // try to find free pml4 entry 1254 bool f[512]; 1255 for (int i = 0; i < 512; i++) { 1256 f[i] = true; 1257 } 1258 //debug_printf("entry: %d\n", walk_pml4->entry); 1259 f[walk_pml4->v.entry] = false; 1260 while (walk_pml4) { 1261 //debug_printf("looping over pml4 entries\n"); 1262 assert(walk_pml4->v.is_vnode); 1263 f[walk_pml4->v.entry] = false; 1264 walk_pml4 = walk_pml4->v.meta.next; 1265 } 1266 genvaddr_t first_free = 16; 1267 for (; first_free < 512; first_free++) { 1268 //debug_printf("f[%"PRIuGENVADDR"] = %d\n", first_free, f[first_free]); 1269 if (f[first_free]) { 1270 break; 1271 } 1272 } 1273#elif defined(PMAP_ARRAY) 1274 genvaddr_t first_free = 16; 1275 for (; first_free < X86_64_PTABLE_SIZE; first_free++) { 1276 if (!x86->root.v.u.vnode.children[first_free]) { 1277 break; 1278 } 1279 } 1280#else 1281#error Invalid pmap datastructure 1282#endif 1283 //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free); 1284 if (first_free < X86_64_PTABLE_SIZE) { 1285 //debug_printf("first_free: %"PRIuGENVADDR"\n", first_free); 1286 *retvaddr = first_free << 39; 1287 return SYS_ERR_OK; 1288 } else { 1289 return LIB_ERR_OUT_OF_VIRTUAL_ADDR; 1290 } 1291} 1292 1293static struct pmap_funcs pmap_funcs = { 1294 .determine_addr = pmap_x86_determine_addr, 1295 .determine_addr_raw = determine_addr_raw, 1296 .map = map, 1297 .unmap = unmap, 1298 .lookup = lookup, 1299 .modify_flags = modify_flags, 1300 .serialise = pmap_serialise, 1301 .deserialise = pmap_deserialise, 1302 .dump = dump, 1303 .create_pts_pinned = create_pts_pinned, 1304 .get_leaf_pt = get_leaf_pt, 1305 .measure_res = pmap_x86_measure_res, 1306}; 1307 1308/** 1309 * \brief Initialize a x86 pmap object 1310 * 1311 * \param pmap Pmap object of type x86 1312 */ 1313errval_t pmap_x86_64_init(struct pmap *pmap, struct vspace *vspace, 1314 struct capref vnode, 1315 struct slot_allocator *opt_slot_alloc) 1316{ 1317 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 1318 1319 /* Generic portion */ 1320 pmap->f = pmap_funcs; 1321 pmap->vspace = vspace; 1322 1323 if (opt_slot_alloc != NULL) { 1324 pmap->slot_alloc = opt_slot_alloc; 1325 } else { /* use default allocator for this dispatcher */ 1326 pmap->slot_alloc = get_default_slot_allocator(); 1327 } 1328 x86->used_cap_slots = 0; 1329 1330 errval_t err; 1331 err = pmap_vnode_mgmt_init(pmap); 1332 if (err_is_fail(err)) { 1333 return err_push(err, LIB_ERR_PMAP_INIT); 1334 } 1335 1336 x86->root.v.type = ObjType_VNode_x86_64_pml4; 1337 x86->root.v.is_vnode = true; 1338 x86->root.v.cap = vnode; 1339 x86->root.v.u.vnode.invokable = vnode; 1340 if (get_croot_addr(vnode) != CPTR_ROOTCN) { 1341 err = slot_alloc(&x86->root.v.u.vnode.invokable); 1342 assert(err_is_ok(err)); 1343 x86->used_cap_slots ++; 1344 err = cap_copy(x86->root.v.u.vnode.invokable, vnode); 1345 assert(err_is_ok(err)); 1346 } 1347 assert(!capref_is_null(x86->root.v.cap)); 1348 assert(!capref_is_null(x86->root.v.u.vnode.invokable)); 1349 pmap_vnode_init(pmap, &x86->root); 1350 x86->root.u.vnode.virt_base = 0; 1351 x86->root.u.vnode.page_table_frame = NULL_CAP; 1352 1353#ifdef GLOBAL_MCN 1354 if (pmap == get_current_pmap()) { 1355 /* 1356 * for now, for our own pmap, we use the left over slot allocator cnode to 1357 * provide the mapping cnode for the first half of the root page table as 1358 * we cannot allocate CNodes before establishing a connection to the 1359 * memory server! 1360 */ 1361 x86->root.u.vnode.mcn[0].cnode = cnode_root; 1362 x86->root.u.vnode.mcn[0].slot = ROOTCN_SLOT_ROOT_MAPPING; 1363 x86->root.u.vnode.mcnode[0].croot = CPTR_ROOTCN; 1364 x86->root.u.vnode.mcnode[0].cnode = ROOTCN_SLOT_ADDR(ROOTCN_SLOT_ROOT_MAPPING); 1365 x86->root.u.vnode.mcnode[0].level = CNODE_TYPE_OTHER; 1366 } else { 1367 err = cnode_create_l2(&x86->root.u.vnode.mcn[0], &x86->root.u.vnode.mcnode[0]); 1368 if (err_is_fail(err)) { 1369 return err_push(err, LIB_ERR_PMAP_ALLOC_CNODE); 1370 } 1371 } 1372#endif 1373 1374 // choose a minimum mappable VA for most domains; enough to catch NULL 1375 // pointer derefs with suitably large offsets 1376 x86->min_mappable_va = 64 * 1024; 1377 1378 // maximum mappable VA is derived from X86_64_MEMORY_OFFSET in kernel 1379 x86->max_mappable_va = (genvaddr_t)0xffffff8000000000; 1380 1381 return SYS_ERR_OK; 1382} 1383 1384errval_t pmap_x86_64_init_ept(struct pmap *pmap, struct vspace *vspace, 1385 struct capref vnode, 1386 struct slot_allocator *opt_slot_alloc) 1387{ 1388 errval_t err; 1389 err = pmap_x86_64_init(pmap, vspace, vnode, opt_slot_alloc); 1390 struct pmap_x86 *x86 = (struct pmap_x86*)pmap; 1391 1392 x86->root.v.type = ObjType_VNode_x86_64_ept_pml4; 1393 1394 return err; 1395} 1396 1397/** 1398 * \brief Initialize the current pmap. Reserve space for metadata 1399 * 1400 * This code is coupled with #vspace_current_init() 1401 */ 1402errval_t pmap_x86_64_current_init(bool init_domain) 1403{ 1404 struct pmap_x86 *x86 = (struct pmap_x86*)get_current_pmap(); 1405 1406 pmap_vnode_mgmt_current_init((struct pmap *)x86); 1407 1408 // We don't know the vnode layout for the first part of our address space 1409 // (which was setup by the kernel), so we avoid mapping there until told it. 1410 x86->min_mappable_va = x86->p.m.vregion.base; 1411 1412 return SYS_ERR_OK; 1413} 1414