1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * MMU support 8 * 9 * Copyright (C) 2006 Qumranet, Inc. 10 * 11 * Authors: 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Avi Kivity <avi@qumranet.com> 14 * 15 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * the COPYING file in the top-level directory. 17 * 18 */ 19 20/* 21 * We need the mmu code to access both 32-bit and 64-bit guest ptes, 22 * so the code in this file is compiled twice, once per pte size. 23 */ 24 25#if PTTYPE == 64 26 #define pt_element_t u64 27 #define guest_walker guest_walker64 28 #define FNAME(name) paging##64_##name 29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK 35 #ifdef CONFIG_X86_64 36 #define PT_MAX_FULL_LEVELS 4 37 #else 38 #define PT_MAX_FULL_LEVELS 2 39 #endif 40#elif PTTYPE == 32 41 #define pt_element_t u32 42 #define guest_walker guest_walker32 43 #define FNAME(name) paging##32_##name 44 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 45 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 46 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 47 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 49 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK 50 #define PT_MAX_FULL_LEVELS 2 51#else 52 #error Invalid PTTYPE value 53#endif 54 55/* 56 * The guest_walker structure emulates the behavior of the hardware page 57 * table walker. 58 */ 59struct guest_walker { 60 int level; 61 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 62 pt_element_t *table; 63 pt_element_t *ptep; 64 pt_element_t inherited_ar; 65 gfn_t gfn; 66 u32 error_code; 67}; 68 69/* 70 * Fetch a guest pte for a guest virtual address 71 */ 72static int FNAME(walk_addr)(struct guest_walker *walker, 73 struct kvm_vcpu *vcpu, gva_t addr, 74 int write_fault, int user_fault, int fetch_fault) 75{ 76 hpa_t hpa; 77 struct kvm_memory_slot *slot; 78 pt_element_t *ptep; 79 pt_element_t root; 80 gfn_t table_gfn; 81 82 pgprintk("%s: addr %lx\n", __FUNCTION__, addr); 83 walker->level = vcpu->mmu.root_level; 84 walker->table = NULL; 85 root = vcpu->cr3; 86#if PTTYPE == 64 87 if (!is_long_mode(vcpu)) { 88 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; 89 root = *walker->ptep; 90 if (!(root & PT_PRESENT_MASK)) 91 goto not_present; 92 --walker->level; 93 } 94#endif 95 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 96 walker->table_gfn[walker->level - 1] = table_gfn; 97 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, 98 walker->level - 1, table_gfn); 99 slot = gfn_to_memslot(vcpu->kvm, table_gfn); 100 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); 101 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); 102 103 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 104 (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); 105 106 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; 107 108 for (;;) { 109 int index = PT_INDEX(addr, walker->level); 110 hpa_t paddr; 111 112 ptep = &walker->table[index]; 113 ASSERT(((unsigned long)walker->table & PAGE_MASK) == 114 ((unsigned long)ptep & PAGE_MASK)); 115 116 if (!is_present_pte(*ptep)) 117 goto not_present; 118 119 if (write_fault && !is_writeble_pte(*ptep)) 120 if (user_fault || is_write_protection(vcpu)) 121 goto access_error; 122 123 if (user_fault && !(*ptep & PT_USER_MASK)) 124 goto access_error; 125 126#if PTTYPE == 64 127 if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) 128 goto access_error; 129#endif 130 131 if (!(*ptep & PT_ACCESSED_MASK)) { 132 mark_page_dirty(vcpu->kvm, table_gfn); 133 *ptep |= PT_ACCESSED_MASK; 134 } 135 136 if (walker->level == PT_PAGE_TABLE_LEVEL) { 137 walker->gfn = (*ptep & PT_BASE_ADDR_MASK) 138 >> PAGE_SHIFT; 139 break; 140 } 141 142 if (walker->level == PT_DIRECTORY_LEVEL 143 && (*ptep & PT_PAGE_SIZE_MASK) 144 && (PTTYPE == 64 || is_pse(vcpu))) { 145 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) 146 >> PAGE_SHIFT; 147 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); 148 break; 149 } 150 151 walker->inherited_ar &= walker->table[index]; 152 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 153 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); 154 kunmap_atomic(walker->table, KM_USER0); 155 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), 156 KM_USER0); 157 --walker->level; 158 walker->table_gfn[walker->level - 1 ] = table_gfn; 159 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, 160 walker->level - 1, table_gfn); 161 } 162 walker->ptep = ptep; 163 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); 164 return 1; 165 166not_present: 167 walker->error_code = 0; 168 goto err; 169 170access_error: 171 walker->error_code = PFERR_PRESENT_MASK; 172 173err: 174 if (write_fault) 175 walker->error_code |= PFERR_WRITE_MASK; 176 if (user_fault) 177 walker->error_code |= PFERR_USER_MASK; 178 if (fetch_fault) 179 walker->error_code |= PFERR_FETCH_MASK; 180 return 0; 181} 182 183static void FNAME(release_walker)(struct guest_walker *walker) 184{ 185 if (walker->table) 186 kunmap_atomic(walker->table, KM_USER0); 187} 188 189static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, 190 struct guest_walker *walker) 191{ 192 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); 193} 194 195static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, 196 u64 *shadow_pte, u64 access_bits, gfn_t gfn) 197{ 198 ASSERT(*shadow_pte == 0); 199 access_bits &= guest_pte; 200 *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); 201 set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, 202 guest_pte & PT_DIRTY_MASK, access_bits, gfn); 203} 204 205static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, 206 u64 *shadow_pte, u64 access_bits, gfn_t gfn) 207{ 208 gpa_t gaddr; 209 210 ASSERT(*shadow_pte == 0); 211 access_bits &= guest_pde; 212 gaddr = (gpa_t)gfn << PAGE_SHIFT; 213 if (PTTYPE == 32 && is_cpuid_PSE36()) 214 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << 215 (32 - PT32_DIR_PSE36_SHIFT); 216 *shadow_pte = guest_pde & PT_PTE_COPY_MASK; 217 set_pte_common(vcpu, shadow_pte, gaddr, 218 guest_pde & PT_DIRTY_MASK, access_bits, gfn); 219} 220 221/* 222 * Fetch a shadow pte for a specific level in the paging hierarchy. 223 */ 224static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 225 struct guest_walker *walker) 226{ 227 hpa_t shadow_addr; 228 int level; 229 u64 *prev_shadow_ent = NULL; 230 pt_element_t *guest_ent = walker->ptep; 231 232 if (!is_present_pte(*guest_ent)) 233 return NULL; 234 235 shadow_addr = vcpu->mmu.root_hpa; 236 level = vcpu->mmu.shadow_root_level; 237 if (level == PT32E_ROOT_LEVEL) { 238 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; 239 shadow_addr &= PT64_BASE_ADDR_MASK; 240 --level; 241 } 242 243 for (; ; level--) { 244 u32 index = SHADOW_PT_INDEX(addr, level); 245 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; 246 struct kvm_mmu_page *shadow_page; 247 u64 shadow_pte; 248 int metaphysical; 249 gfn_t table_gfn; 250 unsigned hugepage_access = 0; 251 252 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 253 if (level == PT_PAGE_TABLE_LEVEL) 254 return shadow_ent; 255 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 256 prev_shadow_ent = shadow_ent; 257 continue; 258 } 259 260 if (level == PT_PAGE_TABLE_LEVEL) { 261 262 if (walker->level == PT_DIRECTORY_LEVEL) { 263 if (prev_shadow_ent) 264 *prev_shadow_ent |= PT_SHADOW_PS_MARK; 265 FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, 266 walker->inherited_ar, 267 walker->gfn); 268 } else { 269 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); 270 FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, 271 walker->inherited_ar, 272 walker->gfn); 273 } 274 return shadow_ent; 275 } 276 277 if (level - 1 == PT_PAGE_TABLE_LEVEL 278 && walker->level == PT_DIRECTORY_LEVEL) { 279 metaphysical = 1; 280 hugepage_access = *guest_ent; 281 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; 282 hugepage_access >>= PT_WRITABLE_SHIFT; 283 table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) 284 >> PAGE_SHIFT; 285 } else { 286 metaphysical = 0; 287 table_gfn = walker->table_gfn[level - 2]; 288 } 289 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 290 metaphysical, hugepage_access, 291 shadow_ent); 292 shadow_addr = shadow_page->page_hpa; 293 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK 294 | PT_WRITABLE_MASK | PT_USER_MASK; 295 *shadow_ent = shadow_pte; 296 prev_shadow_ent = shadow_ent; 297 } 298} 299 300/* 301 * The guest faulted for write. We need to 302 * 303 * - check write permissions 304 * - update the guest pte dirty bit 305 * - update our own dirty page tracking structures 306 */ 307static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, 308 u64 *shadow_ent, 309 struct guest_walker *walker, 310 gva_t addr, 311 int user, 312 int *write_pt) 313{ 314 pt_element_t *guest_ent; 315 int writable_shadow; 316 gfn_t gfn; 317 struct kvm_mmu_page *page; 318 319 if (is_writeble_pte(*shadow_ent)) 320 return !user || (*shadow_ent & PT_USER_MASK); 321 322 writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; 323 if (user) { 324 /* 325 * User mode access. Fail if it's a kernel page or a read-only 326 * page. 327 */ 328 if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) 329 return 0; 330 ASSERT(*shadow_ent & PT_USER_MASK); 331 } else 332 /* 333 * Kernel mode access. Fail if it's a read-only page and 334 * supervisor write protection is enabled. 335 */ 336 if (!writable_shadow) { 337 if (is_write_protection(vcpu)) 338 return 0; 339 *shadow_ent &= ~PT_USER_MASK; 340 } 341 342 guest_ent = walker->ptep; 343 344 if (!is_present_pte(*guest_ent)) { 345 *shadow_ent = 0; 346 return 0; 347 } 348 349 gfn = walker->gfn; 350 351 if (user) { 352 /* 353 * Usermode page faults won't be for page table updates. 354 */ 355 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { 356 pgprintk("%s: zap %lx %x\n", 357 __FUNCTION__, gfn, page->role.word); 358 kvm_mmu_zap_page(vcpu, page); 359 } 360 } else if (kvm_mmu_lookup_page(vcpu, gfn)) { 361 pgprintk("%s: found shadow page for %lx, marking ro\n", 362 __FUNCTION__, gfn); 363 mark_page_dirty(vcpu->kvm, gfn); 364 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); 365 *guest_ent |= PT_DIRTY_MASK; 366 *write_pt = 1; 367 return 0; 368 } 369 mark_page_dirty(vcpu->kvm, gfn); 370 *shadow_ent |= PT_WRITABLE_MASK; 371 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); 372 *guest_ent |= PT_DIRTY_MASK; 373 rmap_add(vcpu, shadow_ent); 374 375 return 1; 376} 377 378/* 379 * Page fault handler. There are several causes for a page fault: 380 * - there is no shadow pte for the guest pte 381 * - write access through a shadow pte marked read only so that we can set 382 * the dirty bit 383 * - write access to a shadow pte marked read only so we can update the page 384 * dirty bitmap, when userspace requests it 385 * - mmio access; in this case we will never install a present shadow pte 386 * - normal guest page fault due to the guest pte marked not present, not 387 * writable, or not executable 388 * 389 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 390 * a negative value on error. 391 */ 392static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 393 u32 error_code) 394{ 395 int write_fault = error_code & PFERR_WRITE_MASK; 396 int user_fault = error_code & PFERR_USER_MASK; 397 int fetch_fault = error_code & PFERR_FETCH_MASK; 398 struct guest_walker walker; 399 u64 *shadow_pte; 400 int fixed; 401 int write_pt = 0; 402 int r; 403 404 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); 405 kvm_mmu_audit(vcpu, "pre page fault"); 406 407 r = mmu_topup_memory_caches(vcpu); 408 if (r) 409 return r; 410 411 /* 412 * Look up the shadow pte for the faulting address. 413 */ 414 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 415 fetch_fault); 416 417 /* 418 * The page is not mapped by the guest. Let the guest handle it. 419 */ 420 if (!r) { 421 pgprintk("%s: guest page fault\n", __FUNCTION__); 422 inject_page_fault(vcpu, addr, walker.error_code); 423 FNAME(release_walker)(&walker); 424 return 0; 425 } 426 427 shadow_pte = FNAME(fetch)(vcpu, addr, &walker); 428 pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, 429 shadow_pte, *shadow_pte); 430 431 /* 432 * Update the shadow pte. 433 */ 434 if (write_fault) 435 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, 436 user_fault, &write_pt); 437 else 438 fixed = fix_read_pf(shadow_pte); 439 440 pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, 441 shadow_pte, *shadow_pte); 442 443 FNAME(release_walker)(&walker); 444 445 /* 446 * mmio: emulate if accessible, otherwise its a guest fault. 447 */ 448 if (is_io_pte(*shadow_pte)) 449 return 1; 450 451 ++vcpu->stat.pf_fixed; 452 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 453 454 return write_pt; 455} 456 457static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 458{ 459 struct guest_walker walker; 460 gpa_t gpa = UNMAPPED_GVA; 461 int r; 462 463 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); 464 465 if (r) { 466 gpa = (gpa_t)walker.gfn << PAGE_SHIFT; 467 gpa |= vaddr & ~PAGE_MASK; 468 } 469 470 FNAME(release_walker)(&walker); 471 return gpa; 472} 473 474#undef pt_element_t 475#undef guest_walker 476#undef FNAME 477#undef PT_BASE_ADDR_MASK 478#undef PT_INDEX 479#undef SHADOW_PT_INDEX 480#undef PT_LEVEL_MASK 481#undef PT_PTE_COPY_MASK 482#undef PT_NON_PTE_COPY_MASK 483#undef PT_DIR_BASE_ADDR_MASK 484#undef PT_MAX_FULL_LEVELS 485