1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * MMU support 8 * 9 * Copyright (C) 2006 Qumranet, Inc. 10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 11 * 12 * Authors: 13 * Yaniv Kamay <yaniv@qumranet.com> 14 * Avi Kivity <avi@qumranet.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21#include "mmu.h" 22#include "x86.h" 23#include "kvm_cache_regs.h" 24 25#include <linux/kvm_host.h> 26#include <linux/types.h> 27#include <linux/string.h> 28#include <linux/mm.h> 29#include <linux/highmem.h> 30#include <linux/module.h> 31#include <linux/swap.h> 32#include <linux/hugetlb.h> 33#include <linux/compiler.h> 34#include <linux/srcu.h> 35#include <linux/slab.h> 36#include <linux/uaccess.h> 37 38#include <asm/page.h> 39#include <asm/cmpxchg.h> 40#include <asm/io.h> 41#include <asm/vmx.h> 42 43/* 44 * When setting this variable to true it enables Two-Dimensional-Paging 45 * where the hardware walks 2 page tables: 46 * 1. the guest-virtual to guest-physical 47 * 2. while doing 1. it walks guest-physical to host-physical 48 * If the hardware supports that we don't need to do shadow paging. 49 */ 50bool tdp_enabled = false; 51 52#undef MMU_DEBUG 53 54#undef AUDIT 55 56#ifdef AUDIT 57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); 58#else 59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} 60#endif 61 62#ifdef MMU_DEBUG 63 64#define pgprintk(x...) do { if (dbg) printk(x); } while (0) 65#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) 66 67#else 68 69#define pgprintk(x...) do { } while (0) 70#define rmap_printk(x...) do { } while (0) 71 72#endif 73 74#if defined(MMU_DEBUG) || defined(AUDIT) 75static int dbg = 0; 76module_param(dbg, bool, 0644); 77#endif 78 79static int oos_shadow = 1; 80module_param(oos_shadow, bool, 0644); 81 82#ifndef MMU_DEBUG 83#define ASSERT(x) do { } while (0) 84#else 85#define ASSERT(x) \ 86 if (!(x)) { \ 87 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ 88 __FILE__, __LINE__, #x); \ 89 } 90#endif 91 92#define PT_FIRST_AVAIL_BITS_SHIFT 9 93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 94 95#define PT64_LEVEL_BITS 9 96 97#define PT64_LEVEL_SHIFT(level) \ 98 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 99 100#define PT64_LEVEL_MASK(level) \ 101 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) 102 103#define PT64_INDEX(address, level)\ 104 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 105 106 107#define PT32_LEVEL_BITS 10 108 109#define PT32_LEVEL_SHIFT(level) \ 110 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 111 112#define PT32_LEVEL_MASK(level) \ 113 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) 114#define PT32_LVL_OFFSET_MASK(level) \ 115 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 116 * PT32_LEVEL_BITS))) - 1)) 117 118#define PT32_INDEX(address, level)\ 119 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 120 121 122#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 123#define PT64_DIR_BASE_ADDR_MASK \ 124 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 125#define PT64_LVL_ADDR_MASK(level) \ 126 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 127 * PT64_LEVEL_BITS))) - 1)) 128#define PT64_LVL_OFFSET_MASK(level) \ 129 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 130 * PT64_LEVEL_BITS))) - 1)) 131 132#define PT32_BASE_ADDR_MASK PAGE_MASK 133#define PT32_DIR_BASE_ADDR_MASK \ 134 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 135#define PT32_LVL_ADDR_MASK(level) \ 136 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 137 * PT32_LEVEL_BITS))) - 1)) 138 139#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 140 | PT64_NX_MASK) 141 142#define RMAP_EXT 4 143 144#define ACC_EXEC_MASK 1 145#define ACC_WRITE_MASK PT_WRITABLE_MASK 146#define ACC_USER_MASK PT_USER_MASK 147#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 148 149#include <trace/events/kvm.h> 150 151#define CREATE_TRACE_POINTS 152#include "mmutrace.h" 153 154#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 155 156#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 157 158struct kvm_rmap_desc { 159 u64 *sptes[RMAP_EXT]; 160 struct kvm_rmap_desc *more; 161}; 162 163struct kvm_shadow_walk_iterator { 164 u64 addr; 165 hpa_t shadow_addr; 166 int level; 167 u64 *sptep; 168 unsigned index; 169}; 170 171#define for_each_shadow_entry(_vcpu, _addr, _walker) \ 172 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 173 shadow_walk_okay(&(_walker)); \ 174 shadow_walk_next(&(_walker))) 175 176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 177 178static struct kmem_cache *pte_chain_cache; 179static struct kmem_cache *rmap_desc_cache; 180static struct kmem_cache *mmu_page_header_cache; 181 182static u64 __read_mostly shadow_trap_nonpresent_pte; 183static u64 __read_mostly shadow_notrap_nonpresent_pte; 184static u64 __read_mostly shadow_base_present_pte; 185static u64 __read_mostly shadow_nx_mask; 186static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 187static u64 __read_mostly shadow_user_mask; 188static u64 __read_mostly shadow_accessed_mask; 189static u64 __read_mostly shadow_dirty_mask; 190 191static inline u64 rsvd_bits(int s, int e) 192{ 193 return ((1ULL << (e - s + 1)) - 1) << s; 194} 195 196void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 197{ 198 shadow_trap_nonpresent_pte = trap_pte; 199 shadow_notrap_nonpresent_pte = notrap_pte; 200} 201EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 202 203void kvm_mmu_set_base_ptes(u64 base_pte) 204{ 205 shadow_base_present_pte = base_pte; 206} 207EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 208 209void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 210 u64 dirty_mask, u64 nx_mask, u64 x_mask) 211{ 212 shadow_user_mask = user_mask; 213 shadow_accessed_mask = accessed_mask; 214 shadow_dirty_mask = dirty_mask; 215 shadow_nx_mask = nx_mask; 216 shadow_x_mask = x_mask; 217} 218EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 219 220static bool is_write_protection(struct kvm_vcpu *vcpu) 221{ 222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 223} 224 225static int is_cpuid_PSE36(void) 226{ 227 return 1; 228} 229 230static int is_nx(struct kvm_vcpu *vcpu) 231{ 232 return vcpu->arch.efer & EFER_NX; 233} 234 235static int is_shadow_present_pte(u64 pte) 236{ 237 return pte != shadow_trap_nonpresent_pte 238 && pte != shadow_notrap_nonpresent_pte; 239} 240 241static int is_large_pte(u64 pte) 242{ 243 return pte & PT_PAGE_SIZE_MASK; 244} 245 246static int is_writable_pte(unsigned long pte) 247{ 248 return pte & PT_WRITABLE_MASK; 249} 250 251static int is_dirty_gpte(unsigned long pte) 252{ 253 return pte & PT_DIRTY_MASK; 254} 255 256static int is_rmap_spte(u64 pte) 257{ 258 return is_shadow_present_pte(pte); 259} 260 261static int is_last_spte(u64 pte, int level) 262{ 263 if (level == PT_PAGE_TABLE_LEVEL) 264 return 1; 265 if (is_large_pte(pte)) 266 return 1; 267 return 0; 268} 269 270static pfn_t spte_to_pfn(u64 pte) 271{ 272 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 273} 274 275static gfn_t pse36_gfn_delta(u32 gpte) 276{ 277 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 278 279 return (gpte & PT32_DIR_PSE36_MASK) << shift; 280} 281 282static void __set_spte(u64 *sptep, u64 spte) 283{ 284 set_64bit(sptep, spte); 285} 286 287static u64 __xchg_spte(u64 *sptep, u64 new_spte) 288{ 289#ifdef CONFIG_X86_64 290 return xchg(sptep, new_spte); 291#else 292 u64 old_spte; 293 294 do { 295 old_spte = *sptep; 296 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 297 298 return old_spte; 299#endif 300} 301 302static void update_spte(u64 *sptep, u64 new_spte) 303{ 304 u64 old_spte; 305 306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 307 !is_rmap_spte(*sptep)) 308 __set_spte(sptep, new_spte); 309 else { 310 old_spte = __xchg_spte(sptep, new_spte); 311 if (old_spte & shadow_accessed_mask) 312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 313 } 314} 315 316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 317 struct kmem_cache *base_cache, int min) 318{ 319 void *obj; 320 321 if (cache->nobjs >= min) 322 return 0; 323 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 324 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); 325 if (!obj) 326 return -ENOMEM; 327 cache->objects[cache->nobjs++] = obj; 328 } 329 return 0; 330} 331 332static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 333 struct kmem_cache *cache) 334{ 335 while (mc->nobjs) 336 kmem_cache_free(cache, mc->objects[--mc->nobjs]); 337} 338 339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 340 int min) 341{ 342 struct page *page; 343 344 if (cache->nobjs >= min) 345 return 0; 346 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 347 page = alloc_page(GFP_KERNEL); 348 if (!page) 349 return -ENOMEM; 350 cache->objects[cache->nobjs++] = page_address(page); 351 } 352 return 0; 353} 354 355static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) 356{ 357 while (mc->nobjs) 358 free_page((unsigned long)mc->objects[--mc->nobjs]); 359} 360 361static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 362{ 363 int r; 364 365 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, 366 pte_chain_cache, 4); 367 if (r) 368 goto out; 369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 370 rmap_desc_cache, 4); 371 if (r) 372 goto out; 373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 374 if (r) 375 goto out; 376 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 377 mmu_page_header_cache, 4); 378out: 379 return r; 380} 381 382static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 383{ 384 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 385 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 386 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 387 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 388 mmu_page_header_cache); 389} 390 391static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 392 size_t size) 393{ 394 void *p; 395 396 BUG_ON(!mc->nobjs); 397 p = mc->objects[--mc->nobjs]; 398 return p; 399} 400 401static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) 402{ 403 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, 404 sizeof(struct kvm_pte_chain)); 405} 406 407static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 408{ 409 kmem_cache_free(pte_chain_cache, pc); 410} 411 412static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 413{ 414 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, 415 sizeof(struct kvm_rmap_desc)); 416} 417 418static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 419{ 420 kmem_cache_free(rmap_desc_cache, rd); 421} 422 423static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 424{ 425 if (!sp->role.direct) 426 return sp->gfns[index]; 427 428 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 429} 430 431static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 432{ 433 if (sp->role.direct) 434 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); 435 else 436 sp->gfns[index] = gfn; 437} 438 439/* 440 * Return the pointer to the largepage write count for a given 441 * gfn, handling slots that are not large page aligned. 442 */ 443static int *slot_largepage_idx(gfn_t gfn, 444 struct kvm_memory_slot *slot, 445 int level) 446{ 447 unsigned long idx; 448 449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 451 return &slot->lpage_info[level - 2][idx].write_count; 452} 453 454static void account_shadowed(struct kvm *kvm, gfn_t gfn) 455{ 456 struct kvm_memory_slot *slot; 457 int *write_count; 458 int i; 459 460 slot = gfn_to_memslot(kvm, gfn); 461 for (i = PT_DIRECTORY_LEVEL; 462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 463 write_count = slot_largepage_idx(gfn, slot, i); 464 *write_count += 1; 465 } 466} 467 468static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 469{ 470 struct kvm_memory_slot *slot; 471 int *write_count; 472 int i; 473 474 slot = gfn_to_memslot(kvm, gfn); 475 for (i = PT_DIRECTORY_LEVEL; 476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 477 write_count = slot_largepage_idx(gfn, slot, i); 478 *write_count -= 1; 479 WARN_ON(*write_count < 0); 480 } 481} 482 483static int has_wrprotected_page(struct kvm *kvm, 484 gfn_t gfn, 485 int level) 486{ 487 struct kvm_memory_slot *slot; 488 int *largepage_idx; 489 490 slot = gfn_to_memslot(kvm, gfn); 491 if (slot) { 492 largepage_idx = slot_largepage_idx(gfn, slot, level); 493 return *largepage_idx; 494 } 495 496 return 1; 497} 498 499static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 500{ 501 unsigned long page_size; 502 int i, ret = 0; 503 504 page_size = kvm_host_page_size(kvm, gfn); 505 506 for (i = PT_PAGE_TABLE_LEVEL; 507 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 508 if (page_size >= KVM_HPAGE_SIZE(i)) 509 ret = i; 510 else 511 break; 512 } 513 514 return ret; 515} 516 517static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 518{ 519 struct kvm_memory_slot *slot; 520 int host_level, level, max_level; 521 522 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 523 if (slot && slot->dirty_bitmap) 524 return PT_PAGE_TABLE_LEVEL; 525 526 host_level = host_mapping_level(vcpu->kvm, large_gfn); 527 528 if (host_level == PT_PAGE_TABLE_LEVEL) 529 return host_level; 530 531 max_level = kvm_x86_ops->get_lpage_level() < host_level ? 532 kvm_x86_ops->get_lpage_level() : host_level; 533 534 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 535 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 536 break; 537 538 return level - 1; 539} 540 541/* 542 * Take gfn and return the reverse mapping to it. 543 */ 544 545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 546{ 547 struct kvm_memory_slot *slot; 548 unsigned long idx; 549 550 slot = gfn_to_memslot(kvm, gfn); 551 if (likely(level == PT_PAGE_TABLE_LEVEL)) 552 return &slot->rmap[gfn - slot->base_gfn]; 553 554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 556 557 return &slot->lpage_info[level - 2][idx].rmap_pde; 558} 559 560/* 561 * Reverse mapping data structures: 562 * 563 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry 564 * that points to page_address(page). 565 * 566 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 567 * containing more mappings. 568 * 569 * Returns the number of rmap entries before the spte was added or zero if 570 * the spte was not added. 571 * 572 */ 573static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 574{ 575 struct kvm_mmu_page *sp; 576 struct kvm_rmap_desc *desc; 577 unsigned long *rmapp; 578 int i, count = 0; 579 580 if (!is_rmap_spte(*spte)) 581 return count; 582 sp = page_header(__pa(spte)); 583 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 584 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 585 if (!*rmapp) { 586 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 587 *rmapp = (unsigned long)spte; 588 } else if (!(*rmapp & 1)) { 589 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); 590 desc = mmu_alloc_rmap_desc(vcpu); 591 desc->sptes[0] = (u64 *)*rmapp; 592 desc->sptes[1] = spte; 593 *rmapp = (unsigned long)desc | 1; 594 } else { 595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 597 while (desc->sptes[RMAP_EXT-1] && desc->more) { 598 desc = desc->more; 599 count += RMAP_EXT; 600 } 601 if (desc->sptes[RMAP_EXT-1]) { 602 desc->more = mmu_alloc_rmap_desc(vcpu); 603 desc = desc->more; 604 } 605 for (i = 0; desc->sptes[i]; ++i) 606 ; 607 desc->sptes[i] = spte; 608 } 609 return count; 610} 611 612static void rmap_desc_remove_entry(unsigned long *rmapp, 613 struct kvm_rmap_desc *desc, 614 int i, 615 struct kvm_rmap_desc *prev_desc) 616{ 617 int j; 618 619 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) 620 ; 621 desc->sptes[i] = desc->sptes[j]; 622 desc->sptes[j] = NULL; 623 if (j != 0) 624 return; 625 if (!prev_desc && !desc->more) 626 *rmapp = (unsigned long)desc->sptes[0]; 627 else 628 if (prev_desc) 629 prev_desc->more = desc->more; 630 else 631 *rmapp = (unsigned long)desc->more | 1; 632 mmu_free_rmap_desc(desc); 633} 634 635static void rmap_remove(struct kvm *kvm, u64 *spte) 636{ 637 struct kvm_rmap_desc *desc; 638 struct kvm_rmap_desc *prev_desc; 639 struct kvm_mmu_page *sp; 640 gfn_t gfn; 641 unsigned long *rmapp; 642 int i; 643 644 sp = page_header(__pa(spte)); 645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 647 if (!*rmapp) { 648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 649 BUG(); 650 } else if (!(*rmapp & 1)) { 651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 652 if ((u64 *)*rmapp != spte) { 653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 654 spte, *spte); 655 BUG(); 656 } 657 *rmapp = 0; 658 } else { 659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 661 prev_desc = NULL; 662 while (desc) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) 664 if (desc->sptes[i] == spte) { 665 rmap_desc_remove_entry(rmapp, 666 desc, i, 667 prev_desc); 668 return; 669 } 670 prev_desc = desc; 671 desc = desc->more; 672 } 673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 674 BUG(); 675 } 676} 677 678static void set_spte_track_bits(u64 *sptep, u64 new_spte) 679{ 680 pfn_t pfn; 681 u64 old_spte = *sptep; 682 683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 684 old_spte & shadow_accessed_mask) { 685 __set_spte(sptep, new_spte); 686 } else 687 old_spte = __xchg_spte(sptep, new_spte); 688 689 if (!is_rmap_spte(old_spte)) 690 return; 691 pfn = spte_to_pfn(old_spte); 692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 693 kvm_set_pfn_accessed(pfn); 694 if (is_writable_pte(old_spte)) 695 kvm_set_pfn_dirty(pfn); 696} 697 698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 699{ 700 set_spte_track_bits(sptep, new_spte); 701 rmap_remove(kvm, sptep); 702} 703 704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 705{ 706 struct kvm_rmap_desc *desc; 707 u64 *prev_spte; 708 int i; 709 710 if (!*rmapp) 711 return NULL; 712 else if (!(*rmapp & 1)) { 713 if (!spte) 714 return (u64 *)*rmapp; 715 return NULL; 716 } 717 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 718 prev_spte = NULL; 719 while (desc) { 720 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 721 if (prev_spte == spte) 722 return desc->sptes[i]; 723 prev_spte = desc->sptes[i]; 724 } 725 desc = desc->more; 726 } 727 return NULL; 728} 729 730static int rmap_write_protect(struct kvm *kvm, u64 gfn) 731{ 732 unsigned long *rmapp; 733 u64 *spte; 734 int i, write_protected = 0; 735 736 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 737 738 spte = rmap_next(kvm, rmapp, NULL); 739 while (spte) { 740 BUG_ON(!spte); 741 BUG_ON(!(*spte & PT_PRESENT_MASK)); 742 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 743 if (is_writable_pte(*spte)) { 744 update_spte(spte, *spte & ~PT_WRITABLE_MASK); 745 write_protected = 1; 746 } 747 spte = rmap_next(kvm, rmapp, spte); 748 } 749 if (write_protected) { 750 pfn_t pfn; 751 752 spte = rmap_next(kvm, rmapp, NULL); 753 pfn = spte_to_pfn(*spte); 754 kvm_set_pfn_dirty(pfn); 755 } 756 757 /* check for huge page mappings */ 758 for (i = PT_DIRECTORY_LEVEL; 759 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 760 rmapp = gfn_to_rmap(kvm, gfn, i); 761 spte = rmap_next(kvm, rmapp, NULL); 762 while (spte) { 763 BUG_ON(!spte); 764 BUG_ON(!(*spte & PT_PRESENT_MASK)); 765 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 766 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 767 if (is_writable_pte(*spte)) { 768 drop_spte(kvm, spte, 769 shadow_trap_nonpresent_pte); 770 --kvm->stat.lpages; 771 spte = NULL; 772 write_protected = 1; 773 } 774 spte = rmap_next(kvm, rmapp, spte); 775 } 776 } 777 778 return write_protected; 779} 780 781static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 782 unsigned long data) 783{ 784 u64 *spte; 785 int need_tlb_flush = 0; 786 787 while ((spte = rmap_next(kvm, rmapp, NULL))) { 788 BUG_ON(!(*spte & PT_PRESENT_MASK)); 789 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 790 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 791 need_tlb_flush = 1; 792 } 793 return need_tlb_flush; 794} 795 796static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 797 unsigned long data) 798{ 799 int need_flush = 0; 800 u64 *spte, new_spte; 801 pte_t *ptep = (pte_t *)data; 802 pfn_t new_pfn; 803 804 WARN_ON(pte_huge(*ptep)); 805 new_pfn = pte_pfn(*ptep); 806 spte = rmap_next(kvm, rmapp, NULL); 807 while (spte) { 808 BUG_ON(!is_shadow_present_pte(*spte)); 809 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 810 need_flush = 1; 811 if (pte_write(*ptep)) { 812 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 813 spte = rmap_next(kvm, rmapp, NULL); 814 } else { 815 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 816 new_spte |= (u64)new_pfn << PAGE_SHIFT; 817 818 new_spte &= ~PT_WRITABLE_MASK; 819 new_spte &= ~SPTE_HOST_WRITEABLE; 820 new_spte &= ~shadow_accessed_mask; 821 set_spte_track_bits(spte, new_spte); 822 spte = rmap_next(kvm, rmapp, spte); 823 } 824 } 825 if (need_flush) 826 kvm_flush_remote_tlbs(kvm); 827 828 return 0; 829} 830 831static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 832 unsigned long data, 833 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 834 unsigned long data)) 835{ 836 int i, j; 837 int ret; 838 int retval = 0; 839 struct kvm_memslots *slots; 840 841 slots = kvm_memslots(kvm); 842 843 for (i = 0; i < slots->nmemslots; i++) { 844 struct kvm_memory_slot *memslot = &slots->memslots[i]; 845 unsigned long start = memslot->userspace_addr; 846 unsigned long end; 847 848 end = start + (memslot->npages << PAGE_SHIFT); 849 if (hva >= start && hva < end) { 850 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 851 852 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 853 854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 855 unsigned long idx; 856 int sh; 857 858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 859 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 860 (memslot->base_gfn >> sh); 861 ret |= handler(kvm, 862 &memslot->lpage_info[j][idx].rmap_pde, 863 data); 864 } 865 trace_kvm_age_page(hva, memslot, ret); 866 retval |= ret; 867 } 868 } 869 870 return retval; 871} 872 873int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 874{ 875 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 876} 877 878void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 879{ 880 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 881} 882 883static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 884 unsigned long data) 885{ 886 u64 *spte; 887 int young = 0; 888 889 /* 890 * Emulate the accessed bit for EPT, by checking if this page has 891 * an EPT mapping, and clearing it if it does. On the next access, 892 * a new EPT mapping will be established. 893 * This has some overhead, but not as much as the cost of swapping 894 * out actively used pages or breaking up actively used hugepages. 895 */ 896 if (!shadow_accessed_mask) 897 return kvm_unmap_rmapp(kvm, rmapp, data); 898 899 spte = rmap_next(kvm, rmapp, NULL); 900 while (spte) { 901 int _young; 902 u64 _spte = *spte; 903 BUG_ON(!(_spte & PT_PRESENT_MASK)); 904 _young = _spte & PT_ACCESSED_MASK; 905 if (_young) { 906 young = 1; 907 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 908 } 909 spte = rmap_next(kvm, rmapp, spte); 910 } 911 return young; 912} 913 914#define RMAP_RECYCLE_THRESHOLD 1000 915 916static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 917{ 918 unsigned long *rmapp; 919 struct kvm_mmu_page *sp; 920 921 sp = page_header(__pa(spte)); 922 923 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 924 925 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 926 kvm_flush_remote_tlbs(vcpu->kvm); 927} 928 929int kvm_age_hva(struct kvm *kvm, unsigned long hva) 930{ 931 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 932} 933 934#ifdef MMU_DEBUG 935static int is_empty_shadow_page(u64 *spt) 936{ 937 u64 *pos; 938 u64 *end; 939 940 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 941 if (is_shadow_present_pte(*pos)) { 942 printk(KERN_ERR "%s: %p %llx\n", __func__, 943 pos, *pos); 944 return 0; 945 } 946 return 1; 947} 948#endif 949 950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 951{ 952 ASSERT(is_empty_shadow_page(sp->spt)); 953 hlist_del(&sp->hash_link); 954 list_del(&sp->link); 955 __free_page(virt_to_page(sp->spt)); 956 if (!sp->role.direct) 957 __free_page(virt_to_page(sp->gfns)); 958 kmem_cache_free(mmu_page_header_cache, sp); 959 ++kvm->arch.n_free_mmu_pages; 960} 961 962static unsigned kvm_page_table_hashfn(gfn_t gfn) 963{ 964 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 965} 966 967static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 968 u64 *parent_pte, int direct) 969{ 970 struct kvm_mmu_page *sp; 971 972 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 973 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 974 if (!direct) 975 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 976 PAGE_SIZE); 977 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 978 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 980 sp->multimapped = 0; 981 sp->parent_pte = parent_pte; 982 --vcpu->kvm->arch.n_free_mmu_pages; 983 return sp; 984} 985 986static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 987 struct kvm_mmu_page *sp, u64 *parent_pte) 988{ 989 struct kvm_pte_chain *pte_chain; 990 struct hlist_node *node; 991 int i; 992 993 if (!parent_pte) 994 return; 995 if (!sp->multimapped) { 996 u64 *old = sp->parent_pte; 997 998 if (!old) { 999 sp->parent_pte = parent_pte; 1000 return; 1001 } 1002 sp->multimapped = 1; 1003 pte_chain = mmu_alloc_pte_chain(vcpu); 1004 INIT_HLIST_HEAD(&sp->parent_ptes); 1005 hlist_add_head(&pte_chain->link, &sp->parent_ptes); 1006 pte_chain->parent_ptes[0] = old; 1007 } 1008 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { 1009 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) 1010 continue; 1011 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) 1012 if (!pte_chain->parent_ptes[i]) { 1013 pte_chain->parent_ptes[i] = parent_pte; 1014 return; 1015 } 1016 } 1017 pte_chain = mmu_alloc_pte_chain(vcpu); 1018 BUG_ON(!pte_chain); 1019 hlist_add_head(&pte_chain->link, &sp->parent_ptes); 1020 pte_chain->parent_ptes[0] = parent_pte; 1021} 1022 1023static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1024 u64 *parent_pte) 1025{ 1026 struct kvm_pte_chain *pte_chain; 1027 struct hlist_node *node; 1028 int i; 1029 1030 if (!sp->multimapped) { 1031 BUG_ON(sp->parent_pte != parent_pte); 1032 sp->parent_pte = NULL; 1033 return; 1034 } 1035 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1036 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1037 if (!pte_chain->parent_ptes[i]) 1038 break; 1039 if (pte_chain->parent_ptes[i] != parent_pte) 1040 continue; 1041 while (i + 1 < NR_PTE_CHAIN_ENTRIES 1042 && pte_chain->parent_ptes[i + 1]) { 1043 pte_chain->parent_ptes[i] 1044 = pte_chain->parent_ptes[i + 1]; 1045 ++i; 1046 } 1047 pte_chain->parent_ptes[i] = NULL; 1048 if (i == 0) { 1049 hlist_del(&pte_chain->link); 1050 mmu_free_pte_chain(pte_chain); 1051 if (hlist_empty(&sp->parent_ptes)) { 1052 sp->multimapped = 0; 1053 sp->parent_pte = NULL; 1054 } 1055 } 1056 return; 1057 } 1058 BUG(); 1059} 1060 1061static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1062{ 1063 struct kvm_pte_chain *pte_chain; 1064 struct hlist_node *node; 1065 struct kvm_mmu_page *parent_sp; 1066 int i; 1067 1068 if (!sp->multimapped && sp->parent_pte) { 1069 parent_sp = page_header(__pa(sp->parent_pte)); 1070 fn(parent_sp, sp->parent_pte); 1071 return; 1072 } 1073 1074 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1075 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1076 u64 *spte = pte_chain->parent_ptes[i]; 1077 1078 if (!spte) 1079 break; 1080 parent_sp = page_header(__pa(spte)); 1081 fn(parent_sp, spte); 1082 } 1083} 1084 1085static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1086static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1087{ 1088 mmu_parent_walk(sp, mark_unsync); 1089} 1090 1091static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1092{ 1093 unsigned int index; 1094 1095 index = spte - sp->spt; 1096 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1097 return; 1098 if (sp->unsync_children++) 1099 return; 1100 kvm_mmu_mark_parents_unsync(sp); 1101} 1102 1103static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1104 struct kvm_mmu_page *sp) 1105{ 1106 int i; 1107 1108 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 1109 sp->spt[i] = shadow_trap_nonpresent_pte; 1110} 1111 1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1113 struct kvm_mmu_page *sp, bool clear_unsync) 1114{ 1115 return 1; 1116} 1117 1118static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 1119{ 1120} 1121 1122#define KVM_PAGE_ARRAY_NR 16 1123 1124struct kvm_mmu_pages { 1125 struct mmu_page_and_offset { 1126 struct kvm_mmu_page *sp; 1127 unsigned int idx; 1128 } page[KVM_PAGE_ARRAY_NR]; 1129 unsigned int nr; 1130}; 1131 1132#define for_each_unsync_children(bitmap, idx) \ 1133 for (idx = find_first_bit(bitmap, 512); \ 1134 idx < 512; \ 1135 idx = find_next_bit(bitmap, 512, idx+1)) 1136 1137static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1138 int idx) 1139{ 1140 int i; 1141 1142 if (sp->unsync) 1143 for (i=0; i < pvec->nr; i++) 1144 if (pvec->page[i].sp == sp) 1145 return 0; 1146 1147 pvec->page[pvec->nr].sp = sp; 1148 pvec->page[pvec->nr].idx = idx; 1149 pvec->nr++; 1150 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1151} 1152 1153static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1154 struct kvm_mmu_pages *pvec) 1155{ 1156 int i, ret, nr_unsync_leaf = 0; 1157 1158 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1159 struct kvm_mmu_page *child; 1160 u64 ent = sp->spt[i]; 1161 1162 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) 1163 goto clear_child_bitmap; 1164 1165 child = page_header(ent & PT64_BASE_ADDR_MASK); 1166 1167 if (child->unsync_children) { 1168 if (mmu_pages_add(pvec, child, i)) 1169 return -ENOSPC; 1170 1171 ret = __mmu_unsync_walk(child, pvec); 1172 if (!ret) 1173 goto clear_child_bitmap; 1174 else if (ret > 0) 1175 nr_unsync_leaf += ret; 1176 else 1177 return ret; 1178 } else if (child->unsync) { 1179 nr_unsync_leaf++; 1180 if (mmu_pages_add(pvec, child, i)) 1181 return -ENOSPC; 1182 } else 1183 goto clear_child_bitmap; 1184 1185 continue; 1186 1187clear_child_bitmap: 1188 __clear_bit(i, sp->unsync_child_bitmap); 1189 sp->unsync_children--; 1190 WARN_ON((int)sp->unsync_children < 0); 1191 } 1192 1193 1194 return nr_unsync_leaf; 1195} 1196 1197static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1198 struct kvm_mmu_pages *pvec) 1199{ 1200 if (!sp->unsync_children) 1201 return 0; 1202 1203 mmu_pages_add(pvec, sp, 0); 1204 return __mmu_unsync_walk(sp, pvec); 1205} 1206 1207static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1208{ 1209 WARN_ON(!sp->unsync); 1210 trace_kvm_mmu_sync_page(sp); 1211 sp->unsync = 0; 1212 --kvm->stat.mmu_unsync; 1213} 1214 1215static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1216 struct list_head *invalid_list); 1217static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1218 struct list_head *invalid_list); 1219 1220#define for_each_gfn_sp(kvm, sp, gfn, pos) \ 1221 hlist_for_each_entry(sp, pos, \ 1222 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1223 if ((sp)->gfn != (gfn)) {} else 1224 1225#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ 1226 hlist_for_each_entry(sp, pos, \ 1227 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1228 if ((sp)->gfn != (gfn) || (sp)->role.direct || \ 1229 (sp)->role.invalid) {} else 1230 1231/* @sp->gfn should be write-protected at the call site */ 1232static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1233 struct list_head *invalid_list, bool clear_unsync) 1234{ 1235 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1236 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1237 return 1; 1238 } 1239 1240 if (clear_unsync) 1241 kvm_unlink_unsync_page(vcpu->kvm, sp); 1242 1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1245 return 1; 1246 } 1247 1248 kvm_mmu_flush_tlb(vcpu); 1249 return 0; 1250} 1251 1252static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, 1253 struct kvm_mmu_page *sp) 1254{ 1255 LIST_HEAD(invalid_list); 1256 int ret; 1257 1258 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); 1259 if (ret) 1260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1261 1262 return ret; 1263} 1264 1265static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1266 struct list_head *invalid_list) 1267{ 1268 return __kvm_sync_page(vcpu, sp, invalid_list, true); 1269} 1270 1271/* @gfn should be write-protected at the call site */ 1272static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1273{ 1274 struct kvm_mmu_page *s; 1275 struct hlist_node *node; 1276 LIST_HEAD(invalid_list); 1277 bool flush = false; 1278 1279 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1280 if (!s->unsync) 1281 continue; 1282 1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1287 continue; 1288 } 1289 kvm_unlink_unsync_page(vcpu->kvm, s); 1290 flush = true; 1291 } 1292 1293 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1294 if (flush) 1295 kvm_mmu_flush_tlb(vcpu); 1296} 1297 1298struct mmu_page_path { 1299 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1300 unsigned int idx[PT64_ROOT_LEVEL-1]; 1301}; 1302 1303#define for_each_sp(pvec, sp, parents, i) \ 1304 for (i = mmu_pages_next(&pvec, &parents, -1), \ 1305 sp = pvec.page[i].sp; \ 1306 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1307 i = mmu_pages_next(&pvec, &parents, i)) 1308 1309static int mmu_pages_next(struct kvm_mmu_pages *pvec, 1310 struct mmu_page_path *parents, 1311 int i) 1312{ 1313 int n; 1314 1315 for (n = i+1; n < pvec->nr; n++) { 1316 struct kvm_mmu_page *sp = pvec->page[n].sp; 1317 1318 if (sp->role.level == PT_PAGE_TABLE_LEVEL) { 1319 parents->idx[0] = pvec->page[n].idx; 1320 return n; 1321 } 1322 1323 parents->parent[sp->role.level-2] = sp; 1324 parents->idx[sp->role.level-1] = pvec->page[n].idx; 1325 } 1326 1327 return n; 1328} 1329 1330static void mmu_pages_clear_parents(struct mmu_page_path *parents) 1331{ 1332 struct kvm_mmu_page *sp; 1333 unsigned int level = 0; 1334 1335 do { 1336 unsigned int idx = parents->idx[level]; 1337 1338 sp = parents->parent[level]; 1339 if (!sp) 1340 return; 1341 1342 --sp->unsync_children; 1343 WARN_ON((int)sp->unsync_children < 0); 1344 __clear_bit(idx, sp->unsync_child_bitmap); 1345 level++; 1346 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); 1347} 1348 1349static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, 1350 struct mmu_page_path *parents, 1351 struct kvm_mmu_pages *pvec) 1352{ 1353 parents->parent[parent->role.level-1] = NULL; 1354 pvec->nr = 0; 1355} 1356 1357static void mmu_sync_children(struct kvm_vcpu *vcpu, 1358 struct kvm_mmu_page *parent) 1359{ 1360 int i; 1361 struct kvm_mmu_page *sp; 1362 struct mmu_page_path parents; 1363 struct kvm_mmu_pages pages; 1364 LIST_HEAD(invalid_list); 1365 1366 kvm_mmu_pages_init(parent, &parents, &pages); 1367 while (mmu_unsync_walk(parent, &pages)) { 1368 int protected = 0; 1369 1370 for_each_sp(pages, sp, parents, i) 1371 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1372 1373 if (protected) 1374 kvm_flush_remote_tlbs(vcpu->kvm); 1375 1376 for_each_sp(pages, sp, parents, i) { 1377 kvm_sync_page(vcpu, sp, &invalid_list); 1378 mmu_pages_clear_parents(&parents); 1379 } 1380 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1381 cond_resched_lock(&vcpu->kvm->mmu_lock); 1382 kvm_mmu_pages_init(parent, &parents, &pages); 1383 } 1384} 1385 1386static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1387 gfn_t gfn, 1388 gva_t gaddr, 1389 unsigned level, 1390 int direct, 1391 unsigned access, 1392 u64 *parent_pte) 1393{ 1394 union kvm_mmu_page_role role; 1395 unsigned quadrant; 1396 struct kvm_mmu_page *sp; 1397 struct hlist_node *node; 1398 bool need_sync = false; 1399 1400 role = vcpu->arch.mmu.base_role; 1401 role.level = level; 1402 role.direct = direct; 1403 if (role.direct) 1404 role.cr4_pae = 0; 1405 role.access = access; 1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1409 role.quadrant = quadrant; 1410 } 1411 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { 1412 if (!need_sync && sp->unsync) 1413 need_sync = true; 1414 1415 if (sp->role.word != role.word) 1416 continue; 1417 1418 if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) 1419 break; 1420 1421 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1422 if (sp->unsync_children) { 1423 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 1424 kvm_mmu_mark_parents_unsync(sp); 1425 } else if (sp->unsync) 1426 kvm_mmu_mark_parents_unsync(sp); 1427 1428 trace_kvm_mmu_get_page(sp, false); 1429 return sp; 1430 } 1431 ++vcpu->kvm->stat.mmu_cache_miss; 1432 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); 1433 if (!sp) 1434 return sp; 1435 sp->gfn = gfn; 1436 sp->role = role; 1437 hlist_add_head(&sp->hash_link, 1438 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); 1439 if (!direct) { 1440 if (rmap_write_protect(vcpu->kvm, gfn)) 1441 kvm_flush_remote_tlbs(vcpu->kvm); 1442 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 1443 kvm_sync_pages(vcpu, gfn); 1444 1445 account_shadowed(vcpu->kvm, gfn); 1446 } 1447 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1448 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1449 else 1450 nonpaging_prefetch_page(vcpu, sp); 1451 trace_kvm_mmu_get_page(sp, true); 1452 return sp; 1453} 1454 1455static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 1456 struct kvm_vcpu *vcpu, u64 addr) 1457{ 1458 iterator->addr = addr; 1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1462 iterator->shadow_addr 1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1464 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 1465 --iterator->level; 1466 if (!iterator->shadow_addr) 1467 iterator->level = 0; 1468 } 1469} 1470 1471static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 1472{ 1473 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1474 return false; 1475 1476 if (iterator->level == PT_PAGE_TABLE_LEVEL) 1477 if (is_large_pte(*iterator->sptep)) 1478 return false; 1479 1480 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1481 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1482 return true; 1483} 1484 1485static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 1486{ 1487 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; 1488 --iterator->level; 1489} 1490 1491static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 1492{ 1493 u64 spte; 1494 1495 spte = __pa(sp->spt) 1496 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1497 | PT_WRITABLE_MASK | PT_USER_MASK; 1498 __set_spte(sptep, spte); 1499} 1500 1501static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1502{ 1503 if (is_large_pte(*sptep)) { 1504 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1505 kvm_flush_remote_tlbs(vcpu->kvm); 1506 } 1507} 1508 1509static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1510 unsigned direct_access) 1511{ 1512 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 1513 struct kvm_mmu_page *child; 1514 1515 /* 1516 * For the direct sp, if the guest pte's dirty bit 1517 * changed form clean to dirty, it will corrupt the 1518 * sp's access: allow writable in the read-only sp, 1519 * so we should update the spte at this point to get 1520 * a new sp with the correct access. 1521 */ 1522 child = page_header(*sptep & PT64_BASE_ADDR_MASK); 1523 if (child->role.access == direct_access) 1524 return; 1525 1526 mmu_page_remove_parent_pte(child, sptep); 1527 __set_spte(sptep, shadow_trap_nonpresent_pte); 1528 kvm_flush_remote_tlbs(vcpu->kvm); 1529 } 1530} 1531 1532static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1533 struct kvm_mmu_page *sp) 1534{ 1535 unsigned i; 1536 u64 *pt; 1537 u64 ent; 1538 1539 pt = sp->spt; 1540 1541 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1542 ent = pt[i]; 1543 1544 if (is_shadow_present_pte(ent)) { 1545 if (!is_last_spte(ent, sp->role.level)) { 1546 ent &= PT64_BASE_ADDR_MASK; 1547 mmu_page_remove_parent_pte(page_header(ent), 1548 &pt[i]); 1549 } else { 1550 if (is_large_pte(ent)) 1551 --kvm->stat.lpages; 1552 drop_spte(kvm, &pt[i], 1553 shadow_trap_nonpresent_pte); 1554 } 1555 } 1556 pt[i] = shadow_trap_nonpresent_pte; 1557 } 1558} 1559 1560static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1561{ 1562 mmu_page_remove_parent_pte(sp, parent_pte); 1563} 1564 1565static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) 1566{ 1567 int i; 1568 struct kvm_vcpu *vcpu; 1569 1570 kvm_for_each_vcpu(i, vcpu, kvm) 1571 vcpu->arch.last_pte_updated = NULL; 1572} 1573 1574static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1575{ 1576 u64 *parent_pte; 1577 1578 while (sp->multimapped || sp->parent_pte) { 1579 if (!sp->multimapped) 1580 parent_pte = sp->parent_pte; 1581 else { 1582 struct kvm_pte_chain *chain; 1583 1584 chain = container_of(sp->parent_ptes.first, 1585 struct kvm_pte_chain, link); 1586 parent_pte = chain->parent_ptes[0]; 1587 } 1588 BUG_ON(!parent_pte); 1589 kvm_mmu_put_page(sp, parent_pte); 1590 __set_spte(parent_pte, shadow_trap_nonpresent_pte); 1591 } 1592} 1593 1594static int mmu_zap_unsync_children(struct kvm *kvm, 1595 struct kvm_mmu_page *parent, 1596 struct list_head *invalid_list) 1597{ 1598 int i, zapped = 0; 1599 struct mmu_page_path parents; 1600 struct kvm_mmu_pages pages; 1601 1602 if (parent->role.level == PT_PAGE_TABLE_LEVEL) 1603 return 0; 1604 1605 kvm_mmu_pages_init(parent, &parents, &pages); 1606 while (mmu_unsync_walk(parent, &pages)) { 1607 struct kvm_mmu_page *sp; 1608 1609 for_each_sp(pages, sp, parents, i) { 1610 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 1611 mmu_pages_clear_parents(&parents); 1612 zapped++; 1613 } 1614 kvm_mmu_pages_init(parent, &parents, &pages); 1615 } 1616 1617 return zapped; 1618} 1619 1620static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1621 struct list_head *invalid_list) 1622{ 1623 int ret; 1624 1625 trace_kvm_mmu_prepare_zap_page(sp); 1626 ++kvm->stat.mmu_shadow_zapped; 1627 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 1628 kvm_mmu_page_unlink_children(kvm, sp); 1629 kvm_mmu_unlink_parents(kvm, sp); 1630 if (!sp->role.invalid && !sp->role.direct) 1631 unaccount_shadowed(kvm, sp->gfn); 1632 if (sp->unsync) 1633 kvm_unlink_unsync_page(kvm, sp); 1634 if (!sp->root_count) { 1635 /* Count self */ 1636 ret++; 1637 list_move(&sp->link, invalid_list); 1638 } else { 1639 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1640 kvm_reload_remote_mmus(kvm); 1641 } 1642 1643 sp->role.invalid = 1; 1644 kvm_mmu_reset_last_pte_updated(kvm); 1645 return ret; 1646} 1647 1648static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1649 struct list_head *invalid_list) 1650{ 1651 struct kvm_mmu_page *sp; 1652 1653 if (list_empty(invalid_list)) 1654 return; 1655 1656 kvm_flush_remote_tlbs(kvm); 1657 1658 do { 1659 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1660 WARN_ON(!sp->role.invalid || sp->root_count); 1661 kvm_mmu_free_page(kvm, sp); 1662 } while (!list_empty(invalid_list)); 1663 1664} 1665 1666/* 1667 * Changing the number of mmu pages allocated to the vm 1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1669 */ 1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1671{ 1672 int used_pages; 1673 LIST_HEAD(invalid_list); 1674 1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1676 used_pages = max(0, used_pages); 1677 1678 /* 1679 * If we set the number of mmu pages to be smaller be than the 1680 * number of actived pages , we must to free some mmu pages before we 1681 * change the value 1682 */ 1683 1684 if (used_pages > kvm_nr_mmu_pages) { 1685 while (used_pages > kvm_nr_mmu_pages && 1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1687 struct kvm_mmu_page *page; 1688 1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1690 struct kvm_mmu_page, link); 1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1692 &invalid_list); 1693 } 1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1695 kvm_nr_mmu_pages = used_pages; 1696 kvm->arch.n_free_mmu_pages = 0; 1697 } 1698 else 1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages 1700 - kvm->arch.n_alloc_mmu_pages; 1701 1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1703} 1704 1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1706{ 1707 struct kvm_mmu_page *sp; 1708 struct hlist_node *node; 1709 LIST_HEAD(invalid_list); 1710 int r; 1711 1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1713 r = 0; 1714 1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1717 sp->role.word); 1718 r = 1; 1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1720 } 1721 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1722 return r; 1723} 1724 1725static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1726{ 1727 struct kvm_mmu_page *sp; 1728 struct hlist_node *node; 1729 LIST_HEAD(invalid_list); 1730 1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1732 pgprintk("%s: zap %lx %x\n", 1733 __func__, gfn, sp->role.word); 1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1735 } 1736 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1737} 1738 1739static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1740{ 1741 int slot = memslot_id(kvm, gfn); 1742 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1743 1744 __set_bit(slot, sp->slot_bitmap); 1745} 1746 1747static void mmu_convert_notrap(struct kvm_mmu_page *sp) 1748{ 1749 int i; 1750 u64 *pt = sp->spt; 1751 1752 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) 1753 return; 1754 1755 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1756 if (pt[i] == shadow_notrap_nonpresent_pte) 1757 __set_spte(&pt[i], shadow_trap_nonpresent_pte); 1758 } 1759} 1760 1761/* 1762 * The function is based on mtrr_type_lookup() in 1763 * arch/x86/kernel/cpu/mtrr/generic.c 1764 */ 1765static int get_mtrr_type(struct mtrr_state_type *mtrr_state, 1766 u64 start, u64 end) 1767{ 1768 int i; 1769 u64 base, mask; 1770 u8 prev_match, curr_match; 1771 int num_var_ranges = KVM_NR_VAR_MTRR; 1772 1773 if (!mtrr_state->enabled) 1774 return 0xFF; 1775 1776 /* Make end inclusive end, instead of exclusive */ 1777 end--; 1778 1779 /* Look in fixed ranges. Just return the type as per start */ 1780 if (mtrr_state->have_fixed && (start < 0x100000)) { 1781 int idx; 1782 1783 if (start < 0x80000) { 1784 idx = 0; 1785 idx += (start >> 16); 1786 return mtrr_state->fixed_ranges[idx]; 1787 } else if (start < 0xC0000) { 1788 idx = 1 * 8; 1789 idx += ((start - 0x80000) >> 14); 1790 return mtrr_state->fixed_ranges[idx]; 1791 } else if (start < 0x1000000) { 1792 idx = 3 * 8; 1793 idx += ((start - 0xC0000) >> 12); 1794 return mtrr_state->fixed_ranges[idx]; 1795 } 1796 } 1797 1798 /* 1799 * Look in variable ranges 1800 * Look of multiple ranges matching this address and pick type 1801 * as per MTRR precedence 1802 */ 1803 if (!(mtrr_state->enabled & 2)) 1804 return mtrr_state->def_type; 1805 1806 prev_match = 0xFF; 1807 for (i = 0; i < num_var_ranges; ++i) { 1808 unsigned short start_state, end_state; 1809 1810 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) 1811 continue; 1812 1813 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + 1814 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); 1815 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + 1816 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); 1817 1818 start_state = ((start & mask) == (base & mask)); 1819 end_state = ((end & mask) == (base & mask)); 1820 if (start_state != end_state) 1821 return 0xFE; 1822 1823 if ((start & mask) != (base & mask)) 1824 continue; 1825 1826 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; 1827 if (prev_match == 0xFF) { 1828 prev_match = curr_match; 1829 continue; 1830 } 1831 1832 if (prev_match == MTRR_TYPE_UNCACHABLE || 1833 curr_match == MTRR_TYPE_UNCACHABLE) 1834 return MTRR_TYPE_UNCACHABLE; 1835 1836 if ((prev_match == MTRR_TYPE_WRBACK && 1837 curr_match == MTRR_TYPE_WRTHROUGH) || 1838 (prev_match == MTRR_TYPE_WRTHROUGH && 1839 curr_match == MTRR_TYPE_WRBACK)) { 1840 prev_match = MTRR_TYPE_WRTHROUGH; 1841 curr_match = MTRR_TYPE_WRTHROUGH; 1842 } 1843 1844 if (prev_match != curr_match) 1845 return MTRR_TYPE_UNCACHABLE; 1846 } 1847 1848 if (prev_match != 0xFF) 1849 return prev_match; 1850 1851 return mtrr_state->def_type; 1852} 1853 1854u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) 1855{ 1856 u8 mtrr; 1857 1858 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, 1859 (gfn << PAGE_SHIFT) + PAGE_SIZE); 1860 if (mtrr == 0xfe || mtrr == 0xff) 1861 mtrr = MTRR_TYPE_WRBACK; 1862 return mtrr; 1863} 1864EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1865 1866static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1867{ 1868 trace_kvm_mmu_unsync_page(sp); 1869 ++vcpu->kvm->stat.mmu_unsync; 1870 sp->unsync = 1; 1871 1872 kvm_mmu_mark_parents_unsync(sp); 1873 mmu_convert_notrap(sp); 1874} 1875 1876static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1877{ 1878 struct kvm_mmu_page *s; 1879 struct hlist_node *node; 1880 1881 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1882 if (s->unsync) 1883 continue; 1884 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1885 __kvm_unsync_page(vcpu, s); 1886 } 1887} 1888 1889static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1890 bool can_unsync) 1891{ 1892 struct kvm_mmu_page *s; 1893 struct hlist_node *node; 1894 bool need_unsync = false; 1895 1896 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1897 if (!can_unsync) 1898 return 1; 1899 1900 if (s->role.level != PT_PAGE_TABLE_LEVEL) 1901 return 1; 1902 1903 if (!need_unsync && !s->unsync) { 1904 if (!oos_shadow) 1905 return 1; 1906 need_unsync = true; 1907 } 1908 } 1909 if (need_unsync) 1910 kvm_unsync_pages(vcpu, gfn); 1911 return 0; 1912} 1913 1914static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1915 unsigned pte_access, int user_fault, 1916 int write_fault, int dirty, int level, 1917 gfn_t gfn, pfn_t pfn, bool speculative, 1918 bool can_unsync, bool reset_host_protection) 1919{ 1920 u64 spte; 1921 int ret = 0; 1922 1923 /* 1924 * We don't set the accessed bit, since we sometimes want to see 1925 * whether the guest actually used the pte (in order to detect 1926 * demand paging). 1927 */ 1928 spte = shadow_base_present_pte | shadow_dirty_mask; 1929 if (!speculative) 1930 spte |= shadow_accessed_mask; 1931 if (!dirty) 1932 pte_access &= ~ACC_WRITE_MASK; 1933 if (pte_access & ACC_EXEC_MASK) 1934 spte |= shadow_x_mask; 1935 else 1936 spte |= shadow_nx_mask; 1937 if (pte_access & ACC_USER_MASK) 1938 spte |= shadow_user_mask; 1939 if (level > PT_PAGE_TABLE_LEVEL) 1940 spte |= PT_PAGE_SIZE_MASK; 1941 if (tdp_enabled) 1942 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1943 kvm_is_mmio_pfn(pfn)); 1944 1945 if (reset_host_protection) 1946 spte |= SPTE_HOST_WRITEABLE; 1947 1948 spte |= (u64)pfn << PAGE_SHIFT; 1949 1950 if ((pte_access & ACC_WRITE_MASK) 1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1952 && !user_fault)) { 1953 1954 if (level > PT_PAGE_TABLE_LEVEL && 1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1956 ret = 1; 1957 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1958 goto done; 1959 } 1960 1961 spte |= PT_WRITABLE_MASK; 1962 1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 1964 spte &= ~PT_USER_MASK; 1965 1966 /* 1967 * Optimization: for pte sync, if spte was writable the hash 1968 * lookup is unnecessary (and expensive). Write protection 1969 * is responsibility of mmu_get_page / kvm_sync_page. 1970 * Same reasoning can be applied to dirty page accounting. 1971 */ 1972 if (!can_unsync && is_writable_pte(*sptep)) 1973 goto set_pte; 1974 1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 1977 __func__, gfn); 1978 ret = 1; 1979 pte_access &= ~ACC_WRITE_MASK; 1980 if (is_writable_pte(spte)) 1981 spte &= ~PT_WRITABLE_MASK; 1982 } 1983 } 1984 1985 if (pte_access & ACC_WRITE_MASK) 1986 mark_page_dirty(vcpu->kvm, gfn); 1987 1988set_pte: 1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte)) 1990 kvm_set_pfn_dirty(pfn); 1991 update_spte(sptep, spte); 1992done: 1993 return ret; 1994} 1995 1996static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1997 unsigned pt_access, unsigned pte_access, 1998 int user_fault, int write_fault, int dirty, 1999 int *ptwrite, int level, gfn_t gfn, 2000 pfn_t pfn, bool speculative, 2001 bool reset_host_protection) 2002{ 2003 int was_rmapped = 0; 2004 int rmap_count; 2005 2006 pgprintk("%s: spte %llx access %x write_fault %d" 2007 " user_fault %d gfn %lx\n", 2008 __func__, *sptep, pt_access, 2009 write_fault, user_fault, gfn); 2010 2011 if (is_rmap_spte(*sptep)) { 2012 /* 2013 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 2014 * the parent of the now unreachable PTE. 2015 */ 2016 if (level > PT_PAGE_TABLE_LEVEL && 2017 !is_large_pte(*sptep)) { 2018 struct kvm_mmu_page *child; 2019 u64 pte = *sptep; 2020 2021 child = page_header(pte & PT64_BASE_ADDR_MASK); 2022 mmu_page_remove_parent_pte(child, sptep); 2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2024 kvm_flush_remote_tlbs(vcpu->kvm); 2025 } else if (pfn != spte_to_pfn(*sptep)) { 2026 pgprintk("hfn old %lx new %lx\n", 2027 spte_to_pfn(*sptep), pfn); 2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2029 kvm_flush_remote_tlbs(vcpu->kvm); 2030 } else 2031 was_rmapped = 1; 2032 } 2033 2034 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2035 dirty, level, gfn, pfn, speculative, true, 2036 reset_host_protection)) { 2037 if (write_fault) 2038 *ptwrite = 1; 2039 kvm_mmu_flush_tlb(vcpu); 2040 } 2041 2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2044 is_large_pte(*sptep)? "2MB" : "4kB", 2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2046 *sptep, sptep); 2047 if (!was_rmapped && is_large_pte(*sptep)) 2048 ++vcpu->kvm->stat.lpages; 2049 2050 page_header_update_slot(vcpu->kvm, sptep, gfn); 2051 if (!was_rmapped) { 2052 rmap_count = rmap_add(vcpu, sptep, gfn); 2053 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2054 rmap_recycle(vcpu, sptep, gfn); 2055 } 2056 kvm_release_pfn_clean(pfn); 2057 if (speculative) { 2058 vcpu->arch.last_pte_updated = sptep; 2059 vcpu->arch.last_pte_gfn = gfn; 2060 } 2061} 2062 2063static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2064{ 2065} 2066 2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2068 int level, gfn_t gfn, pfn_t pfn) 2069{ 2070 struct kvm_shadow_walk_iterator iterator; 2071 struct kvm_mmu_page *sp; 2072 int pt_write = 0; 2073 gfn_t pseudo_gfn; 2074 2075 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2076 if (iterator.level == level) { 2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2078 0, write, 1, &pt_write, 2079 level, gfn, pfn, false, true); 2080 ++vcpu->stat.pf_fixed; 2081 break; 2082 } 2083 2084 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2085 u64 base_addr = iterator.addr; 2086 2087 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2088 pseudo_gfn = base_addr >> PAGE_SHIFT; 2089 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2090 iterator.level - 1, 2091 1, ACC_ALL, iterator.sptep); 2092 if (!sp) { 2093 pgprintk("nonpaging_map: ENOMEM\n"); 2094 kvm_release_pfn_clean(pfn); 2095 return -ENOMEM; 2096 } 2097 2098 __set_spte(iterator.sptep, 2099 __pa(sp->spt) 2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2101 | shadow_user_mask | shadow_x_mask); 2102 } 2103 } 2104 return pt_write; 2105} 2106 2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2108{ 2109 char buf[1]; 2110 void __user *hva; 2111 int r; 2112 2113 /* Touch the page, so send SIGBUS */ 2114 hva = (void __user *)gfn_to_hva(kvm, gfn); 2115 r = copy_from_user(buf, hva, 1); 2116} 2117 2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2119{ 2120 kvm_release_pfn_clean(pfn); 2121 if (is_hwpoison_pfn(pfn)) { 2122 kvm_send_hwpoison_signal(kvm, gfn); 2123 return 0; 2124 } else if (is_fault_pfn(pfn)) 2125 return -EFAULT; 2126 2127 return 1; 2128} 2129 2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2131{ 2132 int r; 2133 int level; 2134 pfn_t pfn; 2135 unsigned long mmu_seq; 2136 2137 level = mapping_level(vcpu, gfn); 2138 2139 /* 2140 * This path builds a PAE pagetable - so we can map 2mb pages at 2141 * maximum. Therefore check if the level is larger than that. 2142 */ 2143 if (level > PT_DIRECTORY_LEVEL) 2144 level = PT_DIRECTORY_LEVEL; 2145 2146 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2147 2148 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2149 smp_rmb(); 2150 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2151 2152 /* mmio */ 2153 if (is_error_pfn(pfn)) 2154 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2155 2156 spin_lock(&vcpu->kvm->mmu_lock); 2157 if (mmu_notifier_retry(vcpu, mmu_seq)) 2158 goto out_unlock; 2159 kvm_mmu_free_some_pages(vcpu); 2160 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2161 spin_unlock(&vcpu->kvm->mmu_lock); 2162 2163 2164 return r; 2165 2166out_unlock: 2167 spin_unlock(&vcpu->kvm->mmu_lock); 2168 kvm_release_pfn_clean(pfn); 2169 return 0; 2170} 2171 2172 2173static void mmu_free_roots(struct kvm_vcpu *vcpu) 2174{ 2175 int i; 2176 struct kvm_mmu_page *sp; 2177 LIST_HEAD(invalid_list); 2178 2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2180 return; 2181 spin_lock(&vcpu->kvm->mmu_lock); 2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2184 2185 sp = page_header(root); 2186 --sp->root_count; 2187 if (!sp->root_count && sp->role.invalid) { 2188 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2189 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2190 } 2191 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2192 spin_unlock(&vcpu->kvm->mmu_lock); 2193 return; 2194 } 2195 for (i = 0; i < 4; ++i) { 2196 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2197 2198 if (root) { 2199 root &= PT64_BASE_ADDR_MASK; 2200 sp = page_header(root); 2201 --sp->root_count; 2202 if (!sp->root_count && sp->role.invalid) 2203 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2204 &invalid_list); 2205 } 2206 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2207 } 2208 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2209 spin_unlock(&vcpu->kvm->mmu_lock); 2210 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2211} 2212 2213static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 2214{ 2215 int ret = 0; 2216 2217 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2218 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2219 ret = 1; 2220 } 2221 2222 return ret; 2223} 2224 2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2226{ 2227 int i; 2228 gfn_t root_gfn; 2229 struct kvm_mmu_page *sp; 2230 int direct = 0; 2231 u64 pdptr; 2232 2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 2234 2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2237 2238 ASSERT(!VALID_PAGE(root)); 2239 if (mmu_check_root(vcpu, root_gfn)) 2240 return 1; 2241 if (tdp_enabled) { 2242 direct = 1; 2243 root_gfn = 0; 2244 } 2245 spin_lock(&vcpu->kvm->mmu_lock); 2246 kvm_mmu_free_some_pages(vcpu); 2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2248 PT64_ROOT_LEVEL, direct, 2249 ACC_ALL, NULL); 2250 root = __pa(sp->spt); 2251 ++sp->root_count; 2252 spin_unlock(&vcpu->kvm->mmu_lock); 2253 vcpu->arch.mmu.root_hpa = root; 2254 return 0; 2255 } 2256 direct = !is_paging(vcpu); 2257 2258 if (mmu_check_root(vcpu, root_gfn)) 2259 return 1; 2260 2261 for (i = 0; i < 4; ++i) { 2262 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2263 2264 ASSERT(!VALID_PAGE(root)); 2265 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2266 pdptr = kvm_pdptr_read(vcpu, i); 2267 if (!is_present_gpte(pdptr)) { 2268 vcpu->arch.mmu.pae_root[i] = 0; 2269 continue; 2270 } 2271 root_gfn = pdptr >> PAGE_SHIFT; 2272 if (mmu_check_root(vcpu, root_gfn)) 2273 return 1; 2274 } else if (vcpu->arch.mmu.root_level == 0) 2275 root_gfn = 0; 2276 if (tdp_enabled) { 2277 direct = 1; 2278 root_gfn = i << (30 - PAGE_SHIFT); 2279 } 2280 spin_lock(&vcpu->kvm->mmu_lock); 2281 kvm_mmu_free_some_pages(vcpu); 2282 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2283 PT32_ROOT_LEVEL, direct, 2284 ACC_ALL, NULL); 2285 root = __pa(sp->spt); 2286 ++sp->root_count; 2287 spin_unlock(&vcpu->kvm->mmu_lock); 2288 2289 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2290 } 2291 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2292 return 0; 2293} 2294 2295static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2296{ 2297 int i; 2298 struct kvm_mmu_page *sp; 2299 2300 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2301 return; 2302 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2303 hpa_t root = vcpu->arch.mmu.root_hpa; 2304 sp = page_header(root); 2305 mmu_sync_children(vcpu, sp); 2306 return; 2307 } 2308 for (i = 0; i < 4; ++i) { 2309 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2310 2311 if (root && VALID_PAGE(root)) { 2312 root &= PT64_BASE_ADDR_MASK; 2313 sp = page_header(root); 2314 mmu_sync_children(vcpu, sp); 2315 } 2316 } 2317} 2318 2319void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2320{ 2321 spin_lock(&vcpu->kvm->mmu_lock); 2322 mmu_sync_roots(vcpu); 2323 spin_unlock(&vcpu->kvm->mmu_lock); 2324} 2325 2326static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2327 u32 access, u32 *error) 2328{ 2329 if (error) 2330 *error = 0; 2331 return vaddr; 2332} 2333 2334static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2335 u32 error_code) 2336{ 2337 gfn_t gfn; 2338 int r; 2339 2340 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 2341 r = mmu_topup_memory_caches(vcpu); 2342 if (r) 2343 return r; 2344 2345 ASSERT(vcpu); 2346 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2347 2348 gfn = gva >> PAGE_SHIFT; 2349 2350 return nonpaging_map(vcpu, gva & PAGE_MASK, 2351 error_code & PFERR_WRITE_MASK, gfn); 2352} 2353 2354static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2355 u32 error_code) 2356{ 2357 pfn_t pfn; 2358 int r; 2359 int level; 2360 gfn_t gfn = gpa >> PAGE_SHIFT; 2361 unsigned long mmu_seq; 2362 2363 ASSERT(vcpu); 2364 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2365 2366 r = mmu_topup_memory_caches(vcpu); 2367 if (r) 2368 return r; 2369 2370 level = mapping_level(vcpu, gfn); 2371 2372 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2373 2374 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2375 smp_rmb(); 2376 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2377 if (is_error_pfn(pfn)) 2378 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2379 spin_lock(&vcpu->kvm->mmu_lock); 2380 if (mmu_notifier_retry(vcpu, mmu_seq)) 2381 goto out_unlock; 2382 kvm_mmu_free_some_pages(vcpu); 2383 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2384 level, gfn, pfn); 2385 spin_unlock(&vcpu->kvm->mmu_lock); 2386 2387 return r; 2388 2389out_unlock: 2390 spin_unlock(&vcpu->kvm->mmu_lock); 2391 kvm_release_pfn_clean(pfn); 2392 return 0; 2393} 2394 2395static void nonpaging_free(struct kvm_vcpu *vcpu) 2396{ 2397 mmu_free_roots(vcpu); 2398} 2399 2400static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2401{ 2402 struct kvm_mmu *context = &vcpu->arch.mmu; 2403 2404 context->new_cr3 = nonpaging_new_cr3; 2405 context->page_fault = nonpaging_page_fault; 2406 context->gva_to_gpa = nonpaging_gva_to_gpa; 2407 context->free = nonpaging_free; 2408 context->prefetch_page = nonpaging_prefetch_page; 2409 context->sync_page = nonpaging_sync_page; 2410 context->invlpg = nonpaging_invlpg; 2411 context->root_level = 0; 2412 context->shadow_root_level = PT32E_ROOT_LEVEL; 2413 context->root_hpa = INVALID_PAGE; 2414 return 0; 2415} 2416 2417void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2418{ 2419 ++vcpu->stat.tlb_flush; 2420 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2421} 2422 2423static void paging_new_cr3(struct kvm_vcpu *vcpu) 2424{ 2425 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2426 mmu_free_roots(vcpu); 2427} 2428 2429static void inject_page_fault(struct kvm_vcpu *vcpu, 2430 u64 addr, 2431 u32 err_code) 2432{ 2433 kvm_inject_page_fault(vcpu, addr, err_code); 2434} 2435 2436static void paging_free(struct kvm_vcpu *vcpu) 2437{ 2438 nonpaging_free(vcpu); 2439} 2440 2441static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2442{ 2443 int bit7; 2444 2445 bit7 = (gpte >> 7) & 1; 2446 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2447} 2448 2449#define PTTYPE 64 2450#include "paging_tmpl.h" 2451#undef PTTYPE 2452 2453#define PTTYPE 32 2454#include "paging_tmpl.h" 2455#undef PTTYPE 2456 2457static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2458{ 2459 struct kvm_mmu *context = &vcpu->arch.mmu; 2460 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2461 u64 exb_bit_rsvd = 0; 2462 2463 if (!is_nx(vcpu)) 2464 exb_bit_rsvd = rsvd_bits(63, 63); 2465 switch (level) { 2466 case PT32_ROOT_LEVEL: 2467 /* no rsvd bits for 2 level 4K page table entries */ 2468 context->rsvd_bits_mask[0][1] = 0; 2469 context->rsvd_bits_mask[0][0] = 0; 2470 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; 2471 2472 if (!is_pse(vcpu)) { 2473 context->rsvd_bits_mask[1][1] = 0; 2474 break; 2475 } 2476 2477 if (is_cpuid_PSE36()) 2478 /* 36bits PSE 4MB page */ 2479 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2480 else 2481 /* 32 bits PSE 4MB page */ 2482 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2483 break; 2484 case PT32E_ROOT_LEVEL: 2485 context->rsvd_bits_mask[0][2] = 2486 rsvd_bits(maxphyaddr, 63) | 2487 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ 2488 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 2489 rsvd_bits(maxphyaddr, 62); /* PDE */ 2490 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2491 rsvd_bits(maxphyaddr, 62); /* PTE */ 2492 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2493 rsvd_bits(maxphyaddr, 62) | 2494 rsvd_bits(13, 20); /* large page */ 2495 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; 2496 break; 2497 case PT64_ROOT_LEVEL: 2498 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2499 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 2500 context->rsvd_bits_mask[0][2] = exb_bit_rsvd | 2501 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 2502 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 2503 rsvd_bits(maxphyaddr, 51); 2504 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2505 rsvd_bits(maxphyaddr, 51); 2506 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 2507 context->rsvd_bits_mask[1][2] = exb_bit_rsvd | 2508 rsvd_bits(maxphyaddr, 51) | 2509 rsvd_bits(13, 29); 2510 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2511 rsvd_bits(maxphyaddr, 51) | 2512 rsvd_bits(13, 20); /* large page */ 2513 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; 2514 break; 2515 } 2516} 2517 2518static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2519{ 2520 struct kvm_mmu *context = &vcpu->arch.mmu; 2521 2522 ASSERT(is_pae(vcpu)); 2523 context->new_cr3 = paging_new_cr3; 2524 context->page_fault = paging64_page_fault; 2525 context->gva_to_gpa = paging64_gva_to_gpa; 2526 context->prefetch_page = paging64_prefetch_page; 2527 context->sync_page = paging64_sync_page; 2528 context->invlpg = paging64_invlpg; 2529 context->free = paging_free; 2530 context->root_level = level; 2531 context->shadow_root_level = level; 2532 context->root_hpa = INVALID_PAGE; 2533 return 0; 2534} 2535 2536static int paging64_init_context(struct kvm_vcpu *vcpu) 2537{ 2538 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2539 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); 2540} 2541 2542static int paging32_init_context(struct kvm_vcpu *vcpu) 2543{ 2544 struct kvm_mmu *context = &vcpu->arch.mmu; 2545 2546 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2547 context->new_cr3 = paging_new_cr3; 2548 context->page_fault = paging32_page_fault; 2549 context->gva_to_gpa = paging32_gva_to_gpa; 2550 context->free = paging_free; 2551 context->prefetch_page = paging32_prefetch_page; 2552 context->sync_page = paging32_sync_page; 2553 context->invlpg = paging32_invlpg; 2554 context->root_level = PT32_ROOT_LEVEL; 2555 context->shadow_root_level = PT32E_ROOT_LEVEL; 2556 context->root_hpa = INVALID_PAGE; 2557 return 0; 2558} 2559 2560static int paging32E_init_context(struct kvm_vcpu *vcpu) 2561{ 2562 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2563 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 2564} 2565 2566static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2567{ 2568 struct kvm_mmu *context = &vcpu->arch.mmu; 2569 2570 context->new_cr3 = nonpaging_new_cr3; 2571 context->page_fault = tdp_page_fault; 2572 context->free = nonpaging_free; 2573 context->prefetch_page = nonpaging_prefetch_page; 2574 context->sync_page = nonpaging_sync_page; 2575 context->invlpg = nonpaging_invlpg; 2576 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2577 context->root_hpa = INVALID_PAGE; 2578 2579 if (!is_paging(vcpu)) { 2580 context->gva_to_gpa = nonpaging_gva_to_gpa; 2581 context->root_level = 0; 2582 } else if (is_long_mode(vcpu)) { 2583 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2584 context->gva_to_gpa = paging64_gva_to_gpa; 2585 context->root_level = PT64_ROOT_LEVEL; 2586 } else if (is_pae(vcpu)) { 2587 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2588 context->gva_to_gpa = paging64_gva_to_gpa; 2589 context->root_level = PT32E_ROOT_LEVEL; 2590 } else { 2591 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2592 context->gva_to_gpa = paging32_gva_to_gpa; 2593 context->root_level = PT32_ROOT_LEVEL; 2594 } 2595 2596 return 0; 2597} 2598 2599static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2600{ 2601 int r; 2602 2603 ASSERT(vcpu); 2604 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2605 2606 if (!is_paging(vcpu)) 2607 r = nonpaging_init_context(vcpu); 2608 else if (is_long_mode(vcpu)) 2609 r = paging64_init_context(vcpu); 2610 else if (is_pae(vcpu)) 2611 r = paging32E_init_context(vcpu); 2612 else 2613 r = paging32_init_context(vcpu); 2614 2615 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 2616 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 2617 2618 return r; 2619} 2620 2621static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2622{ 2623 vcpu->arch.update_pte.pfn = bad_pfn; 2624 2625 if (tdp_enabled) 2626 return init_kvm_tdp_mmu(vcpu); 2627 else 2628 return init_kvm_softmmu(vcpu); 2629} 2630 2631static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2632{ 2633 ASSERT(vcpu); 2634 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2635 /* mmu.free() should set root_hpa = INVALID_PAGE */ 2636 vcpu->arch.mmu.free(vcpu); 2637} 2638 2639int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2640{ 2641 destroy_kvm_mmu(vcpu); 2642 return init_kvm_mmu(vcpu); 2643} 2644EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 2645 2646int kvm_mmu_load(struct kvm_vcpu *vcpu) 2647{ 2648 int r; 2649 2650 r = mmu_topup_memory_caches(vcpu); 2651 if (r) 2652 goto out; 2653 r = mmu_alloc_roots(vcpu); 2654 spin_lock(&vcpu->kvm->mmu_lock); 2655 mmu_sync_roots(vcpu); 2656 spin_unlock(&vcpu->kvm->mmu_lock); 2657 if (r) 2658 goto out; 2659 /* set_cr3() should ensure TLB has been flushed */ 2660 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2661out: 2662 return r; 2663} 2664EXPORT_SYMBOL_GPL(kvm_mmu_load); 2665 2666void kvm_mmu_unload(struct kvm_vcpu *vcpu) 2667{ 2668 mmu_free_roots(vcpu); 2669} 2670 2671static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 2672 struct kvm_mmu_page *sp, 2673 u64 *spte) 2674{ 2675 u64 pte; 2676 struct kvm_mmu_page *child; 2677 2678 pte = *spte; 2679 if (is_shadow_present_pte(pte)) { 2680 if (is_last_spte(pte, sp->role.level)) 2681 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); 2682 else { 2683 child = page_header(pte & PT64_BASE_ADDR_MASK); 2684 mmu_page_remove_parent_pte(child, spte); 2685 } 2686 } 2687 __set_spte(spte, shadow_trap_nonpresent_pte); 2688 if (is_large_pte(pte)) 2689 --vcpu->kvm->stat.lpages; 2690} 2691 2692static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 2693 struct kvm_mmu_page *sp, 2694 u64 *spte, 2695 const void *new) 2696{ 2697 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 2698 ++vcpu->kvm->stat.mmu_pde_zapped; 2699 return; 2700 } 2701 2702 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) 2703 return; 2704 2705 ++vcpu->kvm->stat.mmu_pte_updated; 2706 if (!sp->role.cr4_pae) 2707 paging32_update_pte(vcpu, sp, spte, new); 2708 else 2709 paging64_update_pte(vcpu, sp, spte, new); 2710} 2711 2712static bool need_remote_flush(u64 old, u64 new) 2713{ 2714 if (!is_shadow_present_pte(old)) 2715 return false; 2716 if (!is_shadow_present_pte(new)) 2717 return true; 2718 if ((old ^ new) & PT64_BASE_ADDR_MASK) 2719 return true; 2720 old ^= PT64_NX_MASK; 2721 new ^= PT64_NX_MASK; 2722 return (old & ~new & PT64_PERM_MASK) != 0; 2723} 2724 2725static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, 2726 bool remote_flush, bool local_flush) 2727{ 2728 if (zap_page) 2729 return; 2730 2731 if (remote_flush) 2732 kvm_flush_remote_tlbs(vcpu->kvm); 2733 else if (local_flush) 2734 kvm_mmu_flush_tlb(vcpu); 2735} 2736 2737static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) 2738{ 2739 u64 *spte = vcpu->arch.last_pte_updated; 2740 2741 return !!(spte && (*spte & shadow_accessed_mask)); 2742} 2743 2744static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2745 u64 gpte) 2746{ 2747 gfn_t gfn; 2748 pfn_t pfn; 2749 2750 if (!is_present_gpte(gpte)) 2751 return; 2752 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2753 2754 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2755 smp_rmb(); 2756 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2757 2758 if (is_error_pfn(pfn)) { 2759 kvm_release_pfn_clean(pfn); 2760 return; 2761 } 2762 vcpu->arch.update_pte.gfn = gfn; 2763 vcpu->arch.update_pte.pfn = pfn; 2764} 2765 2766static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2767{ 2768 u64 *spte = vcpu->arch.last_pte_updated; 2769 2770 if (spte 2771 && vcpu->arch.last_pte_gfn == gfn 2772 && shadow_accessed_mask 2773 && !(*spte & shadow_accessed_mask) 2774 && is_shadow_present_pte(*spte)) 2775 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 2776} 2777 2778void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2779 const u8 *new, int bytes, 2780 bool guest_initiated) 2781{ 2782 gfn_t gfn = gpa >> PAGE_SHIFT; 2783 union kvm_mmu_page_role mask = { .word = 0 }; 2784 struct kvm_mmu_page *sp; 2785 struct hlist_node *node; 2786 LIST_HEAD(invalid_list); 2787 u64 entry, gentry; 2788 u64 *spte; 2789 unsigned offset = offset_in_page(gpa); 2790 unsigned pte_size; 2791 unsigned page_offset; 2792 unsigned misaligned; 2793 unsigned quadrant; 2794 int level; 2795 int flooded = 0; 2796 int npte; 2797 int r; 2798 int invlpg_counter; 2799 bool remote_flush, local_flush, zap_page; 2800 2801 zap_page = remote_flush = local_flush = false; 2802 2803 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2804 2805 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); 2806 2807 /* 2808 * Assume that the pte write on a page table of the same type 2809 * as the current vcpu paging mode. This is nearly always true 2810 * (might be false while changing modes). Note it is verified later 2811 * by update_pte(). 2812 */ 2813 if ((is_pae(vcpu) && bytes == 4) || !new) { 2814 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 2815 if (is_pae(vcpu)) { 2816 gpa &= ~(gpa_t)7; 2817 bytes = 8; 2818 } 2819 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); 2820 if (r) 2821 gentry = 0; 2822 new = (const u8 *)&gentry; 2823 } 2824 2825 switch (bytes) { 2826 case 4: 2827 gentry = *(const u32 *)new; 2828 break; 2829 case 8: 2830 gentry = *(const u64 *)new; 2831 break; 2832 default: 2833 gentry = 0; 2834 break; 2835 } 2836 2837 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 2838 spin_lock(&vcpu->kvm->mmu_lock); 2839 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 2840 gentry = 0; 2841 kvm_mmu_access_page(vcpu, gfn); 2842 kvm_mmu_free_some_pages(vcpu); 2843 ++vcpu->kvm->stat.mmu_pte_write; 2844 kvm_mmu_audit(vcpu, "pre pte write"); 2845 if (guest_initiated) { 2846 if (gfn == vcpu->arch.last_pt_write_gfn 2847 && !last_updated_pte_accessed(vcpu)) { 2848 ++vcpu->arch.last_pt_write_count; 2849 if (vcpu->arch.last_pt_write_count >= 3) 2850 flooded = 1; 2851 } else { 2852 vcpu->arch.last_pt_write_gfn = gfn; 2853 vcpu->arch.last_pt_write_count = 1; 2854 vcpu->arch.last_pte_updated = NULL; 2855 } 2856 } 2857 2858 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 2859 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 2860 pte_size = sp->role.cr4_pae ? 8 : 4; 2861 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2862 misaligned |= bytes < 4; 2863 if (misaligned || flooded) { 2864 /* 2865 * Misaligned accesses are too much trouble to fix 2866 * up; also, they usually indicate a page is not used 2867 * as a page table. 2868 * 2869 * If we're seeing too many writes to a page, 2870 * it may no longer be a page table, or we may be 2871 * forking, in which case it is better to unmap the 2872 * page. 2873 */ 2874 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2875 gpa, bytes, sp->role.word); 2876 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2877 &invalid_list); 2878 ++vcpu->kvm->stat.mmu_flooded; 2879 continue; 2880 } 2881 page_offset = offset; 2882 level = sp->role.level; 2883 npte = 1; 2884 if (!sp->role.cr4_pae) { 2885 page_offset <<= 1; /* 32->64 */ 2886 /* 2887 * A 32-bit pde maps 4MB while the shadow pdes map 2888 * only 2MB. So we need to double the offset again 2889 * and zap two pdes instead of one. 2890 */ 2891 if (level == PT32_ROOT_LEVEL) { 2892 page_offset &= ~7; /* kill rounding error */ 2893 page_offset <<= 1; 2894 npte = 2; 2895 } 2896 quadrant = page_offset >> PAGE_SHIFT; 2897 page_offset &= ~PAGE_MASK; 2898 if (quadrant != sp->role.quadrant) 2899 continue; 2900 } 2901 local_flush = true; 2902 spte = &sp->spt[page_offset / sizeof(*spte)]; 2903 while (npte--) { 2904 entry = *spte; 2905 mmu_pte_write_zap_pte(vcpu, sp, spte); 2906 if (gentry && 2907 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 2908 & mask.word)) 2909 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2910 if (!remote_flush && need_remote_flush(entry, *spte)) 2911 remote_flush = true; 2912 ++spte; 2913 } 2914 } 2915 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 2916 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2917 kvm_mmu_audit(vcpu, "post pte write"); 2918 spin_unlock(&vcpu->kvm->mmu_lock); 2919 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2920 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 2921 vcpu->arch.update_pte.pfn = bad_pfn; 2922 } 2923} 2924 2925int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2926{ 2927 gpa_t gpa; 2928 int r; 2929 2930 if (tdp_enabled) 2931 return 0; 2932 2933 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2934 2935 spin_lock(&vcpu->kvm->mmu_lock); 2936 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2937 spin_unlock(&vcpu->kvm->mmu_lock); 2938 return r; 2939} 2940EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 2941 2942void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2943{ 2944 int free_pages; 2945 LIST_HEAD(invalid_list); 2946 2947 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 2948 while (free_pages < KVM_REFILL_PAGES && 2949 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2950 struct kvm_mmu_page *sp; 2951 2952 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2953 struct kvm_mmu_page, link); 2954 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2955 &invalid_list); 2956 ++vcpu->kvm->stat.mmu_recycled; 2957 } 2958 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2959} 2960 2961int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2962{ 2963 int r; 2964 enum emulation_result er; 2965 2966 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 2967 if (r < 0) 2968 goto out; 2969 2970 if (!r) { 2971 r = 1; 2972 goto out; 2973 } 2974 2975 r = mmu_topup_memory_caches(vcpu); 2976 if (r) 2977 goto out; 2978 2979 er = emulate_instruction(vcpu, cr2, error_code, 0); 2980 2981 switch (er) { 2982 case EMULATE_DONE: 2983 return 1; 2984 case EMULATE_DO_MMIO: 2985 ++vcpu->stat.mmio_exits; 2986 /* fall through */ 2987 case EMULATE_FAIL: 2988 return 0; 2989 default: 2990 BUG(); 2991 } 2992out: 2993 return r; 2994} 2995EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2996 2997void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 2998{ 2999 vcpu->arch.mmu.invlpg(vcpu, gva); 3000 kvm_mmu_flush_tlb(vcpu); 3001 ++vcpu->stat.invlpg; 3002} 3003EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 3004 3005void kvm_enable_tdp(void) 3006{ 3007 tdp_enabled = true; 3008} 3009EXPORT_SYMBOL_GPL(kvm_enable_tdp); 3010 3011void kvm_disable_tdp(void) 3012{ 3013 tdp_enabled = false; 3014} 3015EXPORT_SYMBOL_GPL(kvm_disable_tdp); 3016 3017static void free_mmu_pages(struct kvm_vcpu *vcpu) 3018{ 3019 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3020} 3021 3022static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3023{ 3024 struct page *page; 3025 int i; 3026 3027 ASSERT(vcpu); 3028 3029 /* 3030 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 3031 * Therefore we need to allocate shadow page tables in the first 3032 * 4GB of memory, which happens to fit the DMA32 zone. 3033 */ 3034 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 3035 if (!page) 3036 return -ENOMEM; 3037 3038 vcpu->arch.mmu.pae_root = page_address(page); 3039 for (i = 0; i < 4; ++i) 3040 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 3041 3042 return 0; 3043} 3044 3045int kvm_mmu_create(struct kvm_vcpu *vcpu) 3046{ 3047 ASSERT(vcpu); 3048 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3049 3050 return alloc_mmu_pages(vcpu); 3051} 3052 3053int kvm_mmu_setup(struct kvm_vcpu *vcpu) 3054{ 3055 ASSERT(vcpu); 3056 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3057 3058 return init_kvm_mmu(vcpu); 3059} 3060 3061void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3062{ 3063 ASSERT(vcpu); 3064 3065 destroy_kvm_mmu(vcpu); 3066 free_mmu_pages(vcpu); 3067 mmu_free_memory_caches(vcpu); 3068} 3069 3070void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3071{ 3072 struct kvm_mmu_page *sp; 3073 3074 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 3075 int i; 3076 u64 *pt; 3077 3078 if (!test_bit(slot, sp->slot_bitmap)) 3079 continue; 3080 3081 pt = sp->spt; 3082 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3083 /* avoid RMW */ 3084 if (is_writable_pte(pt[i])) 3085 pt[i] &= ~PT_WRITABLE_MASK; 3086 } 3087 kvm_flush_remote_tlbs(kvm); 3088} 3089 3090void kvm_mmu_zap_all(struct kvm *kvm) 3091{ 3092 struct kvm_mmu_page *sp, *node; 3093 LIST_HEAD(invalid_list); 3094 3095 spin_lock(&kvm->mmu_lock); 3096restart: 3097 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3098 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 3099 goto restart; 3100 3101 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3102 spin_unlock(&kvm->mmu_lock); 3103} 3104 3105static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 3106 struct list_head *invalid_list) 3107{ 3108 struct kvm_mmu_page *page; 3109 3110 page = container_of(kvm->arch.active_mmu_pages.prev, 3111 struct kvm_mmu_page, link); 3112 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3113} 3114 3115static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3116{ 3117 struct kvm *kvm; 3118 struct kvm *kvm_freed = NULL; 3119 int cache_count = 0; 3120 3121 spin_lock(&kvm_lock); 3122 3123 list_for_each_entry(kvm, &vm_list, vm_list) { 3124 int npages, idx, freed_pages; 3125 LIST_HEAD(invalid_list); 3126 3127 idx = srcu_read_lock(&kvm->srcu); 3128 spin_lock(&kvm->mmu_lock); 3129 npages = kvm->arch.n_alloc_mmu_pages - 3130 kvm->arch.n_free_mmu_pages; 3131 cache_count += npages; 3132 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3133 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3134 &invalid_list); 3135 cache_count -= freed_pages; 3136 kvm_freed = kvm; 3137 } 3138 nr_to_scan--; 3139 3140 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3141 spin_unlock(&kvm->mmu_lock); 3142 srcu_read_unlock(&kvm->srcu, idx); 3143 } 3144 if (kvm_freed) 3145 list_move_tail(&kvm_freed->vm_list, &vm_list); 3146 3147 spin_unlock(&kvm_lock); 3148 3149 return cache_count; 3150} 3151 3152static struct shrinker mmu_shrinker = { 3153 .shrink = mmu_shrink, 3154 .seeks = DEFAULT_SEEKS * 10, 3155}; 3156 3157static void mmu_destroy_caches(void) 3158{ 3159 if (pte_chain_cache) 3160 kmem_cache_destroy(pte_chain_cache); 3161 if (rmap_desc_cache) 3162 kmem_cache_destroy(rmap_desc_cache); 3163 if (mmu_page_header_cache) 3164 kmem_cache_destroy(mmu_page_header_cache); 3165} 3166 3167void kvm_mmu_module_exit(void) 3168{ 3169 mmu_destroy_caches(); 3170 unregister_shrinker(&mmu_shrinker); 3171} 3172 3173int kvm_mmu_module_init(void) 3174{ 3175 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3176 sizeof(struct kvm_pte_chain), 3177 0, 0, NULL); 3178 if (!pte_chain_cache) 3179 goto nomem; 3180 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", 3181 sizeof(struct kvm_rmap_desc), 3182 0, 0, NULL); 3183 if (!rmap_desc_cache) 3184 goto nomem; 3185 3186 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 3187 sizeof(struct kvm_mmu_page), 3188 0, 0, NULL); 3189 if (!mmu_page_header_cache) 3190 goto nomem; 3191 3192 register_shrinker(&mmu_shrinker); 3193 3194 return 0; 3195 3196nomem: 3197 mmu_destroy_caches(); 3198 return -ENOMEM; 3199} 3200 3201/* 3202 * Caculate mmu pages needed for kvm. 3203 */ 3204unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 3205{ 3206 int i; 3207 unsigned int nr_mmu_pages; 3208 unsigned int nr_pages = 0; 3209 struct kvm_memslots *slots; 3210 3211 slots = kvm_memslots(kvm); 3212 3213 for (i = 0; i < slots->nmemslots; i++) 3214 nr_pages += slots->memslots[i].npages; 3215 3216 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3217 nr_mmu_pages = max(nr_mmu_pages, 3218 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); 3219 3220 return nr_mmu_pages; 3221} 3222 3223static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, 3224 unsigned len) 3225{ 3226 if (len > buffer->len) 3227 return NULL; 3228 return buffer->ptr; 3229} 3230 3231static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, 3232 unsigned len) 3233{ 3234 void *ret; 3235 3236 ret = pv_mmu_peek_buffer(buffer, len); 3237 if (!ret) 3238 return ret; 3239 buffer->ptr += len; 3240 buffer->len -= len; 3241 buffer->processed += len; 3242 return ret; 3243} 3244 3245static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, 3246 gpa_t addr, gpa_t value) 3247{ 3248 int bytes = 8; 3249 int r; 3250 3251 if (!is_long_mode(vcpu) && !is_pae(vcpu)) 3252 bytes = 4; 3253 3254 r = mmu_topup_memory_caches(vcpu); 3255 if (r) 3256 return r; 3257 3258 if (!emulator_write_phys(vcpu, addr, &value, bytes)) 3259 return -EFAULT; 3260 3261 return 1; 3262} 3263 3264static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3265{ 3266 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3267 return 1; 3268} 3269 3270static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) 3271{ 3272 spin_lock(&vcpu->kvm->mmu_lock); 3273 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); 3274 spin_unlock(&vcpu->kvm->mmu_lock); 3275 return 1; 3276} 3277 3278static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, 3279 struct kvm_pv_mmu_op_buffer *buffer) 3280{ 3281 struct kvm_mmu_op_header *header; 3282 3283 header = pv_mmu_peek_buffer(buffer, sizeof *header); 3284 if (!header) 3285 return 0; 3286 switch (header->op) { 3287 case KVM_MMU_OP_WRITE_PTE: { 3288 struct kvm_mmu_op_write_pte *wpte; 3289 3290 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); 3291 if (!wpte) 3292 return 0; 3293 return kvm_pv_mmu_write(vcpu, wpte->pte_phys, 3294 wpte->pte_val); 3295 } 3296 case KVM_MMU_OP_FLUSH_TLB: { 3297 struct kvm_mmu_op_flush_tlb *ftlb; 3298 3299 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); 3300 if (!ftlb) 3301 return 0; 3302 return kvm_pv_mmu_flush_tlb(vcpu); 3303 } 3304 case KVM_MMU_OP_RELEASE_PT: { 3305 struct kvm_mmu_op_release_pt *rpt; 3306 3307 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); 3308 if (!rpt) 3309 return 0; 3310 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); 3311 } 3312 default: return 0; 3313 } 3314} 3315 3316int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 3317 gpa_t addr, unsigned long *ret) 3318{ 3319 int r; 3320 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; 3321 3322 buffer->ptr = buffer->buf; 3323 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); 3324 buffer->processed = 0; 3325 3326 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); 3327 if (r) 3328 goto out; 3329 3330 while (buffer->len) { 3331 r = kvm_pv_mmu_op_one(vcpu, buffer); 3332 if (r < 0) 3333 goto out; 3334 if (r == 0) 3335 break; 3336 } 3337 3338 r = 1; 3339out: 3340 *ret = buffer->processed; 3341 return r; 3342} 3343 3344int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 3345{ 3346 struct kvm_shadow_walk_iterator iterator; 3347 int nr_sptes = 0; 3348 3349 spin_lock(&vcpu->kvm->mmu_lock); 3350 for_each_shadow_entry(vcpu, addr, iterator) { 3351 sptes[iterator.level-1] = *iterator.sptep; 3352 nr_sptes++; 3353 if (!is_shadow_present_pte(*iterator.sptep)) 3354 break; 3355 } 3356 spin_unlock(&vcpu->kvm->mmu_lock); 3357 3358 return nr_sptes; 3359} 3360EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3361 3362#ifdef AUDIT 3363 3364static const char *audit_msg; 3365 3366static gva_t canonicalize(gva_t gva) 3367{ 3368#ifdef CONFIG_X86_64 3369 gva = (long long)(gva << 16) >> 16; 3370#endif 3371 return gva; 3372} 3373 3374 3375typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); 3376 3377static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3378 inspect_spte_fn fn) 3379{ 3380 int i; 3381 3382 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 3383 u64 ent = sp->spt[i]; 3384 3385 if (is_shadow_present_pte(ent)) { 3386 if (!is_last_spte(ent, sp->role.level)) { 3387 struct kvm_mmu_page *child; 3388 child = page_header(ent & PT64_BASE_ADDR_MASK); 3389 __mmu_spte_walk(kvm, child, fn); 3390 } else 3391 fn(kvm, &sp->spt[i]); 3392 } 3393 } 3394} 3395 3396static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) 3397{ 3398 int i; 3399 struct kvm_mmu_page *sp; 3400 3401 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3402 return; 3403 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3404 hpa_t root = vcpu->arch.mmu.root_hpa; 3405 sp = page_header(root); 3406 __mmu_spte_walk(vcpu->kvm, sp, fn); 3407 return; 3408 } 3409 for (i = 0; i < 4; ++i) { 3410 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3411 3412 if (root && VALID_PAGE(root)) { 3413 root &= PT64_BASE_ADDR_MASK; 3414 sp = page_header(root); 3415 __mmu_spte_walk(vcpu->kvm, sp, fn); 3416 } 3417 } 3418 return; 3419} 3420 3421static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, 3422 gva_t va, int level) 3423{ 3424 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); 3425 int i; 3426 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); 3427 3428 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { 3429 u64 ent = pt[i]; 3430 3431 if (ent == shadow_trap_nonpresent_pte) 3432 continue; 3433 3434 va = canonicalize(va); 3435 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3436 audit_mappings_page(vcpu, ent, va, level - 1); 3437 else { 3438 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); 3439 gfn_t gfn = gpa >> PAGE_SHIFT; 3440 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3441 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3442 3443 if (is_error_pfn(pfn)) { 3444 kvm_release_pfn_clean(pfn); 3445 continue; 3446 } 3447 3448 if (is_shadow_present_pte(ent) 3449 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3450 printk(KERN_ERR "xx audit error: (%s) levels %d" 3451 " gva %lx gpa %llx hpa %llx ent %llx %d\n", 3452 audit_msg, vcpu->arch.mmu.root_level, 3453 va, gpa, hpa, ent, 3454 is_shadow_present_pte(ent)); 3455 else if (ent == shadow_notrap_nonpresent_pte 3456 && !is_error_hpa(hpa)) 3457 printk(KERN_ERR "audit: (%s) notrap shadow," 3458 " valid guest gva %lx\n", audit_msg, va); 3459 kvm_release_pfn_clean(pfn); 3460 3461 } 3462 } 3463} 3464 3465static void audit_mappings(struct kvm_vcpu *vcpu) 3466{ 3467 unsigned i; 3468 3469 if (vcpu->arch.mmu.root_level == 4) 3470 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); 3471 else 3472 for (i = 0; i < 4; ++i) 3473 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) 3474 audit_mappings_page(vcpu, 3475 vcpu->arch.mmu.pae_root[i], 3476 i << 30, 3477 2); 3478} 3479 3480static int count_rmaps(struct kvm_vcpu *vcpu) 3481{ 3482 struct kvm *kvm = vcpu->kvm; 3483 struct kvm_memslots *slots; 3484 int nmaps = 0; 3485 int i, j, k, idx; 3486 3487 idx = srcu_read_lock(&kvm->srcu); 3488 slots = kvm_memslots(kvm); 3489 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3490 struct kvm_memory_slot *m = &slots->memslots[i]; 3491 struct kvm_rmap_desc *d; 3492 3493 for (j = 0; j < m->npages; ++j) { 3494 unsigned long *rmapp = &m->rmap[j]; 3495 3496 if (!*rmapp) 3497 continue; 3498 if (!(*rmapp & 1)) { 3499 ++nmaps; 3500 continue; 3501 } 3502 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 3503 while (d) { 3504 for (k = 0; k < RMAP_EXT; ++k) 3505 if (d->sptes[k]) 3506 ++nmaps; 3507 else 3508 break; 3509 d = d->more; 3510 } 3511 } 3512 } 3513 srcu_read_unlock(&kvm->srcu, idx); 3514 return nmaps; 3515} 3516 3517void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 3518{ 3519 unsigned long *rmapp; 3520 struct kvm_mmu_page *rev_sp; 3521 gfn_t gfn; 3522 3523 if (is_writable_pte(*sptep)) { 3524 rev_sp = page_header(__pa(sptep)); 3525 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); 3526 3527 if (!gfn_to_memslot(kvm, gfn)) { 3528 if (!printk_ratelimit()) 3529 return; 3530 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3531 audit_msg, gfn); 3532 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3533 audit_msg, (long int)(sptep - rev_sp->spt), 3534 rev_sp->gfn); 3535 dump_stack(); 3536 return; 3537 } 3538 3539 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); 3540 if (!*rmapp) { 3541 if (!printk_ratelimit()) 3542 return; 3543 printk(KERN_ERR "%s: no rmap for writable spte %llx\n", 3544 audit_msg, *sptep); 3545 dump_stack(); 3546 } 3547 } 3548 3549} 3550 3551void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) 3552{ 3553 mmu_spte_walk(vcpu, inspect_spte_has_rmap); 3554} 3555 3556static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) 3557{ 3558 struct kvm_mmu_page *sp; 3559 int i; 3560 3561 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3562 u64 *pt = sp->spt; 3563 3564 if (sp->role.level != PT_PAGE_TABLE_LEVEL) 3565 continue; 3566 3567 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 3568 u64 ent = pt[i]; 3569 3570 if (!(ent & PT_PRESENT_MASK)) 3571 continue; 3572 if (!is_writable_pte(ent)) 3573 continue; 3574 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3575 } 3576 } 3577 return; 3578} 3579 3580static void audit_rmap(struct kvm_vcpu *vcpu) 3581{ 3582 check_writable_mappings_rmap(vcpu); 3583 count_rmaps(vcpu); 3584} 3585 3586static void audit_write_protection(struct kvm_vcpu *vcpu) 3587{ 3588 struct kvm_mmu_page *sp; 3589 struct kvm_memory_slot *slot; 3590 unsigned long *rmapp; 3591 u64 *spte; 3592 gfn_t gfn; 3593 3594 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3595 if (sp->role.direct) 3596 continue; 3597 if (sp->unsync) 3598 continue; 3599 3600 slot = gfn_to_memslot(vcpu->kvm, sp->gfn); 3601 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3602 3603 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3604 while (spte) { 3605 if (is_writable_pte(*spte)) 3606 printk(KERN_ERR "%s: (%s) shadow page has " 3607 "writable mappings: gfn %lx role %x\n", 3608 __func__, audit_msg, sp->gfn, 3609 sp->role.word); 3610 spte = rmap_next(vcpu->kvm, rmapp, spte); 3611 } 3612 } 3613} 3614 3615static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) 3616{ 3617 int olddbg = dbg; 3618 3619 dbg = 0; 3620 audit_msg = msg; 3621 audit_rmap(vcpu); 3622 audit_write_protection(vcpu); 3623 if (strcmp("pre pte write", audit_msg) != 0) 3624 audit_mappings(vcpu); 3625 audit_writable_sptes_have_rmaps(vcpu); 3626 dbg = olddbg; 3627} 3628 3629#endif 3630