• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-R7000-V1.0.7.12_1.2.5/components/opensource/linux/linux-2.6.36/arch/x86/kvm/
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
11 *
12 * Authors:
13 *   Yaniv Kamay  <yaniv@qumranet.com>
14 *   Avi Kivity   <avi@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2.  See
17 * the COPYING file in the top-level directory.
18 *
19 */
20
21#include "mmu.h"
22#include "x86.h"
23#include "kvm_cache_regs.h"
24
25#include <linux/kvm_host.h>
26#include <linux/types.h>
27#include <linux/string.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/module.h>
31#include <linux/swap.h>
32#include <linux/hugetlb.h>
33#include <linux/compiler.h>
34#include <linux/srcu.h>
35#include <linux/slab.h>
36#include <linux/uaccess.h>
37
38#include <asm/page.h>
39#include <asm/cmpxchg.h>
40#include <asm/io.h>
41#include <asm/vmx.h>
42
43/*
44 * When setting this variable to true it enables Two-Dimensional-Paging
45 * where the hardware walks 2 page tables:
46 * 1. the guest-virtual to guest-physical
47 * 2. while doing 1. it walks guest-physical to host-physical
48 * If the hardware supports that we don't need to do shadow paging.
49 */
50bool tdp_enabled = false;
51
52#undef MMU_DEBUG
53
54#undef AUDIT
55
56#ifdef AUDIT
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61
62#ifdef MMU_DEBUG
63
64#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
65#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
66
67#else
68
69#define pgprintk(x...) do { } while (0)
70#define rmap_printk(x...) do { } while (0)
71
72#endif
73
74#if defined(MMU_DEBUG) || defined(AUDIT)
75static int dbg = 0;
76module_param(dbg, bool, 0644);
77#endif
78
79static int oos_shadow = 1;
80module_param(oos_shadow, bool, 0644);
81
82#ifndef MMU_DEBUG
83#define ASSERT(x) do { } while (0)
84#else
85#define ASSERT(x)							\
86	if (!(x)) {							\
87		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
88		       __FILE__, __LINE__, #x);				\
89	}
90#endif
91
92#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94
95#define PT64_LEVEL_BITS 9
96
97#define PT64_LEVEL_SHIFT(level) \
98		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
99
100#define PT64_LEVEL_MASK(level) \
101		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
102
103#define PT64_INDEX(address, level)\
104	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
105
106
107#define PT32_LEVEL_BITS 10
108
109#define PT32_LEVEL_SHIFT(level) \
110		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
111
112#define PT32_LEVEL_MASK(level) \
113		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114#define PT32_LVL_OFFSET_MASK(level) \
115	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
116						* PT32_LEVEL_BITS))) - 1))
117
118#define PT32_INDEX(address, level)\
119	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
120
121
122#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
123#define PT64_DIR_BASE_ADDR_MASK \
124	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
125#define PT64_LVL_ADDR_MASK(level) \
126	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
127						* PT64_LEVEL_BITS))) - 1))
128#define PT64_LVL_OFFSET_MASK(level) \
129	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130						* PT64_LEVEL_BITS))) - 1))
131
132#define PT32_BASE_ADDR_MASK PAGE_MASK
133#define PT32_DIR_BASE_ADDR_MASK \
134	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
135#define PT32_LVL_ADDR_MASK(level) \
136	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
137					    * PT32_LEVEL_BITS))) - 1))
138
139#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
140			| PT64_NX_MASK)
141
142#define RMAP_EXT 4
143
144#define ACC_EXEC_MASK    1
145#define ACC_WRITE_MASK   PT_WRITABLE_MASK
146#define ACC_USER_MASK    PT_USER_MASK
147#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
148
149#include <trace/events/kvm.h>
150
151#define CREATE_TRACE_POINTS
152#include "mmutrace.h"
153
154#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
155
156#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
157
158struct kvm_rmap_desc {
159	u64 *sptes[RMAP_EXT];
160	struct kvm_rmap_desc *more;
161};
162
163struct kvm_shadow_walk_iterator {
164	u64 addr;
165	hpa_t shadow_addr;
166	int level;
167	u64 *sptep;
168	unsigned index;
169};
170
171#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
172	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
173	     shadow_walk_okay(&(_walker));			\
174	     shadow_walk_next(&(_walker)))
175
176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177
178static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache;
181
182static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte;
184static u64 __read_mostly shadow_base_present_pte;
185static u64 __read_mostly shadow_nx_mask;
186static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
187static u64 __read_mostly shadow_user_mask;
188static u64 __read_mostly shadow_accessed_mask;
189static u64 __read_mostly shadow_dirty_mask;
190
191static inline u64 rsvd_bits(int s, int e)
192{
193	return ((1ULL << (e - s + 1)) - 1) << s;
194}
195
196void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
197{
198	shadow_trap_nonpresent_pte = trap_pte;
199	shadow_notrap_nonpresent_pte = notrap_pte;
200}
201EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
202
203void kvm_mmu_set_base_ptes(u64 base_pte)
204{
205	shadow_base_present_pte = base_pte;
206}
207EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
208
209void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210		u64 dirty_mask, u64 nx_mask, u64 x_mask)
211{
212	shadow_user_mask = user_mask;
213	shadow_accessed_mask = accessed_mask;
214	shadow_dirty_mask = dirty_mask;
215	shadow_nx_mask = nx_mask;
216	shadow_x_mask = x_mask;
217}
218EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
219
220static bool is_write_protection(struct kvm_vcpu *vcpu)
221{
222	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
223}
224
225static int is_cpuid_PSE36(void)
226{
227	return 1;
228}
229
230static int is_nx(struct kvm_vcpu *vcpu)
231{
232	return vcpu->arch.efer & EFER_NX;
233}
234
235static int is_shadow_present_pte(u64 pte)
236{
237	return pte != shadow_trap_nonpresent_pte
238		&& pte != shadow_notrap_nonpresent_pte;
239}
240
241static int is_large_pte(u64 pte)
242{
243	return pte & PT_PAGE_SIZE_MASK;
244}
245
246static int is_writable_pte(unsigned long pte)
247{
248	return pte & PT_WRITABLE_MASK;
249}
250
251static int is_dirty_gpte(unsigned long pte)
252{
253	return pte & PT_DIRTY_MASK;
254}
255
256static int is_rmap_spte(u64 pte)
257{
258	return is_shadow_present_pte(pte);
259}
260
261static int is_last_spte(u64 pte, int level)
262{
263	if (level == PT_PAGE_TABLE_LEVEL)
264		return 1;
265	if (is_large_pte(pte))
266		return 1;
267	return 0;
268}
269
270static pfn_t spte_to_pfn(u64 pte)
271{
272	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
273}
274
275static gfn_t pse36_gfn_delta(u32 gpte)
276{
277	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
278
279	return (gpte & PT32_DIR_PSE36_MASK) << shift;
280}
281
282static void __set_spte(u64 *sptep, u64 spte)
283{
284	set_64bit(sptep, spte);
285}
286
287static u64 __xchg_spte(u64 *sptep, u64 new_spte)
288{
289#ifdef CONFIG_X86_64
290	return xchg(sptep, new_spte);
291#else
292	u64 old_spte;
293
294	do {
295		old_spte = *sptep;
296	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
297
298	return old_spte;
299#endif
300}
301
302static void update_spte(u64 *sptep, u64 new_spte)
303{
304	u64 old_spte;
305
306	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
307	      !is_rmap_spte(*sptep))
308		__set_spte(sptep, new_spte);
309	else {
310		old_spte = __xchg_spte(sptep, new_spte);
311		if (old_spte & shadow_accessed_mask)
312			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
313	}
314}
315
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
317				  struct kmem_cache *base_cache, int min)
318{
319	void *obj;
320
321	if (cache->nobjs >= min)
322		return 0;
323	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
324		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
325		if (!obj)
326			return -ENOMEM;
327		cache->objects[cache->nobjs++] = obj;
328	}
329	return 0;
330}
331
332static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
333				  struct kmem_cache *cache)
334{
335	while (mc->nobjs)
336		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
337}
338
339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
340				       int min)
341{
342	struct page *page;
343
344	if (cache->nobjs >= min)
345		return 0;
346	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
347		page = alloc_page(GFP_KERNEL);
348		if (!page)
349			return -ENOMEM;
350		cache->objects[cache->nobjs++] = page_address(page);
351	}
352	return 0;
353}
354
355static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
356{
357	while (mc->nobjs)
358		free_page((unsigned long)mc->objects[--mc->nobjs]);
359}
360
361static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
362{
363	int r;
364
365	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
366				   pte_chain_cache, 4);
367	if (r)
368		goto out;
369	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370				   rmap_desc_cache, 4);
371	if (r)
372		goto out;
373	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
374	if (r)
375		goto out;
376	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
377				   mmu_page_header_cache, 4);
378out:
379	return r;
380}
381
382static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
383{
384	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
385	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
386	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
387	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
388				mmu_page_header_cache);
389}
390
391static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
392				    size_t size)
393{
394	void *p;
395
396	BUG_ON(!mc->nobjs);
397	p = mc->objects[--mc->nobjs];
398	return p;
399}
400
401static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
402{
403	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
404				      sizeof(struct kvm_pte_chain));
405}
406
407static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
408{
409	kmem_cache_free(pte_chain_cache, pc);
410}
411
412static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
413{
414	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
415				      sizeof(struct kvm_rmap_desc));
416}
417
418static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
419{
420	kmem_cache_free(rmap_desc_cache, rd);
421}
422
423static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
424{
425	if (!sp->role.direct)
426		return sp->gfns[index];
427
428	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
429}
430
431static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
432{
433	if (sp->role.direct)
434		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
435	else
436		sp->gfns[index] = gfn;
437}
438
439/*
440 * Return the pointer to the largepage write count for a given
441 * gfn, handling slots that are not large page aligned.
442 */
443static int *slot_largepage_idx(gfn_t gfn,
444			       struct kvm_memory_slot *slot,
445			       int level)
446{
447	unsigned long idx;
448
449	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
450	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
451	return &slot->lpage_info[level - 2][idx].write_count;
452}
453
454static void account_shadowed(struct kvm *kvm, gfn_t gfn)
455{
456	struct kvm_memory_slot *slot;
457	int *write_count;
458	int i;
459
460	slot = gfn_to_memslot(kvm, gfn);
461	for (i = PT_DIRECTORY_LEVEL;
462	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
463		write_count   = slot_largepage_idx(gfn, slot, i);
464		*write_count += 1;
465	}
466}
467
468static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
469{
470	struct kvm_memory_slot *slot;
471	int *write_count;
472	int i;
473
474	slot = gfn_to_memslot(kvm, gfn);
475	for (i = PT_DIRECTORY_LEVEL;
476	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
477		write_count   = slot_largepage_idx(gfn, slot, i);
478		*write_count -= 1;
479		WARN_ON(*write_count < 0);
480	}
481}
482
483static int has_wrprotected_page(struct kvm *kvm,
484				gfn_t gfn,
485				int level)
486{
487	struct kvm_memory_slot *slot;
488	int *largepage_idx;
489
490	slot = gfn_to_memslot(kvm, gfn);
491	if (slot) {
492		largepage_idx = slot_largepage_idx(gfn, slot, level);
493		return *largepage_idx;
494	}
495
496	return 1;
497}
498
499static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
500{
501	unsigned long page_size;
502	int i, ret = 0;
503
504	page_size = kvm_host_page_size(kvm, gfn);
505
506	for (i = PT_PAGE_TABLE_LEVEL;
507	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
508		if (page_size >= KVM_HPAGE_SIZE(i))
509			ret = i;
510		else
511			break;
512	}
513
514	return ret;
515}
516
517static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
518{
519	struct kvm_memory_slot *slot;
520	int host_level, level, max_level;
521
522	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
523	if (slot && slot->dirty_bitmap)
524		return PT_PAGE_TABLE_LEVEL;
525
526	host_level = host_mapping_level(vcpu->kvm, large_gfn);
527
528	if (host_level == PT_PAGE_TABLE_LEVEL)
529		return host_level;
530
531	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
532		kvm_x86_ops->get_lpage_level() : host_level;
533
534	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
535		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
536			break;
537
538	return level - 1;
539}
540
541/*
542 * Take gfn and return the reverse mapping to it.
543 */
544
545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
546{
547	struct kvm_memory_slot *slot;
548	unsigned long idx;
549
550	slot = gfn_to_memslot(kvm, gfn);
551	if (likely(level == PT_PAGE_TABLE_LEVEL))
552		return &slot->rmap[gfn - slot->base_gfn];
553
554	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
555		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
556
557	return &slot->lpage_info[level - 2][idx].rmap_pde;
558}
559
560/*
561 * Reverse mapping data structures:
562 *
563 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
564 * that points to page_address(page).
565 *
566 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
567 * containing more mappings.
568 *
569 * Returns the number of rmap entries before the spte was added or zero if
570 * the spte was not added.
571 *
572 */
573static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
574{
575	struct kvm_mmu_page *sp;
576	struct kvm_rmap_desc *desc;
577	unsigned long *rmapp;
578	int i, count = 0;
579
580	if (!is_rmap_spte(*spte))
581		return count;
582	sp = page_header(__pa(spte));
583	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
584	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
585	if (!*rmapp) {
586		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
587		*rmapp = (unsigned long)spte;
588	} else if (!(*rmapp & 1)) {
589		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
590		desc = mmu_alloc_rmap_desc(vcpu);
591		desc->sptes[0] = (u64 *)*rmapp;
592		desc->sptes[1] = spte;
593		*rmapp = (unsigned long)desc | 1;
594	} else {
595		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
597		while (desc->sptes[RMAP_EXT-1] && desc->more) {
598			desc = desc->more;
599			count += RMAP_EXT;
600		}
601		if (desc->sptes[RMAP_EXT-1]) {
602			desc->more = mmu_alloc_rmap_desc(vcpu);
603			desc = desc->more;
604		}
605		for (i = 0; desc->sptes[i]; ++i)
606			;
607		desc->sptes[i] = spte;
608	}
609	return count;
610}
611
612static void rmap_desc_remove_entry(unsigned long *rmapp,
613				   struct kvm_rmap_desc *desc,
614				   int i,
615				   struct kvm_rmap_desc *prev_desc)
616{
617	int j;
618
619	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
620		;
621	desc->sptes[i] = desc->sptes[j];
622	desc->sptes[j] = NULL;
623	if (j != 0)
624		return;
625	if (!prev_desc && !desc->more)
626		*rmapp = (unsigned long)desc->sptes[0];
627	else
628		if (prev_desc)
629			prev_desc->more = desc->more;
630		else
631			*rmapp = (unsigned long)desc->more | 1;
632	mmu_free_rmap_desc(desc);
633}
634
635static void rmap_remove(struct kvm *kvm, u64 *spte)
636{
637	struct kvm_rmap_desc *desc;
638	struct kvm_rmap_desc *prev_desc;
639	struct kvm_mmu_page *sp;
640	gfn_t gfn;
641	unsigned long *rmapp;
642	int i;
643
644	sp = page_header(__pa(spte));
645	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647	if (!*rmapp) {
648		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
649		BUG();
650	} else if (!(*rmapp & 1)) {
651		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
652		if ((u64 *)*rmapp != spte) {
653			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
654			       spte, *spte);
655			BUG();
656		}
657		*rmapp = 0;
658	} else {
659		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
660		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661		prev_desc = NULL;
662		while (desc) {
663			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
664				if (desc->sptes[i] == spte) {
665					rmap_desc_remove_entry(rmapp,
666							       desc, i,
667							       prev_desc);
668					return;
669				}
670			prev_desc = desc;
671			desc = desc->more;
672		}
673		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
674		BUG();
675	}
676}
677
678static void set_spte_track_bits(u64 *sptep, u64 new_spte)
679{
680	pfn_t pfn;
681	u64 old_spte = *sptep;
682
683	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
684	      old_spte & shadow_accessed_mask) {
685		__set_spte(sptep, new_spte);
686	} else
687		old_spte = __xchg_spte(sptep, new_spte);
688
689	if (!is_rmap_spte(old_spte))
690		return;
691	pfn = spte_to_pfn(old_spte);
692	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693		kvm_set_pfn_accessed(pfn);
694	if (is_writable_pte(old_spte))
695		kvm_set_pfn_dirty(pfn);
696}
697
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{
700	set_spte_track_bits(sptep, new_spte);
701	rmap_remove(kvm, sptep);
702}
703
704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
705{
706	struct kvm_rmap_desc *desc;
707	u64 *prev_spte;
708	int i;
709
710	if (!*rmapp)
711		return NULL;
712	else if (!(*rmapp & 1)) {
713		if (!spte)
714			return (u64 *)*rmapp;
715		return NULL;
716	}
717	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
718	prev_spte = NULL;
719	while (desc) {
720		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
721			if (prev_spte == spte)
722				return desc->sptes[i];
723			prev_spte = desc->sptes[i];
724		}
725		desc = desc->more;
726	}
727	return NULL;
728}
729
730static int rmap_write_protect(struct kvm *kvm, u64 gfn)
731{
732	unsigned long *rmapp;
733	u64 *spte;
734	int i, write_protected = 0;
735
736	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
737
738	spte = rmap_next(kvm, rmapp, NULL);
739	while (spte) {
740		BUG_ON(!spte);
741		BUG_ON(!(*spte & PT_PRESENT_MASK));
742		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
743		if (is_writable_pte(*spte)) {
744			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
745			write_protected = 1;
746		}
747		spte = rmap_next(kvm, rmapp, spte);
748	}
749	if (write_protected) {
750		pfn_t pfn;
751
752		spte = rmap_next(kvm, rmapp, NULL);
753		pfn = spte_to_pfn(*spte);
754		kvm_set_pfn_dirty(pfn);
755	}
756
757	/* check for huge page mappings */
758	for (i = PT_DIRECTORY_LEVEL;
759	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
760		rmapp = gfn_to_rmap(kvm, gfn, i);
761		spte = rmap_next(kvm, rmapp, NULL);
762		while (spte) {
763			BUG_ON(!spte);
764			BUG_ON(!(*spte & PT_PRESENT_MASK));
765			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
766			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
767			if (is_writable_pte(*spte)) {
768				drop_spte(kvm, spte,
769					  shadow_trap_nonpresent_pte);
770				--kvm->stat.lpages;
771				spte = NULL;
772				write_protected = 1;
773			}
774			spte = rmap_next(kvm, rmapp, spte);
775		}
776	}
777
778	return write_protected;
779}
780
781static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
782			   unsigned long data)
783{
784	u64 *spte;
785	int need_tlb_flush = 0;
786
787	while ((spte = rmap_next(kvm, rmapp, NULL))) {
788		BUG_ON(!(*spte & PT_PRESENT_MASK));
789		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
790		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
791		need_tlb_flush = 1;
792	}
793	return need_tlb_flush;
794}
795
796static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
797			     unsigned long data)
798{
799	int need_flush = 0;
800	u64 *spte, new_spte;
801	pte_t *ptep = (pte_t *)data;
802	pfn_t new_pfn;
803
804	WARN_ON(pte_huge(*ptep));
805	new_pfn = pte_pfn(*ptep);
806	spte = rmap_next(kvm, rmapp, NULL);
807	while (spte) {
808		BUG_ON(!is_shadow_present_pte(*spte));
809		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
810		need_flush = 1;
811		if (pte_write(*ptep)) {
812			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
813			spte = rmap_next(kvm, rmapp, NULL);
814		} else {
815			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
816			new_spte |= (u64)new_pfn << PAGE_SHIFT;
817
818			new_spte &= ~PT_WRITABLE_MASK;
819			new_spte &= ~SPTE_HOST_WRITEABLE;
820			new_spte &= ~shadow_accessed_mask;
821			set_spte_track_bits(spte, new_spte);
822			spte = rmap_next(kvm, rmapp, spte);
823		}
824	}
825	if (need_flush)
826		kvm_flush_remote_tlbs(kvm);
827
828	return 0;
829}
830
831static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
832			  unsigned long data,
833			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
834					 unsigned long data))
835{
836	int i, j;
837	int ret;
838	int retval = 0;
839	struct kvm_memslots *slots;
840
841	slots = kvm_memslots(kvm);
842
843	for (i = 0; i < slots->nmemslots; i++) {
844		struct kvm_memory_slot *memslot = &slots->memslots[i];
845		unsigned long start = memslot->userspace_addr;
846		unsigned long end;
847
848		end = start + (memslot->npages << PAGE_SHIFT);
849		if (hva >= start && hva < end) {
850			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
851
852			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
853
854			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
855				unsigned long idx;
856				int sh;
857
858				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
859				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
860					(memslot->base_gfn >> sh);
861				ret |= handler(kvm,
862					&memslot->lpage_info[j][idx].rmap_pde,
863					data);
864			}
865			trace_kvm_age_page(hva, memslot, ret);
866			retval |= ret;
867		}
868	}
869
870	return retval;
871}
872
873int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
874{
875	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
876}
877
878void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
879{
880	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
881}
882
883static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
884			 unsigned long data)
885{
886	u64 *spte;
887	int young = 0;
888
889	/*
890	 * Emulate the accessed bit for EPT, by checking if this page has
891	 * an EPT mapping, and clearing it if it does. On the next access,
892	 * a new EPT mapping will be established.
893	 * This has some overhead, but not as much as the cost of swapping
894	 * out actively used pages or breaking up actively used hugepages.
895	 */
896	if (!shadow_accessed_mask)
897		return kvm_unmap_rmapp(kvm, rmapp, data);
898
899	spte = rmap_next(kvm, rmapp, NULL);
900	while (spte) {
901		int _young;
902		u64 _spte = *spte;
903		BUG_ON(!(_spte & PT_PRESENT_MASK));
904		_young = _spte & PT_ACCESSED_MASK;
905		if (_young) {
906			young = 1;
907			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
908		}
909		spte = rmap_next(kvm, rmapp, spte);
910	}
911	return young;
912}
913
914#define RMAP_RECYCLE_THRESHOLD 1000
915
916static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
917{
918	unsigned long *rmapp;
919	struct kvm_mmu_page *sp;
920
921	sp = page_header(__pa(spte));
922
923	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
924
925	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
926	kvm_flush_remote_tlbs(vcpu->kvm);
927}
928
929int kvm_age_hva(struct kvm *kvm, unsigned long hva)
930{
931	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
932}
933
934#ifdef MMU_DEBUG
935static int is_empty_shadow_page(u64 *spt)
936{
937	u64 *pos;
938	u64 *end;
939
940	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
941		if (is_shadow_present_pte(*pos)) {
942			printk(KERN_ERR "%s: %p %llx\n", __func__,
943			       pos, *pos);
944			return 0;
945		}
946	return 1;
947}
948#endif
949
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{
952	ASSERT(is_empty_shadow_page(sp->spt));
953	hlist_del(&sp->hash_link);
954	list_del(&sp->link);
955	__free_page(virt_to_page(sp->spt));
956	if (!sp->role.direct)
957		__free_page(virt_to_page(sp->gfns));
958	kmem_cache_free(mmu_page_header_cache, sp);
959	++kvm->arch.n_free_mmu_pages;
960}
961
962static unsigned kvm_page_table_hashfn(gfn_t gfn)
963{
964	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
965}
966
967static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
968					       u64 *parent_pte, int direct)
969{
970	struct kvm_mmu_page *sp;
971
972	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
973	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
974	if (!direct)
975		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
976						  PAGE_SIZE);
977	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
978	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
979	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980	sp->multimapped = 0;
981	sp->parent_pte = parent_pte;
982	--vcpu->kvm->arch.n_free_mmu_pages;
983	return sp;
984}
985
986static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
987				    struct kvm_mmu_page *sp, u64 *parent_pte)
988{
989	struct kvm_pte_chain *pte_chain;
990	struct hlist_node *node;
991	int i;
992
993	if (!parent_pte)
994		return;
995	if (!sp->multimapped) {
996		u64 *old = sp->parent_pte;
997
998		if (!old) {
999			sp->parent_pte = parent_pte;
1000			return;
1001		}
1002		sp->multimapped = 1;
1003		pte_chain = mmu_alloc_pte_chain(vcpu);
1004		INIT_HLIST_HEAD(&sp->parent_ptes);
1005		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1006		pte_chain->parent_ptes[0] = old;
1007	}
1008	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1009		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1010			continue;
1011		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1012			if (!pte_chain->parent_ptes[i]) {
1013				pte_chain->parent_ptes[i] = parent_pte;
1014				return;
1015			}
1016	}
1017	pte_chain = mmu_alloc_pte_chain(vcpu);
1018	BUG_ON(!pte_chain);
1019	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1020	pte_chain->parent_ptes[0] = parent_pte;
1021}
1022
1023static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1024				       u64 *parent_pte)
1025{
1026	struct kvm_pte_chain *pte_chain;
1027	struct hlist_node *node;
1028	int i;
1029
1030	if (!sp->multimapped) {
1031		BUG_ON(sp->parent_pte != parent_pte);
1032		sp->parent_pte = NULL;
1033		return;
1034	}
1035	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1036		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1037			if (!pte_chain->parent_ptes[i])
1038				break;
1039			if (pte_chain->parent_ptes[i] != parent_pte)
1040				continue;
1041			while (i + 1 < NR_PTE_CHAIN_ENTRIES
1042				&& pte_chain->parent_ptes[i + 1]) {
1043				pte_chain->parent_ptes[i]
1044					= pte_chain->parent_ptes[i + 1];
1045				++i;
1046			}
1047			pte_chain->parent_ptes[i] = NULL;
1048			if (i == 0) {
1049				hlist_del(&pte_chain->link);
1050				mmu_free_pte_chain(pte_chain);
1051				if (hlist_empty(&sp->parent_ptes)) {
1052					sp->multimapped = 0;
1053					sp->parent_pte = NULL;
1054				}
1055			}
1056			return;
1057		}
1058	BUG();
1059}
1060
1061static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1062{
1063	struct kvm_pte_chain *pte_chain;
1064	struct hlist_node *node;
1065	struct kvm_mmu_page *parent_sp;
1066	int i;
1067
1068	if (!sp->multimapped && sp->parent_pte) {
1069		parent_sp = page_header(__pa(sp->parent_pte));
1070		fn(parent_sp, sp->parent_pte);
1071		return;
1072	}
1073
1074	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1075		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1076			u64 *spte = pte_chain->parent_ptes[i];
1077
1078			if (!spte)
1079				break;
1080			parent_sp = page_header(__pa(spte));
1081			fn(parent_sp, spte);
1082		}
1083}
1084
1085static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1086static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1087{
1088	mmu_parent_walk(sp, mark_unsync);
1089}
1090
1091static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1092{
1093	unsigned int index;
1094
1095	index = spte - sp->spt;
1096	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1097		return;
1098	if (sp->unsync_children++)
1099		return;
1100	kvm_mmu_mark_parents_unsync(sp);
1101}
1102
1103static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1104				    struct kvm_mmu_page *sp)
1105{
1106	int i;
1107
1108	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1109		sp->spt[i] = shadow_trap_nonpresent_pte;
1110}
1111
1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1113			       struct kvm_mmu_page *sp, bool clear_unsync)
1114{
1115	return 1;
1116}
1117
1118static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1119{
1120}
1121
1122#define KVM_PAGE_ARRAY_NR 16
1123
1124struct kvm_mmu_pages {
1125	struct mmu_page_and_offset {
1126		struct kvm_mmu_page *sp;
1127		unsigned int idx;
1128	} page[KVM_PAGE_ARRAY_NR];
1129	unsigned int nr;
1130};
1131
1132#define for_each_unsync_children(bitmap, idx)		\
1133	for (idx = find_first_bit(bitmap, 512);		\
1134	     idx < 512;					\
1135	     idx = find_next_bit(bitmap, 512, idx+1))
1136
1137static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1138			 int idx)
1139{
1140	int i;
1141
1142	if (sp->unsync)
1143		for (i=0; i < pvec->nr; i++)
1144			if (pvec->page[i].sp == sp)
1145				return 0;
1146
1147	pvec->page[pvec->nr].sp = sp;
1148	pvec->page[pvec->nr].idx = idx;
1149	pvec->nr++;
1150	return (pvec->nr == KVM_PAGE_ARRAY_NR);
1151}
1152
1153static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1154			   struct kvm_mmu_pages *pvec)
1155{
1156	int i, ret, nr_unsync_leaf = 0;
1157
1158	for_each_unsync_children(sp->unsync_child_bitmap, i) {
1159		struct kvm_mmu_page *child;
1160		u64 ent = sp->spt[i];
1161
1162		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1163			goto clear_child_bitmap;
1164
1165		child = page_header(ent & PT64_BASE_ADDR_MASK);
1166
1167		if (child->unsync_children) {
1168			if (mmu_pages_add(pvec, child, i))
1169				return -ENOSPC;
1170
1171			ret = __mmu_unsync_walk(child, pvec);
1172			if (!ret)
1173				goto clear_child_bitmap;
1174			else if (ret > 0)
1175				nr_unsync_leaf += ret;
1176			else
1177				return ret;
1178		} else if (child->unsync) {
1179			nr_unsync_leaf++;
1180			if (mmu_pages_add(pvec, child, i))
1181				return -ENOSPC;
1182		} else
1183			 goto clear_child_bitmap;
1184
1185		continue;
1186
1187clear_child_bitmap:
1188		__clear_bit(i, sp->unsync_child_bitmap);
1189		sp->unsync_children--;
1190		WARN_ON((int)sp->unsync_children < 0);
1191	}
1192
1193
1194	return nr_unsync_leaf;
1195}
1196
1197static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1198			   struct kvm_mmu_pages *pvec)
1199{
1200	if (!sp->unsync_children)
1201		return 0;
1202
1203	mmu_pages_add(pvec, sp, 0);
1204	return __mmu_unsync_walk(sp, pvec);
1205}
1206
1207static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1208{
1209	WARN_ON(!sp->unsync);
1210	trace_kvm_mmu_sync_page(sp);
1211	sp->unsync = 0;
1212	--kvm->stat.mmu_unsync;
1213}
1214
1215static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1216				    struct list_head *invalid_list);
1217static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1218				    struct list_head *invalid_list);
1219
1220#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
1221  hlist_for_each_entry(sp, pos,						\
1222   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
1223	if ((sp)->gfn != (gfn)) {} else
1224
1225#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
1226  hlist_for_each_entry(sp, pos,						\
1227   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
1228		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
1229			(sp)->role.invalid) {} else
1230
1231/* @sp->gfn should be write-protected at the call site */
1232static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1233			   struct list_head *invalid_list, bool clear_unsync)
1234{
1235	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1236		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1237		return 1;
1238	}
1239
1240	if (clear_unsync)
1241		kvm_unlink_unsync_page(vcpu->kvm, sp);
1242
1243	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1244		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1245		return 1;
1246	}
1247
1248	kvm_mmu_flush_tlb(vcpu);
1249	return 0;
1250}
1251
1252static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1253				   struct kvm_mmu_page *sp)
1254{
1255	LIST_HEAD(invalid_list);
1256	int ret;
1257
1258	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1259	if (ret)
1260		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1261
1262	return ret;
1263}
1264
1265static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1266			 struct list_head *invalid_list)
1267{
1268	return __kvm_sync_page(vcpu, sp, invalid_list, true);
1269}
1270
1271/* @gfn should be write-protected at the call site */
1272static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1273{
1274	struct kvm_mmu_page *s;
1275	struct hlist_node *node;
1276	LIST_HEAD(invalid_list);
1277	bool flush = false;
1278
1279	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1280		if (!s->unsync)
1281			continue;
1282
1283		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1284		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1286			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287			continue;
1288		}
1289		kvm_unlink_unsync_page(vcpu->kvm, s);
1290		flush = true;
1291	}
1292
1293	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1294	if (flush)
1295		kvm_mmu_flush_tlb(vcpu);
1296}
1297
1298struct mmu_page_path {
1299	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1300	unsigned int idx[PT64_ROOT_LEVEL-1];
1301};
1302
1303#define for_each_sp(pvec, sp, parents, i)			\
1304		for (i = mmu_pages_next(&pvec, &parents, -1),	\
1305			sp = pvec.page[i].sp;			\
1306			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
1307			i = mmu_pages_next(&pvec, &parents, i))
1308
1309static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1310			  struct mmu_page_path *parents,
1311			  int i)
1312{
1313	int n;
1314
1315	for (n = i+1; n < pvec->nr; n++) {
1316		struct kvm_mmu_page *sp = pvec->page[n].sp;
1317
1318		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1319			parents->idx[0] = pvec->page[n].idx;
1320			return n;
1321		}
1322
1323		parents->parent[sp->role.level-2] = sp;
1324		parents->idx[sp->role.level-1] = pvec->page[n].idx;
1325	}
1326
1327	return n;
1328}
1329
1330static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1331{
1332	struct kvm_mmu_page *sp;
1333	unsigned int level = 0;
1334
1335	do {
1336		unsigned int idx = parents->idx[level];
1337
1338		sp = parents->parent[level];
1339		if (!sp)
1340			return;
1341
1342		--sp->unsync_children;
1343		WARN_ON((int)sp->unsync_children < 0);
1344		__clear_bit(idx, sp->unsync_child_bitmap);
1345		level++;
1346	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1347}
1348
1349static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1350			       struct mmu_page_path *parents,
1351			       struct kvm_mmu_pages *pvec)
1352{
1353	parents->parent[parent->role.level-1] = NULL;
1354	pvec->nr = 0;
1355}
1356
1357static void mmu_sync_children(struct kvm_vcpu *vcpu,
1358			      struct kvm_mmu_page *parent)
1359{
1360	int i;
1361	struct kvm_mmu_page *sp;
1362	struct mmu_page_path parents;
1363	struct kvm_mmu_pages pages;
1364	LIST_HEAD(invalid_list);
1365
1366	kvm_mmu_pages_init(parent, &parents, &pages);
1367	while (mmu_unsync_walk(parent, &pages)) {
1368		int protected = 0;
1369
1370		for_each_sp(pages, sp, parents, i)
1371			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1372
1373		if (protected)
1374			kvm_flush_remote_tlbs(vcpu->kvm);
1375
1376		for_each_sp(pages, sp, parents, i) {
1377			kvm_sync_page(vcpu, sp, &invalid_list);
1378			mmu_pages_clear_parents(&parents);
1379		}
1380		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1381		cond_resched_lock(&vcpu->kvm->mmu_lock);
1382		kvm_mmu_pages_init(parent, &parents, &pages);
1383	}
1384}
1385
1386static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1387					     gfn_t gfn,
1388					     gva_t gaddr,
1389					     unsigned level,
1390					     int direct,
1391					     unsigned access,
1392					     u64 *parent_pte)
1393{
1394	union kvm_mmu_page_role role;
1395	unsigned quadrant;
1396	struct kvm_mmu_page *sp;
1397	struct hlist_node *node;
1398	bool need_sync = false;
1399
1400	role = vcpu->arch.mmu.base_role;
1401	role.level = level;
1402	role.direct = direct;
1403	if (role.direct)
1404		role.cr4_pae = 0;
1405	role.access = access;
1406	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409		role.quadrant = quadrant;
1410	}
1411	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1412		if (!need_sync && sp->unsync)
1413			need_sync = true;
1414
1415		if (sp->role.word != role.word)
1416			continue;
1417
1418		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1419			break;
1420
1421		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1422		if (sp->unsync_children) {
1423			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1424			kvm_mmu_mark_parents_unsync(sp);
1425		} else if (sp->unsync)
1426			kvm_mmu_mark_parents_unsync(sp);
1427
1428		trace_kvm_mmu_get_page(sp, false);
1429		return sp;
1430	}
1431	++vcpu->kvm->stat.mmu_cache_miss;
1432	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1433	if (!sp)
1434		return sp;
1435	sp->gfn = gfn;
1436	sp->role = role;
1437	hlist_add_head(&sp->hash_link,
1438		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1439	if (!direct) {
1440		if (rmap_write_protect(vcpu->kvm, gfn))
1441			kvm_flush_remote_tlbs(vcpu->kvm);
1442		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1443			kvm_sync_pages(vcpu, gfn);
1444
1445		account_shadowed(vcpu->kvm, gfn);
1446	}
1447	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1448		vcpu->arch.mmu.prefetch_page(vcpu, sp);
1449	else
1450		nonpaging_prefetch_page(vcpu, sp);
1451	trace_kvm_mmu_get_page(sp, true);
1452	return sp;
1453}
1454
1455static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1456			     struct kvm_vcpu *vcpu, u64 addr)
1457{
1458	iterator->addr = addr;
1459	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460	iterator->level = vcpu->arch.mmu.shadow_root_level;
1461	if (iterator->level == PT32E_ROOT_LEVEL) {
1462		iterator->shadow_addr
1463			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1464		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1465		--iterator->level;
1466		if (!iterator->shadow_addr)
1467			iterator->level = 0;
1468	}
1469}
1470
1471static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1472{
1473	if (iterator->level < PT_PAGE_TABLE_LEVEL)
1474		return false;
1475
1476	if (iterator->level == PT_PAGE_TABLE_LEVEL)
1477		if (is_large_pte(*iterator->sptep))
1478			return false;
1479
1480	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1481	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1482	return true;
1483}
1484
1485static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1486{
1487	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1488	--iterator->level;
1489}
1490
1491static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1492{
1493	u64 spte;
1494
1495	spte = __pa(sp->spt)
1496		| PT_PRESENT_MASK | PT_ACCESSED_MASK
1497		| PT_WRITABLE_MASK | PT_USER_MASK;
1498	__set_spte(sptep, spte);
1499}
1500
1501static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1502{
1503	if (is_large_pte(*sptep)) {
1504		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1505		kvm_flush_remote_tlbs(vcpu->kvm);
1506	}
1507}
1508
1509static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1510				   unsigned direct_access)
1511{
1512	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1513		struct kvm_mmu_page *child;
1514
1515		/*
1516		 * For the direct sp, if the guest pte's dirty bit
1517		 * changed form clean to dirty, it will corrupt the
1518		 * sp's access: allow writable in the read-only sp,
1519		 * so we should update the spte at this point to get
1520		 * a new sp with the correct access.
1521		 */
1522		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1523		if (child->role.access == direct_access)
1524			return;
1525
1526		mmu_page_remove_parent_pte(child, sptep);
1527		__set_spte(sptep, shadow_trap_nonpresent_pte);
1528		kvm_flush_remote_tlbs(vcpu->kvm);
1529	}
1530}
1531
1532static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1533					 struct kvm_mmu_page *sp)
1534{
1535	unsigned i;
1536	u64 *pt;
1537	u64 ent;
1538
1539	pt = sp->spt;
1540
1541	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1542		ent = pt[i];
1543
1544		if (is_shadow_present_pte(ent)) {
1545			if (!is_last_spte(ent, sp->role.level)) {
1546				ent &= PT64_BASE_ADDR_MASK;
1547				mmu_page_remove_parent_pte(page_header(ent),
1548							   &pt[i]);
1549			} else {
1550				if (is_large_pte(ent))
1551					--kvm->stat.lpages;
1552				drop_spte(kvm, &pt[i],
1553					  shadow_trap_nonpresent_pte);
1554			}
1555		}
1556		pt[i] = shadow_trap_nonpresent_pte;
1557	}
1558}
1559
1560static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1561{
1562	mmu_page_remove_parent_pte(sp, parent_pte);
1563}
1564
1565static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1566{
1567	int i;
1568	struct kvm_vcpu *vcpu;
1569
1570	kvm_for_each_vcpu(i, vcpu, kvm)
1571		vcpu->arch.last_pte_updated = NULL;
1572}
1573
1574static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1575{
1576	u64 *parent_pte;
1577
1578	while (sp->multimapped || sp->parent_pte) {
1579		if (!sp->multimapped)
1580			parent_pte = sp->parent_pte;
1581		else {
1582			struct kvm_pte_chain *chain;
1583
1584			chain = container_of(sp->parent_ptes.first,
1585					     struct kvm_pte_chain, link);
1586			parent_pte = chain->parent_ptes[0];
1587		}
1588		BUG_ON(!parent_pte);
1589		kvm_mmu_put_page(sp, parent_pte);
1590		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1591	}
1592}
1593
1594static int mmu_zap_unsync_children(struct kvm *kvm,
1595				   struct kvm_mmu_page *parent,
1596				   struct list_head *invalid_list)
1597{
1598	int i, zapped = 0;
1599	struct mmu_page_path parents;
1600	struct kvm_mmu_pages pages;
1601
1602	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1603		return 0;
1604
1605	kvm_mmu_pages_init(parent, &parents, &pages);
1606	while (mmu_unsync_walk(parent, &pages)) {
1607		struct kvm_mmu_page *sp;
1608
1609		for_each_sp(pages, sp, parents, i) {
1610			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1611			mmu_pages_clear_parents(&parents);
1612			zapped++;
1613		}
1614		kvm_mmu_pages_init(parent, &parents, &pages);
1615	}
1616
1617	return zapped;
1618}
1619
1620static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1621				    struct list_head *invalid_list)
1622{
1623	int ret;
1624
1625	trace_kvm_mmu_prepare_zap_page(sp);
1626	++kvm->stat.mmu_shadow_zapped;
1627	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1628	kvm_mmu_page_unlink_children(kvm, sp);
1629	kvm_mmu_unlink_parents(kvm, sp);
1630	if (!sp->role.invalid && !sp->role.direct)
1631		unaccount_shadowed(kvm, sp->gfn);
1632	if (sp->unsync)
1633		kvm_unlink_unsync_page(kvm, sp);
1634	if (!sp->root_count) {
1635		/* Count self */
1636		ret++;
1637		list_move(&sp->link, invalid_list);
1638	} else {
1639		list_move(&sp->link, &kvm->arch.active_mmu_pages);
1640		kvm_reload_remote_mmus(kvm);
1641	}
1642
1643	sp->role.invalid = 1;
1644	kvm_mmu_reset_last_pte_updated(kvm);
1645	return ret;
1646}
1647
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649				    struct list_head *invalid_list)
1650{
1651	struct kvm_mmu_page *sp;
1652
1653	if (list_empty(invalid_list))
1654		return;
1655
1656	kvm_flush_remote_tlbs(kvm);
1657
1658	do {
1659		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1660		WARN_ON(!sp->role.invalid || sp->root_count);
1661		kvm_mmu_free_page(kvm, sp);
1662	} while (!list_empty(invalid_list));
1663
1664}
1665
1666/*
1667 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1669 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1671{
1672	int used_pages;
1673	LIST_HEAD(invalid_list);
1674
1675	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676	used_pages = max(0, used_pages);
1677
1678	/*
1679	 * If we set the number of mmu pages to be smaller be than the
1680	 * number of actived pages , we must to free some mmu pages before we
1681	 * change the value
1682	 */
1683
1684	if (used_pages > kvm_nr_mmu_pages) {
1685		while (used_pages > kvm_nr_mmu_pages &&
1686			!list_empty(&kvm->arch.active_mmu_pages)) {
1687			struct kvm_mmu_page *page;
1688
1689			page = container_of(kvm->arch.active_mmu_pages.prev,
1690					    struct kvm_mmu_page, link);
1691			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1692							       &invalid_list);
1693		}
1694		kvm_mmu_commit_zap_page(kvm, &invalid_list);
1695		kvm_nr_mmu_pages = used_pages;
1696		kvm->arch.n_free_mmu_pages = 0;
1697	}
1698	else
1699		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700					 - kvm->arch.n_alloc_mmu_pages;
1701
1702	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1703}
1704
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1706{
1707	struct kvm_mmu_page *sp;
1708	struct hlist_node *node;
1709	LIST_HEAD(invalid_list);
1710	int r;
1711
1712	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1713	r = 0;
1714
1715	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1717			 sp->role.word);
1718		r = 1;
1719		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1720	}
1721	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1722	return r;
1723}
1724
1725static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1726{
1727	struct kvm_mmu_page *sp;
1728	struct hlist_node *node;
1729	LIST_HEAD(invalid_list);
1730
1731	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732		pgprintk("%s: zap %lx %x\n",
1733			 __func__, gfn, sp->role.word);
1734		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735	}
1736	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1737}
1738
1739static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1740{
1741	int slot = memslot_id(kvm, gfn);
1742	struct kvm_mmu_page *sp = page_header(__pa(pte));
1743
1744	__set_bit(slot, sp->slot_bitmap);
1745}
1746
1747static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1748{
1749	int i;
1750	u64 *pt = sp->spt;
1751
1752	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1753		return;
1754
1755	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1756		if (pt[i] == shadow_notrap_nonpresent_pte)
1757			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1758	}
1759}
1760
1761/*
1762 * The function is based on mtrr_type_lookup() in
1763 * arch/x86/kernel/cpu/mtrr/generic.c
1764 */
1765static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1766			 u64 start, u64 end)
1767{
1768	int i;
1769	u64 base, mask;
1770	u8 prev_match, curr_match;
1771	int num_var_ranges = KVM_NR_VAR_MTRR;
1772
1773	if (!mtrr_state->enabled)
1774		return 0xFF;
1775
1776	/* Make end inclusive end, instead of exclusive */
1777	end--;
1778
1779	/* Look in fixed ranges. Just return the type as per start */
1780	if (mtrr_state->have_fixed && (start < 0x100000)) {
1781		int idx;
1782
1783		if (start < 0x80000) {
1784			idx = 0;
1785			idx += (start >> 16);
1786			return mtrr_state->fixed_ranges[idx];
1787		} else if (start < 0xC0000) {
1788			idx = 1 * 8;
1789			idx += ((start - 0x80000) >> 14);
1790			return mtrr_state->fixed_ranges[idx];
1791		} else if (start < 0x1000000) {
1792			idx = 3 * 8;
1793			idx += ((start - 0xC0000) >> 12);
1794			return mtrr_state->fixed_ranges[idx];
1795		}
1796	}
1797
1798	/*
1799	 * Look in variable ranges
1800	 * Look of multiple ranges matching this address and pick type
1801	 * as per MTRR precedence
1802	 */
1803	if (!(mtrr_state->enabled & 2))
1804		return mtrr_state->def_type;
1805
1806	prev_match = 0xFF;
1807	for (i = 0; i < num_var_ranges; ++i) {
1808		unsigned short start_state, end_state;
1809
1810		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1811			continue;
1812
1813		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1814		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1815		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1816		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1817
1818		start_state = ((start & mask) == (base & mask));
1819		end_state = ((end & mask) == (base & mask));
1820		if (start_state != end_state)
1821			return 0xFE;
1822
1823		if ((start & mask) != (base & mask))
1824			continue;
1825
1826		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1827		if (prev_match == 0xFF) {
1828			prev_match = curr_match;
1829			continue;
1830		}
1831
1832		if (prev_match == MTRR_TYPE_UNCACHABLE ||
1833		    curr_match == MTRR_TYPE_UNCACHABLE)
1834			return MTRR_TYPE_UNCACHABLE;
1835
1836		if ((prev_match == MTRR_TYPE_WRBACK &&
1837		     curr_match == MTRR_TYPE_WRTHROUGH) ||
1838		    (prev_match == MTRR_TYPE_WRTHROUGH &&
1839		     curr_match == MTRR_TYPE_WRBACK)) {
1840			prev_match = MTRR_TYPE_WRTHROUGH;
1841			curr_match = MTRR_TYPE_WRTHROUGH;
1842		}
1843
1844		if (prev_match != curr_match)
1845			return MTRR_TYPE_UNCACHABLE;
1846	}
1847
1848	if (prev_match != 0xFF)
1849		return prev_match;
1850
1851	return mtrr_state->def_type;
1852}
1853
1854u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1855{
1856	u8 mtrr;
1857
1858	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1859			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
1860	if (mtrr == 0xfe || mtrr == 0xff)
1861		mtrr = MTRR_TYPE_WRBACK;
1862	return mtrr;
1863}
1864EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1865
1866static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1867{
1868	trace_kvm_mmu_unsync_page(sp);
1869	++vcpu->kvm->stat.mmu_unsync;
1870	sp->unsync = 1;
1871
1872	kvm_mmu_mark_parents_unsync(sp);
1873	mmu_convert_notrap(sp);
1874}
1875
1876static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1877{
1878	struct kvm_mmu_page *s;
1879	struct hlist_node *node;
1880
1881	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1882		if (s->unsync)
1883			continue;
1884		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1885		__kvm_unsync_page(vcpu, s);
1886	}
1887}
1888
1889static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1890				  bool can_unsync)
1891{
1892	struct kvm_mmu_page *s;
1893	struct hlist_node *node;
1894	bool need_unsync = false;
1895
1896	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1897		if (!can_unsync)
1898			return 1;
1899
1900		if (s->role.level != PT_PAGE_TABLE_LEVEL)
1901			return 1;
1902
1903		if (!need_unsync && !s->unsync) {
1904			if (!oos_shadow)
1905				return 1;
1906			need_unsync = true;
1907		}
1908	}
1909	if (need_unsync)
1910		kvm_unsync_pages(vcpu, gfn);
1911	return 0;
1912}
1913
1914static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1915		    unsigned pte_access, int user_fault,
1916		    int write_fault, int dirty, int level,
1917		    gfn_t gfn, pfn_t pfn, bool speculative,
1918		    bool can_unsync, bool reset_host_protection)
1919{
1920	u64 spte;
1921	int ret = 0;
1922
1923	/*
1924	 * We don't set the accessed bit, since we sometimes want to see
1925	 * whether the guest actually used the pte (in order to detect
1926	 * demand paging).
1927	 */
1928	spte = shadow_base_present_pte | shadow_dirty_mask;
1929	if (!speculative)
1930		spte |= shadow_accessed_mask;
1931	if (!dirty)
1932		pte_access &= ~ACC_WRITE_MASK;
1933	if (pte_access & ACC_EXEC_MASK)
1934		spte |= shadow_x_mask;
1935	else
1936		spte |= shadow_nx_mask;
1937	if (pte_access & ACC_USER_MASK)
1938		spte |= shadow_user_mask;
1939	if (level > PT_PAGE_TABLE_LEVEL)
1940		spte |= PT_PAGE_SIZE_MASK;
1941	if (tdp_enabled)
1942		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1943			kvm_is_mmio_pfn(pfn));
1944
1945	if (reset_host_protection)
1946		spte |= SPTE_HOST_WRITEABLE;
1947
1948	spte |= (u64)pfn << PAGE_SHIFT;
1949
1950	if ((pte_access & ACC_WRITE_MASK)
1951	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1952		&& !user_fault)) {
1953
1954		if (level > PT_PAGE_TABLE_LEVEL &&
1955		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
1956			ret = 1;
1957			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1958			goto done;
1959		}
1960
1961		spte |= PT_WRITABLE_MASK;
1962
1963		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
1964			spte &= ~PT_USER_MASK;
1965
1966		/*
1967		 * Optimization: for pte sync, if spte was writable the hash
1968		 * lookup is unnecessary (and expensive). Write protection
1969		 * is responsibility of mmu_get_page / kvm_sync_page.
1970		 * Same reasoning can be applied to dirty page accounting.
1971		 */
1972		if (!can_unsync && is_writable_pte(*sptep))
1973			goto set_pte;
1974
1975		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976			pgprintk("%s: found shadow page for %lx, marking ro\n",
1977				 __func__, gfn);
1978			ret = 1;
1979			pte_access &= ~ACC_WRITE_MASK;
1980			if (is_writable_pte(spte))
1981				spte &= ~PT_WRITABLE_MASK;
1982		}
1983	}
1984
1985	if (pte_access & ACC_WRITE_MASK)
1986		mark_page_dirty(vcpu->kvm, gfn);
1987
1988set_pte:
1989	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990		kvm_set_pfn_dirty(pfn);
1991	update_spte(sptep, spte);
1992done:
1993	return ret;
1994}
1995
1996static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1997			 unsigned pt_access, unsigned pte_access,
1998			 int user_fault, int write_fault, int dirty,
1999			 int *ptwrite, int level, gfn_t gfn,
2000			 pfn_t pfn, bool speculative,
2001			 bool reset_host_protection)
2002{
2003	int was_rmapped = 0;
2004	int rmap_count;
2005
2006	pgprintk("%s: spte %llx access %x write_fault %d"
2007		 " user_fault %d gfn %lx\n",
2008		 __func__, *sptep, pt_access,
2009		 write_fault, user_fault, gfn);
2010
2011	if (is_rmap_spte(*sptep)) {
2012		/*
2013		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2014		 * the parent of the now unreachable PTE.
2015		 */
2016		if (level > PT_PAGE_TABLE_LEVEL &&
2017		    !is_large_pte(*sptep)) {
2018			struct kvm_mmu_page *child;
2019			u64 pte = *sptep;
2020
2021			child = page_header(pte & PT64_BASE_ADDR_MASK);
2022			mmu_page_remove_parent_pte(child, sptep);
2023			__set_spte(sptep, shadow_trap_nonpresent_pte);
2024			kvm_flush_remote_tlbs(vcpu->kvm);
2025		} else if (pfn != spte_to_pfn(*sptep)) {
2026			pgprintk("hfn old %lx new %lx\n",
2027				 spte_to_pfn(*sptep), pfn);
2028			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029			kvm_flush_remote_tlbs(vcpu->kvm);
2030		} else
2031			was_rmapped = 1;
2032	}
2033
2034	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2035		      dirty, level, gfn, pfn, speculative, true,
2036		      reset_host_protection)) {
2037		if (write_fault)
2038			*ptwrite = 1;
2039		kvm_mmu_flush_tlb(vcpu);
2040	}
2041
2042	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
2044		 is_large_pte(*sptep)? "2MB" : "4kB",
2045		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046		 *sptep, sptep);
2047	if (!was_rmapped && is_large_pte(*sptep))
2048		++vcpu->kvm->stat.lpages;
2049
2050	page_header_update_slot(vcpu->kvm, sptep, gfn);
2051	if (!was_rmapped) {
2052		rmap_count = rmap_add(vcpu, sptep, gfn);
2053		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2054			rmap_recycle(vcpu, sptep, gfn);
2055	}
2056	kvm_release_pfn_clean(pfn);
2057	if (speculative) {
2058		vcpu->arch.last_pte_updated = sptep;
2059		vcpu->arch.last_pte_gfn = gfn;
2060	}
2061}
2062
2063static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{
2065}
2066
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068			int level, gfn_t gfn, pfn_t pfn)
2069{
2070	struct kvm_shadow_walk_iterator iterator;
2071	struct kvm_mmu_page *sp;
2072	int pt_write = 0;
2073	gfn_t pseudo_gfn;
2074
2075	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2076		if (iterator.level == level) {
2077			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2078				     0, write, 1, &pt_write,
2079				     level, gfn, pfn, false, true);
2080			++vcpu->stat.pf_fixed;
2081			break;
2082		}
2083
2084		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2085			u64 base_addr = iterator.addr;
2086
2087			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2088			pseudo_gfn = base_addr >> PAGE_SHIFT;
2089			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2090					      iterator.level - 1,
2091					      1, ACC_ALL, iterator.sptep);
2092			if (!sp) {
2093				pgprintk("nonpaging_map: ENOMEM\n");
2094				kvm_release_pfn_clean(pfn);
2095				return -ENOMEM;
2096			}
2097
2098			__set_spte(iterator.sptep,
2099				   __pa(sp->spt)
2100				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101				   | shadow_user_mask | shadow_x_mask);
2102		}
2103	}
2104	return pt_write;
2105}
2106
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2108{
2109	char buf[1];
2110	void __user *hva;
2111	int r;
2112
2113	/* Touch the page, so send SIGBUS */
2114	hva = (void __user *)gfn_to_hva(kvm, gfn);
2115	r = copy_from_user(buf, hva, 1);
2116}
2117
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{
2120	kvm_release_pfn_clean(pfn);
2121	if (is_hwpoison_pfn(pfn)) {
2122		kvm_send_hwpoison_signal(kvm, gfn);
2123		return 0;
2124	} else if (is_fault_pfn(pfn))
2125		return -EFAULT;
2126
2127	return 1;
2128}
2129
2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2131{
2132	int r;
2133	int level;
2134	pfn_t pfn;
2135	unsigned long mmu_seq;
2136
2137	level = mapping_level(vcpu, gfn);
2138
2139	/*
2140	 * This path builds a PAE pagetable - so we can map 2mb pages at
2141	 * maximum. Therefore check if the level is larger than that.
2142	 */
2143	if (level > PT_DIRECTORY_LEVEL)
2144		level = PT_DIRECTORY_LEVEL;
2145
2146	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2147
2148	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2149	smp_rmb();
2150	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2151
2152	/* mmio */
2153	if (is_error_pfn(pfn))
2154		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2155
2156	spin_lock(&vcpu->kvm->mmu_lock);
2157	if (mmu_notifier_retry(vcpu, mmu_seq))
2158		goto out_unlock;
2159	kvm_mmu_free_some_pages(vcpu);
2160	r = __direct_map(vcpu, v, write, level, gfn, pfn);
2161	spin_unlock(&vcpu->kvm->mmu_lock);
2162
2163
2164	return r;
2165
2166out_unlock:
2167	spin_unlock(&vcpu->kvm->mmu_lock);
2168	kvm_release_pfn_clean(pfn);
2169	return 0;
2170}
2171
2172
2173static void mmu_free_roots(struct kvm_vcpu *vcpu)
2174{
2175	int i;
2176	struct kvm_mmu_page *sp;
2177	LIST_HEAD(invalid_list);
2178
2179	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180		return;
2181	spin_lock(&vcpu->kvm->mmu_lock);
2182	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2183		hpa_t root = vcpu->arch.mmu.root_hpa;
2184
2185		sp = page_header(root);
2186		--sp->root_count;
2187		if (!sp->root_count && sp->role.invalid) {
2188			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2189			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2190		}
2191		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2192		spin_unlock(&vcpu->kvm->mmu_lock);
2193		return;
2194	}
2195	for (i = 0; i < 4; ++i) {
2196		hpa_t root = vcpu->arch.mmu.pae_root[i];
2197
2198		if (root) {
2199			root &= PT64_BASE_ADDR_MASK;
2200			sp = page_header(root);
2201			--sp->root_count;
2202			if (!sp->root_count && sp->role.invalid)
2203				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2204							 &invalid_list);
2205		}
2206		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2207	}
2208	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2209	spin_unlock(&vcpu->kvm->mmu_lock);
2210	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2211}
2212
2213static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2214{
2215	int ret = 0;
2216
2217	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2218		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2219		ret = 1;
2220	}
2221
2222	return ret;
2223}
2224
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2226{
2227	int i;
2228	gfn_t root_gfn;
2229	struct kvm_mmu_page *sp;
2230	int direct = 0;
2231	u64 pdptr;
2232
2233	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234
2235	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2236		hpa_t root = vcpu->arch.mmu.root_hpa;
2237
2238		ASSERT(!VALID_PAGE(root));
2239		if (mmu_check_root(vcpu, root_gfn))
2240			return 1;
2241		if (tdp_enabled) {
2242			direct = 1;
2243			root_gfn = 0;
2244		}
2245		spin_lock(&vcpu->kvm->mmu_lock);
2246		kvm_mmu_free_some_pages(vcpu);
2247		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2248				      PT64_ROOT_LEVEL, direct,
2249				      ACC_ALL, NULL);
2250		root = __pa(sp->spt);
2251		++sp->root_count;
2252		spin_unlock(&vcpu->kvm->mmu_lock);
2253		vcpu->arch.mmu.root_hpa = root;
2254		return 0;
2255	}
2256	direct = !is_paging(vcpu);
2257
2258	if (mmu_check_root(vcpu, root_gfn))
2259		return 1;
2260
2261	for (i = 0; i < 4; ++i) {
2262		hpa_t root = vcpu->arch.mmu.pae_root[i];
2263
2264		ASSERT(!VALID_PAGE(root));
2265		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2266			pdptr = kvm_pdptr_read(vcpu, i);
2267			if (!is_present_gpte(pdptr)) {
2268				vcpu->arch.mmu.pae_root[i] = 0;
2269				continue;
2270			}
2271			root_gfn = pdptr >> PAGE_SHIFT;
2272			if (mmu_check_root(vcpu, root_gfn))
2273				return 1;
2274		} else if (vcpu->arch.mmu.root_level == 0)
2275			root_gfn = 0;
2276		if (tdp_enabled) {
2277			direct = 1;
2278			root_gfn = i << (30 - PAGE_SHIFT);
2279		}
2280		spin_lock(&vcpu->kvm->mmu_lock);
2281		kvm_mmu_free_some_pages(vcpu);
2282		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2283				      PT32_ROOT_LEVEL, direct,
2284				      ACC_ALL, NULL);
2285		root = __pa(sp->spt);
2286		++sp->root_count;
2287		spin_unlock(&vcpu->kvm->mmu_lock);
2288
2289		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2290	}
2291	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2292	return 0;
2293}
2294
2295static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2296{
2297	int i;
2298	struct kvm_mmu_page *sp;
2299
2300	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2301		return;
2302	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2303		hpa_t root = vcpu->arch.mmu.root_hpa;
2304		sp = page_header(root);
2305		mmu_sync_children(vcpu, sp);
2306		return;
2307	}
2308	for (i = 0; i < 4; ++i) {
2309		hpa_t root = vcpu->arch.mmu.pae_root[i];
2310
2311		if (root && VALID_PAGE(root)) {
2312			root &= PT64_BASE_ADDR_MASK;
2313			sp = page_header(root);
2314			mmu_sync_children(vcpu, sp);
2315		}
2316	}
2317}
2318
2319void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2320{
2321	spin_lock(&vcpu->kvm->mmu_lock);
2322	mmu_sync_roots(vcpu);
2323	spin_unlock(&vcpu->kvm->mmu_lock);
2324}
2325
2326static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2327				  u32 access, u32 *error)
2328{
2329	if (error)
2330		*error = 0;
2331	return vaddr;
2332}
2333
2334static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2335				u32 error_code)
2336{
2337	gfn_t gfn;
2338	int r;
2339
2340	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2341	r = mmu_topup_memory_caches(vcpu);
2342	if (r)
2343		return r;
2344
2345	ASSERT(vcpu);
2346	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2347
2348	gfn = gva >> PAGE_SHIFT;
2349
2350	return nonpaging_map(vcpu, gva & PAGE_MASK,
2351			     error_code & PFERR_WRITE_MASK, gfn);
2352}
2353
2354static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2355				u32 error_code)
2356{
2357	pfn_t pfn;
2358	int r;
2359	int level;
2360	gfn_t gfn = gpa >> PAGE_SHIFT;
2361	unsigned long mmu_seq;
2362
2363	ASSERT(vcpu);
2364	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2365
2366	r = mmu_topup_memory_caches(vcpu);
2367	if (r)
2368		return r;
2369
2370	level = mapping_level(vcpu, gfn);
2371
2372	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2373
2374	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2375	smp_rmb();
2376	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2377	if (is_error_pfn(pfn))
2378		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2379	spin_lock(&vcpu->kvm->mmu_lock);
2380	if (mmu_notifier_retry(vcpu, mmu_seq))
2381		goto out_unlock;
2382	kvm_mmu_free_some_pages(vcpu);
2383	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2384			 level, gfn, pfn);
2385	spin_unlock(&vcpu->kvm->mmu_lock);
2386
2387	return r;
2388
2389out_unlock:
2390	spin_unlock(&vcpu->kvm->mmu_lock);
2391	kvm_release_pfn_clean(pfn);
2392	return 0;
2393}
2394
2395static void nonpaging_free(struct kvm_vcpu *vcpu)
2396{
2397	mmu_free_roots(vcpu);
2398}
2399
2400static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2401{
2402	struct kvm_mmu *context = &vcpu->arch.mmu;
2403
2404	context->new_cr3 = nonpaging_new_cr3;
2405	context->page_fault = nonpaging_page_fault;
2406	context->gva_to_gpa = nonpaging_gva_to_gpa;
2407	context->free = nonpaging_free;
2408	context->prefetch_page = nonpaging_prefetch_page;
2409	context->sync_page = nonpaging_sync_page;
2410	context->invlpg = nonpaging_invlpg;
2411	context->root_level = 0;
2412	context->shadow_root_level = PT32E_ROOT_LEVEL;
2413	context->root_hpa = INVALID_PAGE;
2414	return 0;
2415}
2416
2417void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2418{
2419	++vcpu->stat.tlb_flush;
2420	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2421}
2422
2423static void paging_new_cr3(struct kvm_vcpu *vcpu)
2424{
2425	pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2426	mmu_free_roots(vcpu);
2427}
2428
2429static void inject_page_fault(struct kvm_vcpu *vcpu,
2430			      u64 addr,
2431			      u32 err_code)
2432{
2433	kvm_inject_page_fault(vcpu, addr, err_code);
2434}
2435
2436static void paging_free(struct kvm_vcpu *vcpu)
2437{
2438	nonpaging_free(vcpu);
2439}
2440
2441static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2442{
2443	int bit7;
2444
2445	bit7 = (gpte >> 7) & 1;
2446	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2447}
2448
2449#define PTTYPE 64
2450#include "paging_tmpl.h"
2451#undef PTTYPE
2452
2453#define PTTYPE 32
2454#include "paging_tmpl.h"
2455#undef PTTYPE
2456
2457static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2458{
2459	struct kvm_mmu *context = &vcpu->arch.mmu;
2460	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2461	u64 exb_bit_rsvd = 0;
2462
2463	if (!is_nx(vcpu))
2464		exb_bit_rsvd = rsvd_bits(63, 63);
2465	switch (level) {
2466	case PT32_ROOT_LEVEL:
2467		/* no rsvd bits for 2 level 4K page table entries */
2468		context->rsvd_bits_mask[0][1] = 0;
2469		context->rsvd_bits_mask[0][0] = 0;
2470		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2471
2472		if (!is_pse(vcpu)) {
2473			context->rsvd_bits_mask[1][1] = 0;
2474			break;
2475		}
2476
2477		if (is_cpuid_PSE36())
2478			/* 36bits PSE 4MB page */
2479			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2480		else
2481			/* 32 bits PSE 4MB page */
2482			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2483		break;
2484	case PT32E_ROOT_LEVEL:
2485		context->rsvd_bits_mask[0][2] =
2486			rsvd_bits(maxphyaddr, 63) |
2487			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2488		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2489			rsvd_bits(maxphyaddr, 62);	/* PDE */
2490		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2491			rsvd_bits(maxphyaddr, 62); 	/* PTE */
2492		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2493			rsvd_bits(maxphyaddr, 62) |
2494			rsvd_bits(13, 20);		/* large page */
2495		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2496		break;
2497	case PT64_ROOT_LEVEL:
2498		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2499			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2500		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2501			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2502		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2503			rsvd_bits(maxphyaddr, 51);
2504		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2505			rsvd_bits(maxphyaddr, 51);
2506		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2507		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2508			rsvd_bits(maxphyaddr, 51) |
2509			rsvd_bits(13, 29);
2510		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2511			rsvd_bits(maxphyaddr, 51) |
2512			rsvd_bits(13, 20);		/* large page */
2513		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2514		break;
2515	}
2516}
2517
2518static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2519{
2520	struct kvm_mmu *context = &vcpu->arch.mmu;
2521
2522	ASSERT(is_pae(vcpu));
2523	context->new_cr3 = paging_new_cr3;
2524	context->page_fault = paging64_page_fault;
2525	context->gva_to_gpa = paging64_gva_to_gpa;
2526	context->prefetch_page = paging64_prefetch_page;
2527	context->sync_page = paging64_sync_page;
2528	context->invlpg = paging64_invlpg;
2529	context->free = paging_free;
2530	context->root_level = level;
2531	context->shadow_root_level = level;
2532	context->root_hpa = INVALID_PAGE;
2533	return 0;
2534}
2535
2536static int paging64_init_context(struct kvm_vcpu *vcpu)
2537{
2538	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2539	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2540}
2541
2542static int paging32_init_context(struct kvm_vcpu *vcpu)
2543{
2544	struct kvm_mmu *context = &vcpu->arch.mmu;
2545
2546	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2547	context->new_cr3 = paging_new_cr3;
2548	context->page_fault = paging32_page_fault;
2549	context->gva_to_gpa = paging32_gva_to_gpa;
2550	context->free = paging_free;
2551	context->prefetch_page = paging32_prefetch_page;
2552	context->sync_page = paging32_sync_page;
2553	context->invlpg = paging32_invlpg;
2554	context->root_level = PT32_ROOT_LEVEL;
2555	context->shadow_root_level = PT32E_ROOT_LEVEL;
2556	context->root_hpa = INVALID_PAGE;
2557	return 0;
2558}
2559
2560static int paging32E_init_context(struct kvm_vcpu *vcpu)
2561{
2562	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2563	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2564}
2565
2566static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2567{
2568	struct kvm_mmu *context = &vcpu->arch.mmu;
2569
2570	context->new_cr3 = nonpaging_new_cr3;
2571	context->page_fault = tdp_page_fault;
2572	context->free = nonpaging_free;
2573	context->prefetch_page = nonpaging_prefetch_page;
2574	context->sync_page = nonpaging_sync_page;
2575	context->invlpg = nonpaging_invlpg;
2576	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2577	context->root_hpa = INVALID_PAGE;
2578
2579	if (!is_paging(vcpu)) {
2580		context->gva_to_gpa = nonpaging_gva_to_gpa;
2581		context->root_level = 0;
2582	} else if (is_long_mode(vcpu)) {
2583		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2584		context->gva_to_gpa = paging64_gva_to_gpa;
2585		context->root_level = PT64_ROOT_LEVEL;
2586	} else if (is_pae(vcpu)) {
2587		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2588		context->gva_to_gpa = paging64_gva_to_gpa;
2589		context->root_level = PT32E_ROOT_LEVEL;
2590	} else {
2591		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2592		context->gva_to_gpa = paging32_gva_to_gpa;
2593		context->root_level = PT32_ROOT_LEVEL;
2594	}
2595
2596	return 0;
2597}
2598
2599static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2600{
2601	int r;
2602
2603	ASSERT(vcpu);
2604	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2605
2606	if (!is_paging(vcpu))
2607		r = nonpaging_init_context(vcpu);
2608	else if (is_long_mode(vcpu))
2609		r = paging64_init_context(vcpu);
2610	else if (is_pae(vcpu))
2611		r = paging32E_init_context(vcpu);
2612	else
2613		r = paging32_init_context(vcpu);
2614
2615	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2616	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2617
2618	return r;
2619}
2620
2621static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2622{
2623	vcpu->arch.update_pte.pfn = bad_pfn;
2624
2625	if (tdp_enabled)
2626		return init_kvm_tdp_mmu(vcpu);
2627	else
2628		return init_kvm_softmmu(vcpu);
2629}
2630
2631static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2632{
2633	ASSERT(vcpu);
2634	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2635		/* mmu.free() should set root_hpa = INVALID_PAGE */
2636		vcpu->arch.mmu.free(vcpu);
2637}
2638
2639int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2640{
2641	destroy_kvm_mmu(vcpu);
2642	return init_kvm_mmu(vcpu);
2643}
2644EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
2645
2646int kvm_mmu_load(struct kvm_vcpu *vcpu)
2647{
2648	int r;
2649
2650	r = mmu_topup_memory_caches(vcpu);
2651	if (r)
2652		goto out;
2653	r = mmu_alloc_roots(vcpu);
2654	spin_lock(&vcpu->kvm->mmu_lock);
2655	mmu_sync_roots(vcpu);
2656	spin_unlock(&vcpu->kvm->mmu_lock);
2657	if (r)
2658		goto out;
2659	/* set_cr3() should ensure TLB has been flushed */
2660	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2661out:
2662	return r;
2663}
2664EXPORT_SYMBOL_GPL(kvm_mmu_load);
2665
2666void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2667{
2668	mmu_free_roots(vcpu);
2669}
2670
2671static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2672				  struct kvm_mmu_page *sp,
2673				  u64 *spte)
2674{
2675	u64 pte;
2676	struct kvm_mmu_page *child;
2677
2678	pte = *spte;
2679	if (is_shadow_present_pte(pte)) {
2680		if (is_last_spte(pte, sp->role.level))
2681			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2682		else {
2683			child = page_header(pte & PT64_BASE_ADDR_MASK);
2684			mmu_page_remove_parent_pte(child, spte);
2685		}
2686	}
2687	__set_spte(spte, shadow_trap_nonpresent_pte);
2688	if (is_large_pte(pte))
2689		--vcpu->kvm->stat.lpages;
2690}
2691
2692static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2693				  struct kvm_mmu_page *sp,
2694				  u64 *spte,
2695				  const void *new)
2696{
2697	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2698		++vcpu->kvm->stat.mmu_pde_zapped;
2699		return;
2700        }
2701
2702	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2703		return;
2704
2705	++vcpu->kvm->stat.mmu_pte_updated;
2706	if (!sp->role.cr4_pae)
2707		paging32_update_pte(vcpu, sp, spte, new);
2708	else
2709		paging64_update_pte(vcpu, sp, spte, new);
2710}
2711
2712static bool need_remote_flush(u64 old, u64 new)
2713{
2714	if (!is_shadow_present_pte(old))
2715		return false;
2716	if (!is_shadow_present_pte(new))
2717		return true;
2718	if ((old ^ new) & PT64_BASE_ADDR_MASK)
2719		return true;
2720	old ^= PT64_NX_MASK;
2721	new ^= PT64_NX_MASK;
2722	return (old & ~new & PT64_PERM_MASK) != 0;
2723}
2724
2725static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2726				    bool remote_flush, bool local_flush)
2727{
2728	if (zap_page)
2729		return;
2730
2731	if (remote_flush)
2732		kvm_flush_remote_tlbs(vcpu->kvm);
2733	else if (local_flush)
2734		kvm_mmu_flush_tlb(vcpu);
2735}
2736
2737static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2738{
2739	u64 *spte = vcpu->arch.last_pte_updated;
2740
2741	return !!(spte && (*spte & shadow_accessed_mask));
2742}
2743
2744static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2745					  u64 gpte)
2746{
2747	gfn_t gfn;
2748	pfn_t pfn;
2749
2750	if (!is_present_gpte(gpte))
2751		return;
2752	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2753
2754	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2755	smp_rmb();
2756	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2757
2758	if (is_error_pfn(pfn)) {
2759		kvm_release_pfn_clean(pfn);
2760		return;
2761	}
2762	vcpu->arch.update_pte.gfn = gfn;
2763	vcpu->arch.update_pte.pfn = pfn;
2764}
2765
2766static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2767{
2768	u64 *spte = vcpu->arch.last_pte_updated;
2769
2770	if (spte
2771	    && vcpu->arch.last_pte_gfn == gfn
2772	    && shadow_accessed_mask
2773	    && !(*spte & shadow_accessed_mask)
2774	    && is_shadow_present_pte(*spte))
2775		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2776}
2777
2778void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2779		       const u8 *new, int bytes,
2780		       bool guest_initiated)
2781{
2782	gfn_t gfn = gpa >> PAGE_SHIFT;
2783	union kvm_mmu_page_role mask = { .word = 0 };
2784	struct kvm_mmu_page *sp;
2785	struct hlist_node *node;
2786	LIST_HEAD(invalid_list);
2787	u64 entry, gentry;
2788	u64 *spte;
2789	unsigned offset = offset_in_page(gpa);
2790	unsigned pte_size;
2791	unsigned page_offset;
2792	unsigned misaligned;
2793	unsigned quadrant;
2794	int level;
2795	int flooded = 0;
2796	int npte;
2797	int r;
2798	int invlpg_counter;
2799	bool remote_flush, local_flush, zap_page;
2800
2801	zap_page = remote_flush = local_flush = false;
2802
2803	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2804
2805	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2806
2807	/*
2808	 * Assume that the pte write on a page table of the same type
2809	 * as the current vcpu paging mode.  This is nearly always true
2810	 * (might be false while changing modes).  Note it is verified later
2811	 * by update_pte().
2812	 */
2813	if ((is_pae(vcpu) && bytes == 4) || !new) {
2814		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2815		if (is_pae(vcpu)) {
2816			gpa &= ~(gpa_t)7;
2817			bytes = 8;
2818		}
2819		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2820		if (r)
2821			gentry = 0;
2822		new = (const u8 *)&gentry;
2823	}
2824
2825	switch (bytes) {
2826	case 4:
2827		gentry = *(const u32 *)new;
2828		break;
2829	case 8:
2830		gentry = *(const u64 *)new;
2831		break;
2832	default:
2833		gentry = 0;
2834		break;
2835	}
2836
2837	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2838	spin_lock(&vcpu->kvm->mmu_lock);
2839	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2840		gentry = 0;
2841	kvm_mmu_access_page(vcpu, gfn);
2842	kvm_mmu_free_some_pages(vcpu);
2843	++vcpu->kvm->stat.mmu_pte_write;
2844	kvm_mmu_audit(vcpu, "pre pte write");
2845	if (guest_initiated) {
2846		if (gfn == vcpu->arch.last_pt_write_gfn
2847		    && !last_updated_pte_accessed(vcpu)) {
2848			++vcpu->arch.last_pt_write_count;
2849			if (vcpu->arch.last_pt_write_count >= 3)
2850				flooded = 1;
2851		} else {
2852			vcpu->arch.last_pt_write_gfn = gfn;
2853			vcpu->arch.last_pt_write_count = 1;
2854			vcpu->arch.last_pte_updated = NULL;
2855		}
2856	}
2857
2858	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2859	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2860		pte_size = sp->role.cr4_pae ? 8 : 4;
2861		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2862		misaligned |= bytes < 4;
2863		if (misaligned || flooded) {
2864			/*
2865			 * Misaligned accesses are too much trouble to fix
2866			 * up; also, they usually indicate a page is not used
2867			 * as a page table.
2868			 *
2869			 * If we're seeing too many writes to a page,
2870			 * it may no longer be a page table, or we may be
2871			 * forking, in which case it is better to unmap the
2872			 * page.
2873			 */
2874			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2875				 gpa, bytes, sp->role.word);
2876			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2877						     &invalid_list);
2878			++vcpu->kvm->stat.mmu_flooded;
2879			continue;
2880		}
2881		page_offset = offset;
2882		level = sp->role.level;
2883		npte = 1;
2884		if (!sp->role.cr4_pae) {
2885			page_offset <<= 1;	/* 32->64 */
2886			/*
2887			 * A 32-bit pde maps 4MB while the shadow pdes map
2888			 * only 2MB.  So we need to double the offset again
2889			 * and zap two pdes instead of one.
2890			 */
2891			if (level == PT32_ROOT_LEVEL) {
2892				page_offset &= ~7; /* kill rounding error */
2893				page_offset <<= 1;
2894				npte = 2;
2895			}
2896			quadrant = page_offset >> PAGE_SHIFT;
2897			page_offset &= ~PAGE_MASK;
2898			if (quadrant != sp->role.quadrant)
2899				continue;
2900		}
2901		local_flush = true;
2902		spte = &sp->spt[page_offset / sizeof(*spte)];
2903		while (npte--) {
2904			entry = *spte;
2905			mmu_pte_write_zap_pte(vcpu, sp, spte);
2906			if (gentry &&
2907			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
2908			      & mask.word))
2909				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2910			if (!remote_flush && need_remote_flush(entry, *spte))
2911				remote_flush = true;
2912			++spte;
2913		}
2914	}
2915	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2916	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2917	kvm_mmu_audit(vcpu, "post pte write");
2918	spin_unlock(&vcpu->kvm->mmu_lock);
2919	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2920		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2921		vcpu->arch.update_pte.pfn = bad_pfn;
2922	}
2923}
2924
2925int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2926{
2927	gpa_t gpa;
2928	int r;
2929
2930	if (tdp_enabled)
2931		return 0;
2932
2933	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2934
2935	spin_lock(&vcpu->kvm->mmu_lock);
2936	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2937	spin_unlock(&vcpu->kvm->mmu_lock);
2938	return r;
2939}
2940EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2941
2942void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2943{
2944	int free_pages;
2945	LIST_HEAD(invalid_list);
2946
2947	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2948	while (free_pages < KVM_REFILL_PAGES &&
2949	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2950		struct kvm_mmu_page *sp;
2951
2952		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2953				  struct kvm_mmu_page, link);
2954		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2955						       &invalid_list);
2956		++vcpu->kvm->stat.mmu_recycled;
2957	}
2958	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2959}
2960
2961int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2962{
2963	int r;
2964	enum emulation_result er;
2965
2966	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2967	if (r < 0)
2968		goto out;
2969
2970	if (!r) {
2971		r = 1;
2972		goto out;
2973	}
2974
2975	r = mmu_topup_memory_caches(vcpu);
2976	if (r)
2977		goto out;
2978
2979	er = emulate_instruction(vcpu, cr2, error_code, 0);
2980
2981	switch (er) {
2982	case EMULATE_DONE:
2983		return 1;
2984	case EMULATE_DO_MMIO:
2985		++vcpu->stat.mmio_exits;
2986		/* fall through */
2987	case EMULATE_FAIL:
2988		return 0;
2989	default:
2990		BUG();
2991	}
2992out:
2993	return r;
2994}
2995EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2996
2997void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2998{
2999	vcpu->arch.mmu.invlpg(vcpu, gva);
3000	kvm_mmu_flush_tlb(vcpu);
3001	++vcpu->stat.invlpg;
3002}
3003EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
3004
3005void kvm_enable_tdp(void)
3006{
3007	tdp_enabled = true;
3008}
3009EXPORT_SYMBOL_GPL(kvm_enable_tdp);
3010
3011void kvm_disable_tdp(void)
3012{
3013	tdp_enabled = false;
3014}
3015EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3016
3017static void free_mmu_pages(struct kvm_vcpu *vcpu)
3018{
3019	free_page((unsigned long)vcpu->arch.mmu.pae_root);
3020}
3021
3022static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3023{
3024	struct page *page;
3025	int i;
3026
3027	ASSERT(vcpu);
3028
3029	/*
3030	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
3031	 * Therefore we need to allocate shadow page tables in the first
3032	 * 4GB of memory, which happens to fit the DMA32 zone.
3033	 */
3034	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
3035	if (!page)
3036		return -ENOMEM;
3037
3038	vcpu->arch.mmu.pae_root = page_address(page);
3039	for (i = 0; i < 4; ++i)
3040		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3041
3042	return 0;
3043}
3044
3045int kvm_mmu_create(struct kvm_vcpu *vcpu)
3046{
3047	ASSERT(vcpu);
3048	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3049
3050	return alloc_mmu_pages(vcpu);
3051}
3052
3053int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054{
3055	ASSERT(vcpu);
3056	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3057
3058	return init_kvm_mmu(vcpu);
3059}
3060
3061void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3062{
3063	ASSERT(vcpu);
3064
3065	destroy_kvm_mmu(vcpu);
3066	free_mmu_pages(vcpu);
3067	mmu_free_memory_caches(vcpu);
3068}
3069
3070void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3071{
3072	struct kvm_mmu_page *sp;
3073
3074	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3075		int i;
3076		u64 *pt;
3077
3078		if (!test_bit(slot, sp->slot_bitmap))
3079			continue;
3080
3081		pt = sp->spt;
3082		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3083			/* avoid RMW */
3084			if (is_writable_pte(pt[i]))
3085				pt[i] &= ~PT_WRITABLE_MASK;
3086	}
3087	kvm_flush_remote_tlbs(kvm);
3088}
3089
3090void kvm_mmu_zap_all(struct kvm *kvm)
3091{
3092	struct kvm_mmu_page *sp, *node;
3093	LIST_HEAD(invalid_list);
3094
3095	spin_lock(&kvm->mmu_lock);
3096restart:
3097	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3098		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3099			goto restart;
3100
3101	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3102	spin_unlock(&kvm->mmu_lock);
3103}
3104
3105static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3106					       struct list_head *invalid_list)
3107{
3108	struct kvm_mmu_page *page;
3109
3110	page = container_of(kvm->arch.active_mmu_pages.prev,
3111			    struct kvm_mmu_page, link);
3112	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3113}
3114
3115static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3116{
3117	struct kvm *kvm;
3118	struct kvm *kvm_freed = NULL;
3119	int cache_count = 0;
3120
3121	spin_lock(&kvm_lock);
3122
3123	list_for_each_entry(kvm, &vm_list, vm_list) {
3124		int npages, idx, freed_pages;
3125		LIST_HEAD(invalid_list);
3126
3127		idx = srcu_read_lock(&kvm->srcu);
3128		spin_lock(&kvm->mmu_lock);
3129		npages = kvm->arch.n_alloc_mmu_pages -
3130			 kvm->arch.n_free_mmu_pages;
3131		cache_count += npages;
3132		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3133			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3134							  &invalid_list);
3135			cache_count -= freed_pages;
3136			kvm_freed = kvm;
3137		}
3138		nr_to_scan--;
3139
3140		kvm_mmu_commit_zap_page(kvm, &invalid_list);
3141		spin_unlock(&kvm->mmu_lock);
3142		srcu_read_unlock(&kvm->srcu, idx);
3143	}
3144	if (kvm_freed)
3145		list_move_tail(&kvm_freed->vm_list, &vm_list);
3146
3147	spin_unlock(&kvm_lock);
3148
3149	return cache_count;
3150}
3151
3152static struct shrinker mmu_shrinker = {
3153	.shrink = mmu_shrink,
3154	.seeks = DEFAULT_SEEKS * 10,
3155};
3156
3157static void mmu_destroy_caches(void)
3158{
3159	if (pte_chain_cache)
3160		kmem_cache_destroy(pte_chain_cache);
3161	if (rmap_desc_cache)
3162		kmem_cache_destroy(rmap_desc_cache);
3163	if (mmu_page_header_cache)
3164		kmem_cache_destroy(mmu_page_header_cache);
3165}
3166
3167void kvm_mmu_module_exit(void)
3168{
3169	mmu_destroy_caches();
3170	unregister_shrinker(&mmu_shrinker);
3171}
3172
3173int kvm_mmu_module_init(void)
3174{
3175	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3176					    sizeof(struct kvm_pte_chain),
3177					    0, 0, NULL);
3178	if (!pte_chain_cache)
3179		goto nomem;
3180	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3181					    sizeof(struct kvm_rmap_desc),
3182					    0, 0, NULL);
3183	if (!rmap_desc_cache)
3184		goto nomem;
3185
3186	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3187						  sizeof(struct kvm_mmu_page),
3188						  0, 0, NULL);
3189	if (!mmu_page_header_cache)
3190		goto nomem;
3191
3192	register_shrinker(&mmu_shrinker);
3193
3194	return 0;
3195
3196nomem:
3197	mmu_destroy_caches();
3198	return -ENOMEM;
3199}
3200
3201/*
3202 * Caculate mmu pages needed for kvm.
3203 */
3204unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3205{
3206	int i;
3207	unsigned int nr_mmu_pages;
3208	unsigned int  nr_pages = 0;
3209	struct kvm_memslots *slots;
3210
3211	slots = kvm_memslots(kvm);
3212
3213	for (i = 0; i < slots->nmemslots; i++)
3214		nr_pages += slots->memslots[i].npages;
3215
3216	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3217	nr_mmu_pages = max(nr_mmu_pages,
3218			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3219
3220	return nr_mmu_pages;
3221}
3222
3223static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3224				unsigned len)
3225{
3226	if (len > buffer->len)
3227		return NULL;
3228	return buffer->ptr;
3229}
3230
3231static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3232				unsigned len)
3233{
3234	void *ret;
3235
3236	ret = pv_mmu_peek_buffer(buffer, len);
3237	if (!ret)
3238		return ret;
3239	buffer->ptr += len;
3240	buffer->len -= len;
3241	buffer->processed += len;
3242	return ret;
3243}
3244
3245static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3246			     gpa_t addr, gpa_t value)
3247{
3248	int bytes = 8;
3249	int r;
3250
3251	if (!is_long_mode(vcpu) && !is_pae(vcpu))
3252		bytes = 4;
3253
3254	r = mmu_topup_memory_caches(vcpu);
3255	if (r)
3256		return r;
3257
3258	if (!emulator_write_phys(vcpu, addr, &value, bytes))
3259		return -EFAULT;
3260
3261	return 1;
3262}
3263
3264static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3265{
3266	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3267	return 1;
3268}
3269
3270static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3271{
3272	spin_lock(&vcpu->kvm->mmu_lock);
3273	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3274	spin_unlock(&vcpu->kvm->mmu_lock);
3275	return 1;
3276}
3277
3278static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3279			     struct kvm_pv_mmu_op_buffer *buffer)
3280{
3281	struct kvm_mmu_op_header *header;
3282
3283	header = pv_mmu_peek_buffer(buffer, sizeof *header);
3284	if (!header)
3285		return 0;
3286	switch (header->op) {
3287	case KVM_MMU_OP_WRITE_PTE: {
3288		struct kvm_mmu_op_write_pte *wpte;
3289
3290		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3291		if (!wpte)
3292			return 0;
3293		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3294					wpte->pte_val);
3295	}
3296	case KVM_MMU_OP_FLUSH_TLB: {
3297		struct kvm_mmu_op_flush_tlb *ftlb;
3298
3299		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3300		if (!ftlb)
3301			return 0;
3302		return kvm_pv_mmu_flush_tlb(vcpu);
3303	}
3304	case KVM_MMU_OP_RELEASE_PT: {
3305		struct kvm_mmu_op_release_pt *rpt;
3306
3307		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3308		if (!rpt)
3309			return 0;
3310		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3311	}
3312	default: return 0;
3313	}
3314}
3315
3316int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3317		  gpa_t addr, unsigned long *ret)
3318{
3319	int r;
3320	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3321
3322	buffer->ptr = buffer->buf;
3323	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3324	buffer->processed = 0;
3325
3326	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3327	if (r)
3328		goto out;
3329
3330	while (buffer->len) {
3331		r = kvm_pv_mmu_op_one(vcpu, buffer);
3332		if (r < 0)
3333			goto out;
3334		if (r == 0)
3335			break;
3336	}
3337
3338	r = 1;
3339out:
3340	*ret = buffer->processed;
3341	return r;
3342}
3343
3344int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3345{
3346	struct kvm_shadow_walk_iterator iterator;
3347	int nr_sptes = 0;
3348
3349	spin_lock(&vcpu->kvm->mmu_lock);
3350	for_each_shadow_entry(vcpu, addr, iterator) {
3351		sptes[iterator.level-1] = *iterator.sptep;
3352		nr_sptes++;
3353		if (!is_shadow_present_pte(*iterator.sptep))
3354			break;
3355	}
3356	spin_unlock(&vcpu->kvm->mmu_lock);
3357
3358	return nr_sptes;
3359}
3360EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3361
3362#ifdef AUDIT
3363
3364static const char *audit_msg;
3365
3366static gva_t canonicalize(gva_t gva)
3367{
3368#ifdef CONFIG_X86_64
3369	gva = (long long)(gva << 16) >> 16;
3370#endif
3371	return gva;
3372}
3373
3374
3375typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3376
3377static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3378			    inspect_spte_fn fn)
3379{
3380	int i;
3381
3382	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3383		u64 ent = sp->spt[i];
3384
3385		if (is_shadow_present_pte(ent)) {
3386			if (!is_last_spte(ent, sp->role.level)) {
3387				struct kvm_mmu_page *child;
3388				child = page_header(ent & PT64_BASE_ADDR_MASK);
3389				__mmu_spte_walk(kvm, child, fn);
3390			} else
3391				fn(kvm, &sp->spt[i]);
3392		}
3393	}
3394}
3395
3396static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3397{
3398	int i;
3399	struct kvm_mmu_page *sp;
3400
3401	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3402		return;
3403	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3404		hpa_t root = vcpu->arch.mmu.root_hpa;
3405		sp = page_header(root);
3406		__mmu_spte_walk(vcpu->kvm, sp, fn);
3407		return;
3408	}
3409	for (i = 0; i < 4; ++i) {
3410		hpa_t root = vcpu->arch.mmu.pae_root[i];
3411
3412		if (root && VALID_PAGE(root)) {
3413			root &= PT64_BASE_ADDR_MASK;
3414			sp = page_header(root);
3415			__mmu_spte_walk(vcpu->kvm, sp, fn);
3416		}
3417	}
3418	return;
3419}
3420
3421static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3422				gva_t va, int level)
3423{
3424	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3425	int i;
3426	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3427
3428	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3429		u64 ent = pt[i];
3430
3431		if (ent == shadow_trap_nonpresent_pte)
3432			continue;
3433
3434		va = canonicalize(va);
3435		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3436			audit_mappings_page(vcpu, ent, va, level - 1);
3437		else {
3438			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3439			gfn_t gfn = gpa >> PAGE_SHIFT;
3440			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3441			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3442
3443			if (is_error_pfn(pfn)) {
3444				kvm_release_pfn_clean(pfn);
3445				continue;
3446			}
3447
3448			if (is_shadow_present_pte(ent)
3449			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
3450				printk(KERN_ERR "xx audit error: (%s) levels %d"
3451				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3452				       audit_msg, vcpu->arch.mmu.root_level,
3453				       va, gpa, hpa, ent,
3454				       is_shadow_present_pte(ent));
3455			else if (ent == shadow_notrap_nonpresent_pte
3456				 && !is_error_hpa(hpa))
3457				printk(KERN_ERR "audit: (%s) notrap shadow,"
3458				       " valid guest gva %lx\n", audit_msg, va);
3459			kvm_release_pfn_clean(pfn);
3460
3461		}
3462	}
3463}
3464
3465static void audit_mappings(struct kvm_vcpu *vcpu)
3466{
3467	unsigned i;
3468
3469	if (vcpu->arch.mmu.root_level == 4)
3470		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3471	else
3472		for (i = 0; i < 4; ++i)
3473			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3474				audit_mappings_page(vcpu,
3475						    vcpu->arch.mmu.pae_root[i],
3476						    i << 30,
3477						    2);
3478}
3479
3480static int count_rmaps(struct kvm_vcpu *vcpu)
3481{
3482	struct kvm *kvm = vcpu->kvm;
3483	struct kvm_memslots *slots;
3484	int nmaps = 0;
3485	int i, j, k, idx;
3486
3487	idx = srcu_read_lock(&kvm->srcu);
3488	slots = kvm_memslots(kvm);
3489	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3490		struct kvm_memory_slot *m = &slots->memslots[i];
3491		struct kvm_rmap_desc *d;
3492
3493		for (j = 0; j < m->npages; ++j) {
3494			unsigned long *rmapp = &m->rmap[j];
3495
3496			if (!*rmapp)
3497				continue;
3498			if (!(*rmapp & 1)) {
3499				++nmaps;
3500				continue;
3501			}
3502			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3503			while (d) {
3504				for (k = 0; k < RMAP_EXT; ++k)
3505					if (d->sptes[k])
3506						++nmaps;
3507					else
3508						break;
3509				d = d->more;
3510			}
3511		}
3512	}
3513	srcu_read_unlock(&kvm->srcu, idx);
3514	return nmaps;
3515}
3516
3517void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3518{
3519	unsigned long *rmapp;
3520	struct kvm_mmu_page *rev_sp;
3521	gfn_t gfn;
3522
3523	if (is_writable_pte(*sptep)) {
3524		rev_sp = page_header(__pa(sptep));
3525		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3526
3527		if (!gfn_to_memslot(kvm, gfn)) {
3528			if (!printk_ratelimit())
3529				return;
3530			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3531					 audit_msg, gfn);
3532			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3533			       audit_msg, (long int)(sptep - rev_sp->spt),
3534					rev_sp->gfn);
3535			dump_stack();
3536			return;
3537		}
3538
3539		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3540		if (!*rmapp) {
3541			if (!printk_ratelimit())
3542				return;
3543			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3544					 audit_msg, *sptep);
3545			dump_stack();
3546		}
3547	}
3548
3549}
3550
3551void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3552{
3553	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3554}
3555
3556static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3557{
3558	struct kvm_mmu_page *sp;
3559	int i;
3560
3561	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3562		u64 *pt = sp->spt;
3563
3564		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3565			continue;
3566
3567		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3568			u64 ent = pt[i];
3569
3570			if (!(ent & PT_PRESENT_MASK))
3571				continue;
3572			if (!is_writable_pte(ent))
3573				continue;
3574			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3575		}
3576	}
3577	return;
3578}
3579
3580static void audit_rmap(struct kvm_vcpu *vcpu)
3581{
3582	check_writable_mappings_rmap(vcpu);
3583	count_rmaps(vcpu);
3584}
3585
3586static void audit_write_protection(struct kvm_vcpu *vcpu)
3587{
3588	struct kvm_mmu_page *sp;
3589	struct kvm_memory_slot *slot;
3590	unsigned long *rmapp;
3591	u64 *spte;
3592	gfn_t gfn;
3593
3594	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3595		if (sp->role.direct)
3596			continue;
3597		if (sp->unsync)
3598			continue;
3599
3600		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3601		rmapp = &slot->rmap[gfn - slot->base_gfn];
3602
3603		spte = rmap_next(vcpu->kvm, rmapp, NULL);
3604		while (spte) {
3605			if (is_writable_pte(*spte))
3606				printk(KERN_ERR "%s: (%s) shadow page has "
3607				"writable mappings: gfn %lx role %x\n",
3608			       __func__, audit_msg, sp->gfn,
3609			       sp->role.word);
3610			spte = rmap_next(vcpu->kvm, rmapp, spte);
3611		}
3612	}
3613}
3614
3615static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3616{
3617	int olddbg = dbg;
3618
3619	dbg = 0;
3620	audit_msg = msg;
3621	audit_rmap(vcpu);
3622	audit_write_protection(vcpu);
3623	if (strcmp("pre pte write", audit_msg) != 0)
3624		audit_mappings(vcpu);
3625	audit_writable_sptes_have_rmaps(vcpu);
3626	dbg = olddbg;
3627}
3628
3629#endif
3630