1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 *   Avi Kivity   <avi@qumranet.com>
13 *   Yaniv Kamay  <yaniv@qumranet.com>
14 */
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54#include <linux/suspend.h>
55
56#include <asm/processor.h>
57#include <asm/ioctl.h>
58#include <linux/uaccess.h>
59
60#include "coalesced_mmio.h"
61#include "async_pf.h"
62#include "kvm_mm.h"
63#include "vfio.h"
64
65#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
68#include <linux/kvm_dirty_ring.h>
69
70/* Worst case buffer size needed for holding an integer. */
71#define ITOA_MAX_LEN 12
72
73MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
76/* Architectures should define their poll value according to the halt latency */
77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
78module_param(halt_poll_ns, uint, 0644);
79EXPORT_SYMBOL_GPL(halt_poll_ns);
80
81/* Default doubles per-vcpu halt_poll_ns. */
82unsigned int halt_poll_ns_grow = 2;
83module_param(halt_poll_ns_grow, uint, 0644);
84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
85
86/* The start value to grow halt_poll_ns from */
87unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
91/* Default resets per-vcpu halt_poll_ns . */
92unsigned int halt_poll_ns_shrink;
93module_param(halt_poll_ns_shrink, uint, 0644);
94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
95
96/*
97 * Ordering of locks:
98 *
99 *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
100 */
101
102DEFINE_MUTEX(kvm_lock);
103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
104LIST_HEAD(vm_list);
105
106static cpumask_var_t cpus_hardware_enabled;
107static int kvm_usage_count;
108static atomic_t hardware_enable_failed;
109
110static struct kmem_cache *kvm_vcpu_cache;
111
112static __read_mostly struct preempt_ops kvm_preempt_ops;
113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
114
115struct dentry *kvm_debugfs_dir;
116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
117
118static const struct file_operations stat_fops_per_vm;
119
120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121			   unsigned long arg);
122#ifdef CONFIG_KVM_COMPAT
123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124				  unsigned long arg);
125#define KVM_COMPAT(c)	.compat_ioctl	= (c)
126#else
127/*
128 * For architectures that don't implement a compat infrastructure,
129 * adopt a double line of defense:
130 * - Prevent a compat task from opening /dev/kvm
131 * - If the open has been done by a 64bit task, and the KVM fd
132 *   passed to a compat task, let the ioctls fail.
133 */
134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135				unsigned long arg) { return -EINVAL; }
136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139	return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
142			.open		= kvm_no_compat_open
143#endif
144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
146
147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149__visible bool kvm_rebooting;
150EXPORT_SYMBOL_GPL(kvm_rebooting);
151
152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
158static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
159
160__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
161						   unsigned long start, unsigned long end)
162{
163}
164
165bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
166{
167	/*
168	 * The metadata used by is_zone_device_page() to determine whether or
169	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
170	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
171	 * page_count() is zero to help detect bad usage of this helper.
172	 */
173	if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
174		return false;
175
176	return is_zone_device_page(pfn_to_page(pfn));
177}
178
179bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
180{
181	/*
182	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
183	 * perspective they are "normal" pages, albeit with slightly different
184	 * usage rules.
185	 */
186	if (pfn_valid(pfn))
187		return PageReserved(pfn_to_page(pfn)) &&
188		       !is_zero_pfn(pfn) &&
189		       !kvm_is_zone_device_pfn(pfn);
190
191	return true;
192}
193
194/*
195 * Switches to specified vcpu, until a matching vcpu_put()
196 */
197void vcpu_load(struct kvm_vcpu *vcpu)
198{
199	int cpu = get_cpu();
200
201	__this_cpu_write(kvm_running_vcpu, vcpu);
202	preempt_notifier_register(&vcpu->preempt_notifier);
203	kvm_arch_vcpu_load(vcpu, cpu);
204	put_cpu();
205}
206EXPORT_SYMBOL_GPL(vcpu_load);
207
208void vcpu_put(struct kvm_vcpu *vcpu)
209{
210	preempt_disable();
211	kvm_arch_vcpu_put(vcpu);
212	preempt_notifier_unregister(&vcpu->preempt_notifier);
213	__this_cpu_write(kvm_running_vcpu, NULL);
214	preempt_enable();
215}
216EXPORT_SYMBOL_GPL(vcpu_put);
217
218/* TODO: merge with kvm_arch_vcpu_should_kick */
219static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
220{
221	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
222
223	/*
224	 * We need to wait for the VCPU to reenable interrupts and get out of
225	 * READING_SHADOW_PAGE_TABLES mode.
226	 */
227	if (req & KVM_REQUEST_WAIT)
228		return mode != OUTSIDE_GUEST_MODE;
229
230	/*
231	 * Need to kick a running VCPU, but otherwise there is nothing to do.
232	 */
233	return mode == IN_GUEST_MODE;
234}
235
236static void ack_flush(void *_completed)
237{
238}
239
240static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
241{
242	if (cpumask_empty(cpus))
243		return false;
244
245	smp_call_function_many(cpus, ack_flush, NULL, wait);
246	return true;
247}
248
249static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
250				  unsigned int req, struct cpumask *tmp,
251				  int current_cpu)
252{
253	int cpu;
254
255	kvm_make_request(req, vcpu);
256
257	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
258		return;
259
260	/*
261	 * Note, the vCPU could get migrated to a different pCPU at any point
262	 * after kvm_request_needs_ipi(), which could result in sending an IPI
263	 * to the previous pCPU.  But, that's OK because the purpose of the IPI
264	 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
265	 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
266	 * after this point is also OK, as the requirement is only that KVM wait
267	 * for vCPUs that were reading SPTEs _before_ any changes were
268	 * finalized. See kvm_vcpu_kick() for more details on handling requests.
269	 */
270	if (kvm_request_needs_ipi(vcpu, req)) {
271		cpu = READ_ONCE(vcpu->cpu);
272		if (cpu != -1 && cpu != current_cpu)
273			__cpumask_set_cpu(cpu, tmp);
274	}
275}
276
277bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
278				 unsigned long *vcpu_bitmap)
279{
280	struct kvm_vcpu *vcpu;
281	struct cpumask *cpus;
282	int i, me;
283	bool called;
284
285	me = get_cpu();
286
287	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
288	cpumask_clear(cpus);
289
290	for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
291		vcpu = kvm_get_vcpu(kvm, i);
292		if (!vcpu)
293			continue;
294		kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
295	}
296
297	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
298	put_cpu();
299
300	return called;
301}
302
303bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
304				      struct kvm_vcpu *except)
305{
306	struct kvm_vcpu *vcpu;
307	struct cpumask *cpus;
308	unsigned long i;
309	bool called;
310	int me;
311
312	me = get_cpu();
313
314	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
315	cpumask_clear(cpus);
316
317	kvm_for_each_vcpu(i, vcpu, kvm) {
318		if (vcpu == except)
319			continue;
320		kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
321	}
322
323	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
324	put_cpu();
325
326	return called;
327}
328
329bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
330{
331	return kvm_make_all_cpus_request_except(kvm, req, NULL);
332}
333EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
334
335#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
336void kvm_flush_remote_tlbs(struct kvm *kvm)
337{
338	++kvm->stat.generic.remote_tlb_flush_requests;
339
340	/*
341	 * We want to publish modifications to the page tables before reading
342	 * mode. Pairs with a memory barrier in arch-specific code.
343	 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
344	 * and smp_mb in walk_shadow_page_lockless_begin/end.
345	 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
346	 *
347	 * There is already an smp_mb__after_atomic() before
348	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
349	 * barrier here.
350	 */
351	if (!kvm_arch_flush_remote_tlb(kvm)
352	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
353		++kvm->stat.generic.remote_tlb_flush;
354}
355EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
356#endif
357
358void kvm_reload_remote_mmus(struct kvm *kvm)
359{
360	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
361}
362
363#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
364static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
365					       gfp_t gfp_flags)
366{
367	gfp_flags |= mc->gfp_zero;
368
369	if (mc->kmem_cache)
370		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
371	else
372		return (void *)__get_free_page(gfp_flags);
373}
374
375int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
376{
377	void *obj;
378
379	if (mc->nobjs >= min)
380		return 0;
381	while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
382		obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
383		if (!obj)
384			return mc->nobjs >= min ? 0 : -ENOMEM;
385		mc->objects[mc->nobjs++] = obj;
386	}
387	return 0;
388}
389
390int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
391{
392	return mc->nobjs;
393}
394
395void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
396{
397	while (mc->nobjs) {
398		if (mc->kmem_cache)
399			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
400		else
401			free_page((unsigned long)mc->objects[--mc->nobjs]);
402	}
403}
404
405void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
406{
407	void *p;
408
409	if (WARN_ON(!mc->nobjs))
410		p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
411	else
412		p = mc->objects[--mc->nobjs];
413	BUG_ON(!p);
414	return p;
415}
416#endif
417
418static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
419{
420	mutex_init(&vcpu->mutex);
421	vcpu->cpu = -1;
422	vcpu->kvm = kvm;
423	vcpu->vcpu_id = id;
424	vcpu->pid = NULL;
425#ifndef __KVM_HAVE_ARCH_WQP
426	rcuwait_init(&vcpu->wait);
427#endif
428	kvm_async_pf_vcpu_init(vcpu);
429
430	kvm_vcpu_set_in_spin_loop(vcpu, false);
431	kvm_vcpu_set_dy_eligible(vcpu, false);
432	vcpu->preempted = false;
433	vcpu->ready = false;
434	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
435	vcpu->last_used_slot = NULL;
436}
437
438static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
439{
440	kvm_dirty_ring_free(&vcpu->dirty_ring);
441	kvm_arch_vcpu_destroy(vcpu);
442
443	/*
444	 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
445	 * the vcpu->pid pointer, and at destruction time all file descriptors
446	 * are already gone.
447	 */
448	put_pid(rcu_dereference_protected(vcpu->pid, 1));
449
450	free_page((unsigned long)vcpu->run);
451	kmem_cache_free(kvm_vcpu_cache, vcpu);
452}
453
454void kvm_destroy_vcpus(struct kvm *kvm)
455{
456	unsigned long i;
457	struct kvm_vcpu *vcpu;
458
459	kvm_for_each_vcpu(i, vcpu, kvm) {
460		kvm_vcpu_destroy(vcpu);
461		xa_erase(&kvm->vcpu_array, i);
462	}
463
464	atomic_set(&kvm->online_vcpus, 0);
465}
466EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
467
468#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
469static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
470{
471	return container_of(mn, struct kvm, mmu_notifier);
472}
473
474static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
475					      struct mm_struct *mm,
476					      unsigned long start, unsigned long end)
477{
478	struct kvm *kvm = mmu_notifier_to_kvm(mn);
479	int idx;
480
481	idx = srcu_read_lock(&kvm->srcu);
482	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
483	srcu_read_unlock(&kvm->srcu, idx);
484}
485
486typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
487
488typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
489			     unsigned long end);
490
491struct kvm_hva_range {
492	unsigned long start;
493	unsigned long end;
494	pte_t pte;
495	hva_handler_t handler;
496	on_lock_fn_t on_lock;
497	bool flush_on_ret;
498	bool may_block;
499};
500
501/*
502 * Use a dedicated stub instead of NULL to indicate that there is no callback
503 * function/handler.  The compiler technically can't guarantee that a real
504 * function will have a non-zero address, and so it will generate code to
505 * check for !NULL, whereas comparing against a stub will be elided at compile
506 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
507 */
508static void kvm_null_fn(void)
509{
510
511}
512#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
513
514/* Iterate over each memslot intersecting [start, last] (inclusive) range */
515#define kvm_for_each_memslot_in_hva_range(node, slots, start, last)	     \
516	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
517	     node;							     \
518	     node = interval_tree_iter_next(node, start, last))	     \
519
520static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
521						  const struct kvm_hva_range *range)
522{
523	bool ret = false, locked = false;
524	struct kvm_gfn_range gfn_range;
525	struct kvm_memory_slot *slot;
526	struct kvm_memslots *slots;
527	int i, idx;
528
529	if (WARN_ON_ONCE(range->end <= range->start))
530		return 0;
531
532	/* A null handler is allowed if and only if on_lock() is provided. */
533	if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
534			 IS_KVM_NULL_FN(range->handler)))
535		return 0;
536
537	idx = srcu_read_lock(&kvm->srcu);
538
539	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
540		struct interval_tree_node *node;
541
542		slots = __kvm_memslots(kvm, i);
543		kvm_for_each_memslot_in_hva_range(node, slots,
544						  range->start, range->end - 1) {
545			unsigned long hva_start, hva_end;
546
547			slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
548			hva_start = max(range->start, slot->userspace_addr);
549			hva_end = min(range->end, slot->userspace_addr +
550						  (slot->npages << PAGE_SHIFT));
551
552			/*
553			 * To optimize for the likely case where the address
554			 * range is covered by zero or one memslots, don't
555			 * bother making these conditional (to avoid writes on
556			 * the second or later invocation of the handler).
557			 */
558			gfn_range.pte = range->pte;
559			gfn_range.may_block = range->may_block;
560
561			/*
562			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
563			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
564			 */
565			gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
566			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
567			gfn_range.slot = slot;
568
569			if (!locked) {
570				locked = true;
571				KVM_MMU_LOCK(kvm);
572				if (!IS_KVM_NULL_FN(range->on_lock))
573					range->on_lock(kvm, range->start, range->end);
574				if (IS_KVM_NULL_FN(range->handler))
575					break;
576			}
577			ret |= range->handler(kvm, &gfn_range);
578		}
579	}
580
581	if (range->flush_on_ret && ret)
582		kvm_flush_remote_tlbs(kvm);
583
584	if (locked)
585		KVM_MMU_UNLOCK(kvm);
586
587	srcu_read_unlock(&kvm->srcu, idx);
588
589	/* The notifiers are averse to booleans. :-( */
590	return (int)ret;
591}
592
593static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
594						unsigned long start,
595						unsigned long end,
596						pte_t pte,
597						hva_handler_t handler)
598{
599	struct kvm *kvm = mmu_notifier_to_kvm(mn);
600	const struct kvm_hva_range range = {
601		.start		= start,
602		.end		= end,
603		.pte		= pte,
604		.handler	= handler,
605		.on_lock	= (void *)kvm_null_fn,
606		.flush_on_ret	= true,
607		.may_block	= false,
608	};
609
610	return __kvm_handle_hva_range(kvm, &range);
611}
612
613static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
614							 unsigned long start,
615							 unsigned long end,
616							 hva_handler_t handler)
617{
618	struct kvm *kvm = mmu_notifier_to_kvm(mn);
619	const struct kvm_hva_range range = {
620		.start		= start,
621		.end		= end,
622		.pte		= __pte(0),
623		.handler	= handler,
624		.on_lock	= (void *)kvm_null_fn,
625		.flush_on_ret	= false,
626		.may_block	= false,
627	};
628
629	return __kvm_handle_hva_range(kvm, &range);
630}
631static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
632					struct mm_struct *mm,
633					unsigned long address,
634					pte_t pte)
635{
636	struct kvm *kvm = mmu_notifier_to_kvm(mn);
637
638	trace_kvm_set_spte_hva(address);
639
640	/*
641	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
642	 * If mmu_notifier_count is zero, then no in-progress invalidations,
643	 * including this one, found a relevant memslot at start(); rechecking
644	 * memslots here is unnecessary.  Note, a false positive (count elevated
645	 * by a different invalidation) is sub-optimal but functionally ok.
646	 */
647	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
648	if (!READ_ONCE(kvm->mmu_notifier_count))
649		return;
650
651	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
652}
653
654void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
655				   unsigned long end)
656{
657	/*
658	 * The count increase must become visible at unlock time as no
659	 * spte can be established without taking the mmu_lock and
660	 * count is also read inside the mmu_lock critical section.
661	 */
662	kvm->mmu_notifier_count++;
663	if (likely(kvm->mmu_notifier_count == 1)) {
664		kvm->mmu_notifier_range_start = start;
665		kvm->mmu_notifier_range_end = end;
666	} else {
667		/*
668		 * Fully tracking multiple concurrent ranges has dimishing
669		 * returns. Keep things simple and just find the minimal range
670		 * which includes the current and new ranges. As there won't be
671		 * enough information to subtract a range after its invalidate
672		 * completes, any ranges invalidated concurrently will
673		 * accumulate and persist until all outstanding invalidates
674		 * complete.
675		 */
676		kvm->mmu_notifier_range_start =
677			min(kvm->mmu_notifier_range_start, start);
678		kvm->mmu_notifier_range_end =
679			max(kvm->mmu_notifier_range_end, end);
680	}
681}
682
683static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
684					const struct mmu_notifier_range *range)
685{
686	struct kvm *kvm = mmu_notifier_to_kvm(mn);
687	const struct kvm_hva_range hva_range = {
688		.start		= range->start,
689		.end		= range->end,
690		.pte		= __pte(0),
691		.handler	= kvm_unmap_gfn_range,
692		.on_lock	= kvm_inc_notifier_count,
693		.flush_on_ret	= true,
694		.may_block	= mmu_notifier_range_blockable(range),
695	};
696
697	trace_kvm_unmap_hva_range(range->start, range->end);
698
699	/*
700	 * Prevent memslot modification between range_start() and range_end()
701	 * so that conditionally locking provides the same result in both
702	 * functions.  Without that guarantee, the mmu_notifier_count
703	 * adjustments will be imbalanced.
704	 *
705	 * Pairs with the decrement in range_end().
706	 */
707	spin_lock(&kvm->mn_invalidate_lock);
708	kvm->mn_active_invalidate_count++;
709	spin_unlock(&kvm->mn_invalidate_lock);
710
711	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
712					  hva_range.may_block);
713
714	__kvm_handle_hva_range(kvm, &hva_range);
715
716	return 0;
717}
718
719void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
720				   unsigned long end)
721{
722	/*
723	 * This sequence increase will notify the kvm page fault that
724	 * the page that is going to be mapped in the spte could have
725	 * been freed.
726	 */
727	kvm->mmu_notifier_seq++;
728	smp_wmb();
729	/*
730	 * The above sequence increase must be visible before the
731	 * below count decrease, which is ensured by the smp_wmb above
732	 * in conjunction with the smp_rmb in mmu_notifier_retry().
733	 */
734	kvm->mmu_notifier_count--;
735}
736
737static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
738					const struct mmu_notifier_range *range)
739{
740	struct kvm *kvm = mmu_notifier_to_kvm(mn);
741	const struct kvm_hva_range hva_range = {
742		.start		= range->start,
743		.end		= range->end,
744		.pte		= __pte(0),
745		.handler	= (void *)kvm_null_fn,
746		.on_lock	= kvm_dec_notifier_count,
747		.flush_on_ret	= false,
748		.may_block	= mmu_notifier_range_blockable(range),
749	};
750	bool wake;
751
752	__kvm_handle_hva_range(kvm, &hva_range);
753
754	/* Pairs with the increment in range_start(). */
755	spin_lock(&kvm->mn_invalidate_lock);
756	wake = (--kvm->mn_active_invalidate_count == 0);
757	spin_unlock(&kvm->mn_invalidate_lock);
758
759	/*
760	 * There can only be one waiter, since the wait happens under
761	 * slots_lock.
762	 */
763	if (wake)
764		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
765
766	BUG_ON(kvm->mmu_notifier_count < 0);
767}
768
769static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
770					      struct mm_struct *mm,
771					      unsigned long start,
772					      unsigned long end)
773{
774	trace_kvm_age_hva(start, end);
775
776	return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
777}
778
779static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
780					struct mm_struct *mm,
781					unsigned long start,
782					unsigned long end)
783{
784	trace_kvm_age_hva(start, end);
785
786	/*
787	 * Even though we do not flush TLB, this will still adversely
788	 * affect performance on pre-Haswell Intel EPT, where there is
789	 * no EPT Access Bit to clear so that we have to tear down EPT
790	 * tables instead. If we find this unacceptable, we can always
791	 * add a parameter to kvm_age_hva so that it effectively doesn't
792	 * do anything on clear_young.
793	 *
794	 * Also note that currently we never issue secondary TLB flushes
795	 * from clear_young, leaving this job up to the regular system
796	 * cadence. If we find this inaccurate, we might come up with a
797	 * more sophisticated heuristic later.
798	 */
799	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
800}
801
802static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
803				       struct mm_struct *mm,
804				       unsigned long address)
805{
806	trace_kvm_test_age_hva(address);
807
808	return kvm_handle_hva_range_no_flush(mn, address, address + 1,
809					     kvm_test_age_gfn);
810}
811
812static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
813				     struct mm_struct *mm)
814{
815	struct kvm *kvm = mmu_notifier_to_kvm(mn);
816	int idx;
817
818	idx = srcu_read_lock(&kvm->srcu);
819	kvm_arch_flush_shadow_all(kvm);
820	srcu_read_unlock(&kvm->srcu, idx);
821}
822
823static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
824	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
825	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
826	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
827	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
828	.clear_young		= kvm_mmu_notifier_clear_young,
829	.test_young		= kvm_mmu_notifier_test_young,
830	.change_pte		= kvm_mmu_notifier_change_pte,
831	.release		= kvm_mmu_notifier_release,
832};
833
834static int kvm_init_mmu_notifier(struct kvm *kvm)
835{
836	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
837	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
838}
839
840#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
841
842static int kvm_init_mmu_notifier(struct kvm *kvm)
843{
844	return 0;
845}
846
847#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
848
849#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
850static int kvm_pm_notifier_call(struct notifier_block *bl,
851				unsigned long state,
852				void *unused)
853{
854	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
855
856	return kvm_arch_pm_notifier(kvm, state);
857}
858
859static void kvm_init_pm_notifier(struct kvm *kvm)
860{
861	kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
862	/* Suspend KVM before we suspend ftrace, RCU, etc. */
863	kvm->pm_notifier.priority = INT_MAX;
864	register_pm_notifier(&kvm->pm_notifier);
865}
866
867static void kvm_destroy_pm_notifier(struct kvm *kvm)
868{
869	unregister_pm_notifier(&kvm->pm_notifier);
870}
871#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
872static void kvm_init_pm_notifier(struct kvm *kvm)
873{
874}
875
876static void kvm_destroy_pm_notifier(struct kvm *kvm)
877{
878}
879#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
880
881static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
882{
883	if (!memslot->dirty_bitmap)
884		return;
885
886	kvfree(memslot->dirty_bitmap);
887	memslot->dirty_bitmap = NULL;
888}
889
890/* This does not remove the slot from struct kvm_memslots data structures */
891static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
892{
893	kvm_destroy_dirty_bitmap(slot);
894
895	kvm_arch_free_memslot(kvm, slot);
896
897	kfree(slot);
898}
899
900static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
901{
902	struct hlist_node *idnode;
903	struct kvm_memory_slot *memslot;
904	int bkt;
905
906	/*
907	 * The same memslot objects live in both active and inactive sets,
908	 * arbitrarily free using index '1' so the second invocation of this
909	 * function isn't operating over a structure with dangling pointers
910	 * (even though this function isn't actually touching them).
911	 */
912	if (!slots->node_idx)
913		return;
914
915	hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
916		kvm_free_memslot(kvm, memslot);
917}
918
919static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
920{
921	switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
922	case KVM_STATS_TYPE_INSTANT:
923		return 0444;
924	case KVM_STATS_TYPE_CUMULATIVE:
925	case KVM_STATS_TYPE_PEAK:
926	default:
927		return 0644;
928	}
929}
930
931
932static void kvm_destroy_vm_debugfs(struct kvm *kvm)
933{
934	int i;
935	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
936				      kvm_vcpu_stats_header.num_desc;
937
938	if (!kvm->debugfs_dentry)
939		return;
940
941	debugfs_remove_recursive(kvm->debugfs_dentry);
942
943	if (kvm->debugfs_stat_data) {
944		for (i = 0; i < kvm_debugfs_num_entries; i++)
945			kfree(kvm->debugfs_stat_data[i]);
946		kfree(kvm->debugfs_stat_data);
947	}
948}
949
950static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
951{
952	static DEFINE_MUTEX(kvm_debugfs_lock);
953	struct dentry *dent;
954	char dir_name[ITOA_MAX_LEN * 2];
955	struct kvm_stat_data *stat_data;
956	const struct _kvm_stats_desc *pdesc;
957	int i, ret;
958	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
959				      kvm_vcpu_stats_header.num_desc;
960
961	if (!debugfs_initialized())
962		return 0;
963
964	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
965	mutex_lock(&kvm_debugfs_lock);
966	dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
967	if (dent) {
968		pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
969		dput(dent);
970		mutex_unlock(&kvm_debugfs_lock);
971		return 0;
972	}
973	dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
974	mutex_unlock(&kvm_debugfs_lock);
975	if (IS_ERR(dent))
976		return 0;
977
978	kvm->debugfs_dentry = dent;
979	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
980					 sizeof(*kvm->debugfs_stat_data),
981					 GFP_KERNEL_ACCOUNT);
982	if (!kvm->debugfs_stat_data)
983		return -ENOMEM;
984
985	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
986		pdesc = &kvm_vm_stats_desc[i];
987		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
988		if (!stat_data)
989			return -ENOMEM;
990
991		stat_data->kvm = kvm;
992		stat_data->desc = pdesc;
993		stat_data->kind = KVM_STAT_VM;
994		kvm->debugfs_stat_data[i] = stat_data;
995		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
996				    kvm->debugfs_dentry, stat_data,
997				    &stat_fops_per_vm);
998	}
999
1000	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1001		pdesc = &kvm_vcpu_stats_desc[i];
1002		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1003		if (!stat_data)
1004			return -ENOMEM;
1005
1006		stat_data->kvm = kvm;
1007		stat_data->desc = pdesc;
1008		stat_data->kind = KVM_STAT_VCPU;
1009		kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1010		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1011				    kvm->debugfs_dentry, stat_data,
1012				    &stat_fops_per_vm);
1013	}
1014
1015	ret = kvm_arch_create_vm_debugfs(kvm);
1016	if (ret) {
1017		kvm_destroy_vm_debugfs(kvm);
1018		return i;
1019	}
1020
1021	return 0;
1022}
1023
1024/*
1025 * Called after the VM is otherwise initialized, but just before adding it to
1026 * the vm_list.
1027 */
1028int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1029{
1030	return 0;
1031}
1032
1033/*
1034 * Called just after removing the VM from the vm_list, but before doing any
1035 * other destruction.
1036 */
1037void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1038{
1039}
1040
1041/*
1042 * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1043 * be setup already, so we can create arch-specific debugfs entries under it.
1044 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1045 * a per-arch destroy interface is not needed.
1046 */
1047int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1048{
1049	return 0;
1050}
1051
1052static struct kvm *kvm_create_vm(unsigned long type)
1053{
1054	struct kvm *kvm = kvm_arch_alloc_vm();
1055	struct kvm_memslots *slots;
1056	int r = -ENOMEM;
1057	int i, j;
1058
1059	if (!kvm)
1060		return ERR_PTR(-ENOMEM);
1061
1062	KVM_MMU_LOCK_INIT(kvm);
1063	mmgrab(current->mm);
1064	kvm->mm = current->mm;
1065	kvm_eventfd_init(kvm);
1066	mutex_init(&kvm->lock);
1067	mutex_init(&kvm->irq_lock);
1068	mutex_init(&kvm->slots_lock);
1069	mutex_init(&kvm->slots_arch_lock);
1070	spin_lock_init(&kvm->mn_invalidate_lock);
1071	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1072	xa_init(&kvm->vcpu_array);
1073
1074	INIT_LIST_HEAD(&kvm->gpc_list);
1075	spin_lock_init(&kvm->gpc_lock);
1076
1077	INIT_LIST_HEAD(&kvm->devices);
1078
1079	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1080
1081	if (init_srcu_struct(&kvm->srcu))
1082		goto out_err_no_srcu;
1083	if (init_srcu_struct(&kvm->irq_srcu))
1084		goto out_err_no_irq_srcu;
1085
1086	refcount_set(&kvm->users_count, 1);
1087	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1088		for (j = 0; j < 2; j++) {
1089			slots = &kvm->__memslots[i][j];
1090
1091			atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1092			slots->hva_tree = RB_ROOT_CACHED;
1093			slots->gfn_tree = RB_ROOT;
1094			hash_init(slots->id_hash);
1095			slots->node_idx = j;
1096
1097			/* Generations must be different for each address space. */
1098			slots->generation = i;
1099		}
1100
1101		rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1102	}
1103
1104	for (i = 0; i < KVM_NR_BUSES; i++) {
1105		rcu_assign_pointer(kvm->buses[i],
1106			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1107		if (!kvm->buses[i])
1108			goto out_err_no_arch_destroy_vm;
1109	}
1110
1111	kvm->max_halt_poll_ns = halt_poll_ns;
1112
1113	r = kvm_arch_init_vm(kvm, type);
1114	if (r)
1115		goto out_err_no_arch_destroy_vm;
1116
1117	r = hardware_enable_all();
1118	if (r)
1119		goto out_err_no_disable;
1120
1121#ifdef CONFIG_HAVE_KVM_IRQFD
1122	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1123#endif
1124
1125	r = kvm_init_mmu_notifier(kvm);
1126	if (r)
1127		goto out_err_no_mmu_notifier;
1128
1129	r = kvm_arch_post_init_vm(kvm);
1130	if (r)
1131		goto out_err;
1132
1133	mutex_lock(&kvm_lock);
1134	list_add(&kvm->vm_list, &vm_list);
1135	mutex_unlock(&kvm_lock);
1136
1137	preempt_notifier_inc();
1138	kvm_init_pm_notifier(kvm);
1139
1140	return kvm;
1141
1142out_err:
1143#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1144	if (kvm->mmu_notifier.ops)
1145		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1146#endif
1147out_err_no_mmu_notifier:
1148	hardware_disable_all();
1149out_err_no_disable:
1150	kvm_arch_destroy_vm(kvm);
1151out_err_no_arch_destroy_vm:
1152	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1153	for (i = 0; i < KVM_NR_BUSES; i++)
1154		kfree(kvm_get_bus(kvm, i));
1155	cleanup_srcu_struct(&kvm->irq_srcu);
1156out_err_no_irq_srcu:
1157	cleanup_srcu_struct(&kvm->srcu);
1158out_err_no_srcu:
1159	kvm_arch_free_vm(kvm);
1160	mmdrop(current->mm);
1161	return ERR_PTR(r);
1162}
1163
1164static void kvm_destroy_devices(struct kvm *kvm)
1165{
1166	struct kvm_device *dev, *tmp;
1167
1168	/*
1169	 * We do not need to take the kvm->lock here, because nobody else
1170	 * has a reference to the struct kvm at this point and therefore
1171	 * cannot access the devices list anyhow.
1172	 */
1173	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1174		list_del(&dev->vm_node);
1175		dev->ops->destroy(dev);
1176	}
1177}
1178
1179static void kvm_destroy_vm(struct kvm *kvm)
1180{
1181	int i;
1182	struct mm_struct *mm = kvm->mm;
1183
1184	kvm_destroy_pm_notifier(kvm);
1185	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1186	kvm_destroy_vm_debugfs(kvm);
1187	kvm_arch_sync_events(kvm);
1188	mutex_lock(&kvm_lock);
1189	list_del(&kvm->vm_list);
1190	mutex_unlock(&kvm_lock);
1191	kvm_arch_pre_destroy_vm(kvm);
1192
1193	kvm_free_irq_routing(kvm);
1194	for (i = 0; i < KVM_NR_BUSES; i++) {
1195		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1196
1197		if (bus)
1198			kvm_io_bus_destroy(bus);
1199		kvm->buses[i] = NULL;
1200	}
1201	kvm_coalesced_mmio_free(kvm);
1202#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1203	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1204	/*
1205	 * At this point, pending calls to invalidate_range_start()
1206	 * have completed but no more MMU notifiers will run, so
1207	 * mn_active_invalidate_count may remain unbalanced.
1208	 * No threads can be waiting in install_new_memslots as the
1209	 * last reference on KVM has been dropped, but freeing
1210	 * memslots would deadlock without this manual intervention.
1211	 */
1212	WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1213	kvm->mn_active_invalidate_count = 0;
1214#else
1215	kvm_arch_flush_shadow_all(kvm);
1216#endif
1217	kvm_arch_destroy_vm(kvm);
1218	kvm_destroy_devices(kvm);
1219	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1220		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1221		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1222	}
1223	cleanup_srcu_struct(&kvm->irq_srcu);
1224	cleanup_srcu_struct(&kvm->srcu);
1225	kvm_arch_free_vm(kvm);
1226	preempt_notifier_dec();
1227	hardware_disable_all();
1228	mmdrop(mm);
1229}
1230
1231void kvm_get_kvm(struct kvm *kvm)
1232{
1233	refcount_inc(&kvm->users_count);
1234}
1235EXPORT_SYMBOL_GPL(kvm_get_kvm);
1236
1237/*
1238 * Make sure the vm is not during destruction, which is a safe version of
1239 * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1240 */
1241bool kvm_get_kvm_safe(struct kvm *kvm)
1242{
1243	return refcount_inc_not_zero(&kvm->users_count);
1244}
1245EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1246
1247void kvm_put_kvm(struct kvm *kvm)
1248{
1249	if (refcount_dec_and_test(&kvm->users_count))
1250		kvm_destroy_vm(kvm);
1251}
1252EXPORT_SYMBOL_GPL(kvm_put_kvm);
1253
1254/*
1255 * Used to put a reference that was taken on behalf of an object associated
1256 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1257 * of the new file descriptor fails and the reference cannot be transferred to
1258 * its final owner.  In such cases, the caller is still actively using @kvm and
1259 * will fail miserably if the refcount unexpectedly hits zero.
1260 */
1261void kvm_put_kvm_no_destroy(struct kvm *kvm)
1262{
1263	WARN_ON(refcount_dec_and_test(&kvm->users_count));
1264}
1265EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1266
1267static int kvm_vm_release(struct inode *inode, struct file *filp)
1268{
1269	struct kvm *kvm = filp->private_data;
1270
1271	kvm_irqfd_release(kvm);
1272
1273	kvm_put_kvm(kvm);
1274	return 0;
1275}
1276
1277/*
1278 * Allocation size is twice as large as the actual dirty bitmap size.
1279 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1280 */
1281static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1282{
1283	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1284
1285	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1286	if (!memslot->dirty_bitmap)
1287		return -ENOMEM;
1288
1289	return 0;
1290}
1291
1292static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1293{
1294	struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1295	int node_idx_inactive = active->node_idx ^ 1;
1296
1297	return &kvm->__memslots[as_id][node_idx_inactive];
1298}
1299
1300/*
1301 * Helper to get the address space ID when one of memslot pointers may be NULL.
1302 * This also serves as a sanity that at least one of the pointers is non-NULL,
1303 * and that their address space IDs don't diverge.
1304 */
1305static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1306				  struct kvm_memory_slot *b)
1307{
1308	if (WARN_ON_ONCE(!a && !b))
1309		return 0;
1310
1311	if (!a)
1312		return b->as_id;
1313	if (!b)
1314		return a->as_id;
1315
1316	WARN_ON_ONCE(a->as_id != b->as_id);
1317	return a->as_id;
1318}
1319
1320static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1321				struct kvm_memory_slot *slot)
1322{
1323	struct rb_root *gfn_tree = &slots->gfn_tree;
1324	struct rb_node **node, *parent;
1325	int idx = slots->node_idx;
1326
1327	parent = NULL;
1328	for (node = &gfn_tree->rb_node; *node; ) {
1329		struct kvm_memory_slot *tmp;
1330
1331		tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1332		parent = *node;
1333		if (slot->base_gfn < tmp->base_gfn)
1334			node = &(*node)->rb_left;
1335		else if (slot->base_gfn > tmp->base_gfn)
1336			node = &(*node)->rb_right;
1337		else
1338			BUG();
1339	}
1340
1341	rb_link_node(&slot->gfn_node[idx], parent, node);
1342	rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1343}
1344
1345static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1346			       struct kvm_memory_slot *slot)
1347{
1348	rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1349}
1350
1351static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1352				 struct kvm_memory_slot *old,
1353				 struct kvm_memory_slot *new)
1354{
1355	int idx = slots->node_idx;
1356
1357	WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1358
1359	rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1360			&slots->gfn_tree);
1361}
1362
1363/*
1364 * Replace @old with @new in the inactive memslots.
1365 *
1366 * With NULL @old this simply adds @new.
1367 * With NULL @new this simply removes @old.
1368 *
1369 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1370 * appropriately.
1371 */
1372static void kvm_replace_memslot(struct kvm *kvm,
1373				struct kvm_memory_slot *old,
1374				struct kvm_memory_slot *new)
1375{
1376	int as_id = kvm_memslots_get_as_id(old, new);
1377	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1378	int idx = slots->node_idx;
1379
1380	if (old) {
1381		hash_del(&old->id_node[idx]);
1382		interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1383
1384		if ((long)old == atomic_long_read(&slots->last_used_slot))
1385			atomic_long_set(&slots->last_used_slot, (long)new);
1386
1387		if (!new) {
1388			kvm_erase_gfn_node(slots, old);
1389			return;
1390		}
1391	}
1392
1393	/*
1394	 * Initialize @new's hva range.  Do this even when replacing an @old
1395	 * slot, kvm_copy_memslot() deliberately does not touch node data.
1396	 */
1397	new->hva_node[idx].start = new->userspace_addr;
1398	new->hva_node[idx].last = new->userspace_addr +
1399				  (new->npages << PAGE_SHIFT) - 1;
1400
1401	/*
1402	 * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1403	 * hva_node needs to be swapped with remove+insert even though hva can't
1404	 * change when replacing an existing slot.
1405	 */
1406	hash_add(slots->id_hash, &new->id_node[idx], new->id);
1407	interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1408
1409	/*
1410	 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1411	 * switch the node in the gfn tree instead of removing the old and
1412	 * inserting the new as two separate operations. Replacement is a
1413	 * single O(1) operation versus two O(log(n)) operations for
1414	 * remove+insert.
1415	 */
1416	if (old && old->base_gfn == new->base_gfn) {
1417		kvm_replace_gfn_node(slots, old, new);
1418	} else {
1419		if (old)
1420			kvm_erase_gfn_node(slots, old);
1421		kvm_insert_gfn_node(slots, new);
1422	}
1423}
1424
1425static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1426{
1427	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1428
1429#ifdef __KVM_HAVE_READONLY_MEM
1430	valid_flags |= KVM_MEM_READONLY;
1431#endif
1432
1433	if (mem->flags & ~valid_flags)
1434		return -EINVAL;
1435
1436	return 0;
1437}
1438
1439static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1440{
1441	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1442
1443	/* Grab the generation from the activate memslots. */
1444	u64 gen = __kvm_memslots(kvm, as_id)->generation;
1445
1446	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1447	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1448
1449	/*
1450	 * Do not store the new memslots while there are invalidations in
1451	 * progress, otherwise the locking in invalidate_range_start and
1452	 * invalidate_range_end will be unbalanced.
1453	 */
1454	spin_lock(&kvm->mn_invalidate_lock);
1455	prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1456	while (kvm->mn_active_invalidate_count) {
1457		set_current_state(TASK_UNINTERRUPTIBLE);
1458		spin_unlock(&kvm->mn_invalidate_lock);
1459		schedule();
1460		spin_lock(&kvm->mn_invalidate_lock);
1461	}
1462	finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1463	rcu_assign_pointer(kvm->memslots[as_id], slots);
1464	spin_unlock(&kvm->mn_invalidate_lock);
1465
1466	/*
1467	 * Acquired in kvm_set_memslot. Must be released before synchronize
1468	 * SRCU below in order to avoid deadlock with another thread
1469	 * acquiring the slots_arch_lock in an srcu critical section.
1470	 */
1471	mutex_unlock(&kvm->slots_arch_lock);
1472
1473	synchronize_srcu_expedited(&kvm->srcu);
1474
1475	/*
1476	 * Increment the new memslot generation a second time, dropping the
1477	 * update in-progress flag and incrementing the generation based on
1478	 * the number of address spaces.  This provides a unique and easily
1479	 * identifiable generation number while the memslots are in flux.
1480	 */
1481	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1482
1483	/*
1484	 * Generations must be unique even across address spaces.  We do not need
1485	 * a global counter for that, instead the generation space is evenly split
1486	 * across address spaces.  For example, with two address spaces, address
1487	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1488	 * use generations 1, 3, 5, ...
1489	 */
1490	gen += KVM_ADDRESS_SPACE_NUM;
1491
1492	kvm_arch_memslots_updated(kvm, gen);
1493
1494	slots->generation = gen;
1495}
1496
1497static int kvm_prepare_memory_region(struct kvm *kvm,
1498				     const struct kvm_memory_slot *old,
1499				     struct kvm_memory_slot *new,
1500				     enum kvm_mr_change change)
1501{
1502	int r;
1503
1504	/*
1505	 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1506	 * will be freed on "commit".  If logging is enabled in both old and
1507	 * new, reuse the existing bitmap.  If logging is enabled only in the
1508	 * new and KVM isn't using a ring buffer, allocate and initialize a
1509	 * new bitmap.
1510	 */
1511	if (change != KVM_MR_DELETE) {
1512		if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1513			new->dirty_bitmap = NULL;
1514		else if (old && old->dirty_bitmap)
1515			new->dirty_bitmap = old->dirty_bitmap;
1516		else if (!kvm->dirty_ring_size) {
1517			r = kvm_alloc_dirty_bitmap(new);
1518			if (r)
1519				return r;
1520
1521			if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1522				bitmap_set(new->dirty_bitmap, 0, new->npages);
1523		}
1524	}
1525
1526	r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1527
1528	/* Free the bitmap on failure if it was allocated above. */
1529	if (r && new && new->dirty_bitmap && old && !old->dirty_bitmap)
1530		kvm_destroy_dirty_bitmap(new);
1531
1532	return r;
1533}
1534
1535static void kvm_commit_memory_region(struct kvm *kvm,
1536				     struct kvm_memory_slot *old,
1537				     const struct kvm_memory_slot *new,
1538				     enum kvm_mr_change change)
1539{
1540	/*
1541	 * Update the total number of memslot pages before calling the arch
1542	 * hook so that architectures can consume the result directly.
1543	 */
1544	if (change == KVM_MR_DELETE)
1545		kvm->nr_memslot_pages -= old->npages;
1546	else if (change == KVM_MR_CREATE)
1547		kvm->nr_memslot_pages += new->npages;
1548
1549	kvm_arch_commit_memory_region(kvm, old, new, change);
1550
1551	switch (change) {
1552	case KVM_MR_CREATE:
1553		/* Nothing more to do. */
1554		break;
1555	case KVM_MR_DELETE:
1556		/* Free the old memslot and all its metadata. */
1557		kvm_free_memslot(kvm, old);
1558		break;
1559	case KVM_MR_MOVE:
1560	case KVM_MR_FLAGS_ONLY:
1561		/*
1562		 * Free the dirty bitmap as needed; the below check encompasses
1563		 * both the flags and whether a ring buffer is being used)
1564		 */
1565		if (old->dirty_bitmap && !new->dirty_bitmap)
1566			kvm_destroy_dirty_bitmap(old);
1567
1568		/*
1569		 * The final quirk.  Free the detached, old slot, but only its
1570		 * memory, not any metadata.  Metadata, including arch specific
1571		 * data, may be reused by @new.
1572		 */
1573		kfree(old);
1574		break;
1575	default:
1576		BUG();
1577	}
1578}
1579
1580/*
1581 * Activate @new, which must be installed in the inactive slots by the caller,
1582 * by swapping the active slots and then propagating @new to @old once @old is
1583 * unreachable and can be safely modified.
1584 *
1585 * With NULL @old this simply adds @new to @active (while swapping the sets).
1586 * With NULL @new this simply removes @old from @active and frees it
1587 * (while also swapping the sets).
1588 */
1589static void kvm_activate_memslot(struct kvm *kvm,
1590				 struct kvm_memory_slot *old,
1591				 struct kvm_memory_slot *new)
1592{
1593	int as_id = kvm_memslots_get_as_id(old, new);
1594
1595	kvm_swap_active_memslots(kvm, as_id);
1596
1597	/* Propagate the new memslot to the now inactive memslots. */
1598	kvm_replace_memslot(kvm, old, new);
1599}
1600
1601static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1602			     const struct kvm_memory_slot *src)
1603{
1604	dest->base_gfn = src->base_gfn;
1605	dest->npages = src->npages;
1606	dest->dirty_bitmap = src->dirty_bitmap;
1607	dest->arch = src->arch;
1608	dest->userspace_addr = src->userspace_addr;
1609	dest->flags = src->flags;
1610	dest->id = src->id;
1611	dest->as_id = src->as_id;
1612}
1613
1614static void kvm_invalidate_memslot(struct kvm *kvm,
1615				   struct kvm_memory_slot *old,
1616				   struct kvm_memory_slot *invalid_slot)
1617{
1618	/*
1619	 * Mark the current slot INVALID.  As with all memslot modifications,
1620	 * this must be done on an unreachable slot to avoid modifying the
1621	 * current slot in the active tree.
1622	 */
1623	kvm_copy_memslot(invalid_slot, old);
1624	invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1625	kvm_replace_memslot(kvm, old, invalid_slot);
1626
1627	/*
1628	 * Activate the slot that is now marked INVALID, but don't propagate
1629	 * the slot to the now inactive slots. The slot is either going to be
1630	 * deleted or recreated as a new slot.
1631	 */
1632	kvm_swap_active_memslots(kvm, old->as_id);
1633
1634	/*
1635	 * From this point no new shadow pages pointing to a deleted, or moved,
1636	 * memslot will be created.  Validation of sp->gfn happens in:
1637	 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1638	 *	- kvm_is_visible_gfn (mmu_check_root)
1639	 */
1640	kvm_arch_flush_shadow_memslot(kvm, old);
1641
1642	/* Was released by kvm_swap_active_memslots, reacquire. */
1643	mutex_lock(&kvm->slots_arch_lock);
1644
1645	/*
1646	 * Copy the arch-specific field of the newly-installed slot back to the
1647	 * old slot as the arch data could have changed between releasing
1648	 * slots_arch_lock in install_new_memslots() and re-acquiring the lock
1649	 * above.  Writers are required to retrieve memslots *after* acquiring
1650	 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1651	 */
1652	old->arch = invalid_slot->arch;
1653}
1654
1655static void kvm_create_memslot(struct kvm *kvm,
1656			       struct kvm_memory_slot *new)
1657{
1658	/* Add the new memslot to the inactive set and activate. */
1659	kvm_replace_memslot(kvm, NULL, new);
1660	kvm_activate_memslot(kvm, NULL, new);
1661}
1662
1663static void kvm_delete_memslot(struct kvm *kvm,
1664			       struct kvm_memory_slot *old,
1665			       struct kvm_memory_slot *invalid_slot)
1666{
1667	/*
1668	 * Remove the old memslot (in the inactive memslots) by passing NULL as
1669	 * the "new" slot, and for the invalid version in the active slots.
1670	 */
1671	kvm_replace_memslot(kvm, old, NULL);
1672	kvm_activate_memslot(kvm, invalid_slot, NULL);
1673}
1674
1675static void kvm_move_memslot(struct kvm *kvm,
1676			     struct kvm_memory_slot *old,
1677			     struct kvm_memory_slot *new,
1678			     struct kvm_memory_slot *invalid_slot)
1679{
1680	/*
1681	 * Replace the old memslot in the inactive slots, and then swap slots
1682	 * and replace the current INVALID with the new as well.
1683	 */
1684	kvm_replace_memslot(kvm, old, new);
1685	kvm_activate_memslot(kvm, invalid_slot, new);
1686}
1687
1688static void kvm_update_flags_memslot(struct kvm *kvm,
1689				     struct kvm_memory_slot *old,
1690				     struct kvm_memory_slot *new)
1691{
1692	/*
1693	 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1694	 * an intermediate step. Instead, the old memslot is simply replaced
1695	 * with a new, updated copy in both memslot sets.
1696	 */
1697	kvm_replace_memslot(kvm, old, new);
1698	kvm_activate_memslot(kvm, old, new);
1699}
1700
1701static int kvm_set_memslot(struct kvm *kvm,
1702			   struct kvm_memory_slot *old,
1703			   struct kvm_memory_slot *new,
1704			   enum kvm_mr_change change)
1705{
1706	struct kvm_memory_slot *invalid_slot;
1707	int r;
1708
1709	/*
1710	 * Released in kvm_swap_active_memslots.
1711	 *
1712	 * Must be held from before the current memslots are copied until
1713	 * after the new memslots are installed with rcu_assign_pointer,
1714	 * then released before the synchronize srcu in kvm_swap_active_memslots.
1715	 *
1716	 * When modifying memslots outside of the slots_lock, must be held
1717	 * before reading the pointer to the current memslots until after all
1718	 * changes to those memslots are complete.
1719	 *
1720	 * These rules ensure that installing new memslots does not lose
1721	 * changes made to the previous memslots.
1722	 */
1723	mutex_lock(&kvm->slots_arch_lock);
1724
1725	/*
1726	 * Invalidate the old slot if it's being deleted or moved.  This is
1727	 * done prior to actually deleting/moving the memslot to allow vCPUs to
1728	 * continue running by ensuring there are no mappings or shadow pages
1729	 * for the memslot when it is deleted/moved.  Without pre-invalidation
1730	 * (and without a lock), a window would exist between effecting the
1731	 * delete/move and committing the changes in arch code where KVM or a
1732	 * guest could access a non-existent memslot.
1733	 *
1734	 * Modifications are done on a temporary, unreachable slot.  The old
1735	 * slot needs to be preserved in case a later step fails and the
1736	 * invalidation needs to be reverted.
1737	 */
1738	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1739		invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1740		if (!invalid_slot) {
1741			mutex_unlock(&kvm->slots_arch_lock);
1742			return -ENOMEM;
1743		}
1744		kvm_invalidate_memslot(kvm, old, invalid_slot);
1745	}
1746
1747	r = kvm_prepare_memory_region(kvm, old, new, change);
1748	if (r) {
1749		/*
1750		 * For DELETE/MOVE, revert the above INVALID change.  No
1751		 * modifications required since the original slot was preserved
1752		 * in the inactive slots.  Changing the active memslots also
1753		 * release slots_arch_lock.
1754		 */
1755		if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1756			kvm_activate_memslot(kvm, invalid_slot, old);
1757			kfree(invalid_slot);
1758		} else {
1759			mutex_unlock(&kvm->slots_arch_lock);
1760		}
1761		return r;
1762	}
1763
1764	/*
1765	 * For DELETE and MOVE, the working slot is now active as the INVALID
1766	 * version of the old slot.  MOVE is particularly special as it reuses
1767	 * the old slot and returns a copy of the old slot (in working_slot).
1768	 * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1769	 * old slot is detached but otherwise preserved.
1770	 */
1771	if (change == KVM_MR_CREATE)
1772		kvm_create_memslot(kvm, new);
1773	else if (change == KVM_MR_DELETE)
1774		kvm_delete_memslot(kvm, old, invalid_slot);
1775	else if (change == KVM_MR_MOVE)
1776		kvm_move_memslot(kvm, old, new, invalid_slot);
1777	else if (change == KVM_MR_FLAGS_ONLY)
1778		kvm_update_flags_memslot(kvm, old, new);
1779	else
1780		BUG();
1781
1782	/* Free the temporary INVALID slot used for DELETE and MOVE. */
1783	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1784		kfree(invalid_slot);
1785
1786	/*
1787	 * No need to refresh new->arch, changes after dropping slots_arch_lock
1788	 * will directly hit the final, active memsot.  Architectures are
1789	 * responsible for knowing that new->arch may be stale.
1790	 */
1791	kvm_commit_memory_region(kvm, old, new, change);
1792
1793	return 0;
1794}
1795
1796static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1797				      gfn_t start, gfn_t end)
1798{
1799	struct kvm_memslot_iter iter;
1800
1801	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1802		if (iter.slot->id != id)
1803			return true;
1804	}
1805
1806	return false;
1807}
1808
1809/*
1810 * Allocate some memory and give it an address in the guest physical address
1811 * space.
1812 *
1813 * Discontiguous memory is allowed, mostly for framebuffers.
1814 *
1815 * Must be called holding kvm->slots_lock for write.
1816 */
1817int __kvm_set_memory_region(struct kvm *kvm,
1818			    const struct kvm_userspace_memory_region *mem)
1819{
1820	struct kvm_memory_slot *old, *new;
1821	struct kvm_memslots *slots;
1822	enum kvm_mr_change change;
1823	unsigned long npages;
1824	gfn_t base_gfn;
1825	int as_id, id;
1826	int r;
1827
1828	r = check_memory_region_flags(mem);
1829	if (r)
1830		return r;
1831
1832	as_id = mem->slot >> 16;
1833	id = (u16)mem->slot;
1834
1835	/* General sanity checks */
1836	if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1837	    (mem->memory_size != (unsigned long)mem->memory_size))
1838		return -EINVAL;
1839	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1840		return -EINVAL;
1841	/* We can read the guest memory with __xxx_user() later on. */
1842	if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1843	    (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1844	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1845			mem->memory_size))
1846		return -EINVAL;
1847	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1848		return -EINVAL;
1849	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1850		return -EINVAL;
1851	if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1852		return -EINVAL;
1853
1854	slots = __kvm_memslots(kvm, as_id);
1855
1856	/*
1857	 * Note, the old memslot (and the pointer itself!) may be invalidated
1858	 * and/or destroyed by kvm_set_memslot().
1859	 */
1860	old = id_to_memslot(slots, id);
1861
1862	if (!mem->memory_size) {
1863		if (!old || !old->npages)
1864			return -EINVAL;
1865
1866		if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1867			return -EIO;
1868
1869		return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
1870	}
1871
1872	base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
1873	npages = (mem->memory_size >> PAGE_SHIFT);
1874
1875	if (!old || !old->npages) {
1876		change = KVM_MR_CREATE;
1877
1878		/*
1879		 * To simplify KVM internals, the total number of pages across
1880		 * all memslots must fit in an unsigned long.
1881		 */
1882		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
1883			return -EINVAL;
1884	} else { /* Modify an existing slot. */
1885		if ((mem->userspace_addr != old->userspace_addr) ||
1886		    (npages != old->npages) ||
1887		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
1888			return -EINVAL;
1889
1890		if (base_gfn != old->base_gfn)
1891			change = KVM_MR_MOVE;
1892		else if (mem->flags != old->flags)
1893			change = KVM_MR_FLAGS_ONLY;
1894		else /* Nothing to change. */
1895			return 0;
1896	}
1897
1898	if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
1899	    kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
1900		return -EEXIST;
1901
1902	/* Allocate a slot that will persist in the memslot. */
1903	new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
1904	if (!new)
1905		return -ENOMEM;
1906
1907	new->as_id = as_id;
1908	new->id = id;
1909	new->base_gfn = base_gfn;
1910	new->npages = npages;
1911	new->flags = mem->flags;
1912	new->userspace_addr = mem->userspace_addr;
1913
1914	r = kvm_set_memslot(kvm, old, new, change);
1915	if (r)
1916		kfree(new);
1917	return r;
1918}
1919EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1920
1921int kvm_set_memory_region(struct kvm *kvm,
1922			  const struct kvm_userspace_memory_region *mem)
1923{
1924	int r;
1925
1926	mutex_lock(&kvm->slots_lock);
1927	r = __kvm_set_memory_region(kvm, mem);
1928	mutex_unlock(&kvm->slots_lock);
1929	return r;
1930}
1931EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1932
1933static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1934					  struct kvm_userspace_memory_region *mem)
1935{
1936	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1937		return -EINVAL;
1938
1939	return kvm_set_memory_region(kvm, mem);
1940}
1941
1942#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1943/**
1944 * kvm_get_dirty_log - get a snapshot of dirty pages
1945 * @kvm:	pointer to kvm instance
1946 * @log:	slot id and address to which we copy the log
1947 * @is_dirty:	set to '1' if any dirty pages were found
1948 * @memslot:	set to the associated memslot, always valid on success
1949 */
1950int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1951		      int *is_dirty, struct kvm_memory_slot **memslot)
1952{
1953	struct kvm_memslots *slots;
1954	int i, as_id, id;
1955	unsigned long n;
1956	unsigned long any = 0;
1957
1958	/* Dirty ring tracking is exclusive to dirty log tracking */
1959	if (kvm->dirty_ring_size)
1960		return -ENXIO;
1961
1962	*memslot = NULL;
1963	*is_dirty = 0;
1964
1965	as_id = log->slot >> 16;
1966	id = (u16)log->slot;
1967	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1968		return -EINVAL;
1969
1970	slots = __kvm_memslots(kvm, as_id);
1971	*memslot = id_to_memslot(slots, id);
1972	if (!(*memslot) || !(*memslot)->dirty_bitmap)
1973		return -ENOENT;
1974
1975	kvm_arch_sync_dirty_log(kvm, *memslot);
1976
1977	n = kvm_dirty_bitmap_bytes(*memslot);
1978
1979	for (i = 0; !any && i < n/sizeof(long); ++i)
1980		any = (*memslot)->dirty_bitmap[i];
1981
1982	if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1983		return -EFAULT;
1984
1985	if (any)
1986		*is_dirty = 1;
1987	return 0;
1988}
1989EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1990
1991#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1992/**
1993 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1994 *	and reenable dirty page tracking for the corresponding pages.
1995 * @kvm:	pointer to kvm instance
1996 * @log:	slot id and address to which we copy the log
1997 *
1998 * We need to keep it in mind that VCPU threads can write to the bitmap
1999 * concurrently. So, to avoid losing track of dirty pages we keep the
2000 * following order:
2001 *
2002 *    1. Take a snapshot of the bit and clear it if needed.
2003 *    2. Write protect the corresponding page.
2004 *    3. Copy the snapshot to the userspace.
2005 *    4. Upon return caller flushes TLB's if needed.
2006 *
2007 * Between 2 and 4, the guest may write to the page using the remaining TLB
2008 * entry.  This is not a problem because the page is reported dirty using
2009 * the snapshot taken before and step 4 ensures that writes done after
2010 * exiting to userspace will be logged for the next call.
2011 *
2012 */
2013static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2014{
2015	struct kvm_memslots *slots;
2016	struct kvm_memory_slot *memslot;
2017	int i, as_id, id;
2018	unsigned long n;
2019	unsigned long *dirty_bitmap;
2020	unsigned long *dirty_bitmap_buffer;
2021	bool flush;
2022
2023	/* Dirty ring tracking is exclusive to dirty log tracking */
2024	if (kvm->dirty_ring_size)
2025		return -ENXIO;
2026
2027	as_id = log->slot >> 16;
2028	id = (u16)log->slot;
2029	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2030		return -EINVAL;
2031
2032	slots = __kvm_memslots(kvm, as_id);
2033	memslot = id_to_memslot(slots, id);
2034	if (!memslot || !memslot->dirty_bitmap)
2035		return -ENOENT;
2036
2037	dirty_bitmap = memslot->dirty_bitmap;
2038
2039	kvm_arch_sync_dirty_log(kvm, memslot);
2040
2041	n = kvm_dirty_bitmap_bytes(memslot);
2042	flush = false;
2043	if (kvm->manual_dirty_log_protect) {
2044		/*
2045		 * Unlike kvm_get_dirty_log, we always return false in *flush,
2046		 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2047		 * is some code duplication between this function and
2048		 * kvm_get_dirty_log, but hopefully all architecture
2049		 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2050		 * can be eliminated.
2051		 */
2052		dirty_bitmap_buffer = dirty_bitmap;
2053	} else {
2054		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2055		memset(dirty_bitmap_buffer, 0, n);
2056
2057		KVM_MMU_LOCK(kvm);
2058		for (i = 0; i < n / sizeof(long); i++) {
2059			unsigned long mask;
2060			gfn_t offset;
2061
2062			if (!dirty_bitmap[i])
2063				continue;
2064
2065			flush = true;
2066			mask = xchg(&dirty_bitmap[i], 0);
2067			dirty_bitmap_buffer[i] = mask;
2068
2069			offset = i * BITS_PER_LONG;
2070			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2071								offset, mask);
2072		}
2073		KVM_MMU_UNLOCK(kvm);
2074	}
2075
2076	if (flush)
2077		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2078
2079	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2080		return -EFAULT;
2081	return 0;
2082}
2083
2084
2085/**
2086 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2087 * @kvm: kvm instance
2088 * @log: slot id and address to which we copy the log
2089 *
2090 * Steps 1-4 below provide general overview of dirty page logging. See
2091 * kvm_get_dirty_log_protect() function description for additional details.
2092 *
2093 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2094 * always flush the TLB (step 4) even if previous step failed  and the dirty
2095 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2096 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2097 * writes will be marked dirty for next log read.
2098 *
2099 *   1. Take a snapshot of the bit and clear it if needed.
2100 *   2. Write protect the corresponding page.
2101 *   3. Copy the snapshot to the userspace.
2102 *   4. Flush TLB's if needed.
2103 */
2104static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2105				      struct kvm_dirty_log *log)
2106{
2107	int r;
2108
2109	mutex_lock(&kvm->slots_lock);
2110
2111	r = kvm_get_dirty_log_protect(kvm, log);
2112
2113	mutex_unlock(&kvm->slots_lock);
2114	return r;
2115}
2116
2117/**
2118 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2119 *	and reenable dirty page tracking for the corresponding pages.
2120 * @kvm:	pointer to kvm instance
2121 * @log:	slot id and address from which to fetch the bitmap of dirty pages
2122 */
2123static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2124				       struct kvm_clear_dirty_log *log)
2125{
2126	struct kvm_memslots *slots;
2127	struct kvm_memory_slot *memslot;
2128	int as_id, id;
2129	gfn_t offset;
2130	unsigned long i, n;
2131	unsigned long *dirty_bitmap;
2132	unsigned long *dirty_bitmap_buffer;
2133	bool flush;
2134
2135	/* Dirty ring tracking is exclusive to dirty log tracking */
2136	if (kvm->dirty_ring_size)
2137		return -ENXIO;
2138
2139	as_id = log->slot >> 16;
2140	id = (u16)log->slot;
2141	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2142		return -EINVAL;
2143
2144	if (log->first_page & 63)
2145		return -EINVAL;
2146
2147	slots = __kvm_memslots(kvm, as_id);
2148	memslot = id_to_memslot(slots, id);
2149	if (!memslot || !memslot->dirty_bitmap)
2150		return -ENOENT;
2151
2152	dirty_bitmap = memslot->dirty_bitmap;
2153
2154	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2155
2156	if (log->first_page > memslot->npages ||
2157	    log->num_pages > memslot->npages - log->first_page ||
2158	    (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2159	    return -EINVAL;
2160
2161	kvm_arch_sync_dirty_log(kvm, memslot);
2162
2163	flush = false;
2164	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2165	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2166		return -EFAULT;
2167
2168	KVM_MMU_LOCK(kvm);
2169	for (offset = log->first_page, i = offset / BITS_PER_LONG,
2170		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2171	     i++, offset += BITS_PER_LONG) {
2172		unsigned long mask = *dirty_bitmap_buffer++;
2173		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2174		if (!mask)
2175			continue;
2176
2177		mask &= atomic_long_fetch_andnot(mask, p);
2178
2179		/*
2180		 * mask contains the bits that really have been cleared.  This
2181		 * never includes any bits beyond the length of the memslot (if
2182		 * the length is not aligned to 64 pages), therefore it is not
2183		 * a problem if userspace sets them in log->dirty_bitmap.
2184		*/
2185		if (mask) {
2186			flush = true;
2187			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2188								offset, mask);
2189		}
2190	}
2191	KVM_MMU_UNLOCK(kvm);
2192
2193	if (flush)
2194		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2195
2196	return 0;
2197}
2198
2199static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2200					struct kvm_clear_dirty_log *log)
2201{
2202	int r;
2203
2204	mutex_lock(&kvm->slots_lock);
2205
2206	r = kvm_clear_dirty_log_protect(kvm, log);
2207
2208	mutex_unlock(&kvm->slots_lock);
2209	return r;
2210}
2211#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2212
2213struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2214{
2215	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2216}
2217EXPORT_SYMBOL_GPL(gfn_to_memslot);
2218
2219struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2220{
2221	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2222	u64 gen = slots->generation;
2223	struct kvm_memory_slot *slot;
2224
2225	/*
2226	 * This also protects against using a memslot from a different address space,
2227	 * since different address spaces have different generation numbers.
2228	 */
2229	if (unlikely(gen != vcpu->last_used_slot_gen)) {
2230		vcpu->last_used_slot = NULL;
2231		vcpu->last_used_slot_gen = gen;
2232	}
2233
2234	slot = try_get_memslot(vcpu->last_used_slot, gfn);
2235	if (slot)
2236		return slot;
2237
2238	/*
2239	 * Fall back to searching all memslots. We purposely use
2240	 * search_memslots() instead of __gfn_to_memslot() to avoid
2241	 * thrashing the VM-wide last_used_slot in kvm_memslots.
2242	 */
2243	slot = search_memslots(slots, gfn, false);
2244	if (slot) {
2245		vcpu->last_used_slot = slot;
2246		return slot;
2247	}
2248
2249	return NULL;
2250}
2251EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
2252
2253bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2254{
2255	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2256
2257	return kvm_is_visible_memslot(memslot);
2258}
2259EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2260
2261bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2262{
2263	struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2264
2265	return kvm_is_visible_memslot(memslot);
2266}
2267EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2268
2269unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2270{
2271	struct vm_area_struct *vma;
2272	unsigned long addr, size;
2273
2274	size = PAGE_SIZE;
2275
2276	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2277	if (kvm_is_error_hva(addr))
2278		return PAGE_SIZE;
2279
2280	mmap_read_lock(current->mm);
2281	vma = find_vma(current->mm, addr);
2282	if (!vma)
2283		goto out;
2284
2285	size = vma_kernel_pagesize(vma);
2286
2287out:
2288	mmap_read_unlock(current->mm);
2289
2290	return size;
2291}
2292
2293static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2294{
2295	return slot->flags & KVM_MEM_READONLY;
2296}
2297
2298static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2299				       gfn_t *nr_pages, bool write)
2300{
2301	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2302		return KVM_HVA_ERR_BAD;
2303
2304	if (memslot_is_readonly(slot) && write)
2305		return KVM_HVA_ERR_RO_BAD;
2306
2307	if (nr_pages)
2308		*nr_pages = slot->npages - (gfn - slot->base_gfn);
2309
2310	return __gfn_to_hva_memslot(slot, gfn);
2311}
2312
2313static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2314				     gfn_t *nr_pages)
2315{
2316	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2317}
2318
2319unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2320					gfn_t gfn)
2321{
2322	return gfn_to_hva_many(slot, gfn, NULL);
2323}
2324EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2325
2326unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2327{
2328	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2329}
2330EXPORT_SYMBOL_GPL(gfn_to_hva);
2331
2332unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2333{
2334	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2335}
2336EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2337
2338/*
2339 * Return the hva of a @gfn and the R/W attribute if possible.
2340 *
2341 * @slot: the kvm_memory_slot which contains @gfn
2342 * @gfn: the gfn to be translated
2343 * @writable: used to return the read/write attribute of the @slot if the hva
2344 * is valid and @writable is not NULL
2345 */
2346unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2347				      gfn_t gfn, bool *writable)
2348{
2349	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2350
2351	if (!kvm_is_error_hva(hva) && writable)
2352		*writable = !memslot_is_readonly(slot);
2353
2354	return hva;
2355}
2356
2357unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2358{
2359	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2360
2361	return gfn_to_hva_memslot_prot(slot, gfn, writable);
2362}
2363
2364unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2365{
2366	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2367
2368	return gfn_to_hva_memslot_prot(slot, gfn, writable);
2369}
2370
2371static inline int check_user_page_hwpoison(unsigned long addr)
2372{
2373	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2374
2375	rc = get_user_pages(addr, 1, flags, NULL, NULL);
2376	return rc == -EHWPOISON;
2377}
2378
2379/*
2380 * The fast path to get the writable pfn which will be stored in @pfn,
2381 * true indicates success, otherwise false is returned.  It's also the
2382 * only part that runs if we can in atomic context.
2383 */
2384static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2385			    bool *writable, kvm_pfn_t *pfn)
2386{
2387	struct page *page[1];
2388
2389	/*
2390	 * Fast pin a writable pfn only if it is a write fault request
2391	 * or the caller allows to map a writable pfn for a read fault
2392	 * request.
2393	 */
2394	if (!(write_fault || writable))
2395		return false;
2396
2397	if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2398		*pfn = page_to_pfn(page[0]);
2399
2400		if (writable)
2401			*writable = true;
2402		return true;
2403	}
2404
2405	return false;
2406}
2407
2408/*
2409 * The slow path to get the pfn of the specified host virtual address,
2410 * 1 indicates success, -errno is returned if error is detected.
2411 */
2412static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2413			   bool *writable, kvm_pfn_t *pfn)
2414{
2415	unsigned int flags = FOLL_HWPOISON;
2416	struct page *page;
2417	int npages = 0;
2418
2419	might_sleep();
2420
2421	if (writable)
2422		*writable = write_fault;
2423
2424	if (write_fault)
2425		flags |= FOLL_WRITE;
2426	if (async)
2427		flags |= FOLL_NOWAIT;
2428
2429	npages = get_user_pages_unlocked(addr, 1, &page, flags);
2430	if (npages != 1)
2431		return npages;
2432
2433	/* map read fault as writable if possible */
2434	if (unlikely(!write_fault) && writable) {
2435		struct page *wpage;
2436
2437		if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2438			*writable = true;
2439			put_page(page);
2440			page = wpage;
2441		}
2442	}
2443	*pfn = page_to_pfn(page);
2444	return npages;
2445}
2446
2447static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2448{
2449	if (unlikely(!(vma->vm_flags & VM_READ)))
2450		return false;
2451
2452	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2453		return false;
2454
2455	return true;
2456}
2457
2458static int kvm_try_get_pfn(kvm_pfn_t pfn)
2459{
2460	if (kvm_is_reserved_pfn(pfn))
2461		return 1;
2462	return get_page_unless_zero(pfn_to_page(pfn));
2463}
2464
2465static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2466			       unsigned long addr, bool *async,
2467			       bool write_fault, bool *writable,
2468			       kvm_pfn_t *p_pfn)
2469{
2470	kvm_pfn_t pfn;
2471	pte_t *ptep;
2472	spinlock_t *ptl;
2473	int r;
2474
2475	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2476	if (r) {
2477		/*
2478		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2479		 * not call the fault handler, so do it here.
2480		 */
2481		bool unlocked = false;
2482		r = fixup_user_fault(current->mm, addr,
2483				     (write_fault ? FAULT_FLAG_WRITE : 0),
2484				     &unlocked);
2485		if (unlocked)
2486			return -EAGAIN;
2487		if (r)
2488			return r;
2489
2490		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2491		if (r)
2492			return r;
2493	}
2494
2495	if (write_fault && !pte_write(*ptep)) {
2496		pfn = KVM_PFN_ERR_RO_FAULT;
2497		goto out;
2498	}
2499
2500	if (writable)
2501		*writable = pte_write(*ptep);
2502	pfn = pte_pfn(*ptep);
2503
2504	/*
2505	 * Get a reference here because callers of *hva_to_pfn* and
2506	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2507	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2508	 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2509	 * simply do nothing for reserved pfns.
2510	 *
2511	 * Whoever called remap_pfn_range is also going to call e.g.
2512	 * unmap_mapping_range before the underlying pages are freed,
2513	 * causing a call to our MMU notifier.
2514	 *
2515	 * Certain IO or PFNMAP mappings can be backed with valid
2516	 * struct pages, but be allocated without refcounting e.g.,
2517	 * tail pages of non-compound higher order allocations, which
2518	 * would then underflow the refcount when the caller does the
2519	 * required put_page. Don't allow those pages here.
2520	 */
2521	if (!kvm_try_get_pfn(pfn))
2522		r = -EFAULT;
2523
2524out:
2525	pte_unmap_unlock(ptep, ptl);
2526	*p_pfn = pfn;
2527
2528	return r;
2529}
2530
2531/*
2532 * Pin guest page in memory and return its pfn.
2533 * @addr: host virtual address which maps memory to the guest
2534 * @atomic: whether this function can sleep
2535 * @async: whether this function need to wait IO complete if the
2536 *         host page is not in the memory
2537 * @write_fault: whether we should get a writable host page
2538 * @writable: whether it allows to map a writable host page for !@write_fault
2539 *
2540 * The function will map a writable host page for these two cases:
2541 * 1): @write_fault = true
2542 * 2): @write_fault = false && @writable, @writable will tell the caller
2543 *     whether the mapping is writable.
2544 */
2545kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2546		     bool write_fault, bool *writable)
2547{
2548	struct vm_area_struct *vma;
2549	kvm_pfn_t pfn = 0;
2550	int npages, r;
2551
2552	/* we can do it either atomically or asynchronously, not both */
2553	BUG_ON(atomic && async);
2554
2555	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2556		return pfn;
2557
2558	if (atomic)
2559		return KVM_PFN_ERR_FAULT;
2560
2561	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2562	if (npages == 1)
2563		return pfn;
2564
2565	mmap_read_lock(current->mm);
2566	if (npages == -EHWPOISON ||
2567	      (!async && check_user_page_hwpoison(addr))) {
2568		pfn = KVM_PFN_ERR_HWPOISON;
2569		goto exit;
2570	}
2571
2572retry:
2573	vma = vma_lookup(current->mm, addr);
2574
2575	if (vma == NULL)
2576		pfn = KVM_PFN_ERR_FAULT;
2577	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2578		r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2579		if (r == -EAGAIN)
2580			goto retry;
2581		if (r < 0)
2582			pfn = KVM_PFN_ERR_FAULT;
2583	} else {
2584		if (async && vma_is_valid(vma, write_fault))
2585			*async = true;
2586		pfn = KVM_PFN_ERR_FAULT;
2587	}
2588exit:
2589	mmap_read_unlock(current->mm);
2590	return pfn;
2591}
2592
2593kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2594			       bool atomic, bool *async, bool write_fault,
2595			       bool *writable, hva_t *hva)
2596{
2597	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2598
2599	if (hva)
2600		*hva = addr;
2601
2602	if (addr == KVM_HVA_ERR_RO_BAD) {
2603		if (writable)
2604			*writable = false;
2605		return KVM_PFN_ERR_RO_FAULT;
2606	}
2607
2608	if (kvm_is_error_hva(addr)) {
2609		if (writable)
2610			*writable = false;
2611		return KVM_PFN_NOSLOT;
2612	}
2613
2614	/* Do not map writable pfn in the readonly memslot. */
2615	if (writable && memslot_is_readonly(slot)) {
2616		*writable = false;
2617		writable = NULL;
2618	}
2619
2620	return hva_to_pfn(addr, atomic, async, write_fault,
2621			  writable);
2622}
2623EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2624
2625kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2626		      bool *writable)
2627{
2628	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2629				    write_fault, writable, NULL);
2630}
2631EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2632
2633kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2634{
2635	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2636}
2637EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2638
2639kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2640{
2641	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2642}
2643EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2644
2645kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2646{
2647	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2648}
2649EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2650
2651kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2652{
2653	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2654}
2655EXPORT_SYMBOL_GPL(gfn_to_pfn);
2656
2657kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2658{
2659	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2660}
2661EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2662
2663int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2664			    struct page **pages, int nr_pages)
2665{
2666	unsigned long addr;
2667	gfn_t entry = 0;
2668
2669	addr = gfn_to_hva_many(slot, gfn, &entry);
2670	if (kvm_is_error_hva(addr))
2671		return -1;
2672
2673	if (entry < nr_pages)
2674		return 0;
2675
2676	return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2677}
2678EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2679
2680static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2681{
2682	if (is_error_noslot_pfn(pfn))
2683		return KVM_ERR_PTR_BAD_PAGE;
2684
2685	if (kvm_is_reserved_pfn(pfn)) {
2686		WARN_ON(1);
2687		return KVM_ERR_PTR_BAD_PAGE;
2688	}
2689
2690	return pfn_to_page(pfn);
2691}
2692
2693struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2694{
2695	kvm_pfn_t pfn;
2696
2697	pfn = gfn_to_pfn(kvm, gfn);
2698
2699	return kvm_pfn_to_page(pfn);
2700}
2701EXPORT_SYMBOL_GPL(gfn_to_page);
2702
2703void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2704{
2705	if (pfn == 0)
2706		return;
2707
2708	if (dirty)
2709		kvm_release_pfn_dirty(pfn);
2710	else
2711		kvm_release_pfn_clean(pfn);
2712}
2713
2714int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2715{
2716	kvm_pfn_t pfn;
2717	void *hva = NULL;
2718	struct page *page = KVM_UNMAPPED_PAGE;
2719
2720	if (!map)
2721		return -EINVAL;
2722
2723	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2724	if (is_error_noslot_pfn(pfn))
2725		return -EINVAL;
2726
2727	if (pfn_valid(pfn)) {
2728		page = pfn_to_page(pfn);
2729		hva = kmap(page);
2730#ifdef CONFIG_HAS_IOMEM
2731	} else {
2732		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2733#endif
2734	}
2735
2736	if (!hva)
2737		return -EFAULT;
2738
2739	map->page = page;
2740	map->hva = hva;
2741	map->pfn = pfn;
2742	map->gfn = gfn;
2743
2744	return 0;
2745}
2746EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2747
2748void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2749{
2750	if (!map)
2751		return;
2752
2753	if (!map->hva)
2754		return;
2755
2756	if (map->page != KVM_UNMAPPED_PAGE)
2757		kunmap(map->page);
2758#ifdef CONFIG_HAS_IOMEM
2759	else
2760		memunmap(map->hva);
2761#endif
2762
2763	if (dirty)
2764		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2765
2766	kvm_release_pfn(map->pfn, dirty);
2767
2768	map->hva = NULL;
2769	map->page = NULL;
2770}
2771EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2772
2773struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2774{
2775	kvm_pfn_t pfn;
2776
2777	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2778
2779	return kvm_pfn_to_page(pfn);
2780}
2781EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2782
2783void kvm_release_page_clean(struct page *page)
2784{
2785	WARN_ON(is_error_page(page));
2786
2787	kvm_release_pfn_clean(page_to_pfn(page));
2788}
2789EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2790
2791void kvm_release_pfn_clean(kvm_pfn_t pfn)
2792{
2793	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2794		put_page(pfn_to_page(pfn));
2795}
2796EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2797
2798void kvm_release_page_dirty(struct page *page)
2799{
2800	WARN_ON(is_error_page(page));
2801
2802	kvm_release_pfn_dirty(page_to_pfn(page));
2803}
2804EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2805
2806void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2807{
2808	kvm_set_pfn_dirty(pfn);
2809	kvm_release_pfn_clean(pfn);
2810}
2811EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2812
2813void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2814{
2815	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2816		SetPageDirty(pfn_to_page(pfn));
2817}
2818EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2819
2820void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2821{
2822	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2823		mark_page_accessed(pfn_to_page(pfn));
2824}
2825EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2826
2827static int next_segment(unsigned long len, int offset)
2828{
2829	if (len > PAGE_SIZE - offset)
2830		return PAGE_SIZE - offset;
2831	else
2832		return len;
2833}
2834
2835static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2836				 void *data, int offset, int len)
2837{
2838	int r;
2839	unsigned long addr;
2840
2841	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2842	if (kvm_is_error_hva(addr))
2843		return -EFAULT;
2844	r = __copy_from_user(data, (void __user *)addr + offset, len);
2845	if (r)
2846		return -EFAULT;
2847	return 0;
2848}
2849
2850int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2851			int len)
2852{
2853	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2854
2855	return __kvm_read_guest_page(slot, gfn, data, offset, len);
2856}
2857EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2858
2859int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2860			     int offset, int len)
2861{
2862	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2863
2864	return __kvm_read_guest_page(slot, gfn, data, offset, len);
2865}
2866EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2867
2868int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2869{
2870	gfn_t gfn = gpa >> PAGE_SHIFT;
2871	int seg;
2872	int offset = offset_in_page(gpa);
2873	int ret;
2874
2875	while ((seg = next_segment(len, offset)) != 0) {
2876		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2877		if (ret < 0)
2878			return ret;
2879		offset = 0;
2880		len -= seg;
2881		data += seg;
2882		++gfn;
2883	}
2884	return 0;
2885}
2886EXPORT_SYMBOL_GPL(kvm_read_guest);
2887
2888int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2889{
2890	gfn_t gfn = gpa >> PAGE_SHIFT;
2891	int seg;
2892	int offset = offset_in_page(gpa);
2893	int ret;
2894
2895	while ((seg = next_segment(len, offset)) != 0) {
2896		ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2897		if (ret < 0)
2898			return ret;
2899		offset = 0;
2900		len -= seg;
2901		data += seg;
2902		++gfn;
2903	}
2904	return 0;
2905}
2906EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2907
2908static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2909			           void *data, int offset, unsigned long len)
2910{
2911	int r;
2912	unsigned long addr;
2913
2914	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2915	if (kvm_is_error_hva(addr))
2916		return -EFAULT;
2917	pagefault_disable();
2918	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2919	pagefault_enable();
2920	if (r)
2921		return -EFAULT;
2922	return 0;
2923}
2924
2925int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2926			       void *data, unsigned long len)
2927{
2928	gfn_t gfn = gpa >> PAGE_SHIFT;
2929	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2930	int offset = offset_in_page(gpa);
2931
2932	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2933}
2934EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2935
2936static int __kvm_write_guest_page(struct kvm *kvm,
2937				  struct kvm_memory_slot *memslot, gfn_t gfn,
2938			          const void *data, int offset, int len)
2939{
2940	int r;
2941	unsigned long addr;
2942
2943	addr = gfn_to_hva_memslot(memslot, gfn);
2944	if (kvm_is_error_hva(addr))
2945		return -EFAULT;
2946	r = __copy_to_user((void __user *)addr + offset, data, len);
2947	if (r)
2948		return -EFAULT;
2949	mark_page_dirty_in_slot(kvm, memslot, gfn);
2950	return 0;
2951}
2952
2953int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2954			 const void *data, int offset, int len)
2955{
2956	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2957
2958	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2959}
2960EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2961
2962int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2963			      const void *data, int offset, int len)
2964{
2965	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2966
2967	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2968}
2969EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2970
2971int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2972		    unsigned long len)
2973{
2974	gfn_t gfn = gpa >> PAGE_SHIFT;
2975	int seg;
2976	int offset = offset_in_page(gpa);
2977	int ret;
2978
2979	while ((seg = next_segment(len, offset)) != 0) {
2980		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2981		if (ret < 0)
2982			return ret;
2983		offset = 0;
2984		len -= seg;
2985		data += seg;
2986		++gfn;
2987	}
2988	return 0;
2989}
2990EXPORT_SYMBOL_GPL(kvm_write_guest);
2991
2992int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2993		         unsigned long len)
2994{
2995	gfn_t gfn = gpa >> PAGE_SHIFT;
2996	int seg;
2997	int offset = offset_in_page(gpa);
2998	int ret;
2999
3000	while ((seg = next_segment(len, offset)) != 0) {
3001		ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3002		if (ret < 0)
3003			return ret;
3004		offset = 0;
3005		len -= seg;
3006		data += seg;
3007		++gfn;
3008	}
3009	return 0;
3010}
3011EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3012
3013static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3014				       struct gfn_to_hva_cache *ghc,
3015				       gpa_t gpa, unsigned long len)
3016{
3017	int offset = offset_in_page(gpa);
3018	gfn_t start_gfn = gpa >> PAGE_SHIFT;
3019	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3020	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3021	gfn_t nr_pages_avail;
3022
3023	/* Update ghc->generation before performing any error checks. */
3024	ghc->generation = slots->generation;
3025
3026	if (start_gfn > end_gfn) {
3027		ghc->hva = KVM_HVA_ERR_BAD;
3028		return -EINVAL;
3029	}
3030
3031	/*
3032	 * If the requested region crosses two memslots, we still
3033	 * verify that the entire region is valid here.
3034	 */
3035	for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3036		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3037		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3038					   &nr_pages_avail);
3039		if (kvm_is_error_hva(ghc->hva))
3040			return -EFAULT;
3041	}
3042
3043	/* Use the slow path for cross page reads and writes. */
3044	if (nr_pages_needed == 1)
3045		ghc->hva += offset;
3046	else
3047		ghc->memslot = NULL;
3048
3049	ghc->gpa = gpa;
3050	ghc->len = len;
3051	return 0;
3052}
3053
3054int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3055			      gpa_t gpa, unsigned long len)
3056{
3057	struct kvm_memslots *slots = kvm_memslots(kvm);
3058	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3059}
3060EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3061
3062int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3063				  void *data, unsigned int offset,
3064				  unsigned long len)
3065{
3066	struct kvm_memslots *slots = kvm_memslots(kvm);
3067	int r;
3068	gpa_t gpa = ghc->gpa + offset;
3069
3070	if (WARN_ON_ONCE(len + offset > ghc->len))
3071		return -EINVAL;
3072
3073	if (slots->generation != ghc->generation) {
3074		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3075			return -EFAULT;
3076	}
3077
3078	if (kvm_is_error_hva(ghc->hva))
3079		return -EFAULT;
3080
3081	if (unlikely(!ghc->memslot))
3082		return kvm_write_guest(kvm, gpa, data, len);
3083
3084	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3085	if (r)
3086		return -EFAULT;
3087	mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3088
3089	return 0;
3090}
3091EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3092
3093int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3094			   void *data, unsigned long len)
3095{
3096	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3097}
3098EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3099
3100int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3101				 void *data, unsigned int offset,
3102				 unsigned long len)
3103{
3104	struct kvm_memslots *slots = kvm_memslots(kvm);
3105	int r;
3106	gpa_t gpa = ghc->gpa + offset;
3107
3108	if (WARN_ON_ONCE(len + offset > ghc->len))
3109		return -EINVAL;
3110
3111	if (slots->generation != ghc->generation) {
3112		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3113			return -EFAULT;
3114	}
3115
3116	if (kvm_is_error_hva(ghc->hva))
3117		return -EFAULT;
3118
3119	if (unlikely(!ghc->memslot))
3120		return kvm_read_guest(kvm, gpa, data, len);
3121
3122	r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3123	if (r)
3124		return -EFAULT;
3125
3126	return 0;
3127}
3128EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3129
3130int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3131			  void *data, unsigned long len)
3132{
3133	return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3134}
3135EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3136
3137int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3138{
3139	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3140	gfn_t gfn = gpa >> PAGE_SHIFT;
3141	int seg;
3142	int offset = offset_in_page(gpa);
3143	int ret;
3144
3145	while ((seg = next_segment(len, offset)) != 0) {
3146		ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3147		if (ret < 0)
3148			return ret;
3149		offset = 0;
3150		len -= seg;
3151		++gfn;
3152	}
3153	return 0;
3154}
3155EXPORT_SYMBOL_GPL(kvm_clear_guest);
3156
3157void mark_page_dirty_in_slot(struct kvm *kvm,
3158			     const struct kvm_memory_slot *memslot,
3159		 	     gfn_t gfn)
3160{
3161	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3162
3163#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3164	if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
3165		return;
3166#endif
3167
3168	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3169		unsigned long rel_gfn = gfn - memslot->base_gfn;
3170		u32 slot = (memslot->as_id << 16) | memslot->id;
3171
3172		if (kvm->dirty_ring_size)
3173			kvm_dirty_ring_push(&vcpu->dirty_ring,
3174					    slot, rel_gfn);
3175		else
3176			set_bit_le(rel_gfn, memslot->dirty_bitmap);
3177	}
3178}
3179EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3180
3181void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3182{
3183	struct kvm_memory_slot *memslot;
3184
3185	memslot = gfn_to_memslot(kvm, gfn);
3186	mark_page_dirty_in_slot(kvm, memslot, gfn);
3187}
3188EXPORT_SYMBOL_GPL(mark_page_dirty);
3189
3190void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3191{
3192	struct kvm_memory_slot *memslot;
3193
3194	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3195	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3196}
3197EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3198
3199void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3200{
3201	if (!vcpu->sigset_active)
3202		return;
3203
3204	/*
3205	 * This does a lockless modification of ->real_blocked, which is fine
3206	 * because, only current can change ->real_blocked and all readers of
3207	 * ->real_blocked don't care as long ->real_blocked is always a subset
3208	 * of ->blocked.
3209	 */
3210	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3211}
3212
3213void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3214{
3215	if (!vcpu->sigset_active)
3216		return;
3217
3218	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3219	sigemptyset(&current->real_blocked);
3220}
3221
3222static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3223{
3224	unsigned int old, val, grow, grow_start;
3225
3226	old = val = vcpu->halt_poll_ns;
3227	grow_start = READ_ONCE(halt_poll_ns_grow_start);
3228	grow = READ_ONCE(halt_poll_ns_grow);
3229	if (!grow)
3230		goto out;
3231
3232	val *= grow;
3233	if (val < grow_start)
3234		val = grow_start;
3235
3236	if (val > vcpu->kvm->max_halt_poll_ns)
3237		val = vcpu->kvm->max_halt_poll_ns;
3238
3239	vcpu->halt_poll_ns = val;
3240out:
3241	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3242}
3243
3244static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3245{
3246	unsigned int old, val, shrink, grow_start;
3247
3248	old = val = vcpu->halt_poll_ns;
3249	shrink = READ_ONCE(halt_poll_ns_shrink);
3250	grow_start = READ_ONCE(halt_poll_ns_grow_start);
3251	if (shrink == 0)
3252		val = 0;
3253	else
3254		val /= shrink;
3255
3256	if (val < grow_start)
3257		val = 0;
3258
3259	vcpu->halt_poll_ns = val;
3260	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3261}
3262
3263static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3264{
3265	int ret = -EINTR;
3266	int idx = srcu_read_lock(&vcpu->kvm->srcu);
3267
3268	if (kvm_arch_vcpu_runnable(vcpu)) {
3269		kvm_make_request(KVM_REQ_UNHALT, vcpu);
3270		goto out;
3271	}
3272	if (kvm_cpu_has_pending_timer(vcpu))
3273		goto out;
3274	if (signal_pending(current))
3275		goto out;
3276	if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3277		goto out;
3278
3279	ret = 0;
3280out:
3281	srcu_read_unlock(&vcpu->kvm->srcu, idx);
3282	return ret;
3283}
3284
3285/*
3286 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3287 * pending.  This is mostly used when halting a vCPU, but may also be used
3288 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3289 */
3290bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3291{
3292	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3293	bool waited = false;
3294
3295	vcpu->stat.generic.blocking = 1;
3296
3297	kvm_arch_vcpu_blocking(vcpu);
3298
3299	prepare_to_rcuwait(wait);
3300	for (;;) {
3301		set_current_state(TASK_INTERRUPTIBLE);
3302
3303		if (kvm_vcpu_check_block(vcpu) < 0)
3304			break;
3305
3306		waited = true;
3307		schedule();
3308	}
3309	finish_rcuwait(wait);
3310
3311	kvm_arch_vcpu_unblocking(vcpu);
3312
3313	vcpu->stat.generic.blocking = 0;
3314
3315	return waited;
3316}
3317
3318static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3319					  ktime_t end, bool success)
3320{
3321	struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3322	u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3323
3324	++vcpu->stat.generic.halt_attempted_poll;
3325
3326	if (success) {
3327		++vcpu->stat.generic.halt_successful_poll;
3328
3329		if (!vcpu_valid_wakeup(vcpu))
3330			++vcpu->stat.generic.halt_poll_invalid;
3331
3332		stats->halt_poll_success_ns += poll_ns;
3333		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3334	} else {
3335		stats->halt_poll_fail_ns += poll_ns;
3336		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3337	}
3338}
3339
3340/*
3341 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3342 * polling is enabled, busy wait for a short time before blocking to avoid the
3343 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3344 * is halted.
3345 */
3346void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3347{
3348	bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3349	bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3350	ktime_t start, cur, poll_end;
3351	bool waited = false;
3352	u64 halt_ns;
3353
3354	start = cur = poll_end = ktime_get();
3355	if (do_halt_poll) {
3356		ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3357
3358		do {
3359			/*
3360			 * This sets KVM_REQ_UNHALT if an interrupt
3361			 * arrives.
3362			 */
3363			if (kvm_vcpu_check_block(vcpu) < 0)
3364				goto out;
3365			cpu_relax();
3366			poll_end = cur = ktime_get();
3367		} while (kvm_vcpu_can_poll(cur, stop));
3368	}
3369
3370	waited = kvm_vcpu_block(vcpu);
3371
3372	cur = ktime_get();
3373	if (waited) {
3374		vcpu->stat.generic.halt_wait_ns +=
3375			ktime_to_ns(cur) - ktime_to_ns(poll_end);
3376		KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3377				ktime_to_ns(cur) - ktime_to_ns(poll_end));
3378	}
3379out:
3380	/* The total time the vCPU was "halted", including polling time. */
3381	halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3382
3383	/*
3384	 * Note, halt-polling is considered successful so long as the vCPU was
3385	 * never actually scheduled out, i.e. even if the wake event arrived
3386	 * after of the halt-polling loop itself, but before the full wait.
3387	 */
3388	if (do_halt_poll)
3389		update_halt_poll_stats(vcpu, start, poll_end, !waited);
3390
3391	if (halt_poll_allowed) {
3392		if (!vcpu_valid_wakeup(vcpu)) {
3393			shrink_halt_poll_ns(vcpu);
3394		} else if (vcpu->kvm->max_halt_poll_ns) {
3395			if (halt_ns <= vcpu->halt_poll_ns)
3396				;
3397			/* we had a long block, shrink polling */
3398			else if (vcpu->halt_poll_ns &&
3399				 halt_ns > vcpu->kvm->max_halt_poll_ns)
3400				shrink_halt_poll_ns(vcpu);
3401			/* we had a short halt and our poll time is too small */
3402			else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3403				 halt_ns < vcpu->kvm->max_halt_poll_ns)
3404				grow_halt_poll_ns(vcpu);
3405		} else {
3406			vcpu->halt_poll_ns = 0;
3407		}
3408	}
3409
3410	trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3411}
3412EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3413
3414bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3415{
3416	if (__kvm_vcpu_wake_up(vcpu)) {
3417		WRITE_ONCE(vcpu->ready, true);
3418		++vcpu->stat.generic.halt_wakeup;
3419		return true;
3420	}
3421
3422	return false;
3423}
3424EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3425
3426#ifndef CONFIG_S390
3427/*
3428 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3429 */
3430void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3431{
3432	int me, cpu;
3433
3434	if (kvm_vcpu_wake_up(vcpu))
3435		return;
3436
3437	me = get_cpu();
3438	/*
3439	 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3440	 * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3441	 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3442	 * within the vCPU thread itself.
3443	 */
3444	if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3445		if (vcpu->mode == IN_GUEST_MODE)
3446			WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3447		goto out;
3448	}
3449
3450	/*
3451	 * Note, the vCPU could get migrated to a different pCPU at any point
3452	 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3453	 * IPI to the previous pCPU.  But, that's ok because the purpose of the
3454	 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3455	 * vCPU also requires it to leave IN_GUEST_MODE.
3456	 */
3457	if (kvm_arch_vcpu_should_kick(vcpu)) {
3458		cpu = READ_ONCE(vcpu->cpu);
3459		if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3460			smp_send_reschedule(cpu);
3461	}
3462out:
3463	put_cpu();
3464}
3465EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3466#endif /* !CONFIG_S390 */
3467
3468int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3469{
3470	struct pid *pid;
3471	struct task_struct *task = NULL;
3472	int ret = 0;
3473
3474	rcu_read_lock();
3475	pid = rcu_dereference(target->pid);
3476	if (pid)
3477		task = get_pid_task(pid, PIDTYPE_PID);
3478	rcu_read_unlock();
3479	if (!task)
3480		return ret;
3481	ret = yield_to(task, 1);
3482	put_task_struct(task);
3483
3484	return ret;
3485}
3486EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3487
3488/*
3489 * Helper that checks whether a VCPU is eligible for directed yield.
3490 * Most eligible candidate to yield is decided by following heuristics:
3491 *
3492 *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3493 *  (preempted lock holder), indicated by @in_spin_loop.
3494 *  Set at the beginning and cleared at the end of interception/PLE handler.
3495 *
3496 *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3497 *  chance last time (mostly it has become eligible now since we have probably
3498 *  yielded to lockholder in last iteration. This is done by toggling
3499 *  @dy_eligible each time a VCPU checked for eligibility.)
3500 *
3501 *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3502 *  to preempted lock-holder could result in wrong VCPU selection and CPU
3503 *  burning. Giving priority for a potential lock-holder increases lock
3504 *  progress.
3505 *
3506 *  Since algorithm is based on heuristics, accessing another VCPU data without
3507 *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3508 *  and continue with next VCPU and so on.
3509 */
3510static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3511{
3512#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3513	bool eligible;
3514
3515	eligible = !vcpu->spin_loop.in_spin_loop ||
3516		    vcpu->spin_loop.dy_eligible;
3517
3518	if (vcpu->spin_loop.in_spin_loop)
3519		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3520
3521	return eligible;
3522#else
3523	return true;
3524#endif
3525}
3526
3527/*
3528 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3529 * a vcpu_load/vcpu_put pair.  However, for most architectures
3530 * kvm_arch_vcpu_runnable does not require vcpu_load.
3531 */
3532bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3533{
3534	return kvm_arch_vcpu_runnable(vcpu);
3535}
3536
3537static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3538{
3539	if (kvm_arch_dy_runnable(vcpu))
3540		return true;
3541
3542#ifdef CONFIG_KVM_ASYNC_PF
3543	if (!list_empty_careful(&vcpu->async_pf.done))
3544		return true;
3545#endif
3546
3547	return false;
3548}
3549
3550bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3551{
3552	return false;
3553}
3554
3555void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3556{
3557	struct kvm *kvm = me->kvm;
3558	struct kvm_vcpu *vcpu;
3559	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3560	unsigned long i;
3561	int yielded = 0;
3562	int try = 3;
3563	int pass;
3564
3565	kvm_vcpu_set_in_spin_loop(me, true);
3566	/*
3567	 * We boost the priority of a VCPU that is runnable but not
3568	 * currently running, because it got preempted by something
3569	 * else and called schedule in __vcpu_run.  Hopefully that
3570	 * VCPU is holding the lock that we need and will release it.
3571	 * We approximate round-robin by starting at the last boosted VCPU.
3572	 */
3573	for (pass = 0; pass < 2 && !yielded && try; pass++) {
3574		kvm_for_each_vcpu(i, vcpu, kvm) {
3575			if (!pass && i <= last_boosted_vcpu) {
3576				i = last_boosted_vcpu;
3577				continue;
3578			} else if (pass && i > last_boosted_vcpu)
3579				break;
3580			if (!READ_ONCE(vcpu->ready))
3581				continue;
3582			if (vcpu == me)
3583				continue;
3584			if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3585				continue;
3586			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3587			    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3588			    !kvm_arch_vcpu_in_kernel(vcpu))
3589				continue;
3590			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3591				continue;
3592
3593			yielded = kvm_vcpu_yield_to(vcpu);
3594			if (yielded > 0) {
3595				kvm->last_boosted_vcpu = i;
3596				break;
3597			} else if (yielded < 0) {
3598				try--;
3599				if (!try)
3600					break;
3601			}
3602		}
3603	}
3604	kvm_vcpu_set_in_spin_loop(me, false);
3605
3606	/* Ensure vcpu is not eligible during next spinloop */
3607	kvm_vcpu_set_dy_eligible(me, false);
3608}
3609EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3610
3611static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3612{
3613#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3614	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3615	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3616	     kvm->dirty_ring_size / PAGE_SIZE);
3617#else
3618	return false;
3619#endif
3620}
3621
3622static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3623{
3624	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3625	struct page *page;
3626
3627	if (vmf->pgoff == 0)
3628		page = virt_to_page(vcpu->run);
3629#ifdef CONFIG_X86
3630	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3631		page = virt_to_page(vcpu->arch.pio_data);
3632#endif
3633#ifdef CONFIG_KVM_MMIO
3634	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3635		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3636#endif
3637	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3638		page = kvm_dirty_ring_get_page(
3639		    &vcpu->dirty_ring,
3640		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3641	else
3642		return kvm_arch_vcpu_fault(vcpu, vmf);
3643	get_page(page);
3644	vmf->page = page;
3645	return 0;
3646}
3647
3648static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3649	.fault = kvm_vcpu_fault,
3650};
3651
3652static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3653{
3654	struct kvm_vcpu *vcpu = file->private_data;
3655	unsigned long pages = vma_pages(vma);
3656
3657	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3658	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3659	    ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3660		return -EINVAL;
3661
3662	vma->vm_ops = &kvm_vcpu_vm_ops;
3663	return 0;
3664}
3665
3666static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3667{
3668	struct kvm_vcpu *vcpu = filp->private_data;
3669
3670	kvm_put_kvm(vcpu->kvm);
3671	return 0;
3672}
3673
3674static struct file_operations kvm_vcpu_fops = {
3675	.release        = kvm_vcpu_release,
3676	.unlocked_ioctl = kvm_vcpu_ioctl,
3677	.mmap           = kvm_vcpu_mmap,
3678	.llseek		= noop_llseek,
3679	KVM_COMPAT(kvm_vcpu_compat_ioctl),
3680};
3681
3682/*
3683 * Allocates an inode for the vcpu.
3684 */
3685static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3686{
3687	char name[8 + 1 + ITOA_MAX_LEN + 1];
3688
3689	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3690	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3691}
3692
3693static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3694{
3695#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3696	struct dentry *debugfs_dentry;
3697	char dir_name[ITOA_MAX_LEN * 2];
3698
3699	if (!debugfs_initialized())
3700		return;
3701
3702	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3703	debugfs_dentry = debugfs_create_dir(dir_name,
3704					    vcpu->kvm->debugfs_dentry);
3705
3706	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3707#endif
3708}
3709
3710/*
3711 * Creates some virtual cpus.  Good luck creating more than one.
3712 */
3713static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3714{
3715	int r;
3716	struct kvm_vcpu *vcpu;
3717	struct page *page;
3718
3719	if (id >= KVM_MAX_VCPU_IDS)
3720		return -EINVAL;
3721
3722	mutex_lock(&kvm->lock);
3723	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3724		mutex_unlock(&kvm->lock);
3725		return -EINVAL;
3726	}
3727
3728	kvm->created_vcpus++;
3729	mutex_unlock(&kvm->lock);
3730
3731	r = kvm_arch_vcpu_precreate(kvm, id);
3732	if (r)
3733		goto vcpu_decrement;
3734
3735	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3736	if (!vcpu) {
3737		r = -ENOMEM;
3738		goto vcpu_decrement;
3739	}
3740
3741	BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3742	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3743	if (!page) {
3744		r = -ENOMEM;
3745		goto vcpu_free;
3746	}
3747	vcpu->run = page_address(page);
3748
3749	kvm_vcpu_init(vcpu, kvm, id);
3750
3751	r = kvm_arch_vcpu_create(vcpu);
3752	if (r)
3753		goto vcpu_free_run_page;
3754
3755	if (kvm->dirty_ring_size) {
3756		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3757					 id, kvm->dirty_ring_size);
3758		if (r)
3759			goto arch_vcpu_destroy;
3760	}
3761
3762	mutex_lock(&kvm->lock);
3763	if (kvm_get_vcpu_by_id(kvm, id)) {
3764		r = -EEXIST;
3765		goto unlock_vcpu_destroy;
3766	}
3767
3768	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3769	r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
3770	BUG_ON(r == -EBUSY);
3771	if (r)
3772		goto unlock_vcpu_destroy;
3773
3774	/* Fill the stats id string for the vcpu */
3775	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3776		 task_pid_nr(current), id);
3777
3778	/* Now it's all set up, let userspace reach it */
3779	kvm_get_kvm(kvm);
3780	r = create_vcpu_fd(vcpu);
3781	if (r < 0) {
3782		xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
3783		kvm_put_kvm_no_destroy(kvm);
3784		goto unlock_vcpu_destroy;
3785	}
3786
3787	/*
3788	 * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
3789	 * pointer before kvm->online_vcpu's incremented value.
3790	 */
3791	smp_wmb();
3792	atomic_inc(&kvm->online_vcpus);
3793
3794	mutex_unlock(&kvm->lock);
3795	kvm_arch_vcpu_postcreate(vcpu);
3796	kvm_create_vcpu_debugfs(vcpu);
3797	return r;
3798
3799unlock_vcpu_destroy:
3800	mutex_unlock(&kvm->lock);
3801	kvm_dirty_ring_free(&vcpu->dirty_ring);
3802arch_vcpu_destroy:
3803	kvm_arch_vcpu_destroy(vcpu);
3804vcpu_free_run_page:
3805	free_page((unsigned long)vcpu->run);
3806vcpu_free:
3807	kmem_cache_free(kvm_vcpu_cache, vcpu);
3808vcpu_decrement:
3809	mutex_lock(&kvm->lock);
3810	kvm->created_vcpus--;
3811	mutex_unlock(&kvm->lock);
3812	return r;
3813}
3814
3815static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3816{
3817	if (sigset) {
3818		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3819		vcpu->sigset_active = 1;
3820		vcpu->sigset = *sigset;
3821	} else
3822		vcpu->sigset_active = 0;
3823	return 0;
3824}
3825
3826static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3827			      size_t size, loff_t *offset)
3828{
3829	struct kvm_vcpu *vcpu = file->private_data;
3830
3831	return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3832			&kvm_vcpu_stats_desc[0], &vcpu->stat,
3833			sizeof(vcpu->stat), user_buffer, size, offset);
3834}
3835
3836static const struct file_operations kvm_vcpu_stats_fops = {
3837	.read = kvm_vcpu_stats_read,
3838	.llseek = noop_llseek,
3839};
3840
3841static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3842{
3843	int fd;
3844	struct file *file;
3845	char name[15 + ITOA_MAX_LEN + 1];
3846
3847	snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3848
3849	fd = get_unused_fd_flags(O_CLOEXEC);
3850	if (fd < 0)
3851		return fd;
3852
3853	file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3854	if (IS_ERR(file)) {
3855		put_unused_fd(fd);
3856		return PTR_ERR(file);
3857	}
3858	file->f_mode |= FMODE_PREAD;
3859	fd_install(fd, file);
3860
3861	return fd;
3862}
3863
3864static long kvm_vcpu_ioctl(struct file *filp,
3865			   unsigned int ioctl, unsigned long arg)
3866{
3867	struct kvm_vcpu *vcpu = filp->private_data;
3868	void __user *argp = (void __user *)arg;
3869	int r;
3870	struct kvm_fpu *fpu = NULL;
3871	struct kvm_sregs *kvm_sregs = NULL;
3872
3873	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3874		return -EIO;
3875
3876	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3877		return -EINVAL;
3878
3879	/*
3880	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
3881	 * execution; mutex_lock() would break them.
3882	 */
3883	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3884	if (r != -ENOIOCTLCMD)
3885		return r;
3886
3887	if (mutex_lock_killable(&vcpu->mutex))
3888		return -EINTR;
3889	switch (ioctl) {
3890	case KVM_RUN: {
3891		struct pid *oldpid;
3892		r = -EINVAL;
3893		if (arg)
3894			goto out;
3895		oldpid = rcu_access_pointer(vcpu->pid);
3896		if (unlikely(oldpid != task_pid(current))) {
3897			/* The thread running this VCPU changed. */
3898			struct pid *newpid;
3899
3900			r = kvm_arch_vcpu_run_pid_change(vcpu);
3901			if (r)
3902				break;
3903
3904			newpid = get_task_pid(current, PIDTYPE_PID);
3905			rcu_assign_pointer(vcpu->pid, newpid);
3906			if (oldpid)
3907				synchronize_rcu();
3908			put_pid(oldpid);
3909		}
3910		r = kvm_arch_vcpu_ioctl_run(vcpu);
3911		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3912		break;
3913	}
3914	case KVM_GET_REGS: {
3915		struct kvm_regs *kvm_regs;
3916
3917		r = -ENOMEM;
3918		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3919		if (!kvm_regs)
3920			goto out;
3921		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3922		if (r)
3923			goto out_free1;
3924		r = -EFAULT;
3925		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3926			goto out_free1;
3927		r = 0;
3928out_free1:
3929		kfree(kvm_regs);
3930		break;
3931	}
3932	case KVM_SET_REGS: {
3933		struct kvm_regs *kvm_regs;
3934
3935		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3936		if (IS_ERR(kvm_regs)) {
3937			r = PTR_ERR(kvm_regs);
3938			goto out;
3939		}
3940		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3941		kfree(kvm_regs);
3942		break;
3943	}
3944	case KVM_GET_SREGS: {
3945		kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3946				    GFP_KERNEL_ACCOUNT);
3947		r = -ENOMEM;
3948		if (!kvm_sregs)
3949			goto out;
3950		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3951		if (r)
3952			goto out;
3953		r = -EFAULT;
3954		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3955			goto out;
3956		r = 0;
3957		break;
3958	}
3959	case KVM_SET_SREGS: {
3960		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3961		if (IS_ERR(kvm_sregs)) {
3962			r = PTR_ERR(kvm_sregs);
3963			kvm_sregs = NULL;
3964			goto out;
3965		}
3966		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3967		break;
3968	}
3969	case KVM_GET_MP_STATE: {
3970		struct kvm_mp_state mp_state;
3971
3972		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3973		if (r)
3974			goto out;
3975		r = -EFAULT;
3976		if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3977			goto out;
3978		r = 0;
3979		break;
3980	}
3981	case KVM_SET_MP_STATE: {
3982		struct kvm_mp_state mp_state;
3983
3984		r = -EFAULT;
3985		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3986			goto out;
3987		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3988		break;
3989	}
3990	case KVM_TRANSLATE: {
3991		struct kvm_translation tr;
3992
3993		r = -EFAULT;
3994		if (copy_from_user(&tr, argp, sizeof(tr)))
3995			goto out;
3996		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3997		if (r)
3998			goto out;
3999		r = -EFAULT;
4000		if (copy_to_user(argp, &tr, sizeof(tr)))
4001			goto out;
4002		r = 0;
4003		break;
4004	}
4005	case KVM_SET_GUEST_DEBUG: {
4006		struct kvm_guest_debug dbg;
4007
4008		r = -EFAULT;
4009		if (copy_from_user(&dbg, argp, sizeof(dbg)))
4010			goto out;
4011		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4012		break;
4013	}
4014	case KVM_SET_SIGNAL_MASK: {
4015		struct kvm_signal_mask __user *sigmask_arg = argp;
4016		struct kvm_signal_mask kvm_sigmask;
4017		sigset_t sigset, *p;
4018
4019		p = NULL;
4020		if (argp) {
4021			r = -EFAULT;
4022			if (copy_from_user(&kvm_sigmask, argp,
4023					   sizeof(kvm_sigmask)))
4024				goto out;
4025			r = -EINVAL;
4026			if (kvm_sigmask.len != sizeof(sigset))
4027				goto out;
4028			r = -EFAULT;
4029			if (copy_from_user(&sigset, sigmask_arg->sigset,
4030					   sizeof(sigset)))
4031				goto out;
4032			p = &sigset;
4033		}
4034		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4035		break;
4036	}
4037	case KVM_GET_FPU: {
4038		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4039		r = -ENOMEM;
4040		if (!fpu)
4041			goto out;
4042		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4043		if (r)
4044			goto out;
4045		r = -EFAULT;
4046		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4047			goto out;
4048		r = 0;
4049		break;
4050	}
4051	case KVM_SET_FPU: {
4052		fpu = memdup_user(argp, sizeof(*fpu));
4053		if (IS_ERR(fpu)) {
4054			r = PTR_ERR(fpu);
4055			fpu = NULL;
4056			goto out;
4057		}
4058		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4059		break;
4060	}
4061	case KVM_GET_STATS_FD: {
4062		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4063		break;
4064	}
4065	default:
4066		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4067	}
4068out:
4069	mutex_unlock(&vcpu->mutex);
4070	kfree(fpu);
4071	kfree(kvm_sregs);
4072	return r;
4073}
4074
4075#ifdef CONFIG_KVM_COMPAT
4076static long kvm_vcpu_compat_ioctl(struct file *filp,
4077				  unsigned int ioctl, unsigned long arg)
4078{
4079	struct kvm_vcpu *vcpu = filp->private_data;
4080	void __user *argp = compat_ptr(arg);
4081	int r;
4082
4083	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4084		return -EIO;
4085
4086	switch (ioctl) {
4087	case KVM_SET_SIGNAL_MASK: {
4088		struct kvm_signal_mask __user *sigmask_arg = argp;
4089		struct kvm_signal_mask kvm_sigmask;
4090		sigset_t sigset;
4091
4092		if (argp) {
4093			r = -EFAULT;
4094			if (copy_from_user(&kvm_sigmask, argp,
4095					   sizeof(kvm_sigmask)))
4096				goto out;
4097			r = -EINVAL;
4098			if (kvm_sigmask.len != sizeof(compat_sigset_t))
4099				goto out;
4100			r = -EFAULT;
4101			if (get_compat_sigset(&sigset,
4102					      (compat_sigset_t __user *)sigmask_arg->sigset))
4103				goto out;
4104			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4105		} else
4106			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4107		break;
4108	}
4109	default:
4110		r = kvm_vcpu_ioctl(filp, ioctl, arg);
4111	}
4112
4113out:
4114	return r;
4115}
4116#endif
4117
4118static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4119{
4120	struct kvm_device *dev = filp->private_data;
4121
4122	if (dev->ops->mmap)
4123		return dev->ops->mmap(dev, vma);
4124
4125	return -ENODEV;
4126}
4127
4128static int kvm_device_ioctl_attr(struct kvm_device *dev,
4129				 int (*accessor)(struct kvm_device *dev,
4130						 struct kvm_device_attr *attr),
4131				 unsigned long arg)
4132{
4133	struct kvm_device_attr attr;
4134
4135	if (!accessor)
4136		return -EPERM;
4137
4138	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4139		return -EFAULT;
4140
4141	return accessor(dev, &attr);
4142}
4143
4144static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4145			     unsigned long arg)
4146{
4147	struct kvm_device *dev = filp->private_data;
4148
4149	if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4150		return -EIO;
4151
4152	switch (ioctl) {
4153	case KVM_SET_DEVICE_ATTR:
4154		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4155	case KVM_GET_DEVICE_ATTR:
4156		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4157	case KVM_HAS_DEVICE_ATTR:
4158		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4159	default:
4160		if (dev->ops->ioctl)
4161			return dev->ops->ioctl(dev, ioctl, arg);
4162
4163		return -ENOTTY;
4164	}
4165}
4166
4167static int kvm_device_release(struct inode *inode, struct file *filp)
4168{
4169	struct kvm_device *dev = filp->private_data;
4170	struct kvm *kvm = dev->kvm;
4171
4172	if (dev->ops->release) {
4173		mutex_lock(&kvm->lock);
4174		list_del(&dev->vm_node);
4175		dev->ops->release(dev);
4176		mutex_unlock(&kvm->lock);
4177	}
4178
4179	kvm_put_kvm(kvm);
4180	return 0;
4181}
4182
4183static const struct file_operations kvm_device_fops = {
4184	.unlocked_ioctl = kvm_device_ioctl,
4185	.release = kvm_device_release,
4186	KVM_COMPAT(kvm_device_ioctl),
4187	.mmap = kvm_device_mmap,
4188};
4189
4190struct kvm_device *kvm_device_from_filp(struct file *filp)
4191{
4192	if (filp->f_op != &kvm_device_fops)
4193		return NULL;
4194
4195	return filp->private_data;
4196}
4197
4198static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4199#ifdef CONFIG_KVM_MPIC
4200	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
4201	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
4202#endif
4203};
4204
4205int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4206{
4207	if (type >= ARRAY_SIZE(kvm_device_ops_table))
4208		return -ENOSPC;
4209
4210	if (kvm_device_ops_table[type] != NULL)
4211		return -EEXIST;
4212
4213	kvm_device_ops_table[type] = ops;
4214	return 0;
4215}
4216
4217void kvm_unregister_device_ops(u32 type)
4218{
4219	if (kvm_device_ops_table[type] != NULL)
4220		kvm_device_ops_table[type] = NULL;
4221}
4222
4223static int kvm_ioctl_create_device(struct kvm *kvm,
4224				   struct kvm_create_device *cd)
4225{
4226	const struct kvm_device_ops *ops = NULL;
4227	struct kvm_device *dev;
4228	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4229	int type;
4230	int ret;
4231
4232	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4233		return -ENODEV;
4234
4235	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4236	ops = kvm_device_ops_table[type];
4237	if (ops == NULL)
4238		return -ENODEV;
4239
4240	if (test)
4241		return 0;
4242
4243	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4244	if (!dev)
4245		return -ENOMEM;
4246
4247	dev->ops = ops;
4248	dev->kvm = kvm;
4249
4250	mutex_lock(&kvm->lock);
4251	ret = ops->create(dev, type);
4252	if (ret < 0) {
4253		mutex_unlock(&kvm->lock);
4254		kfree(dev);
4255		return ret;
4256	}
4257	list_add(&dev->vm_node, &kvm->devices);
4258	mutex_unlock(&kvm->lock);
4259
4260	if (ops->init)
4261		ops->init(dev);
4262
4263	kvm_get_kvm(kvm);
4264	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4265	if (ret < 0) {
4266		kvm_put_kvm_no_destroy(kvm);
4267		mutex_lock(&kvm->lock);
4268		list_del(&dev->vm_node);
4269		mutex_unlock(&kvm->lock);
4270		ops->destroy(dev);
4271		return ret;
4272	}
4273
4274	cd->fd = ret;
4275	return 0;
4276}
4277
4278static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4279{
4280	switch (arg) {
4281	case KVM_CAP_USER_MEMORY:
4282	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4283	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4284	case KVM_CAP_INTERNAL_ERROR_DATA:
4285#ifdef CONFIG_HAVE_KVM_MSI
4286	case KVM_CAP_SIGNAL_MSI:
4287#endif
4288#ifdef CONFIG_HAVE_KVM_IRQFD
4289	case KVM_CAP_IRQFD:
4290	case KVM_CAP_IRQFD_RESAMPLE:
4291#endif
4292	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4293	case KVM_CAP_CHECK_EXTENSION_VM:
4294	case KVM_CAP_ENABLE_CAP_VM:
4295	case KVM_CAP_HALT_POLL:
4296		return 1;
4297#ifdef CONFIG_KVM_MMIO
4298	case KVM_CAP_COALESCED_MMIO:
4299		return KVM_COALESCED_MMIO_PAGE_OFFSET;
4300	case KVM_CAP_COALESCED_PIO:
4301		return 1;
4302#endif
4303#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4304	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4305		return KVM_DIRTY_LOG_MANUAL_CAPS;
4306#endif
4307#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4308	case KVM_CAP_IRQ_ROUTING:
4309		return KVM_MAX_IRQ_ROUTES;
4310#endif
4311#if KVM_ADDRESS_SPACE_NUM > 1
4312	case KVM_CAP_MULTI_ADDRESS_SPACE:
4313		return KVM_ADDRESS_SPACE_NUM;
4314#endif
4315	case KVM_CAP_NR_MEMSLOTS:
4316		return KVM_USER_MEM_SLOTS;
4317	case KVM_CAP_DIRTY_LOG_RING:
4318#ifdef CONFIG_HAVE_KVM_DIRTY_RING
4319		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4320#else
4321		return 0;
4322#endif
4323	case KVM_CAP_BINARY_STATS_FD:
4324		return 1;
4325	default:
4326		break;
4327	}
4328	return kvm_vm_ioctl_check_extension(kvm, arg);
4329}
4330
4331static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4332{
4333	int r;
4334
4335	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4336		return -EINVAL;
4337
4338	/* the size should be power of 2 */
4339	if (!size || (size & (size - 1)))
4340		return -EINVAL;
4341
4342	/* Should be bigger to keep the reserved entries, or a page */
4343	if (size < kvm_dirty_ring_get_rsvd_entries() *
4344	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4345		return -EINVAL;
4346
4347	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4348	    sizeof(struct kvm_dirty_gfn))
4349		return -E2BIG;
4350
4351	/* We only allow it to set once */
4352	if (kvm->dirty_ring_size)
4353		return -EINVAL;
4354
4355	mutex_lock(&kvm->lock);
4356
4357	if (kvm->created_vcpus) {
4358		/* We don't allow to change this value after vcpu created */
4359		r = -EINVAL;
4360	} else {
4361		kvm->dirty_ring_size = size;
4362		r = 0;
4363	}
4364
4365	mutex_unlock(&kvm->lock);
4366	return r;
4367}
4368
4369static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4370{
4371	unsigned long i;
4372	struct kvm_vcpu *vcpu;
4373	int cleared = 0;
4374
4375	if (!kvm->dirty_ring_size)
4376		return -EINVAL;
4377
4378	mutex_lock(&kvm->slots_lock);
4379
4380	kvm_for_each_vcpu(i, vcpu, kvm)
4381		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4382
4383	mutex_unlock(&kvm->slots_lock);
4384
4385	if (cleared)
4386		kvm_flush_remote_tlbs(kvm);
4387
4388	return cleared;
4389}
4390
4391int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4392						  struct kvm_enable_cap *cap)
4393{
4394	return -EINVAL;
4395}
4396
4397static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4398					   struct kvm_enable_cap *cap)
4399{
4400	switch (cap->cap) {
4401#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4402	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4403		u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4404
4405		if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4406			allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4407
4408		if (cap->flags || (cap->args[0] & ~allowed_options))
4409			return -EINVAL;
4410		kvm->manual_dirty_log_protect = cap->args[0];
4411		return 0;
4412	}
4413#endif
4414	case KVM_CAP_HALT_POLL: {
4415		if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4416			return -EINVAL;
4417
4418		kvm->max_halt_poll_ns = cap->args[0];
4419		return 0;
4420	}
4421	case KVM_CAP_DIRTY_LOG_RING:
4422		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4423	default:
4424		return kvm_vm_ioctl_enable_cap(kvm, cap);
4425	}
4426}
4427
4428static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4429			      size_t size, loff_t *offset)
4430{
4431	struct kvm *kvm = file->private_data;
4432
4433	return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4434				&kvm_vm_stats_desc[0], &kvm->stat,
4435				sizeof(kvm->stat), user_buffer, size, offset);
4436}
4437
4438static const struct file_operations kvm_vm_stats_fops = {
4439	.read = kvm_vm_stats_read,
4440	.llseek = noop_llseek,
4441};
4442
4443static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4444{
4445	int fd;
4446	struct file *file;
4447
4448	fd = get_unused_fd_flags(O_CLOEXEC);
4449	if (fd < 0)
4450		return fd;
4451
4452	file = anon_inode_getfile("kvm-vm-stats",
4453			&kvm_vm_stats_fops, kvm, O_RDONLY);
4454	if (IS_ERR(file)) {
4455		put_unused_fd(fd);
4456		return PTR_ERR(file);
4457	}
4458	file->f_mode |= FMODE_PREAD;
4459	fd_install(fd, file);
4460
4461	return fd;
4462}
4463
4464static long kvm_vm_ioctl(struct file *filp,
4465			   unsigned int ioctl, unsigned long arg)
4466{
4467	struct kvm *kvm = filp->private_data;
4468	void __user *argp = (void __user *)arg;
4469	int r;
4470
4471	if (kvm->mm != current->mm || kvm->vm_dead)
4472		return -EIO;
4473	switch (ioctl) {
4474	case KVM_CREATE_VCPU:
4475		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4476		break;
4477	case KVM_ENABLE_CAP: {
4478		struct kvm_enable_cap cap;
4479
4480		r = -EFAULT;
4481		if (copy_from_user(&cap, argp, sizeof(cap)))
4482			goto out;
4483		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4484		break;
4485	}
4486	case KVM_SET_USER_MEMORY_REGION: {
4487		struct kvm_userspace_memory_region kvm_userspace_mem;
4488
4489		r = -EFAULT;
4490		if (copy_from_user(&kvm_userspace_mem, argp,
4491						sizeof(kvm_userspace_mem)))
4492			goto out;
4493
4494		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4495		break;
4496	}
4497	case KVM_GET_DIRTY_LOG: {
4498		struct kvm_dirty_log log;
4499
4500		r = -EFAULT;
4501		if (copy_from_user(&log, argp, sizeof(log)))
4502			goto out;
4503		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4504		break;
4505	}
4506#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4507	case KVM_CLEAR_DIRTY_LOG: {
4508		struct kvm_clear_dirty_log log;
4509
4510		r = -EFAULT;
4511		if (copy_from_user(&log, argp, sizeof(log)))
4512			goto out;
4513		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4514		break;
4515	}
4516#endif
4517#ifdef CONFIG_KVM_MMIO
4518	case KVM_REGISTER_COALESCED_MMIO: {
4519		struct kvm_coalesced_mmio_zone zone;
4520
4521		r = -EFAULT;
4522		if (copy_from_user(&zone, argp, sizeof(zone)))
4523			goto out;
4524		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4525		break;
4526	}
4527	case KVM_UNREGISTER_COALESCED_MMIO: {
4528		struct kvm_coalesced_mmio_zone zone;
4529
4530		r = -EFAULT;
4531		if (copy_from_user(&zone, argp, sizeof(zone)))
4532			goto out;
4533		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4534		break;
4535	}
4536#endif
4537	case KVM_IRQFD: {
4538		struct kvm_irqfd data;
4539
4540		r = -EFAULT;
4541		if (copy_from_user(&data, argp, sizeof(data)))
4542			goto out;
4543		r = kvm_irqfd(kvm, &data);
4544		break;
4545	}
4546	case KVM_IOEVENTFD: {
4547		struct kvm_ioeventfd data;
4548
4549		r = -EFAULT;
4550		if (copy_from_user(&data, argp, sizeof(data)))
4551			goto out;
4552		r = kvm_ioeventfd(kvm, &data);
4553		break;
4554	}
4555#ifdef CONFIG_HAVE_KVM_MSI
4556	case KVM_SIGNAL_MSI: {
4557		struct kvm_msi msi;
4558
4559		r = -EFAULT;
4560		if (copy_from_user(&msi, argp, sizeof(msi)))
4561			goto out;
4562		r = kvm_send_userspace_msi(kvm, &msi);
4563		break;
4564	}
4565#endif
4566#ifdef __KVM_HAVE_IRQ_LINE
4567	case KVM_IRQ_LINE_STATUS:
4568	case KVM_IRQ_LINE: {
4569		struct kvm_irq_level irq_event;
4570
4571		r = -EFAULT;
4572		if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4573			goto out;
4574
4575		r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4576					ioctl == KVM_IRQ_LINE_STATUS);
4577		if (r)
4578			goto out;
4579
4580		r = -EFAULT;
4581		if (ioctl == KVM_IRQ_LINE_STATUS) {
4582			if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4583				goto out;
4584		}
4585
4586		r = 0;
4587		break;
4588	}
4589#endif
4590#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4591	case KVM_SET_GSI_ROUTING: {
4592		struct kvm_irq_routing routing;
4593		struct kvm_irq_routing __user *urouting;
4594		struct kvm_irq_routing_entry *entries = NULL;
4595
4596		r = -EFAULT;
4597		if (copy_from_user(&routing, argp, sizeof(routing)))
4598			goto out;
4599		r = -EINVAL;
4600		if (!kvm_arch_can_set_irq_routing(kvm))
4601			goto out;
4602		if (routing.nr > KVM_MAX_IRQ_ROUTES)
4603			goto out;
4604		if (routing.flags)
4605			goto out;
4606		if (routing.nr) {
4607			urouting = argp;
4608			entries = vmemdup_user(urouting->entries,
4609					       array_size(sizeof(*entries),
4610							  routing.nr));
4611			if (IS_ERR(entries)) {
4612				r = PTR_ERR(entries);
4613				goto out;
4614			}
4615		}
4616		r = kvm_set_irq_routing(kvm, entries, routing.nr,
4617					routing.flags);
4618		kvfree(entries);
4619		break;
4620	}
4621#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4622	case KVM_CREATE_DEVICE: {
4623		struct kvm_create_device cd;
4624
4625		r = -EFAULT;
4626		if (copy_from_user(&cd, argp, sizeof(cd)))
4627			goto out;
4628
4629		r = kvm_ioctl_create_device(kvm, &cd);
4630		if (r)
4631			goto out;
4632
4633		r = -EFAULT;
4634		if (copy_to_user(argp, &cd, sizeof(cd)))
4635			goto out;
4636
4637		r = 0;
4638		break;
4639	}
4640	case KVM_CHECK_EXTENSION:
4641		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4642		break;
4643	case KVM_RESET_DIRTY_RINGS:
4644		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4645		break;
4646	case KVM_GET_STATS_FD:
4647		r = kvm_vm_ioctl_get_stats_fd(kvm);
4648		break;
4649	default:
4650		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4651	}
4652out:
4653	return r;
4654}
4655
4656#ifdef CONFIG_KVM_COMPAT
4657struct compat_kvm_dirty_log {
4658	__u32 slot;
4659	__u32 padding1;
4660	union {
4661		compat_uptr_t dirty_bitmap; /* one bit per page */
4662		__u64 padding2;
4663	};
4664};
4665
4666struct compat_kvm_clear_dirty_log {
4667	__u32 slot;
4668	__u32 num_pages;
4669	__u64 first_page;
4670	union {
4671		compat_uptr_t dirty_bitmap; /* one bit per page */
4672		__u64 padding2;
4673	};
4674};
4675
4676static long kvm_vm_compat_ioctl(struct file *filp,
4677			   unsigned int ioctl, unsigned long arg)
4678{
4679	struct kvm *kvm = filp->private_data;
4680	int r;
4681
4682	if (kvm->mm != current->mm || kvm->vm_dead)
4683		return -EIO;
4684	switch (ioctl) {
4685#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4686	case KVM_CLEAR_DIRTY_LOG: {
4687		struct compat_kvm_clear_dirty_log compat_log;
4688		struct kvm_clear_dirty_log log;
4689
4690		if (copy_from_user(&compat_log, (void __user *)arg,
4691				   sizeof(compat_log)))
4692			return -EFAULT;
4693		log.slot	 = compat_log.slot;
4694		log.num_pages	 = compat_log.num_pages;
4695		log.first_page	 = compat_log.first_page;
4696		log.padding2	 = compat_log.padding2;
4697		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4698
4699		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4700		break;
4701	}
4702#endif
4703	case KVM_GET_DIRTY_LOG: {
4704		struct compat_kvm_dirty_log compat_log;
4705		struct kvm_dirty_log log;
4706
4707		if (copy_from_user(&compat_log, (void __user *)arg,
4708				   sizeof(compat_log)))
4709			return -EFAULT;
4710		log.slot	 = compat_log.slot;
4711		log.padding1	 = compat_log.padding1;
4712		log.padding2	 = compat_log.padding2;
4713		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4714
4715		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4716		break;
4717	}
4718	default:
4719		r = kvm_vm_ioctl(filp, ioctl, arg);
4720	}
4721	return r;
4722}
4723#endif
4724
4725static struct file_operations kvm_vm_fops = {
4726	.release        = kvm_vm_release,
4727	.unlocked_ioctl = kvm_vm_ioctl,
4728	.llseek		= noop_llseek,
4729	KVM_COMPAT(kvm_vm_compat_ioctl),
4730};
4731
4732bool file_is_kvm(struct file *file)
4733{
4734	return file && file->f_op == &kvm_vm_fops;
4735}
4736EXPORT_SYMBOL_GPL(file_is_kvm);
4737
4738static int kvm_dev_ioctl_create_vm(unsigned long type)
4739{
4740	int r;
4741	struct kvm *kvm;
4742	struct file *file;
4743
4744	kvm = kvm_create_vm(type);
4745	if (IS_ERR(kvm))
4746		return PTR_ERR(kvm);
4747#ifdef CONFIG_KVM_MMIO
4748	r = kvm_coalesced_mmio_init(kvm);
4749	if (r < 0)
4750		goto put_kvm;
4751#endif
4752	r = get_unused_fd_flags(O_CLOEXEC);
4753	if (r < 0)
4754		goto put_kvm;
4755
4756	snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4757			"kvm-%d", task_pid_nr(current));
4758
4759	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4760	if (IS_ERR(file)) {
4761		put_unused_fd(r);
4762		r = PTR_ERR(file);
4763		goto put_kvm;
4764	}
4765
4766	/*
4767	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
4768	 * already set, with ->release() being kvm_vm_release().  In error
4769	 * cases it will be called by the final fput(file) and will take
4770	 * care of doing kvm_put_kvm(kvm).
4771	 */
4772	if (kvm_create_vm_debugfs(kvm, r) < 0) {
4773		put_unused_fd(r);
4774		fput(file);
4775		return -ENOMEM;
4776	}
4777	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4778
4779	fd_install(r, file);
4780	return r;
4781
4782put_kvm:
4783	kvm_put_kvm(kvm);
4784	return r;
4785}
4786
4787static long kvm_dev_ioctl(struct file *filp,
4788			  unsigned int ioctl, unsigned long arg)
4789{
4790	long r = -EINVAL;
4791
4792	switch (ioctl) {
4793	case KVM_GET_API_VERSION:
4794		if (arg)
4795			goto out;
4796		r = KVM_API_VERSION;
4797		break;
4798	case KVM_CREATE_VM:
4799		r = kvm_dev_ioctl_create_vm(arg);
4800		break;
4801	case KVM_CHECK_EXTENSION:
4802		r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4803		break;
4804	case KVM_GET_VCPU_MMAP_SIZE:
4805		if (arg)
4806			goto out;
4807		r = PAGE_SIZE;     /* struct kvm_run */
4808#ifdef CONFIG_X86
4809		r += PAGE_SIZE;    /* pio data page */
4810#endif
4811#ifdef CONFIG_KVM_MMIO
4812		r += PAGE_SIZE;    /* coalesced mmio ring page */
4813#endif
4814		break;
4815	case KVM_TRACE_ENABLE:
4816	case KVM_TRACE_PAUSE:
4817	case KVM_TRACE_DISABLE:
4818		r = -EOPNOTSUPP;
4819		break;
4820	default:
4821		return kvm_arch_dev_ioctl(filp, ioctl, arg);
4822	}
4823out:
4824	return r;
4825}
4826
4827static struct file_operations kvm_chardev_ops = {
4828	.unlocked_ioctl = kvm_dev_ioctl,
4829	.llseek		= noop_llseek,
4830	KVM_COMPAT(kvm_dev_ioctl),
4831};
4832
4833static struct miscdevice kvm_dev = {
4834	KVM_MINOR,
4835	"kvm",
4836	&kvm_chardev_ops,
4837};
4838
4839static void hardware_enable_nolock(void *junk)
4840{
4841	int cpu = raw_smp_processor_id();
4842	int r;
4843
4844	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4845		return;
4846
4847	cpumask_set_cpu(cpu, cpus_hardware_enabled);
4848
4849	r = kvm_arch_hardware_enable();
4850
4851	if (r) {
4852		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4853		atomic_inc(&hardware_enable_failed);
4854		pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4855	}
4856}
4857
4858static int kvm_starting_cpu(unsigned int cpu)
4859{
4860	raw_spin_lock(&kvm_count_lock);
4861	if (kvm_usage_count)
4862		hardware_enable_nolock(NULL);
4863	raw_spin_unlock(&kvm_count_lock);
4864	return 0;
4865}
4866
4867static void hardware_disable_nolock(void *junk)
4868{
4869	int cpu = raw_smp_processor_id();
4870
4871	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4872		return;
4873	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4874	kvm_arch_hardware_disable();
4875}
4876
4877static int kvm_dying_cpu(unsigned int cpu)
4878{
4879	raw_spin_lock(&kvm_count_lock);
4880	if (kvm_usage_count)
4881		hardware_disable_nolock(NULL);
4882	raw_spin_unlock(&kvm_count_lock);
4883	return 0;
4884}
4885
4886static void hardware_disable_all_nolock(void)
4887{
4888	BUG_ON(!kvm_usage_count);
4889
4890	kvm_usage_count--;
4891	if (!kvm_usage_count)
4892		on_each_cpu(hardware_disable_nolock, NULL, 1);
4893}
4894
4895static void hardware_disable_all(void)
4896{
4897	raw_spin_lock(&kvm_count_lock);
4898	hardware_disable_all_nolock();
4899	raw_spin_unlock(&kvm_count_lock);
4900}
4901
4902static int hardware_enable_all(void)
4903{
4904	int r = 0;
4905
4906	raw_spin_lock(&kvm_count_lock);
4907
4908	kvm_usage_count++;
4909	if (kvm_usage_count == 1) {
4910		atomic_set(&hardware_enable_failed, 0);
4911		on_each_cpu(hardware_enable_nolock, NULL, 1);
4912
4913		if (atomic_read(&hardware_enable_failed)) {
4914			hardware_disable_all_nolock();
4915			r = -EBUSY;
4916		}
4917	}
4918
4919	raw_spin_unlock(&kvm_count_lock);
4920
4921	return r;
4922}
4923
4924static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4925		      void *v)
4926{
4927	/*
4928	 * Some (well, at least mine) BIOSes hang on reboot if
4929	 * in vmx root mode.
4930	 *
4931	 * And Intel TXT required VMX off for all cpu when system shutdown.
4932	 */
4933	pr_info("kvm: exiting hardware virtualization\n");
4934	kvm_rebooting = true;
4935	on_each_cpu(hardware_disable_nolock, NULL, 1);
4936	return NOTIFY_OK;
4937}
4938
4939static struct notifier_block kvm_reboot_notifier = {
4940	.notifier_call = kvm_reboot,
4941	.priority = 0,
4942};
4943
4944static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4945{
4946	int i;
4947
4948	for (i = 0; i < bus->dev_count; i++) {
4949		struct kvm_io_device *pos = bus->range[i].dev;
4950
4951		kvm_iodevice_destructor(pos);
4952	}
4953	kfree(bus);
4954}
4955
4956static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4957				 const struct kvm_io_range *r2)
4958{
4959	gpa_t addr1 = r1->addr;
4960	gpa_t addr2 = r2->addr;
4961
4962	if (addr1 < addr2)
4963		return -1;
4964
4965	/* If r2->len == 0, match the exact address.  If r2->len != 0,
4966	 * accept any overlapping write.  Any order is acceptable for
4967	 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4968	 * we process all of them.
4969	 */
4970	if (r2->len) {
4971		addr1 += r1->len;
4972		addr2 += r2->len;
4973	}
4974
4975	if (addr1 > addr2)
4976		return 1;
4977
4978	return 0;
4979}
4980
4981static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4982{
4983	return kvm_io_bus_cmp(p1, p2);
4984}
4985
4986static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4987			     gpa_t addr, int len)
4988{
4989	struct kvm_io_range *range, key;
4990	int off;
4991
4992	key = (struct kvm_io_range) {
4993		.addr = addr,
4994		.len = len,
4995	};
4996
4997	range = bsearch(&key, bus->range, bus->dev_count,
4998			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4999	if (range == NULL)
5000		return -ENOENT;
5001
5002	off = range - bus->range;
5003
5004	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5005		off--;
5006
5007	return off;
5008}
5009
5010static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5011			      struct kvm_io_range *range, const void *val)
5012{
5013	int idx;
5014
5015	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5016	if (idx < 0)
5017		return -EOPNOTSUPP;
5018
5019	while (idx < bus->dev_count &&
5020		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5021		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5022					range->len, val))
5023			return idx;
5024		idx++;
5025	}
5026
5027	return -EOPNOTSUPP;
5028}
5029
5030/* kvm_io_bus_write - called under kvm->slots_lock */
5031int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5032		     int len, const void *val)
5033{
5034	struct kvm_io_bus *bus;
5035	struct kvm_io_range range;
5036	int r;
5037
5038	range = (struct kvm_io_range) {
5039		.addr = addr,
5040		.len = len,
5041	};
5042
5043	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5044	if (!bus)
5045		return -ENOMEM;
5046	r = __kvm_io_bus_write(vcpu, bus, &range, val);
5047	return r < 0 ? r : 0;
5048}
5049EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5050
5051/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5052int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5053			    gpa_t addr, int len, const void *val, long cookie)
5054{
5055	struct kvm_io_bus *bus;
5056	struct kvm_io_range range;
5057
5058	range = (struct kvm_io_range) {
5059		.addr = addr,
5060		.len = len,
5061	};
5062
5063	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5064	if (!bus)
5065		return -ENOMEM;
5066
5067	/* First try the device referenced by cookie. */
5068	if ((cookie >= 0) && (cookie < bus->dev_count) &&
5069	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5070		if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5071					val))
5072			return cookie;
5073
5074	/*
5075	 * cookie contained garbage; fall back to search and return the
5076	 * correct cookie value.
5077	 */
5078	return __kvm_io_bus_write(vcpu, bus, &range, val);
5079}
5080
5081static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5082			     struct kvm_io_range *range, void *val)
5083{
5084	int idx;
5085
5086	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5087	if (idx < 0)
5088		return -EOPNOTSUPP;
5089
5090	while (idx < bus->dev_count &&
5091		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5092		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5093				       range->len, val))
5094			return idx;
5095		idx++;
5096	}
5097
5098	return -EOPNOTSUPP;
5099}
5100
5101/* kvm_io_bus_read - called under kvm->slots_lock */
5102int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5103		    int len, void *val)
5104{
5105	struct kvm_io_bus *bus;
5106	struct kvm_io_range range;
5107	int r;
5108
5109	range = (struct kvm_io_range) {
5110		.addr = addr,
5111		.len = len,
5112	};
5113
5114	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5115	if (!bus)
5116		return -ENOMEM;
5117	r = __kvm_io_bus_read(vcpu, bus, &range, val);
5118	return r < 0 ? r : 0;
5119}
5120
5121/* Caller must hold slots_lock. */
5122int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5123			    int len, struct kvm_io_device *dev)
5124{
5125	int i;
5126	struct kvm_io_bus *new_bus, *bus;
5127	struct kvm_io_range range;
5128
5129	bus = kvm_get_bus(kvm, bus_idx);
5130	if (!bus)
5131		return -ENOMEM;
5132
5133	/* exclude ioeventfd which is limited by maximum fd */
5134	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5135		return -ENOSPC;
5136
5137	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5138			  GFP_KERNEL_ACCOUNT);
5139	if (!new_bus)
5140		return -ENOMEM;
5141
5142	range = (struct kvm_io_range) {
5143		.addr = addr,
5144		.len = len,
5145		.dev = dev,
5146	};
5147
5148	for (i = 0; i < bus->dev_count; i++)
5149		if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5150			break;
5151
5152	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5153	new_bus->dev_count++;
5154	new_bus->range[i] = range;
5155	memcpy(new_bus->range + i + 1, bus->range + i,
5156		(bus->dev_count - i) * sizeof(struct kvm_io_range));
5157	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5158	synchronize_srcu_expedited(&kvm->srcu);
5159	kfree(bus);
5160
5161	return 0;
5162}
5163
5164int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5165			      struct kvm_io_device *dev)
5166{
5167	int i, j;
5168	struct kvm_io_bus *new_bus, *bus;
5169
5170	lockdep_assert_held(&kvm->slots_lock);
5171
5172	bus = kvm_get_bus(kvm, bus_idx);
5173	if (!bus)
5174		return 0;
5175
5176	for (i = 0; i < bus->dev_count; i++) {
5177		if (bus->range[i].dev == dev) {
5178			break;
5179		}
5180	}
5181
5182	if (i == bus->dev_count)
5183		return 0;
5184
5185	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5186			  GFP_KERNEL_ACCOUNT);
5187	if (new_bus) {
5188		memcpy(new_bus, bus, struct_size(bus, range, i));
5189		new_bus->dev_count--;
5190		memcpy(new_bus->range + i, bus->range + i + 1,
5191				flex_array_size(new_bus, range, new_bus->dev_count - i));
5192	}
5193
5194	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5195	synchronize_srcu_expedited(&kvm->srcu);
5196
5197	/* Destroy the old bus _after_ installing the (null) bus. */
5198	if (!new_bus) {
5199		pr_err("kvm: failed to shrink bus, removing it completely\n");
5200		for (j = 0; j < bus->dev_count; j++) {
5201			if (j == i)
5202				continue;
5203			kvm_iodevice_destructor(bus->range[j].dev);
5204		}
5205	}
5206
5207	kfree(bus);
5208	return new_bus ? 0 : -ENOMEM;
5209}
5210
5211struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5212					 gpa_t addr)
5213{
5214	struct kvm_io_bus *bus;
5215	int dev_idx, srcu_idx;
5216	struct kvm_io_device *iodev = NULL;
5217
5218	srcu_idx = srcu_read_lock(&kvm->srcu);
5219
5220	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5221	if (!bus)
5222		goto out_unlock;
5223
5224	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5225	if (dev_idx < 0)
5226		goto out_unlock;
5227
5228	iodev = bus->range[dev_idx].dev;
5229
5230out_unlock:
5231	srcu_read_unlock(&kvm->srcu, srcu_idx);
5232
5233	return iodev;
5234}
5235EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5236
5237static int kvm_debugfs_open(struct inode *inode, struct file *file,
5238			   int (*get)(void *, u64 *), int (*set)(void *, u64),
5239			   const char *fmt)
5240{
5241	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5242					  inode->i_private;
5243
5244	/*
5245	 * The debugfs files are a reference to the kvm struct which
5246        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5247        * avoids the race between open and the removal of the debugfs directory.
5248	 */
5249	if (!kvm_get_kvm_safe(stat_data->kvm))
5250		return -ENOENT;
5251
5252	if (simple_attr_open(inode, file, get,
5253		    kvm_stats_debugfs_mode(stat_data->desc) & 0222
5254		    ? set : NULL,
5255		    fmt)) {
5256		kvm_put_kvm(stat_data->kvm);
5257		return -ENOMEM;
5258	}
5259
5260	return 0;
5261}
5262
5263static int kvm_debugfs_release(struct inode *inode, struct file *file)
5264{
5265	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5266					  inode->i_private;
5267
5268	simple_attr_release(inode, file);
5269	kvm_put_kvm(stat_data->kvm);
5270
5271	return 0;
5272}
5273
5274static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5275{
5276	*val = *(u64 *)((void *)(&kvm->stat) + offset);
5277
5278	return 0;
5279}
5280
5281static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5282{
5283	*(u64 *)((void *)(&kvm->stat) + offset) = 0;
5284
5285	return 0;
5286}
5287
5288static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5289{
5290	unsigned long i;
5291	struct kvm_vcpu *vcpu;
5292
5293	*val = 0;
5294
5295	kvm_for_each_vcpu(i, vcpu, kvm)
5296		*val += *(u64 *)((void *)(&vcpu->stat) + offset);
5297
5298	return 0;
5299}
5300
5301static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5302{
5303	unsigned long i;
5304	struct kvm_vcpu *vcpu;
5305
5306	kvm_for_each_vcpu(i, vcpu, kvm)
5307		*(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5308
5309	return 0;
5310}
5311
5312static int kvm_stat_data_get(void *data, u64 *val)
5313{
5314	int r = -EFAULT;
5315	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5316
5317	switch (stat_data->kind) {
5318	case KVM_STAT_VM:
5319		r = kvm_get_stat_per_vm(stat_data->kvm,
5320					stat_data->desc->desc.offset, val);
5321		break;
5322	case KVM_STAT_VCPU:
5323		r = kvm_get_stat_per_vcpu(stat_data->kvm,
5324					  stat_data->desc->desc.offset, val);
5325		break;
5326	}
5327
5328	return r;
5329}
5330
5331static int kvm_stat_data_clear(void *data, u64 val)
5332{
5333	int r = -EFAULT;
5334	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5335
5336	if (val)
5337		return -EINVAL;
5338
5339	switch (stat_data->kind) {
5340	case KVM_STAT_VM:
5341		r = kvm_clear_stat_per_vm(stat_data->kvm,
5342					  stat_data->desc->desc.offset);
5343		break;
5344	case KVM_STAT_VCPU:
5345		r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5346					    stat_data->desc->desc.offset);
5347		break;
5348	}
5349
5350	return r;
5351}
5352
5353static int kvm_stat_data_open(struct inode *inode, struct file *file)
5354{
5355	__simple_attr_check_format("%llu\n", 0ull);
5356	return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5357				kvm_stat_data_clear, "%llu\n");
5358}
5359
5360static const struct file_operations stat_fops_per_vm = {
5361	.owner = THIS_MODULE,
5362	.open = kvm_stat_data_open,
5363	.release = kvm_debugfs_release,
5364	.read = simple_attr_read,
5365	.write = simple_attr_write,
5366	.llseek = no_llseek,
5367};
5368
5369static int vm_stat_get(void *_offset, u64 *val)
5370{
5371	unsigned offset = (long)_offset;
5372	struct kvm *kvm;
5373	u64 tmp_val;
5374
5375	*val = 0;
5376	mutex_lock(&kvm_lock);
5377	list_for_each_entry(kvm, &vm_list, vm_list) {
5378		kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5379		*val += tmp_val;
5380	}
5381	mutex_unlock(&kvm_lock);
5382	return 0;
5383}
5384
5385static int vm_stat_clear(void *_offset, u64 val)
5386{
5387	unsigned offset = (long)_offset;
5388	struct kvm *kvm;
5389
5390	if (val)
5391		return -EINVAL;
5392
5393	mutex_lock(&kvm_lock);
5394	list_for_each_entry(kvm, &vm_list, vm_list) {
5395		kvm_clear_stat_per_vm(kvm, offset);
5396	}
5397	mutex_unlock(&kvm_lock);
5398
5399	return 0;
5400}
5401
5402DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5403DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5404
5405static int vcpu_stat_get(void *_offset, u64 *val)
5406{
5407	unsigned offset = (long)_offset;
5408	struct kvm *kvm;
5409	u64 tmp_val;
5410
5411	*val = 0;
5412	mutex_lock(&kvm_lock);
5413	list_for_each_entry(kvm, &vm_list, vm_list) {
5414		kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5415		*val += tmp_val;
5416	}
5417	mutex_unlock(&kvm_lock);
5418	return 0;
5419}
5420
5421static int vcpu_stat_clear(void *_offset, u64 val)
5422{
5423	unsigned offset = (long)_offset;
5424	struct kvm *kvm;
5425
5426	if (val)
5427		return -EINVAL;
5428
5429	mutex_lock(&kvm_lock);
5430	list_for_each_entry(kvm, &vm_list, vm_list) {
5431		kvm_clear_stat_per_vcpu(kvm, offset);
5432	}
5433	mutex_unlock(&kvm_lock);
5434
5435	return 0;
5436}
5437
5438DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5439			"%llu\n");
5440DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5441
5442static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5443{
5444	struct kobj_uevent_env *env;
5445	unsigned long long created, active;
5446
5447	if (!kvm_dev.this_device || !kvm)
5448		return;
5449
5450	mutex_lock(&kvm_lock);
5451	if (type == KVM_EVENT_CREATE_VM) {
5452		kvm_createvm_count++;
5453		kvm_active_vms++;
5454	} else if (type == KVM_EVENT_DESTROY_VM) {
5455		kvm_active_vms--;
5456	}
5457	created = kvm_createvm_count;
5458	active = kvm_active_vms;
5459	mutex_unlock(&kvm_lock);
5460
5461	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5462	if (!env)
5463		return;
5464
5465	add_uevent_var(env, "CREATED=%llu", created);
5466	add_uevent_var(env, "COUNT=%llu", active);
5467
5468	if (type == KVM_EVENT_CREATE_VM) {
5469		add_uevent_var(env, "EVENT=create");
5470		kvm->userspace_pid = task_pid_nr(current);
5471	} else if (type == KVM_EVENT_DESTROY_VM) {
5472		add_uevent_var(env, "EVENT=destroy");
5473	}
5474	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5475
5476	if (kvm->debugfs_dentry) {
5477		char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5478
5479		if (p) {
5480			tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5481			if (!IS_ERR(tmp))
5482				add_uevent_var(env, "STATS_PATH=%s", tmp);
5483			kfree(p);
5484		}
5485	}
5486	/* no need for checks, since we are adding at most only 5 keys */
5487	env->envp[env->envp_idx++] = NULL;
5488	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5489	kfree(env);
5490}
5491
5492static void kvm_init_debug(void)
5493{
5494	const struct file_operations *fops;
5495	const struct _kvm_stats_desc *pdesc;
5496	int i;
5497
5498	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5499
5500	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5501		pdesc = &kvm_vm_stats_desc[i];
5502		if (kvm_stats_debugfs_mode(pdesc) & 0222)
5503			fops = &vm_stat_fops;
5504		else
5505			fops = &vm_stat_readonly_fops;
5506		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5507				kvm_debugfs_dir,
5508				(void *)(long)pdesc->desc.offset, fops);
5509	}
5510
5511	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5512		pdesc = &kvm_vcpu_stats_desc[i];
5513		if (kvm_stats_debugfs_mode(pdesc) & 0222)
5514			fops = &vcpu_stat_fops;
5515		else
5516			fops = &vcpu_stat_readonly_fops;
5517		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5518				kvm_debugfs_dir,
5519				(void *)(long)pdesc->desc.offset, fops);
5520	}
5521}
5522
5523static int kvm_suspend(void)
5524{
5525	if (kvm_usage_count)
5526		hardware_disable_nolock(NULL);
5527	return 0;
5528}
5529
5530static void kvm_resume(void)
5531{
5532	if (kvm_usage_count) {
5533#ifdef CONFIG_LOCKDEP
5534		WARN_ON(lockdep_is_held(&kvm_count_lock));
5535#endif
5536		hardware_enable_nolock(NULL);
5537	}
5538}
5539
5540static struct syscore_ops kvm_syscore_ops = {
5541	.suspend = kvm_suspend,
5542	.resume = kvm_resume,
5543};
5544
5545static inline
5546struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5547{
5548	return container_of(pn, struct kvm_vcpu, preempt_notifier);
5549}
5550
5551static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5552{
5553	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5554
5555	WRITE_ONCE(vcpu->preempted, false);
5556	WRITE_ONCE(vcpu->ready, false);
5557
5558	__this_cpu_write(kvm_running_vcpu, vcpu);
5559	kvm_arch_sched_in(vcpu, cpu);
5560	kvm_arch_vcpu_load(vcpu, cpu);
5561}
5562
5563static void kvm_sched_out(struct preempt_notifier *pn,
5564			  struct task_struct *next)
5565{
5566	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5567
5568	if (current->on_rq) {
5569		WRITE_ONCE(vcpu->preempted, true);
5570		WRITE_ONCE(vcpu->ready, true);
5571	}
5572	kvm_arch_vcpu_put(vcpu);
5573	__this_cpu_write(kvm_running_vcpu, NULL);
5574}
5575
5576/**
5577 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5578 *
5579 * We can disable preemption locally around accessing the per-CPU variable,
5580 * and use the resolved vcpu pointer after enabling preemption again,
5581 * because even if the current thread is migrated to another CPU, reading
5582 * the per-CPU value later will give us the same value as we update the
5583 * per-CPU variable in the preempt notifier handlers.
5584 */
5585struct kvm_vcpu *kvm_get_running_vcpu(void)
5586{
5587	struct kvm_vcpu *vcpu;
5588
5589	preempt_disable();
5590	vcpu = __this_cpu_read(kvm_running_vcpu);
5591	preempt_enable();
5592
5593	return vcpu;
5594}
5595EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5596
5597/**
5598 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5599 */
5600struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5601{
5602        return &kvm_running_vcpu;
5603}
5604
5605#ifdef CONFIG_GUEST_PERF_EVENTS
5606static unsigned int kvm_guest_state(void)
5607{
5608	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5609	unsigned int state;
5610
5611	if (!kvm_arch_pmi_in_guest(vcpu))
5612		return 0;
5613
5614	state = PERF_GUEST_ACTIVE;
5615	if (!kvm_arch_vcpu_in_kernel(vcpu))
5616		state |= PERF_GUEST_USER;
5617
5618	return state;
5619}
5620
5621static unsigned long kvm_guest_get_ip(void)
5622{
5623	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5624
5625	/* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
5626	if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
5627		return 0;
5628
5629	return kvm_arch_vcpu_get_ip(vcpu);
5630}
5631
5632static struct perf_guest_info_callbacks kvm_guest_cbs = {
5633	.state			= kvm_guest_state,
5634	.get_ip			= kvm_guest_get_ip,
5635	.handle_intel_pt_intr	= NULL,
5636};
5637
5638void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
5639{
5640	kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
5641	perf_register_guest_info_callbacks(&kvm_guest_cbs);
5642}
5643void kvm_unregister_perf_callbacks(void)
5644{
5645	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5646}
5647#endif
5648
5649struct kvm_cpu_compat_check {
5650	void *opaque;
5651	int *ret;
5652};
5653
5654static void check_processor_compat(void *data)
5655{
5656	struct kvm_cpu_compat_check *c = data;
5657
5658	*c->ret = kvm_arch_check_processor_compat(c->opaque);
5659}
5660
5661int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5662		  struct module *module)
5663{
5664	struct kvm_cpu_compat_check c;
5665	int r;
5666	int cpu;
5667
5668	r = kvm_arch_init(opaque);
5669	if (r)
5670		goto out_fail;
5671
5672	/*
5673	 * kvm_arch_init makes sure there's at most one caller
5674	 * for architectures that support multiple implementations,
5675	 * like intel and amd on x86.
5676	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5677	 * conflicts in case kvm is already setup for another implementation.
5678	 */
5679	r = kvm_irqfd_init();
5680	if (r)
5681		goto out_irqfd;
5682
5683	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5684		r = -ENOMEM;
5685		goto out_free_0;
5686	}
5687
5688	r = kvm_arch_hardware_setup(opaque);
5689	if (r < 0)
5690		goto out_free_1;
5691
5692	c.ret = &r;
5693	c.opaque = opaque;
5694	for_each_online_cpu(cpu) {
5695		smp_call_function_single(cpu, check_processor_compat, &c, 1);
5696		if (r < 0)
5697			goto out_free_2;
5698	}
5699
5700	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5701				      kvm_starting_cpu, kvm_dying_cpu);
5702	if (r)
5703		goto out_free_2;
5704	register_reboot_notifier(&kvm_reboot_notifier);
5705
5706	/* A kmem cache lets us meet the alignment requirements of fx_save. */
5707	if (!vcpu_align)
5708		vcpu_align = __alignof__(struct kvm_vcpu);
5709	kvm_vcpu_cache =
5710		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5711					   SLAB_ACCOUNT,
5712					   offsetof(struct kvm_vcpu, arch),
5713					   offsetofend(struct kvm_vcpu, stats_id)
5714					   - offsetof(struct kvm_vcpu, arch),
5715					   NULL);
5716	if (!kvm_vcpu_cache) {
5717		r = -ENOMEM;
5718		goto out_free_3;
5719	}
5720
5721	for_each_possible_cpu(cpu) {
5722		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5723					    GFP_KERNEL, cpu_to_node(cpu))) {
5724			r = -ENOMEM;
5725			goto out_free_4;
5726		}
5727	}
5728
5729	r = kvm_async_pf_init();
5730	if (r)
5731		goto out_free_5;
5732
5733	kvm_chardev_ops.owner = module;
5734	kvm_vm_fops.owner = module;
5735	kvm_vcpu_fops.owner = module;
5736
5737	r = misc_register(&kvm_dev);
5738	if (r) {
5739		pr_err("kvm: misc device register failed\n");
5740		goto out_unreg;
5741	}
5742
5743	register_syscore_ops(&kvm_syscore_ops);
5744
5745	kvm_preempt_ops.sched_in = kvm_sched_in;
5746	kvm_preempt_ops.sched_out = kvm_sched_out;
5747
5748	kvm_init_debug();
5749
5750	r = kvm_vfio_ops_init();
5751	WARN_ON(r);
5752
5753	return 0;
5754
5755out_unreg:
5756	kvm_async_pf_deinit();
5757out_free_5:
5758	for_each_possible_cpu(cpu)
5759		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5760out_free_4:
5761	kmem_cache_destroy(kvm_vcpu_cache);
5762out_free_3:
5763	unregister_reboot_notifier(&kvm_reboot_notifier);
5764	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5765out_free_2:
5766	kvm_arch_hardware_unsetup();
5767out_free_1:
5768	free_cpumask_var(cpus_hardware_enabled);
5769out_free_0:
5770	kvm_irqfd_exit();
5771out_irqfd:
5772	kvm_arch_exit();
5773out_fail:
5774	return r;
5775}
5776EXPORT_SYMBOL_GPL(kvm_init);
5777
5778void kvm_exit(void)
5779{
5780	int cpu;
5781
5782	debugfs_remove_recursive(kvm_debugfs_dir);
5783	misc_deregister(&kvm_dev);
5784	for_each_possible_cpu(cpu)
5785		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5786	kmem_cache_destroy(kvm_vcpu_cache);
5787	kvm_async_pf_deinit();
5788	unregister_syscore_ops(&kvm_syscore_ops);
5789	unregister_reboot_notifier(&kvm_reboot_notifier);
5790	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5791	on_each_cpu(hardware_disable_nolock, NULL, 1);
5792	kvm_arch_hardware_unsetup();
5793	kvm_arch_exit();
5794	kvm_irqfd_exit();
5795	free_cpumask_var(cpus_hardware_enabled);
5796	kvm_vfio_ops_exit();
5797}
5798EXPORT_SYMBOL_GPL(kvm_exit);
5799
5800struct kvm_vm_worker_thread_context {
5801	struct kvm *kvm;
5802	struct task_struct *parent;
5803	struct completion init_done;
5804	kvm_vm_thread_fn_t thread_fn;
5805	uintptr_t data;
5806	int err;
5807};
5808
5809static int kvm_vm_worker_thread(void *context)
5810{
5811	/*
5812	 * The init_context is allocated on the stack of the parent thread, so
5813	 * we have to locally copy anything that is needed beyond initialization
5814	 */
5815	struct kvm_vm_worker_thread_context *init_context = context;
5816	struct kvm *kvm = init_context->kvm;
5817	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5818	uintptr_t data = init_context->data;
5819	int err;
5820
5821	err = kthread_park(current);
5822	/* kthread_park(current) is never supposed to return an error */
5823	WARN_ON(err != 0);
5824	if (err)
5825		goto init_complete;
5826
5827	err = cgroup_attach_task_all(init_context->parent, current);
5828	if (err) {
5829		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5830			__func__, err);
5831		goto init_complete;
5832	}
5833
5834	set_user_nice(current, task_nice(init_context->parent));
5835
5836init_complete:
5837	init_context->err = err;
5838	complete(&init_context->init_done);
5839	init_context = NULL;
5840
5841	if (err)
5842		return err;
5843
5844	/* Wait to be woken up by the spawner before proceeding. */
5845	kthread_parkme();
5846
5847	if (!kthread_should_stop())
5848		err = thread_fn(kvm, data);
5849
5850	return err;
5851}
5852
5853int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5854				uintptr_t data, const char *name,
5855				struct task_struct **thread_ptr)
5856{
5857	struct kvm_vm_worker_thread_context init_context = {};
5858	struct task_struct *thread;
5859
5860	*thread_ptr = NULL;
5861	init_context.kvm = kvm;
5862	init_context.parent = current;
5863	init_context.thread_fn = thread_fn;
5864	init_context.data = data;
5865	init_completion(&init_context.init_done);
5866
5867	thread = kthread_run(kvm_vm_worker_thread, &init_context,
5868			     "%s-%d", name, task_pid_nr(current));
5869	if (IS_ERR(thread))
5870		return PTR_ERR(thread);
5871
5872	/* kthread_run is never supposed to return NULL */
5873	WARN_ON(thread == NULL);
5874
5875	wait_for_completion(&init_context.init_done);
5876
5877	if (!init_context.err)
5878		*thread_ptr = thread;
5879
5880	return init_context.err;
5881}
5882