svm.c revision 271415
1/*-
2 * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: projects/bhyve_svm/sys/amd64/vmm/amd/svm.c 271415 2014-09-11 02:37:02Z neel $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/smp.h>
33#include <sys/kernel.h>
34#include <sys/malloc.h>
35#include <sys/pcpu.h>
36#include <sys/proc.h>
37
38#include <vm/vm.h>
39#include <vm/pmap.h>
40
41#include <machine/cpufunc.h>
42#include <machine/psl.h>
43#include <machine/pmap.h>
44#include <machine/md_var.h>
45#include <machine/vmparam.h>
46#include <machine/specialreg.h>
47#include <machine/segments.h>
48#include <machine/smp.h>
49#include <machine/vmm.h>
50#include <machine/vmm_dev.h>
51#include <machine/vmm_instruction_emul.h>
52
53#include <x86/apicreg.h>
54
55#include "vmm_lapic.h"
56#include "vmm_msr.h"
57#include "vmm_stat.h"
58#include "vmm_ktr.h"
59#include "vmm_ioport.h"
60#include "vatpic.h"
61#include "vlapic.h"
62#include "vlapic_priv.h"
63
64#include "x86.h"
65#include "vmcb.h"
66#include "svm.h"
67#include "svm_softc.h"
68#include "npt.h"
69
70/*
71 * SVM CPUID function 0x8000_000A, edx bit decoding.
72 */
73#define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
74#define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
75#define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
76#define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
77#define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
78#define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
79#define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
80#define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
81#define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
82#define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
83
84#define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
85				VMCB_CACHE_IOPM		|	\
86				VMCB_CACHE_I		|	\
87				VMCB_CACHE_TPR		|	\
88				VMCB_CACHE_NP)
89
90MALLOC_DEFINE(M_SVM, "svm", "svm");
91MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
92
93/* Per-CPU context area. */
94extern struct pcpu __pcpu[];
95
96static int svm_getdesc(void *arg, int vcpu, int type, struct seg_desc *desc);
97
98static uint32_t svm_feature;	/* AMD SVM features. */
99
100/* Maximum ASIDs supported by the processor */
101static uint32_t nasid;
102
103/* Current ASID generation for each host cpu */
104static struct asid asid[MAXCPU];
105
106/*
107 * SVM host state saved area of size 4KB for each core.
108 */
109static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
110
111/*
112 * S/w saved host context.
113 */
114static struct svm_regctx host_ctx[MAXCPU];
115
116static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
117static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
118static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
119
120/*
121 * Common function to enable or disabled SVM for a CPU.
122 */
123static int
124cpu_svm_enable_disable(boolean_t enable)
125{
126	uint64_t efer_msr;
127
128	efer_msr = rdmsr(MSR_EFER);
129
130	if (enable)
131		efer_msr |= EFER_SVM;
132	else
133		efer_msr &= ~EFER_SVM;
134
135	wrmsr(MSR_EFER, efer_msr);
136
137	return(0);
138}
139
140/*
141 * Disable SVM on a CPU.
142 */
143static void
144svm_disable(void *arg __unused)
145{
146
147	(void)cpu_svm_enable_disable(FALSE);
148}
149
150/*
151 * Disable SVM for all CPUs.
152 */
153static int
154svm_cleanup(void)
155{
156
157	smp_rendezvous(NULL, svm_disable, NULL, NULL);
158	return (0);
159}
160
161/*
162 * Check for required BHyVe SVM features in a CPU.
163 */
164static int
165svm_cpuid_features(void)
166{
167	u_int regs[4];
168
169	/* CPUID Fn8000_000A is for SVM */
170	do_cpuid(0x8000000A, regs);
171	svm_feature = regs[3];
172
173	printf("SVM rev: 0x%x NASID:0x%x\n", regs[0] & 0xFF, regs[1]);
174	nasid = regs[1];
175	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
176
177	printf("SVM Features:0x%b\n", svm_feature,
178		"\020"
179		"\001NP"		/* Nested paging */
180		"\002LbrVirt"		/* LBR virtualization */
181		"\003SVML"		/* SVM lock */
182		"\004NRIPS"		/* NRIP save */
183		"\005TscRateMsr"	/* MSR based TSC rate control */
184		"\006VmcbClean"		/* VMCB clean bits */
185		"\007FlushByAsid"	/* Flush by ASID */
186		"\010DecodeAssist"	/* Decode assist */
187		"\011<b20>"
188		"\012<b20>"
189		"\013PauseFilter"
190		"\014<b20>"
191		"\015PauseFilterThreshold"
192		"\016AVIC"
193		);
194
195	/* SVM Lock */
196	if (!(svm_feature & AMD_CPUID_SVM_SVML)) {
197		printf("SVM is disabled by BIOS, please enable in BIOS.\n");
198		return (ENXIO);
199	}
200
201	/*
202	 * bhyve need RVI to work.
203	 */
204	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
205		printf("Missing Nested paging or RVI SVM support in processor.\n");
206		return (EIO);
207	}
208
209	if (svm_feature & AMD_CPUID_SVM_NRIP_SAVE)
210		return (0);
211
212	return (EIO);
213}
214
215static __inline int
216flush_by_asid(void)
217{
218	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
219}
220
221/*
222 * Enable SVM for a CPU.
223 */
224static void
225svm_enable(void *arg __unused)
226{
227	uint64_t hsave_pa;
228
229	(void)cpu_svm_enable_disable(TRUE);
230
231	hsave_pa = vtophys(hsave[curcpu]);
232	wrmsr(MSR_VM_HSAVE_PA, hsave_pa);
233
234	if (rdmsr(MSR_VM_HSAVE_PA) != hsave_pa) {
235		panic("VM_HSAVE_PA is wrong on CPU%d\n", curcpu);
236	}
237}
238
239/*
240 * Check if a processor support SVM.
241 */
242static int
243is_svm_enabled(void)
244{
245	uint64_t msr;
246
247	 /* Section 15.4 Enabling SVM from APM2. */
248	if ((amd_feature2 & AMDID2_SVM) == 0) {
249		printf("SVM is not supported on this processor.\n");
250		return (ENXIO);
251	}
252
253	msr = rdmsr(MSR_VM_CR);
254	/* Make sure SVM is not disabled by BIOS. */
255	if ((msr & VM_CR_SVMDIS) == 0) {
256		return svm_cpuid_features();
257	}
258
259	printf("SVM disabled by Key, consult TPM/BIOS manual.\n");
260	return (ENXIO);
261}
262
263/*
264 * Enable SVM on CPU and initialize nested page table h/w.
265 */
266static int
267svm_init(int ipinum)
268{
269	int err, cpu;
270
271	err = is_svm_enabled();
272	if (err)
273		return (err);
274
275	for (cpu = 0; cpu < MAXCPU; cpu++) {
276		/*
277		 * Initialize the host ASIDs to their "highest" valid values.
278		 *
279		 * The next ASID allocation will rollover both 'gen' and 'num'
280		 * and start off the sequence at {1,1}.
281		 */
282		asid[cpu].gen = ~0UL;
283		asid[cpu].num = nasid - 1;
284	}
285
286	svm_npt_init(ipinum);
287
288	/* Start SVM on all CPUs */
289	smp_rendezvous(NULL, svm_enable, NULL, NULL);
290
291	return (0);
292}
293
294static void
295svm_restore(void)
296{
297	svm_enable(NULL);
298}
299
300/*
301 * Get index and bit position for a MSR in MSR permission
302 * bitmap. Two bits are used for each MSR, lower bit is
303 * for read and higher bit is for write.
304 */
305static int
306svm_msr_index(uint64_t msr, int *index, int *bit)
307{
308	uint32_t base, off;
309
310/* Pentium compatible MSRs */
311#define MSR_PENTIUM_START 	0
312#define MSR_PENTIUM_END 	0x1FFF
313/* AMD 6th generation and Intel compatible MSRs */
314#define MSR_AMD6TH_START 	0xC0000000UL
315#define MSR_AMD6TH_END 		0xC0001FFFUL
316/* AMD 7th and 8th generation compatible MSRs */
317#define MSR_AMD7TH_START 	0xC0010000UL
318#define MSR_AMD7TH_END 		0xC0011FFFUL
319
320	*index = -1;
321	*bit = (msr % 4) * 2;
322	base = 0;
323
324	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
325		*index = msr / 4;
326		return (0);
327	}
328
329	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
330	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
331		off = (msr - MSR_AMD6TH_START);
332		*index = (off + base) / 4;
333		return (0);
334	}
335
336	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
337	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
338		off = (msr - MSR_AMD7TH_START);
339		*index = (off + base) / 4;
340		return (0);
341	}
342
343	return (EIO);
344}
345
346/*
347 * Give virtual cpu the complete access to MSR(read & write).
348 */
349static int
350svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
351{
352	int index, bit, err;
353
354	err = svm_msr_index(msr, &index, &bit);
355	if (err) {
356		ERR("MSR 0x%lx is not writeable by guest.\n", msr);
357		return (err);
358	}
359
360	if (index < 0 || index > (SVM_MSR_BITMAP_SIZE)) {
361		ERR("MSR 0x%lx index out of range(%d).\n", msr, index);
362		return (EINVAL);
363	}
364	if (bit < 0 || bit > 8) {
365		ERR("MSR 0x%lx bit out of range(%d).\n", msr, bit);
366		return (EINVAL);
367	}
368
369	/* Disable intercept for read and write. */
370	if (read)
371		perm_bitmap[index] &= ~(1UL << bit);
372	if (write)
373		perm_bitmap[index] &= ~(2UL << bit);
374	CTR2(KTR_VMM, "Guest has control:0x%x on SVM:MSR(0x%lx).\n",
375		(perm_bitmap[index] >> bit) & 0x3, msr);
376
377	return (0);
378}
379
380static int
381svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
382{
383	return svm_msr_perm(perm_bitmap, msr, true, true);
384}
385
386static int
387svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
388{
389	return svm_msr_perm(perm_bitmap, msr, true, false);
390}
391
392static __inline void
393vcpu_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits)
394{
395	struct svm_vcpu *vcpustate;
396
397	vcpustate = svm_get_vcpu(sc, vcpu);
398
399	vcpustate->dirty |= dirtybits;
400}
401
402static __inline int
403svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
404{
405	struct vmcb_ctrl *ctrl;
406
407	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
408
409	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
410	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
411}
412
413static __inline void
414svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
415    int enabled)
416{
417	struct vmcb_ctrl *ctrl;
418	uint32_t oldval;
419
420	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
421
422	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
423	oldval = ctrl->intercept[idx];
424
425	if (enabled)
426		ctrl->intercept[idx] |= bitmask;
427	else
428		ctrl->intercept[idx] &= ~bitmask;
429
430	if (ctrl->intercept[idx] != oldval) {
431		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_I);
432		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
433		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
434	}
435}
436
437static __inline void
438svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
439{
440	svm_set_intercept(sc, vcpu, off, bitmask, 0);
441}
442
443static __inline void
444svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
445{
446	svm_set_intercept(sc, vcpu, off, bitmask, 1);
447}
448
449static void
450vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
451    uint64_t msrpm_base_pa, uint64_t np_pml4)
452{
453	struct vmcb_ctrl *ctrl;
454	struct vmcb_state *state;
455	uint32_t mask;
456	int n;
457
458	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
459	state = svm_get_vmcb_state(sc, vcpu);
460
461	ctrl->iopm_base_pa = iopm_base_pa;
462	ctrl->msrpm_base_pa = msrpm_base_pa;
463
464	/* Enable nested paging */
465	ctrl->np_enable = 1;
466	ctrl->n_cr3 = np_pml4;
467
468	/*
469	 * Intercept accesses to the control registers that are not shadowed
470	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
471	 */
472	for (n = 0; n < 16; n++) {
473		mask = (BIT(n) << 16) | BIT(n);
474		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
475			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
476		else
477			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
478	}
479
480	/* Intercept Machine Check exceptions. */
481	svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
482
483	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
484	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
485	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
486	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
487	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
488	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
489	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
490	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
491	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
492	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
493	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
494	    VMCB_INTCPT_FERR_FREEZE);
495
496	/*
497	 * From section "Canonicalization and Consistency Checks" in APMv2
498	 * the VMRUN intercept bit must be set to pass the consistency check.
499	 */
500	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
501
502	/*
503	 * The ASID will be set to a non-zero value just before VMRUN.
504	 */
505	ctrl->asid = 0;
506
507	/*
508	 * Section 15.21.1, Interrupt Masking in EFLAGS
509	 * Section 15.21.2, Virtualizing APIC.TPR
510	 *
511	 * This must be set for %rflag and %cr8 isolation of guest and host.
512	 */
513	ctrl->v_intr_masking = 1;
514
515	/* Enable Last Branch Record aka LBR for debugging */
516	ctrl->lbr_virt_en = 1;
517	state->dbgctl = BIT(0);
518
519	/* EFER_SVM must always be set when the guest is executing */
520	state->efer = EFER_SVM;
521
522	/* Set up the PAT to power-on state */
523	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
524	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
525	    PAT_VALUE(2, PAT_UNCACHED)		|
526	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
527	    PAT_VALUE(4, PAT_WRITE_BACK)	|
528	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
529	    PAT_VALUE(6, PAT_UNCACHED)		|
530	    PAT_VALUE(7, PAT_UNCACHEABLE);
531}
532
533/*
534 * Initialise a virtual machine.
535 */
536static void *
537svm_vminit(struct vm *vm, pmap_t pmap)
538{
539	struct svm_softc *svm_sc;
540	struct svm_vcpu *vcpu;
541	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
542	int i;
543
544	svm_sc = (struct svm_softc *)malloc(sizeof (struct svm_softc),
545			M_SVM, M_WAITOK | M_ZERO);
546
547	svm_sc->vm = vm;
548	svm_sc->svm_feature = svm_feature;
549	svm_sc->vcpu_cnt = VM_MAXCPU;
550	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
551
552	/*
553	 * Intercept MSR access to all MSRs except GSBASE, FSBASE,... etc.
554	 */
555	 memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap));
556
557	/*
558	 * Following MSR can be completely controlled by virtual machines
559	 * since access to following are translated to access to VMCB.
560	 */
561	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
562	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
563	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
564
565	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
566	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
567	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
568	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
569	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
570	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
571	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
572
573	/* For Nested Paging/RVI only. */
574	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
575
576	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
577	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
578
579	 /* Intercept access to all I/O ports. */
580	memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap));
581
582	/* Cache physical address for multiple vcpus. */
583	iopm_pa = vtophys(svm_sc->iopm_bitmap);
584	msrpm_pa = vtophys(svm_sc->msr_bitmap);
585	pml4_pa = svm_sc->nptp;
586
587	for (i = 0; i < svm_sc->vcpu_cnt; i++) {
588		vcpu = svm_get_vcpu(svm_sc, i);
589		vcpu->lastcpu = NOCPU;
590		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
591		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
592	}
593	return (svm_sc);
594}
595
596static int
597svm_cpl(struct vmcb_state *state)
598{
599
600	/*
601	 * From APMv2:
602	 *   "Retrieve the CPL from the CPL field in the VMCB, not
603	 *    from any segment DPL"
604	 */
605	return (state->cpl);
606}
607
608static enum vm_cpu_mode
609svm_vcpu_mode(struct vmcb *vmcb)
610{
611	struct vmcb_segment *seg;
612	struct vmcb_state *state;
613
614	state = &vmcb->state;
615
616	if (state->efer & EFER_LMA) {
617		seg = vmcb_seg(vmcb, VM_REG_GUEST_CS);
618		/*
619		 * Section 4.8.1 for APM2, check if Code Segment has
620		 * Long attribute set in descriptor.
621		 */
622		if (seg->attrib & VMCB_CS_ATTRIB_L)
623			return (CPU_MODE_64BIT);
624		else
625			return (CPU_MODE_COMPATIBILITY);
626	} else  if (state->cr0 & CR0_PE) {
627		return (CPU_MODE_PROTECTED);
628	} else {
629		return (CPU_MODE_REAL);
630	}
631}
632
633static enum vm_paging_mode
634svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
635{
636
637	if ((cr0 & CR0_PG) == 0)
638		return (PAGING_MODE_FLAT);
639	if ((cr4 & CR4_PAE) == 0)
640		return (PAGING_MODE_32);
641	if (efer & EFER_LME)
642		return (PAGING_MODE_64);
643	else
644		return (PAGING_MODE_PAE);
645}
646
647/*
648 * ins/outs utility routines
649 */
650static uint64_t
651svm_inout_str_index(struct svm_regctx *regs, int in)
652{
653	uint64_t val;
654
655	val = in ? regs->e.g.sctx_rdi : regs->e.g.sctx_rsi;
656
657	return (val);
658}
659
660static uint64_t
661svm_inout_str_count(struct svm_regctx *regs, int rep)
662{
663	uint64_t val;
664
665	val = rep ? regs->sctx_rcx : 1;
666
667	return (val);
668}
669
670static void
671svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
672    int in, struct vm_inout_str *vis)
673{
674	int error, s;
675
676	if (in) {
677		vis->seg_name = VM_REG_GUEST_ES;
678	} else {
679		/* The segment field has standard encoding */
680		s = (info1 >> 10) & 0x7;
681		vis->seg_name = vm_segment_name(s);
682	}
683
684	error = svm_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
685	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
686}
687
688static int
689svm_inout_str_addrsize(uint64_t info1)
690{
691        uint32_t size;
692
693        size = (info1 >> 7) & 0x7;
694        switch (size) {
695        case 1:
696                return (2);     /* 16 bit */
697        case 2:
698                return (4);     /* 32 bit */
699        case 4:
700                return (8);     /* 64 bit */
701        default:
702                panic("%s: invalid size encoding %d", __func__, size);
703        }
704}
705
706static void
707svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
708{
709	struct vmcb_state *state;
710
711	state = &vmcb->state;
712	paging->cr3 = state->cr3;
713	paging->cpl = svm_cpl(state);
714	paging->cpu_mode = svm_vcpu_mode(vmcb);
715	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
716	    state->efer);
717}
718
719
720/*
721 * Handle guest I/O intercept.
722 */
723static bool
724svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
725{
726	struct vmcb_ctrl *ctrl;
727	struct vmcb_state *state;
728	struct svm_regctx *regs;
729	struct vm_inout_str *vis;
730	uint64_t info1;
731
732	state = svm_get_vmcb_state(svm_sc, vcpu);
733	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
734	regs  = svm_get_guest_regctx(svm_sc, vcpu);
735	info1 = ctrl->exitinfo1;
736
737	vmexit->exitcode 	= VM_EXITCODE_INOUT;
738	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
739	vmexit->u.inout.string 	= (info1 & BIT(2)) ? 1 : 0;
740	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
741	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
742	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
743	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
744
745	if (vmexit->u.inout.string) {
746		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
747		vis = &vmexit->u.inout_str;
748		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
749		vis->rflags = state->rflags;
750		vis->cr0 = state->cr0;
751		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
752		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
753		vis->addrsize = svm_inout_str_addrsize(info1);
754		svm_inout_str_seginfo(svm_sc, vcpu, info1,
755		    vmexit->u.inout.in, vis);
756	}
757
758	return (false);
759}
760
761static int
762svm_npf_paging(uint64_t exitinfo1)
763{
764
765	if (exitinfo1 & VMCB_NPF_INFO1_W)
766		return (VM_PROT_WRITE);
767
768	return (VM_PROT_READ);
769}
770
771static bool
772svm_npf_emul_fault(uint64_t exitinfo1)
773{
774
775	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
776		return (false);
777	}
778
779	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
780		return (false);
781	}
782
783	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
784		return (false);
785	}
786
787	return (true);
788}
789
790static void
791svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
792{
793	struct vm_guest_paging *paging;
794	struct vmcb_segment *seg;
795
796	paging = &vmexit->u.inst_emul.paging;
797	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
798	vmexit->u.inst_emul.gpa = gpa;
799	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
800	svm_paging_info(vmcb, paging);
801
802	/*
803	 * If DecodeAssist SVM feature doesn't exist, we don't have NPF
804	 * instuction length. RIP will be calculated based on the length
805	 * determined by instruction emulation.
806	 */
807	vmexit->inst_length = VIE_INST_SIZE;
808
809	seg = vmcb_seg(vmcb, VM_REG_GUEST_CS);
810	switch(paging->cpu_mode) {
811	case CPU_MODE_PROTECTED:
812	case CPU_MODE_COMPATIBILITY:
813		/*
814		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
815		 */
816		vmexit->u.inst_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
817		    1 : 0;
818		break;
819	default:
820		vmexit->u.inst_emul.cs_d = 0;
821		break;
822	}
823}
824
825/*
826 * Intercept access to MSR_EFER to prevent the guest from clearing the
827 * SVM enable bit.
828 */
829static void
830svm_write_efer(struct svm_softc *sc, int vcpu, uint32_t edx, uint32_t eax)
831{
832	struct vmcb_state *state;
833	uint64_t oldval;
834
835	state = svm_get_vmcb_state(sc, vcpu);
836
837	oldval = state->efer;
838	state->efer = (uint64_t)edx << 32 | eax | EFER_SVM;
839	if (state->efer != oldval) {
840		VCPU_CTR2(sc->vm, vcpu, "Guest EFER changed from %#lx to %#lx",
841		    oldval, state->efer);
842		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_CR);
843	}
844}
845
846#ifdef KTR
847static const char *
848intrtype_to_str(int intr_type)
849{
850	switch (intr_type) {
851	case VMCB_EVENTINJ_TYPE_INTR:
852		return ("hwintr");
853	case VMCB_EVENTINJ_TYPE_NMI:
854		return ("nmi");
855	case VMCB_EVENTINJ_TYPE_INTn:
856		return ("swintr");
857	case VMCB_EVENTINJ_TYPE_EXCEPTION:
858		return ("exception");
859	default:
860		panic("%s: unknown intr_type %d", __func__, intr_type);
861	}
862}
863#endif
864
865/*
866 * Inject an event to vcpu as described in section 15.20, "Event injection".
867 */
868static void
869svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
870		 uint32_t error, bool ec_valid)
871{
872	struct vmcb_ctrl *ctrl;
873
874	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
875
876	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
877	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
878
879	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
880	    __func__, vector));
881
882	switch (intr_type) {
883	case VMCB_EVENTINJ_TYPE_INTR:
884	case VMCB_EVENTINJ_TYPE_NMI:
885	case VMCB_EVENTINJ_TYPE_INTn:
886		break;
887	case VMCB_EVENTINJ_TYPE_EXCEPTION:
888		if (vector >= 0 && vector <= 31 && vector != 2)
889			break;
890		/* FALLTHROUGH */
891	default:
892		panic("%s: invalid intr_type/vector: %d/%d", __func__,
893		    intr_type, vector);
894	}
895	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
896	if (ec_valid) {
897		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
898		ctrl->eventinj |= (uint64_t)error << 32;
899		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
900		    intrtype_to_str(intr_type), vector, error);
901	} else {
902		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
903		    intrtype_to_str(intr_type), vector);
904	}
905}
906
907static void
908svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
909{
910	struct vmcb_ctrl *ctrl;
911	uint64_t intinfo;
912
913	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
914	intinfo = ctrl->exitintinfo;
915	if (!VMCB_EXITINTINFO_VALID(intinfo))
916		return;
917
918	/*
919	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
920	 *
921	 * If a #VMEXIT happened during event delivery then record the event
922	 * that was being delivered.
923	 */
924	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
925		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
926	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
927	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
928}
929
930static __inline void
931enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
932{
933	struct vmcb_ctrl *ctrl;
934
935	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
936
937	if (ctrl->v_irq == 0) {
938		VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
939		ctrl->v_irq = 1;
940		ctrl->v_ign_tpr = 1;
941		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
942		svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
943		    VMCB_INTCPT_VINTR);
944	}
945}
946
947static __inline void
948disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
949{
950	struct vmcb_ctrl *ctrl;
951
952	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
953
954	if (ctrl->v_irq) {
955		VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
956		ctrl->v_irq = 0;
957		vcpu_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
958		svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
959		    VMCB_INTCPT_VINTR);
960	}
961}
962
963static int
964nmi_blocked(struct svm_softc *sc, int vcpu)
965{
966	/* XXX need to track NMI blocking */
967	return (0);
968}
969
970static void
971enable_nmi_blocking(struct svm_softc *sc, int vcpu)
972{
973	/* XXX enable iret intercept */
974}
975
976#ifdef notyet
977static void
978clear_nmi_blocking(struct svm_softc *sc, int vcpu)
979{
980	/* XXX disable iret intercept */
981}
982#endif
983
984#ifdef KTR
985static const char *
986exit_reason_to_str(uint64_t reason)
987{
988	static char reasonbuf[32];
989
990	switch (reason) {
991	case VMCB_EXIT_INVALID:
992		return ("invalvmcb");
993	case VMCB_EXIT_SHUTDOWN:
994		return ("shutdown");
995	case VMCB_EXIT_NPF:
996		return ("nptfault");
997	case VMCB_EXIT_PAUSE:
998		return ("pause");
999	case VMCB_EXIT_HLT:
1000		return ("hlt");
1001	case VMCB_EXIT_CPUID:
1002		return ("cpuid");
1003	case VMCB_EXIT_IO:
1004		return ("inout");
1005	case VMCB_EXIT_MC:
1006		return ("mchk");
1007	case VMCB_EXIT_INTR:
1008		return ("extintr");
1009	case VMCB_EXIT_VINTR:
1010		return ("vintr");
1011	case VMCB_EXIT_MSR:
1012		return ("msr");
1013	default:
1014		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
1015		return (reasonbuf);
1016	}
1017}
1018#endif	/* KTR */
1019
1020/*
1021 * Determine the cause of virtual cpu exit and handle VMEXIT.
1022 * Return: false - Break vcpu execution loop and handle vmexit
1023 *		   in kernel or user space.
1024 *	   true  - Continue vcpu run.
1025 */
1026static bool
1027svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1028{
1029	struct vmcb_state *state;
1030	struct vmcb_ctrl *ctrl;
1031	struct svm_regctx *ctx;
1032	uint64_t code, info1, info2, val;
1033	uint32_t eax, ecx, edx;
1034	bool update_rip, loop, retu;
1035
1036	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1037
1038	state = svm_get_vmcb_state(svm_sc, vcpu);
1039	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
1040	ctx   = svm_get_guest_regctx(svm_sc, vcpu);
1041	code  = ctrl->exitcode;
1042	info1 = ctrl->exitinfo1;
1043	info2 = ctrl->exitinfo2;
1044
1045	update_rip = true;
1046	loop = true;
1047	vmexit->exitcode = VM_EXITCODE_VMX;
1048	vmexit->u.vmx.status = 0;
1049
1050	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1051
1052	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1053	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
1054
1055	svm_save_intinfo(svm_sc, vcpu);
1056
1057	switch (code) {
1058		case	VMCB_EXIT_VINTR:
1059			update_rip = false;
1060			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1061			break;
1062		case	VMCB_EXIT_MC: /* Machine Check. */
1063			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MTRAP, 1);
1064			vmexit->exitcode = VM_EXITCODE_MTRAP;
1065			loop = false;
1066			break;
1067
1068		case	VMCB_EXIT_MSR:	/* MSR access. */
1069			eax = state->rax;
1070			ecx = ctx->sctx_rcx;
1071			edx = ctx->e.g.sctx_rdx;
1072
1073			if (ecx == MSR_EFER) {
1074				KASSERT(info1 != 0, ("rdmsr(MSR_EFER) is not "
1075				    "emulated: info1(%#lx) info2(%#lx)",
1076				    info1, info2));
1077				svm_write_efer(svm_sc, vcpu, edx, eax);
1078				break;
1079			}
1080
1081			retu = false;
1082			if (info1) {
1083				/* VM exited because of write MSR */
1084				vmm_stat_incr(svm_sc->vm, vcpu,
1085					VMEXIT_WRMSR, 1);
1086				vmexit->exitcode = VM_EXITCODE_WRMSR;
1087				vmexit->u.msr.code = ecx;
1088				val = (uint64_t)edx << 32 | eax;
1089				if (emulate_wrmsr(svm_sc->vm, vcpu, ecx, val,
1090					&retu)) {
1091					vmexit->u.msr.wval = val;
1092					loop = false;
1093				} else
1094					loop = retu ? false : true;
1095
1096				VCPU_CTR3(svm_sc->vm, vcpu,
1097					"VMEXIT WRMSR(%s handling) 0x%lx @0x%x",
1098					loop ? "kernel" : "user", val, ecx);
1099			} else {
1100				vmm_stat_incr(svm_sc->vm, vcpu,
1101					VMEXIT_RDMSR, 1);
1102				vmexit->exitcode = VM_EXITCODE_RDMSR;
1103				vmexit->u.msr.code = ecx;
1104				if (emulate_rdmsr(svm_sc->vm, vcpu, ecx,
1105					&retu)) {
1106					loop = false;
1107				} else
1108					loop = retu ? false : true;
1109				VCPU_CTR3(svm_sc->vm, vcpu, "SVM:VMEXIT RDMSR"
1110					" MSB=0x%08x, LSB=%08x @0x%x",
1111					ctx->e.g.sctx_rdx, state->rax, ecx);
1112			}
1113
1114#define MSR_AMDK8_IPM           0xc0010055
1115			/*
1116			 * We can't hide AMD C1E idle capability since its
1117			 * based on CPU generation, for now ignore access to
1118			 * this MSR by vcpus
1119			 * XXX: special handling of AMD C1E - Ignore.
1120			 */
1121			 if (ecx == MSR_AMDK8_IPM)
1122				loop = true;
1123			break;
1124
1125		case VMCB_EXIT_INTR:
1126			/*
1127			 * Exit on External Interrupt.
1128			 * Give host interrupt handler to run and if its guest
1129			 * interrupt, local APIC will inject event in guest.
1130			 */
1131			update_rip = false;
1132			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1133			break;
1134
1135		case VMCB_EXIT_IO:
1136			loop = svm_handle_io(svm_sc, vcpu, vmexit);
1137			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1138			break;
1139
1140		case VMCB_EXIT_CPUID:
1141			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1142			(void)x86_emulate_cpuid(svm_sc->vm, vcpu,
1143					(uint32_t *)&state->rax,
1144					(uint32_t *)&ctx->sctx_rbx,
1145					(uint32_t *)&ctx->sctx_rcx,
1146					(uint32_t *)&ctx->e.g.sctx_rdx);
1147			break;
1148
1149		case VMCB_EXIT_HLT:
1150			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1151			vmexit->exitcode = VM_EXITCODE_HLT;
1152			vmexit->u.hlt.rflags = state->rflags;
1153			loop = false;
1154			break;
1155
1156		case VMCB_EXIT_PAUSE:
1157			vmexit->exitcode = VM_EXITCODE_PAUSE;
1158			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1159
1160			break;
1161
1162		case VMCB_EXIT_NPF:
1163			loop = false;
1164			update_rip = false;
1165
1166        		if (info1 & VMCB_NPF_INFO1_RSV) {
1167 				VCPU_CTR2(svm_sc->vm, vcpu, "SVM_ERR:NPT"
1168					" reserved bit is set,"
1169					"INFO1:0x%lx INFO2:0x%lx .\n",
1170					info1, info2);
1171        			break;
1172			}
1173
1174			/* EXITINFO2 has the physical fault address (GPA). */
1175			if(vm_mem_allocated(svm_sc->vm, info2)) {
1176				vmexit->exitcode = VM_EXITCODE_PAGING;
1177				vmexit->u.paging.gpa = info2;
1178				vmexit->u.paging.fault_type =
1179					svm_npf_paging(info1);
1180				vmm_stat_incr(svm_sc->vm, vcpu,
1181					VMEXIT_NESTED_FAULT, 1);
1182				VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1183				    "on gpa %#lx/%#lx at rip %#lx",
1184				    info2, info1, state->rip);
1185			} else if (svm_npf_emul_fault(info1)) {
1186				svm_handle_inst_emul(svm_get_vmcb(svm_sc, vcpu),
1187					info2, vmexit);
1188				vmm_stat_incr(svm_sc->vm, vcpu,
1189				    VMEXIT_INST_EMUL, 1);
1190				VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
1191				    "for gpa %#lx/%#lx at rip %#lx",
1192				    info2, info1, state->rip);
1193			}
1194			break;
1195
1196		case VMCB_EXIT_SHUTDOWN:
1197			loop = false;
1198			break;
1199
1200		case VMCB_EXIT_INVALID:
1201			loop = false;
1202			break;
1203
1204		default:
1205			 /* Return to user space. */
1206			loop = false;
1207			update_rip = false;
1208			VCPU_CTR3(svm_sc->vm, vcpu, "VMEXIT=0x%lx"
1209				" EXITINFO1: 0x%lx EXITINFO2:0x%lx\n",
1210		 		ctrl->exitcode, info1, info2);
1211			VCPU_CTR3(svm_sc->vm, vcpu, "SVM:RIP: 0x%lx nRIP:0x%lx"
1212				" Inst decoder len:%d\n", state->rip,
1213				ctrl->nrip, ctrl->inst_decode_size);
1214			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1215			break;
1216	}
1217
1218	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx nrip %#lx",
1219	    loop ? "handled" : "unhandled", exit_reason_to_str(code),
1220	    state->rip, update_rip ? ctrl->nrip : state->rip);
1221
1222	vmexit->rip = state->rip;
1223	if (update_rip) {
1224		if (ctrl->nrip == 0) {
1225 			VCPU_CTR1(svm_sc->vm, vcpu, "SVM_ERR:nRIP is not set "
1226				 "for RIP0x%lx.\n", state->rip);
1227			vmexit->exitcode = VM_EXITCODE_VMX;
1228		} else
1229			vmexit->rip = ctrl->nrip;
1230	}
1231
1232	/* If vcpu execution is continued, update RIP. */
1233	if (loop) {
1234		state->rip = vmexit->rip;
1235	}
1236
1237	return (loop);
1238}
1239
1240static void
1241svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
1242{
1243	uint64_t intinfo;
1244
1245	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
1246		return;
1247
1248	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
1249	    "valid: %#lx", __func__, intinfo));
1250
1251	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
1252		VMCB_EXITINTINFO_VECTOR(intinfo),
1253		VMCB_EXITINTINFO_EC(intinfo),
1254		VMCB_EXITINTINFO_EC_VALID(intinfo));
1255	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1256	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
1257}
1258
1259/*
1260 * Inject event to virtual cpu.
1261 */
1262static void
1263svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
1264{
1265	struct vmcb_ctrl *ctrl;
1266	struct vmcb_state *state;
1267	int extint_pending;
1268	int vector, need_intr_window;
1269
1270	state = svm_get_vmcb_state(sc, vcpu);
1271	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1272
1273	need_intr_window = 0;
1274
1275	/*
1276	 * Inject pending events or exceptions for this vcpu.
1277	 *
1278	 * An event might be pending because the previous #VMEXIT happened
1279	 * during event delivery (i.e. ctrl->exitintinfo).
1280	 *
1281	 * An event might also be pending because an exception was injected
1282	 * by the hypervisor (e.g. #PF during instruction emulation).
1283	 */
1284	svm_inj_intinfo(sc, vcpu);
1285
1286	/* NMI event has priority over interrupts. */
1287	if (vm_nmi_pending(sc->vm, vcpu)) {
1288		if (nmi_blocked(sc, vcpu)) {
1289			/*
1290			 * Can't inject another NMI if the guest has not
1291			 * yet executed an "iret" after the last NMI.
1292			 */
1293			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
1294			    "to NMI-blocking");
1295		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1296			/*
1297			 * If there is already an exception/interrupt pending
1298			 * then defer the NMI until after that.
1299			 */
1300			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
1301			    "eventinj %#lx", ctrl->eventinj);
1302
1303			/*
1304			 * Use self-IPI to trigger a VM-exit as soon as
1305			 * possible after the event injection is completed.
1306			 *
1307			 * This works only if the external interrupt exiting
1308			 * is at a lower priority than the event injection.
1309			 *
1310			 * Although not explicitly specified in APMv2 the
1311			 * relative priorities were verified empirically.
1312			 */
1313			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
1314		} else {
1315			vm_nmi_clear(sc->vm, vcpu);
1316
1317			/* Inject NMI, vector number is not used */
1318			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
1319			    IDT_NMI, 0, false);
1320
1321			/* virtual NMI blocking is now in effect */
1322			enable_nmi_blocking(sc, vcpu);
1323
1324			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
1325		}
1326	}
1327
1328	extint_pending = vm_extint_pending(sc->vm, vcpu);
1329
1330	if (!extint_pending) {
1331		/* Ask the local apic for a vector to inject */
1332		if (!vlapic_pending_intr(vlapic, &vector)) {
1333			goto done;	/* nothing to inject */
1334		}
1335		KASSERT(vector >= 16 && vector <= 255,
1336		    ("invalid vector %d from local APIC", vector));
1337	} else {
1338                /* Ask the legacy pic for a vector to inject */
1339                vatpic_pending_intr(sc->vm, &vector);
1340		KASSERT(vector >= 0 && vector <= 255,
1341		    ("invalid vector %d from local APIC", vector));
1342	}
1343
1344	/*
1345	 * If the guest has disabled interrupts or is in an interrupt shadow
1346	 * then we cannot inject the pending interrupt.
1347	 */
1348	if ((state->rflags & PSL_I) == 0) {
1349		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1350		    "rflags %#lx", vector, state->rflags);
1351		need_intr_window = 1;
1352		goto done;
1353	}
1354
1355	if (ctrl->intr_shadow) {
1356		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
1357		    "interrupt shadow", vector);
1358		need_intr_window = 1;
1359		goto done;
1360	}
1361
1362	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1363		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1364		    "eventinj %#lx", vector, ctrl->eventinj);
1365		need_intr_window = 1;
1366		goto done;
1367	}
1368
1369	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
1370
1371        if (!extint_pending) {
1372                /* Update the Local APIC ISR */
1373                vlapic_intr_accepted(vlapic, vector);
1374        } else {
1375                vm_extint_clear(sc->vm, vcpu);
1376                vatpic_intr_accepted(sc->vm, vector);
1377		/*
1378		 * Force a VM-exit as soon as the vcpu is ready to accept
1379		 * another interrupt. This is done because the PIC might
1380		 * have another vector that it wants to inject. Also, if
1381		 * the vlapic has a pending interrupt that was preempted
1382		 * by the ExtInt then it allows us to inject the APIC
1383		 * vector as soon as possible.
1384		 */
1385		need_intr_window = 1;
1386        }
1387done:
1388	if (need_intr_window) {
1389		/*
1390		 * We use V_IRQ in conjunction with the VINTR intercept to
1391		 * trap into the hypervisor as soon as a virtual interrupt
1392		 * can be delivered.
1393		 *
1394		 * Since injected events are not subject to intercept checks
1395		 * we need to ensure that the V_IRQ is not actually going to
1396		 * be delivered on VM entry. The KASSERT below enforces this.
1397		 */
1398		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1399		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
1400		    ("Bogus intr_window_exiting: eventinj (%#lx), "
1401		    "intr_shadow (%u), rflags (%#lx)",
1402		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
1403		enable_intr_window_exiting(sc, vcpu);
1404	} else {
1405		disable_intr_window_exiting(sc, vcpu);
1406	}
1407}
1408
1409static __inline void
1410restore_host_tss(void)
1411{
1412	struct system_segment_descriptor *tss_sd;
1413
1414	/*
1415	 * The TSS descriptor was in use prior to launching the guest so it
1416	 * has been marked busy.
1417	 *
1418	 * 'ltr' requires the descriptor to be marked available so change the
1419	 * type to "64-bit available TSS".
1420	 */
1421	tss_sd = PCPU_GET(tss);
1422	tss_sd->sd_type = SDT_SYSTSS;
1423	ltr(GSEL(GPROC0_SEL, SEL_KPL));
1424}
1425
1426static void
1427check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
1428{
1429	struct svm_vcpu *vcpustate;
1430	struct vmcb_ctrl *ctrl;
1431	long eptgen;
1432	bool alloc_asid;
1433
1434	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
1435	    "active on cpu %u", __func__, thiscpu));
1436
1437	vcpustate = svm_get_vcpu(sc, vcpuid);
1438	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1439
1440	/*
1441	 * The TLB entries associated with the vcpu's ASID are not valid
1442	 * if either of the following conditions is true:
1443	 *
1444	 * 1. The vcpu's ASID generation is different than the host cpu's
1445	 *    ASID generation. This happens when the vcpu migrates to a new
1446	 *    host cpu. It can also happen when the number of vcpus executing
1447	 *    on a host cpu is greater than the number of ASIDs available.
1448	 *
1449	 * 2. The pmap generation number is different than the value cached in
1450	 *    the 'vcpustate'. This happens when the host invalidates pages
1451	 *    belonging to the guest.
1452	 *
1453	 *	asidgen		eptgen	      Action
1454	 *	mismatch	mismatch
1455	 *	   0		   0		(a)
1456	 *	   0		   1		(b1) or (b2)
1457	 *	   1		   0		(c)
1458	 *	   1		   1		(d)
1459	 *
1460	 * (a) There is no mismatch in eptgen or ASID generation and therefore
1461	 *     no further action is needed.
1462	 *
1463	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
1464	 *      retained and the TLB entries associated with this ASID
1465	 *      are flushed by VMRUN.
1466	 *
1467	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
1468	 *      allocated.
1469	 *
1470	 * (c) A new ASID is allocated.
1471	 *
1472	 * (d) A new ASID is allocated.
1473	 */
1474
1475	alloc_asid = false;
1476	eptgen = pmap->pm_eptgen;
1477	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
1478
1479	if (vcpustate->asid.gen != asid[thiscpu].gen) {
1480		alloc_asid = true;	/* (c) and (d) */
1481	} else if (vcpustate->eptgen != eptgen) {
1482		if (flush_by_asid())
1483			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
1484		else
1485			alloc_asid = true;			/* (b2) */
1486	} else {
1487		/*
1488		 * This is the common case (a).
1489		 */
1490		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
1491		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
1492		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
1493	}
1494
1495	if (alloc_asid) {
1496		if (++asid[thiscpu].num >= nasid) {
1497			asid[thiscpu].num = 1;
1498			if (++asid[thiscpu].gen == 0)
1499				asid[thiscpu].gen = 1;
1500			/*
1501			 * If this cpu does not support "flush-by-asid"
1502			 * then flush the entire TLB on a generation
1503			 * bump. Subsequent ASID allocation in this
1504			 * generation can be done without a TLB flush.
1505			 */
1506			if (!flush_by_asid())
1507				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
1508		}
1509		vcpustate->asid.gen = asid[thiscpu].gen;
1510		vcpustate->asid.num = asid[thiscpu].num;
1511
1512		ctrl->asid = vcpustate->asid.num;
1513		vcpu_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1514		/*
1515		 * If this cpu supports "flush-by-asid" then the TLB
1516		 * was not flushed after the generation bump. The TLB
1517		 * is flushed selectively after every new ASID allocation.
1518		 */
1519		if (flush_by_asid())
1520			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
1521	}
1522	vcpustate->eptgen = eptgen;
1523
1524	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
1525	KASSERT(ctrl->asid == vcpustate->asid.num,
1526	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
1527}
1528
1529/*
1530 * Start vcpu with specified RIP.
1531 */
1532static int
1533svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
1534	void *rend_cookie, void *suspended_cookie)
1535{
1536	struct svm_regctx *hctx, *gctx;
1537	struct svm_softc *svm_sc;
1538	struct svm_vcpu *vcpustate;
1539	struct vmcb_state *state;
1540	struct vmcb_ctrl *ctrl;
1541	struct vm_exit *vmexit;
1542	struct vlapic *vlapic;
1543	struct vm *vm;
1544	uint64_t vmcb_pa;
1545	u_int thiscpu;
1546	bool loop;	/* Continue vcpu execution loop. */
1547
1548	loop = true;
1549	svm_sc = arg;
1550	vm = svm_sc->vm;
1551
1552	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1553	state = svm_get_vmcb_state(svm_sc, vcpu);
1554	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1555	vmexit = vm_exitinfo(vm, vcpu);
1556	vlapic = vm_lapic(vm, vcpu);
1557
1558	/*
1559	 * Stash 'curcpu' on the stack as 'thiscpu'.
1560	 *
1561	 * The per-cpu data area is not accessible until MSR_GSBASE is restored
1562	 * after the #VMEXIT. Since VMRUN is executed inside a critical section
1563	 * 'curcpu' and 'thiscpu' are guaranteed to identical.
1564	 */
1565	thiscpu = curcpu;
1566
1567	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1568	hctx = &host_ctx[thiscpu];
1569	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1570
1571	if (vcpustate->lastcpu != thiscpu) {
1572		/*
1573		 * Force new ASID allocation by invalidating the generation.
1574		 */
1575		vcpustate->asid.gen = 0;
1576
1577		/*
1578		 * Invalidate the VMCB state cache by marking all fields dirty.
1579		 */
1580		vcpu_set_dirty(svm_sc, vcpu, 0xffffffff);
1581
1582		/*
1583		 * XXX
1584		 * Setting 'vcpustate->lastcpu' here is bit premature because
1585		 * we may return from this function without actually executing
1586		 * the VMRUN  instruction. This could happen if a rendezvous
1587		 * or an AST is pending on the first time through the loop.
1588		 *
1589		 * This works for now but any new side-effects of vcpu
1590		 * migration should take this case into account.
1591		 */
1592		vcpustate->lastcpu = thiscpu;
1593		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1594	}
1595
1596	/* Update Guest RIP */
1597	state->rip = rip;
1598
1599	do {
1600		vmexit->inst_length = 0;
1601
1602		/*
1603		 * Disable global interrupts to guarantee atomicity during
1604		 * loading of guest state. This includes not only the state
1605		 * loaded by the "vmrun" instruction but also software state
1606		 * maintained by the hypervisor: suspended and rendezvous
1607		 * state, NPT generation number, vlapic interrupts etc.
1608		 */
1609		disable_gintr();
1610
1611		if (vcpu_suspended(suspended_cookie)) {
1612			enable_gintr();
1613			vm_exit_suspended(vm, vcpu, state->rip);
1614			break;
1615		}
1616
1617		if (vcpu_rendezvous_pending(rend_cookie)) {
1618			enable_gintr();
1619			vm_exit_rendezvous(vm, vcpu, state->rip);
1620			break;
1621		}
1622
1623		/* We are asked to give the cpu by scheduler. */
1624		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1625			enable_gintr();
1626			vm_exit_astpending(vm, vcpu, state->rip);
1627			break;
1628		}
1629
1630		svm_inj_interrupts(svm_sc, vcpu, vlapic);
1631
1632		/* Activate the nested pmap on 'thiscpu' */
1633		CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active);
1634
1635		/*
1636		 * Check the pmap generation and the ASID generation to
1637		 * ensure that the vcpu does not use stale TLB mappings.
1638		 */
1639		check_asid(svm_sc, vcpu, pmap, thiscpu);
1640
1641		ctrl->vmcb_clean = VMCB_CACHE_DEFAULT & ~vcpustate->dirty;
1642		vcpustate->dirty = 0;
1643		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
1644
1645		/* Launch Virtual Machine. */
1646		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
1647		svm_launch(vmcb_pa, gctx, hctx);
1648
1649		CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active);
1650
1651		/*
1652		 * Restore MSR_GSBASE to point to the pcpu data area.
1653		 *
1654		 * Note that accesses done via PCPU_GET/PCPU_SET will work
1655		 * only after MSR_GSBASE is restored.
1656		 *
1657		 * Also note that we don't bother restoring MSR_KGSBASE
1658		 * since it is not used in the kernel and will be restored
1659		 * when the VMRUN ioctl returns to userspace.
1660		 */
1661		wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]);
1662		KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch",
1663		    thiscpu, curcpu));
1664
1665		/*
1666		 * The host GDTR and IDTR is saved by VMRUN and restored
1667		 * automatically on #VMEXIT. However, the host TSS needs
1668		 * to be restored explicitly.
1669		 */
1670		restore_host_tss();
1671
1672		/* #VMEXIT disables interrupts so re-enable them here. */
1673		enable_gintr();
1674
1675		/* Handle #VMEXIT and if required return to user space. */
1676		loop = svm_vmexit(svm_sc, vcpu, vmexit);
1677	} while (loop);
1678
1679	return (0);
1680}
1681
1682/*
1683 * Cleanup for virtual machine.
1684 */
1685static void
1686svm_vmcleanup(void *arg)
1687{
1688	struct svm_softc *svm_sc;
1689
1690	svm_sc = arg;
1691
1692	VCPU_CTR0(svm_sc->vm, 0, "SVM:cleanup\n");
1693
1694	free(svm_sc, M_SVM);
1695}
1696
1697/*
1698 * Return pointer to hypervisor saved register state.
1699 */
1700static register_t *
1701swctx_regptr(struct svm_regctx *regctx, int reg)
1702{
1703
1704	switch (reg) {
1705		case VM_REG_GUEST_RBX:
1706			return (&regctx->sctx_rbx);
1707		case VM_REG_GUEST_RCX:
1708			return (&regctx->sctx_rcx);
1709		case VM_REG_GUEST_RDX:
1710			return (&regctx->e.g.sctx_rdx);
1711		case VM_REG_GUEST_RDI:
1712			return (&regctx->e.g.sctx_rdi);
1713		case VM_REG_GUEST_RSI:
1714			return (&regctx->e.g.sctx_rsi);
1715		case VM_REG_GUEST_RBP:
1716			return (&regctx->sctx_rbp);
1717		case VM_REG_GUEST_R8:
1718			return (&regctx->sctx_r8);
1719		case VM_REG_GUEST_R9:
1720			return (&regctx->sctx_r9);
1721		case VM_REG_GUEST_R10:
1722			return (&regctx->sctx_r10);
1723		case VM_REG_GUEST_R11:
1724			return (&regctx->sctx_r11);
1725		case VM_REG_GUEST_R12:
1726			return (&regctx->sctx_r12);
1727		case VM_REG_GUEST_R13:
1728			return (&regctx->sctx_r13);
1729		case VM_REG_GUEST_R14:
1730			return (&regctx->sctx_r14);
1731		case VM_REG_GUEST_R15:
1732			return (&regctx->sctx_r15);
1733		default:
1734			ERR("Unknown register requested, reg=%d.\n", reg);
1735			break;
1736	}
1737
1738	return (NULL);
1739}
1740
1741/*
1742 * Interface to read guest registers.
1743 * This can be SVM h/w saved or hypervisor saved register.
1744 */
1745static int
1746svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
1747{
1748	struct svm_softc *svm_sc;
1749	struct vmcb *vmcb;
1750	register_t *reg;
1751
1752	svm_sc = arg;
1753	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1754
1755	vmcb = svm_get_vmcb(svm_sc, vcpu);
1756
1757	if (vmcb_read(vmcb, ident, val) == 0) {
1758		return (0);
1759	}
1760
1761	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
1762
1763	if (reg != NULL) {
1764		*val = *reg;
1765		return (0);
1766	}
1767
1768 	ERR("SVM_ERR:reg type %x is not saved in VMCB.\n", ident);
1769	return (EINVAL);
1770}
1771
1772/*
1773 * Interface to write to guest registers.
1774 * This can be SVM h/w saved or hypervisor saved register.
1775 */
1776static int
1777svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
1778{
1779	struct svm_softc *svm_sc;
1780	struct vmcb *vmcb;
1781	register_t *reg;
1782
1783	svm_sc = arg;
1784	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1785
1786	vmcb = svm_get_vmcb(svm_sc, vcpu);
1787	if (vmcb_write(vmcb, ident, val) == 0) {
1788		return (0);
1789	}
1790
1791	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
1792
1793	if (reg != NULL) {
1794		*reg = val;
1795		return (0);
1796	}
1797
1798	/*
1799	 * XXX deal with CR3 and invalidate TLB entries tagged with the
1800	 * vcpu's ASID. This needs to be treated differently depending on
1801	 * whether 'running' is true/false.
1802	 */
1803
1804 	ERR("SVM_ERR:reg type %x is not saved in VMCB.\n", ident);
1805	return (EINVAL);
1806}
1807
1808
1809/*
1810 * Inteface to set various descriptors.
1811 */
1812static int
1813svm_setdesc(void *arg, int vcpu, int type, struct seg_desc *desc)
1814{
1815	struct svm_softc *svm_sc;
1816	struct vmcb *vmcb;
1817	struct vmcb_segment *seg;
1818	uint16_t attrib;
1819
1820	svm_sc = arg;
1821	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1822
1823	vmcb = svm_get_vmcb(svm_sc, vcpu);
1824
1825	VCPU_CTR1(svm_sc->vm, vcpu, "SVM:set_desc: Type%d\n", type);
1826
1827	seg = vmcb_seg(vmcb, type);
1828	if (seg == NULL) {
1829		ERR("SVM_ERR:Unsupported segment type%d\n", type);
1830		return (EINVAL);
1831	}
1832
1833	/* Map seg_desc access to VMCB attribute format.*/
1834	attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
1835	VCPU_CTR3(svm_sc->vm, vcpu, "SVM:[sel %d attribute 0x%x limit:0x%x]\n",
1836		type, desc->access, desc->limit);
1837	seg->attrib = attrib;
1838	seg->base = desc->base;
1839	seg->limit = desc->limit;
1840
1841	return (0);
1842}
1843
1844/*
1845 * Interface to get guest descriptor.
1846 */
1847static int
1848svm_getdesc(void *arg, int vcpu, int type, struct seg_desc *desc)
1849{
1850	struct svm_softc *svm_sc;
1851	struct vmcb_segment	*seg;
1852
1853	svm_sc = arg;
1854	KASSERT(vcpu < svm_sc->vcpu_cnt, ("Guest doesn't have VCPU%d", vcpu));
1855
1856	VCPU_CTR1(svm_sc->vm, vcpu, "SVM:get_desc: Type%d\n", type);
1857
1858	seg = vmcb_seg(svm_get_vmcb(svm_sc, vcpu), type);
1859	if (!seg) {
1860		ERR("SVM_ERR:Unsupported segment type%d\n", type);
1861		return (EINVAL);
1862	}
1863
1864	/* Map seg_desc access to VMCB attribute format.*/
1865	desc->access = ((seg->attrib & 0xF00) << 4) | (seg->attrib & 0xFF);
1866	desc->base = seg->base;
1867	desc->limit = seg->limit;
1868
1869	/*
1870	 * VT-x uses bit 16 (Unusable) to indicate a segment that has been
1871	 * loaded with a NULL segment selector. The 'desc->access' field is
1872	 * interpreted in the VT-x format by the processor-independent code.
1873	 *
1874	 * SVM uses the 'P' bit to convey the same information so convert it
1875	 * into the VT-x format. For more details refer to section
1876	 * "Segment State in the VMCB" in APMv2.
1877	 */
1878	if (type == VM_REG_GUEST_CS && type == VM_REG_GUEST_TR)
1879		desc->access |= 0x80;		/* CS and TS always present */
1880
1881	if (!(desc->access & 0x80))
1882		desc->access |= 0x10000;	/* Unusable segment */
1883
1884	return (0);
1885}
1886
1887static int
1888svm_setcap(void *arg, int vcpu, int type, int val)
1889{
1890	struct svm_softc *sc;
1891	int error;
1892
1893	sc = arg;
1894	error = 0;
1895	switch (type) {
1896	case VM_CAP_HALT_EXIT:
1897		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1898		    VMCB_INTCPT_HLT, val);
1899		break;
1900	case VM_CAP_PAUSE_EXIT:
1901		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1902		    VMCB_INTCPT_PAUSE, val);
1903		break;
1904	case VM_CAP_UNRESTRICTED_GUEST:
1905		/* Unrestricted guest execution cannot be disabled in SVM */
1906		if (val == 0)
1907			error = EINVAL;
1908		break;
1909	default:
1910		error = ENOENT;
1911		break;
1912	}
1913	return (error);
1914}
1915
1916static int
1917svm_getcap(void *arg, int vcpu, int type, int *retval)
1918{
1919	struct svm_softc *sc;
1920	int error;
1921
1922	sc = arg;
1923	error = 0;
1924
1925	switch (type) {
1926	case VM_CAP_HALT_EXIT:
1927		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1928		    VMCB_INTCPT_HLT);
1929		break;
1930	case VM_CAP_PAUSE_EXIT:
1931		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1932		    VMCB_INTCPT_PAUSE);
1933		break;
1934	case VM_CAP_UNRESTRICTED_GUEST:
1935		*retval = 1;	/* unrestricted guest is always enabled */
1936		break;
1937	default:
1938		error = ENOENT;
1939		break;
1940	}
1941	return (error);
1942}
1943
1944static struct vlapic *
1945svm_vlapic_init(void *arg, int vcpuid)
1946{
1947	struct svm_softc *svm_sc;
1948	struct vlapic *vlapic;
1949
1950	svm_sc = arg;
1951	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
1952	vlapic->vm = svm_sc->vm;
1953	vlapic->vcpuid = vcpuid;
1954	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
1955
1956	vlapic_init(vlapic);
1957
1958	return (vlapic);
1959}
1960
1961static void
1962svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
1963{
1964
1965        vlapic_cleanup(vlapic);
1966        free(vlapic, M_SVM_VLAPIC);
1967}
1968
1969struct vmm_ops vmm_ops_amd = {
1970	svm_init,
1971	svm_cleanup,
1972	svm_restore,
1973	svm_vminit,
1974	svm_vmrun,
1975	svm_vmcleanup,
1976	svm_getreg,
1977	svm_setreg,
1978	svm_getdesc,
1979	svm_setdesc,
1980	svm_getcap,
1981	svm_setcap,
1982	svm_npt_alloc,
1983	svm_npt_free,
1984	svm_vlapic_init,
1985	svm_vlapic_cleanup
1986};
1987