x86.c revision 267427
1221828Sgrehan/*-
2221828Sgrehan * Copyright (c) 2011 NetApp, Inc.
3221828Sgrehan * All rights reserved.
4221828Sgrehan *
5221828Sgrehan * Redistribution and use in source and binary forms, with or without
6221828Sgrehan * modification, are permitted provided that the following conditions
7221828Sgrehan * are met:
8221828Sgrehan * 1. Redistributions of source code must retain the above copyright
9221828Sgrehan *    notice, this list of conditions and the following disclaimer.
10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright
11221828Sgrehan *    notice, this list of conditions and the following disclaimer in the
12221828Sgrehan *    documentation and/or other materials provided with the distribution.
13221828Sgrehan *
14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17221828Sgrehan * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24221828Sgrehan * SUCH DAMAGE.
25221828Sgrehan *
26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/x86.c 267427 2014-06-12 19:58:12Z jhb $
27221828Sgrehan */
28221828Sgrehan
29221828Sgrehan#include <sys/cdefs.h>
30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/x86.c 267427 2014-06-12 19:58:12Z jhb $");
31221828Sgrehan
32240941Sneel#include <sys/param.h>
33267427Sjhb#include <sys/pcpu.h>
34222610Sjhb#include <sys/systm.h>
35240941Sneel#include <sys/cpuset.h>
36221828Sgrehan
37249324Sneel#include <machine/clock.h>
38221828Sgrehan#include <machine/cpufunc.h>
39222610Sjhb#include <machine/md_var.h>
40267427Sjhb#include <machine/segments.h>
41221828Sgrehan#include <machine/specialreg.h>
42221828Sgrehan
43240941Sneel#include <machine/vmm.h>
44240941Sneel
45267427Sjhb#include "vmm_host.h"
46221828Sgrehan#include "x86.h"
47221828Sgrehan
48222610Sjhb#define	CPUID_VM_HIGH		0x40000000
49222610Sjhb
50252335Sgrehanstatic const char bhyve_id[12] = "bhyve bhyve ";
51222610Sjhb
52252335Sgrehanstatic uint64_t bhyve_xcpuids;
53252335Sgrehan
54221828Sgrehanint
55240941Sneelx86_emulate_cpuid(struct vm *vm, int vcpu_id,
56240941Sneel		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
57221828Sgrehan{
58267427Sjhb	const struct xsave_limits *limits;
59267427Sjhb	uint64_t cr4;
60256869Sneel	int error, enable_invpcid;
61221828Sgrehan	unsigned int 	func, regs[4];
62240941Sneel	enum x2apic_state x2apic_state;
63221828Sgrehan
64222610Sjhb	/*
65222610Sjhb	 * Requests for invalid CPUID levels should map to the highest
66222610Sjhb	 * available level instead.
67222610Sjhb	 */
68222610Sjhb	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
69222610Sjhb		if (*eax > cpu_exthigh)
70222610Sjhb			*eax = cpu_exthigh;
71222610Sjhb	} else if (*eax >= 0x40000000) {
72222610Sjhb		if (*eax > CPUID_VM_HIGH)
73222610Sjhb			*eax = CPUID_VM_HIGH;
74222610Sjhb	} else if (*eax > cpu_high) {
75222610Sjhb		*eax = cpu_high;
76222610Sjhb	}
77221828Sgrehan
78246774Sneel	func = *eax;
79246774Sneel
80222610Sjhb	/*
81222610Sjhb	 * In general the approach used for CPU topology is to
82222610Sjhb	 * advertise a flat topology where all CPUs are packages with
83222610Sjhb	 * no multi-core or SMT.
84222610Sjhb	 */
85222610Sjhb	switch (func) {
86252335Sgrehan		/*
87252335Sgrehan		 * Pass these through to the guest
88252335Sgrehan		 */
89221828Sgrehan		case CPUID_0000_0000:
90221828Sgrehan		case CPUID_0000_0002:
91221828Sgrehan		case CPUID_0000_0003:
92221828Sgrehan		case CPUID_8000_0000:
93221828Sgrehan		case CPUID_8000_0002:
94221828Sgrehan		case CPUID_8000_0003:
95221828Sgrehan		case CPUID_8000_0004:
96221828Sgrehan		case CPUID_8000_0006:
97221828Sgrehan		case CPUID_8000_0008:
98222610Sjhb			cpuid_count(*eax, *ecx, regs);
99221828Sgrehan			break;
100221828Sgrehan
101252335Sgrehan		case CPUID_8000_0001:
102252335Sgrehan			/*
103252335Sgrehan			 * Hide rdtscp/ia32_tsc_aux until we know how
104252335Sgrehan			 * to deal with them.
105252335Sgrehan			 */
106252335Sgrehan			cpuid_count(*eax, *ecx, regs);
107252335Sgrehan			regs[3] &= ~AMDID_RDTSCP;
108252335Sgrehan			break;
109252335Sgrehan
110249324Sneel		case CPUID_8000_0007:
111249324Sneel			cpuid_count(*eax, *ecx, regs);
112249324Sneel			/*
113249324Sneel			 * If the host TSCs are not synchronized across
114249324Sneel			 * physical cpus then we cannot advertise an
115249324Sneel			 * invariant tsc to a vcpu.
116249324Sneel			 *
117249324Sneel			 * XXX This still falls short because the vcpu
118249324Sneel			 * can observe the TSC moving backwards as it
119249324Sneel			 * migrates across physical cpus. But at least
120249324Sneel			 * it should discourage the guest from using the
121249324Sneel			 * TSC to keep track of time.
122249324Sneel			 */
123249324Sneel			if (!smp_tsc)
124249324Sneel				regs[3] &= ~AMDPM_TSC_INVARIANT;
125249324Sneel			break;
126249324Sneel
127221828Sgrehan		case CPUID_0000_0001:
128222610Sjhb			do_cpuid(1, regs);
129222610Sjhb
130240941Sneel			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
131240941Sneel			if (error) {
132240941Sneel				panic("x86_emulate_cpuid: error %d "
133240941Sneel				      "fetching x2apic state", error);
134240941Sneel			}
135240941Sneel
136221828Sgrehan			/*
137221828Sgrehan			 * Override the APIC ID only in ebx
138221828Sgrehan			 */
139222610Sjhb			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
140222610Sjhb			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
141221828Sgrehan
142221828Sgrehan			/*
143222105Sgrehan			 * Don't expose VMX, SpeedStep or TME capability.
144222610Sjhb			 * Advertise x2APIC capability and Hypervisor guest.
145221828Sgrehan			 */
146222610Sjhb			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
147221828Sgrehan
148240941Sneel			regs[2] |= CPUID2_HV;
149240941Sneel
150240941Sneel			if (x2apic_state != X2APIC_DISABLED)
151240941Sneel				regs[2] |= CPUID2_X2APIC;
152240941Sneel
153221828Sgrehan			/*
154267427Sjhb			 * Only advertise CPUID2_XSAVE in the guest if
155267427Sjhb			 * the host is using XSAVE.
156234939Sgrehan			 */
157267427Sjhb			if (!(regs[2] & CPUID2_OSXSAVE))
158267427Sjhb				regs[2] &= ~CPUID2_XSAVE;
159234939Sgrehan
160234939Sgrehan			/*
161267427Sjhb			 * If CPUID2_XSAVE is being advertised and the
162267427Sjhb			 * guest has set CR4_XSAVE, set
163267427Sjhb			 * CPUID2_OSXSAVE.
164267427Sjhb			 */
165267427Sjhb			regs[2] &= ~CPUID2_OSXSAVE;
166267427Sjhb			if (regs[2] & CPUID2_XSAVE) {
167267427Sjhb				error = vm_get_register(vm, vcpu_id,
168267427Sjhb				    VM_REG_GUEST_CR4, &cr4);
169267427Sjhb				if (error)
170267427Sjhb					panic("x86_emulate_cpuid: error %d "
171267427Sjhb					      "fetching %%cr4", error);
172267427Sjhb				if (cr4 & CR4_XSAVE)
173267427Sjhb					regs[2] |= CPUID2_OSXSAVE;
174267427Sjhb			}
175267427Sjhb
176267427Sjhb			/*
177242060Sneel			 * Hide monitor/mwait until we know how to deal with
178242060Sneel			 * these instructions.
179242060Sneel			 */
180242060Sneel			regs[2] &= ~CPUID2_MON;
181242060Sneel
182252335Sgrehan                        /*
183252335Sgrehan			 * Hide the performance and debug features.
184252335Sgrehan			 */
185252335Sgrehan			regs[2] &= ~CPUID2_PDCM;
186255645Sgrehan
187242060Sneel			/*
188255645Sgrehan			 * No TSC deadline support in the APIC yet
189255645Sgrehan			 */
190255645Sgrehan			regs[2] &= ~CPUID2_TSCDLT;
191255645Sgrehan
192255645Sgrehan			/*
193222105Sgrehan			 * Hide thermal monitoring
194222105Sgrehan			 */
195222105Sgrehan			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
196222105Sgrehan
197222105Sgrehan			/*
198221828Sgrehan			 * Machine check handling is done in the host.
199221828Sgrehan			 * Hide MTRR capability.
200221828Sgrehan			 */
201221828Sgrehan			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
202221828Sgrehan
203252335Sgrehan                        /*
204252335Sgrehan                        * Hide the debug store capability.
205252335Sgrehan                        */
206252335Sgrehan			regs[3] &= ~CPUID_DS;
207252335Sgrehan
208222610Sjhb			/*
209222610Sjhb			 * Disable multi-core.
210222610Sjhb			 */
211222610Sjhb			regs[1] &= ~CPUID_HTT_CORES;
212222610Sjhb			regs[3] &= ~CPUID_HTT;
213221828Sgrehan			break;
214221828Sgrehan
215222610Sjhb		case CPUID_0000_0004:
216222610Sjhb			do_cpuid(4, regs);
217222610Sjhb
218222610Sjhb			/*
219222610Sjhb			 * Do not expose topology.
220222610Sjhb			 */
221222610Sjhb			regs[0] &= 0xffff8000;
222222610Sjhb			regs[0] |= 0x04008000;
223222610Sjhb			break;
224222610Sjhb
225256869Sneel		case CPUID_0000_0007:
226256869Sneel			regs[0] = 0;
227256869Sneel			regs[1] = 0;
228256869Sneel			regs[2] = 0;
229256869Sneel			regs[3] = 0;
230256869Sneel
231256869Sneel			/* leaf 0 */
232256869Sneel			if (*ecx == 0) {
233267427Sjhb				cpuid_count(*eax, *ecx, regs);
234267427Sjhb
235267427Sjhb				/* Only leaf 0 is supported */
236267427Sjhb				regs[0] = 0;
237267427Sjhb
238267427Sjhb				/*
239267427Sjhb				 * Expose known-safe features.
240267427Sjhb				 */
241267427Sjhb				regs[1] &= (CPUID_STDEXT_FSGSBASE |
242267427Sjhb				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
243267427Sjhb				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
244267427Sjhb				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
245267427Sjhb				    CPUID_STDEXT_AVX512F |
246267427Sjhb				    CPUID_STDEXT_AVX512PF |
247267427Sjhb				    CPUID_STDEXT_AVX512ER |
248267427Sjhb				    CPUID_STDEXT_AVX512CD);
249267427Sjhb				regs[2] = 0;
250267427Sjhb				regs[3] = 0;
251267427Sjhb
252267427Sjhb				/* Advertise INVPCID if it is enabled. */
253256869Sneel				error = vm_get_capability(vm, vcpu_id,
254256869Sneel				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
255256869Sneel				if (error == 0 && enable_invpcid)
256256869Sneel					regs[1] |= CPUID_STDEXT_INVPCID;
257256869Sneel			}
258256869Sneel			break;
259256869Sneel
260222105Sgrehan		case CPUID_0000_0006:
261252335Sgrehan		case CPUID_0000_000A:
262222105Sgrehan			/*
263222105Sgrehan			 * Handle the access, but report 0 for
264222105Sgrehan			 * all options
265222105Sgrehan			 */
266222105Sgrehan			regs[0] = 0;
267222105Sgrehan			regs[1] = 0;
268222105Sgrehan			regs[2] = 0;
269222105Sgrehan			regs[3] = 0;
270222105Sgrehan			break;
271222105Sgrehan
272221828Sgrehan		case CPUID_0000_000B:
273221828Sgrehan			/*
274221828Sgrehan			 * Processor topology enumeration
275221828Sgrehan			 */
276221828Sgrehan			regs[0] = 0;
277221828Sgrehan			regs[1] = 0;
278221828Sgrehan			regs[2] = *ecx & 0xff;
279222610Sjhb			regs[3] = vcpu_id;
280221828Sgrehan			break;
281221828Sgrehan
282267427Sjhb		case CPUID_0000_000D:
283267427Sjhb			limits = vmm_get_xsave_limits();
284267427Sjhb			if (!limits->xsave_enabled) {
285267427Sjhb				regs[0] = 0;
286267427Sjhb				regs[1] = 0;
287267427Sjhb				regs[2] = 0;
288267427Sjhb				regs[3] = 0;
289267427Sjhb				break;
290267427Sjhb			}
291267427Sjhb
292267427Sjhb			cpuid_count(*eax, *ecx, regs);
293267427Sjhb			switch (*ecx) {
294267427Sjhb			case 0:
295267427Sjhb				/*
296267427Sjhb				 * Only permit the guest to use bits
297267427Sjhb				 * that are active in the host in
298267427Sjhb				 * %xcr0.  Also, claim that the
299267427Sjhb				 * maximum save area size is
300267427Sjhb				 * equivalent to the host's current
301267427Sjhb				 * save area size.  Since this runs
302267427Sjhb				 * "inside" of vmrun(), it runs with
303267427Sjhb				 * the guest's xcr0, so the current
304267427Sjhb				 * save area size is correct as-is.
305267427Sjhb				 */
306267427Sjhb				regs[0] &= limits->xcr0_allowed;
307267427Sjhb				regs[2] = limits->xsave_max_size;
308267427Sjhb				regs[3] &= (limits->xcr0_allowed >> 32);
309267427Sjhb				break;
310267427Sjhb			case 1:
311267427Sjhb				/* Only permit XSAVEOPT. */
312267427Sjhb				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
313267427Sjhb				regs[1] = 0;
314267427Sjhb				regs[2] = 0;
315267427Sjhb				regs[3] = 0;
316267427Sjhb				break;
317267427Sjhb			default:
318267427Sjhb				/*
319267427Sjhb				 * If the leaf is for a permitted feature,
320267427Sjhb				 * pass through as-is, otherwise return
321267427Sjhb				 * all zeroes.
322267427Sjhb				 */
323267427Sjhb				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
324267427Sjhb					regs[0] = 0;
325267427Sjhb					regs[1] = 0;
326267427Sjhb					regs[2] = 0;
327267427Sjhb					regs[3] = 0;
328267427Sjhb				}
329267427Sjhb				break;
330267427Sjhb			}
331267427Sjhb			break;
332267427Sjhb
333222610Sjhb		case 0x40000000:
334222610Sjhb			regs[0] = CPUID_VM_HIGH;
335222610Sjhb			bcopy(bhyve_id, &regs[1], 4);
336252335Sgrehan			bcopy(bhyve_id + 4, &regs[2], 4);
337252335Sgrehan			bcopy(bhyve_id + 8, &regs[3], 4);
338222610Sjhb			break;
339252335Sgrehan
340221828Sgrehan		default:
341252335Sgrehan			/*
342252335Sgrehan			 * The leaf value has already been clamped so
343252335Sgrehan			 * simply pass this through, keeping count of
344252335Sgrehan			 * how many unhandled leaf values have been seen.
345252335Sgrehan			 */
346252335Sgrehan			atomic_add_long(&bhyve_xcpuids, 1);
347252335Sgrehan			cpuid_count(*eax, *ecx, regs);
348252335Sgrehan			break;
349221828Sgrehan	}
350221828Sgrehan
351221828Sgrehan	*eax = regs[0];
352221828Sgrehan	*ebx = regs[1];
353221828Sgrehan	*ecx = regs[2];
354221828Sgrehan	*edx = regs[3];
355252335Sgrehan
356221828Sgrehan	return (1);
357221828Sgrehan}
358