x86.c revision 280839
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/x86.c 280839 2015-03-30 07:11:49Z mav $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/x86.c 280839 2015-03-30 07:11:49Z mav $");
31
32#include <sys/param.h>
33#include <sys/pcpu.h>
34#include <sys/systm.h>
35#include <sys/cpuset.h>
36#include <sys/sysctl.h>
37
38#include <machine/clock.h>
39#include <machine/cpufunc.h>
40#include <machine/md_var.h>
41#include <machine/segments.h>
42#include <machine/specialreg.h>
43
44#include <machine/vmm.h>
45
46#include "vmm_host.h"
47#include "vmm_ktr.h"
48#include "vmm_util.h"
49#include "x86.h"
50
51SYSCTL_DECL(_hw_vmm);
52static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
53
54#define	CPUID_VM_HIGH		0x40000000
55
56static const char bhyve_id[12] = "bhyve bhyve ";
57
58static uint64_t bhyve_xcpuids;
59SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
60    "Number of times an unknown cpuid leaf was accessed");
61
62/*
63 * The default CPU topology is a single thread per package.
64 */
65static u_int threads_per_core = 1;
66SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
67    &threads_per_core, 0, NULL);
68
69static u_int cores_per_package = 1;
70SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
71    &cores_per_package, 0, NULL);
72
73static int cpuid_leaf_b = 1;
74SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
75    &cpuid_leaf_b, 0, NULL);
76
77/*
78 * Round up to the next power of two, if necessary, and then take log2.
79 * Returns -1 if argument is zero.
80 */
81static __inline int
82log2(u_int x)
83{
84
85	return (fls(x << (1 - powerof2(x))) - 1);
86}
87
88int
89x86_emulate_cpuid(struct vm *vm, int vcpu_id,
90		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
91{
92	const struct xsave_limits *limits;
93	uint64_t cr4;
94	int error, enable_invpcid, level, width, x2apic_id;
95	unsigned int func, regs[4], logical_cpus;
96	enum x2apic_state x2apic_state;
97
98	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
99
100	/*
101	 * Requests for invalid CPUID levels should map to the highest
102	 * available level instead.
103	 */
104	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
105		if (*eax > cpu_exthigh)
106			*eax = cpu_exthigh;
107	} else if (*eax >= 0x40000000) {
108		if (*eax > CPUID_VM_HIGH)
109			*eax = CPUID_VM_HIGH;
110	} else if (*eax > cpu_high) {
111		*eax = cpu_high;
112	}
113
114	func = *eax;
115
116	/*
117	 * In general the approach used for CPU topology is to
118	 * advertise a flat topology where all CPUs are packages with
119	 * no multi-core or SMT.
120	 */
121	switch (func) {
122		/*
123		 * Pass these through to the guest
124		 */
125		case CPUID_0000_0000:
126		case CPUID_0000_0002:
127		case CPUID_0000_0003:
128		case CPUID_8000_0000:
129		case CPUID_8000_0002:
130		case CPUID_8000_0003:
131		case CPUID_8000_0004:
132		case CPUID_8000_0006:
133			cpuid_count(*eax, *ecx, regs);
134			break;
135		case CPUID_8000_0008:
136			cpuid_count(*eax, *ecx, regs);
137			if (vmm_is_amd()) {
138				/*
139				 * XXX this might appear silly because AMD
140				 * cpus don't have threads.
141				 *
142				 * However this matches the logical cpus as
143				 * advertised by leaf 0x1 and will work even
144				 * if the 'threads_per_core' tunable is set
145				 * incorrectly on an AMD host.
146				 */
147				logical_cpus = threads_per_core *
148				    cores_per_package;
149				regs[2] = logical_cpus - 1;
150			}
151			break;
152
153		case CPUID_8000_0001:
154			cpuid_count(*eax, *ecx, regs);
155
156			/*
157			 * Hide SVM and Topology Extension features from guest.
158			 */
159			regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
160
161			/*
162			 * Don't advertise extended performance counter MSRs
163			 * to the guest.
164			 */
165			regs[2] &= ~AMDID2_PCXC;
166			regs[2] &= ~AMDID2_PNXC;
167			regs[2] &= ~AMDID2_PTSCEL2I;
168
169			/*
170			 * Don't advertise Instruction Based Sampling feature.
171			 */
172			regs[2] &= ~AMDID2_IBS;
173
174			/* NodeID MSR not available */
175			regs[2] &= ~AMDID2_NODE_ID;
176
177			/* Don't advertise the OS visible workaround feature */
178			regs[2] &= ~AMDID2_OSVW;
179
180			/*
181			 * Hide rdtscp/ia32_tsc_aux until we know how
182			 * to deal with them.
183			 */
184			regs[3] &= ~AMDID_RDTSCP;
185			break;
186
187		case CPUID_8000_0007:
188			/*
189			 * AMD uses this leaf to advertise the processor's
190			 * power monitoring and RAS capabilities. These
191			 * features are hardware-specific and exposing
192			 * them to a guest doesn't make a lot of sense.
193			 *
194			 * Intel uses this leaf only to advertise the
195			 * "Invariant TSC" feature with all other bits
196			 * being reserved (set to zero).
197			 */
198			regs[0] = 0;
199			regs[1] = 0;
200			regs[2] = 0;
201			regs[3] = 0;
202
203			/*
204			 * "Invariant TSC" can be advertised to the guest if:
205			 * - host TSC frequency is invariant
206			 * - host TSCs are synchronized across physical cpus
207			 *
208			 * XXX This still falls short because the vcpu
209			 * can observe the TSC moving backwards as it
210			 * migrates across physical cpus. But at least
211			 * it should discourage the guest from using the
212			 * TSC to keep track of time.
213			 */
214			if (tsc_is_invariant && smp_tsc)
215				regs[3] |= AMDPM_TSC_INVARIANT;
216			break;
217
218		case CPUID_0000_0001:
219			do_cpuid(1, regs);
220
221			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
222			if (error) {
223				panic("x86_emulate_cpuid: error %d "
224				      "fetching x2apic state", error);
225			}
226
227			/*
228			 * Override the APIC ID only in ebx
229			 */
230			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
231			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
232
233			/*
234			 * Don't expose VMX, SpeedStep or TME capability.
235			 * Advertise x2APIC capability and Hypervisor guest.
236			 */
237			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
238
239			regs[2] |= CPUID2_HV;
240
241			if (x2apic_state != X2APIC_DISABLED)
242				regs[2] |= CPUID2_X2APIC;
243			else
244				regs[2] &= ~CPUID2_X2APIC;
245
246			/*
247			 * Only advertise CPUID2_XSAVE in the guest if
248			 * the host is using XSAVE.
249			 */
250			if (!(regs[2] & CPUID2_OSXSAVE))
251				regs[2] &= ~CPUID2_XSAVE;
252
253			/*
254			 * If CPUID2_XSAVE is being advertised and the
255			 * guest has set CR4_XSAVE, set
256			 * CPUID2_OSXSAVE.
257			 */
258			regs[2] &= ~CPUID2_OSXSAVE;
259			if (regs[2] & CPUID2_XSAVE) {
260				error = vm_get_register(vm, vcpu_id,
261				    VM_REG_GUEST_CR4, &cr4);
262				if (error)
263					panic("x86_emulate_cpuid: error %d "
264					      "fetching %%cr4", error);
265				if (cr4 & CR4_XSAVE)
266					regs[2] |= CPUID2_OSXSAVE;
267			}
268
269			/*
270			 * Hide monitor/mwait until we know how to deal with
271			 * these instructions.
272			 */
273			regs[2] &= ~CPUID2_MON;
274
275                        /*
276			 * Hide the performance and debug features.
277			 */
278			regs[2] &= ~CPUID2_PDCM;
279
280			/*
281			 * No TSC deadline support in the APIC yet
282			 */
283			regs[2] &= ~CPUID2_TSCDLT;
284
285			/*
286			 * Hide thermal monitoring
287			 */
288			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
289
290			/*
291			 * Machine check handling is done in the host.
292			 * Hide MTRR capability.
293			 */
294			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
295
296                        /*
297                        * Hide the debug store capability.
298                        */
299			regs[3] &= ~CPUID_DS;
300
301			logical_cpus = threads_per_core * cores_per_package;
302			regs[1] &= ~CPUID_HTT_CORES;
303			regs[1] |= (logical_cpus & 0xff) << 16;
304			regs[3] |= CPUID_HTT;
305			break;
306
307		case CPUID_0000_0004:
308			cpuid_count(*eax, *ecx, regs);
309
310			if (regs[0] || regs[1] || regs[2] || regs[3]) {
311				regs[0] &= 0x3ff;
312				regs[0] |= (cores_per_package - 1) << 26;
313				/*
314				 * Cache topology:
315				 * - L1 and L2 are shared only by the logical
316				 *   processors in a single core.
317				 * - L3 and above are shared by all logical
318				 *   processors in the package.
319				 */
320				logical_cpus = threads_per_core;
321				level = (regs[0] >> 5) & 0x7;
322				if (level >= 3)
323					logical_cpus *= cores_per_package;
324				regs[0] |= (logical_cpus - 1) << 14;
325			}
326			break;
327
328		case CPUID_0000_0007:
329			regs[0] = 0;
330			regs[1] = 0;
331			regs[2] = 0;
332			regs[3] = 0;
333
334			/* leaf 0 */
335			if (*ecx == 0) {
336				cpuid_count(*eax, *ecx, regs);
337
338				/* Only leaf 0 is supported */
339				regs[0] = 0;
340
341				/*
342				 * Expose known-safe features.
343				 */
344				regs[1] &= (CPUID_STDEXT_FSGSBASE |
345				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
346				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
347				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
348				    CPUID_STDEXT_AVX512F |
349				    CPUID_STDEXT_AVX512PF |
350				    CPUID_STDEXT_AVX512ER |
351				    CPUID_STDEXT_AVX512CD);
352				regs[2] = 0;
353				regs[3] = 0;
354
355				/* Advertise INVPCID if it is enabled. */
356				error = vm_get_capability(vm, vcpu_id,
357				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
358				if (error == 0 && enable_invpcid)
359					regs[1] |= CPUID_STDEXT_INVPCID;
360			}
361			break;
362
363		case CPUID_0000_0006:
364			regs[0] = CPUTPM1_ARAT;
365			regs[1] = 0;
366			regs[2] = 0;
367			regs[3] = 0;
368			break;
369
370		case CPUID_0000_000A:
371			/*
372			 * Handle the access, but report 0 for
373			 * all options
374			 */
375			regs[0] = 0;
376			regs[1] = 0;
377			regs[2] = 0;
378			regs[3] = 0;
379			break;
380
381		case CPUID_0000_000B:
382			/*
383			 * Processor topology enumeration
384			 */
385			if (*ecx == 0) {
386				logical_cpus = threads_per_core;
387				width = log2(logical_cpus);
388				level = CPUID_TYPE_SMT;
389				x2apic_id = vcpu_id;
390			}
391
392			if (*ecx == 1) {
393				logical_cpus = threads_per_core *
394				    cores_per_package;
395				width = log2(logical_cpus);
396				level = CPUID_TYPE_CORE;
397				x2apic_id = vcpu_id;
398			}
399
400			if (!cpuid_leaf_b || *ecx >= 2) {
401				width = 0;
402				logical_cpus = 0;
403				level = 0;
404				x2apic_id = 0;
405			}
406
407			regs[0] = width & 0x1f;
408			regs[1] = logical_cpus & 0xffff;
409			regs[2] = (level << 8) | (*ecx & 0xff);
410			regs[3] = x2apic_id;
411			break;
412
413		case CPUID_0000_000D:
414			limits = vmm_get_xsave_limits();
415			if (!limits->xsave_enabled) {
416				regs[0] = 0;
417				regs[1] = 0;
418				regs[2] = 0;
419				regs[3] = 0;
420				break;
421			}
422
423			cpuid_count(*eax, *ecx, regs);
424			switch (*ecx) {
425			case 0:
426				/*
427				 * Only permit the guest to use bits
428				 * that are active in the host in
429				 * %xcr0.  Also, claim that the
430				 * maximum save area size is
431				 * equivalent to the host's current
432				 * save area size.  Since this runs
433				 * "inside" of vmrun(), it runs with
434				 * the guest's xcr0, so the current
435				 * save area size is correct as-is.
436				 */
437				regs[0] &= limits->xcr0_allowed;
438				regs[2] = limits->xsave_max_size;
439				regs[3] &= (limits->xcr0_allowed >> 32);
440				break;
441			case 1:
442				/* Only permit XSAVEOPT. */
443				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
444				regs[1] = 0;
445				regs[2] = 0;
446				regs[3] = 0;
447				break;
448			default:
449				/*
450				 * If the leaf is for a permitted feature,
451				 * pass through as-is, otherwise return
452				 * all zeroes.
453				 */
454				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
455					regs[0] = 0;
456					regs[1] = 0;
457					regs[2] = 0;
458					regs[3] = 0;
459				}
460				break;
461			}
462			break;
463
464		case 0x40000000:
465			regs[0] = CPUID_VM_HIGH;
466			bcopy(bhyve_id, &regs[1], 4);
467			bcopy(bhyve_id + 4, &regs[2], 4);
468			bcopy(bhyve_id + 8, &regs[3], 4);
469			break;
470
471		default:
472			/*
473			 * The leaf value has already been clamped so
474			 * simply pass this through, keeping count of
475			 * how many unhandled leaf values have been seen.
476			 */
477			atomic_add_long(&bhyve_xcpuids, 1);
478			cpuid_count(*eax, *ecx, regs);
479			break;
480	}
481
482	*eax = regs[0];
483	*ebx = regs[1];
484	*ecx = regs[2];
485	*edx = regs[3];
486
487	return (1);
488}
489