Deleted Added
full compact
x86.c (276349) x86.c (276403)
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/x86.c 276349 2014-12-28 21:27:13Z neel $
26 * $FreeBSD: stable/10/sys/amd64/vmm/x86.c 276403 2014-12-30 08:24:14Z neel $
27 */
28
29#include <sys/cdefs.h>
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/x86.c 276349 2014-12-28 21:27:13Z neel $");
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/x86.c 276403 2014-12-30 08:24:14Z neel $");
31
32#include <sys/param.h>
33#include <sys/pcpu.h>
34#include <sys/systm.h>
35#include <sys/cpuset.h>
36#include <sys/sysctl.h>
37
38#include <machine/clock.h>
39#include <machine/cpufunc.h>
40#include <machine/md_var.h>
41#include <machine/segments.h>
42#include <machine/specialreg.h>
43
44#include <machine/vmm.h>
45
46#include "vmm_host.h"
31
32#include <sys/param.h>
33#include <sys/pcpu.h>
34#include <sys/systm.h>
35#include <sys/cpuset.h>
36#include <sys/sysctl.h>
37
38#include <machine/clock.h>
39#include <machine/cpufunc.h>
40#include <machine/md_var.h>
41#include <machine/segments.h>
42#include <machine/specialreg.h>
43
44#include <machine/vmm.h>
45
46#include "vmm_host.h"
47#include "vmm_ktr.h"
48#include "vmm_util.h"
47#include "x86.h"
48
49SYSCTL_DECL(_hw_vmm);
50static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
51
52#define CPUID_VM_HIGH 0x40000000
53
54static const char bhyve_id[12] = "bhyve bhyve ";
55
56static uint64_t bhyve_xcpuids;
49#include "x86.h"
50
51SYSCTL_DECL(_hw_vmm);
52static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
53
54#define CPUID_VM_HIGH 0x40000000
55
56static const char bhyve_id[12] = "bhyve bhyve ";
57
58static uint64_t bhyve_xcpuids;
59SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
60 "Number of times an unknown cpuid leaf was accessed");
57
58/*
59 * The default CPU topology is a single thread per package.
60 */
61static u_int threads_per_core = 1;
62SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
63 &threads_per_core, 0, NULL);
64
65static u_int cores_per_package = 1;
66SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
67 &cores_per_package, 0, NULL);
68
69static int cpuid_leaf_b = 1;
70SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
71 &cpuid_leaf_b, 0, NULL);
72
73/*
74 * Round up to the next power of two, if necessary, and then take log2.
75 * Returns -1 if argument is zero.
76 */
77static __inline int
78log2(u_int x)
79{
80
81 return (fls(x << (1 - powerof2(x))) - 1);
82}
83
84int
85x86_emulate_cpuid(struct vm *vm, int vcpu_id,
86 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
87{
88 const struct xsave_limits *limits;
89 uint64_t cr4;
90 int error, enable_invpcid, level, width, x2apic_id;
91 unsigned int func, regs[4], logical_cpus;
92 enum x2apic_state x2apic_state;
93
61
62/*
63 * The default CPU topology is a single thread per package.
64 */
65static u_int threads_per_core = 1;
66SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
67 &threads_per_core, 0, NULL);
68
69static u_int cores_per_package = 1;
70SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
71 &cores_per_package, 0, NULL);
72
73static int cpuid_leaf_b = 1;
74SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
75 &cpuid_leaf_b, 0, NULL);
76
77/*
78 * Round up to the next power of two, if necessary, and then take log2.
79 * Returns -1 if argument is zero.
80 */
81static __inline int
82log2(u_int x)
83{
84
85 return (fls(x << (1 - powerof2(x))) - 1);
86}
87
88int
89x86_emulate_cpuid(struct vm *vm, int vcpu_id,
90 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
91{
92 const struct xsave_limits *limits;
93 uint64_t cr4;
94 int error, enable_invpcid, level, width, x2apic_id;
95 unsigned int func, regs[4], logical_cpus;
96 enum x2apic_state x2apic_state;
97
98 VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
99
94 /*
95 * Requests for invalid CPUID levels should map to the highest
96 * available level instead.
97 */
98 if (cpu_exthigh != 0 && *eax >= 0x80000000) {
99 if (*eax > cpu_exthigh)
100 *eax = cpu_exthigh;
101 } else if (*eax >= 0x40000000) {
102 if (*eax > CPUID_VM_HIGH)
103 *eax = CPUID_VM_HIGH;
104 } else if (*eax > cpu_high) {
105 *eax = cpu_high;
106 }
107
108 func = *eax;
109
110 /*
111 * In general the approach used for CPU topology is to
112 * advertise a flat topology where all CPUs are packages with
113 * no multi-core or SMT.
114 */
115 switch (func) {
116 /*
117 * Pass these through to the guest
118 */
119 case CPUID_0000_0000:
120 case CPUID_0000_0002:
121 case CPUID_0000_0003:
122 case CPUID_8000_0000:
123 case CPUID_8000_0002:
124 case CPUID_8000_0003:
125 case CPUID_8000_0004:
126 case CPUID_8000_0006:
100 /*
101 * Requests for invalid CPUID levels should map to the highest
102 * available level instead.
103 */
104 if (cpu_exthigh != 0 && *eax >= 0x80000000) {
105 if (*eax > cpu_exthigh)
106 *eax = cpu_exthigh;
107 } else if (*eax >= 0x40000000) {
108 if (*eax > CPUID_VM_HIGH)
109 *eax = CPUID_VM_HIGH;
110 } else if (*eax > cpu_high) {
111 *eax = cpu_high;
112 }
113
114 func = *eax;
115
116 /*
117 * In general the approach used for CPU topology is to
118 * advertise a flat topology where all CPUs are packages with
119 * no multi-core or SMT.
120 */
121 switch (func) {
122 /*
123 * Pass these through to the guest
124 */
125 case CPUID_0000_0000:
126 case CPUID_0000_0002:
127 case CPUID_0000_0003:
128 case CPUID_8000_0000:
129 case CPUID_8000_0002:
130 case CPUID_8000_0003:
131 case CPUID_8000_0004:
132 case CPUID_8000_0006:
133 cpuid_count(*eax, *ecx, regs);
134 break;
127 case CPUID_8000_0008:
128 cpuid_count(*eax, *ecx, regs);
135 case CPUID_8000_0008:
136 cpuid_count(*eax, *ecx, regs);
137 if (vmm_is_amd()) {
138 /*
139 * XXX this might appear silly because AMD
140 * cpus don't have threads.
141 *
142 * However this matches the logical cpus as
143 * advertised by leaf 0x1 and will work even
144 * if the 'threads_per_core' tunable is set
145 * incorrectly on an AMD host.
146 */
147 logical_cpus = threads_per_core *
148 cores_per_package;
149 regs[2] = logical_cpus - 1;
150 }
129 break;
130
131 case CPUID_8000_0001:
151 break;
152
153 case CPUID_8000_0001:
154 cpuid_count(*eax, *ecx, regs);
155
132 /*
156 /*
157 * Hide SVM and Topology Extension features from guest.
158 */
159 regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
160
161 /*
162 * Don't advertise extended performance counter MSRs
163 * to the guest.
164 */
165 regs[2] &= ~AMDID2_PCXC;
166 regs[2] &= ~AMDID2_PNXC;
167 regs[2] &= ~AMDID2_PTSCEL2I;
168
169 /*
170 * Don't advertise Instruction Based Sampling feature.
171 */
172 regs[2] &= ~AMDID2_IBS;
173
174 /* NodeID MSR not available */
175 regs[2] &= ~AMDID2_NODE_ID;
176
177 /* Don't advertise the OS visible workaround feature */
178 regs[2] &= ~AMDID2_OSVW;
179
180 /*
133 * Hide rdtscp/ia32_tsc_aux until we know how
134 * to deal with them.
135 */
181 * Hide rdtscp/ia32_tsc_aux until we know how
182 * to deal with them.
183 */
136 cpuid_count(*eax, *ecx, regs);
137 regs[3] &= ~AMDID_RDTSCP;
138 break;
139
140 case CPUID_8000_0007:
184 regs[3] &= ~AMDID_RDTSCP;
185 break;
186
187 case CPUID_8000_0007:
141 cpuid_count(*eax, *ecx, regs);
142 /*
188 /*
143 * If the host TSCs are not synchronized across
144 * physical cpus then we cannot advertise an
145 * invariant tsc to a vcpu.
189 * AMD uses this leaf to advertise the processor's
190 * power monitoring and RAS capabilities. These
191 * features are hardware-specific and exposing
192 * them to a guest doesn't make a lot of sense.
146 *
193 *
194 * Intel uses this leaf only to advertise the
195 * "Invariant TSC" feature with all other bits
196 * being reserved (set to zero).
197 */
198 regs[0] = 0;
199 regs[1] = 0;
200 regs[2] = 0;
201 regs[3] = 0;
202
203 /*
204 * "Invariant TSC" can be advertised to the guest if:
205 * - host TSC frequency is invariant
206 * - host TSCs are synchronized across physical cpus
207 *
147 * XXX This still falls short because the vcpu
148 * can observe the TSC moving backwards as it
149 * migrates across physical cpus. But at least
150 * it should discourage the guest from using the
151 * TSC to keep track of time.
152 */
208 * XXX This still falls short because the vcpu
209 * can observe the TSC moving backwards as it
210 * migrates across physical cpus. But at least
211 * it should discourage the guest from using the
212 * TSC to keep track of time.
213 */
153 if (!smp_tsc)
154 regs[3] &= ~AMDPM_TSC_INVARIANT;
214 if (tsc_is_invariant && smp_tsc)
215 regs[3] |= AMDPM_TSC_INVARIANT;
155 break;
156
157 case CPUID_0000_0001:
158 do_cpuid(1, regs);
159
160 error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
161 if (error) {
162 panic("x86_emulate_cpuid: error %d "
163 "fetching x2apic state", error);
164 }
165
166 /*
167 * Override the APIC ID only in ebx
168 */
169 regs[1] &= ~(CPUID_LOCAL_APIC_ID);
170 regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
171
172 /*
173 * Don't expose VMX, SpeedStep or TME capability.
174 * Advertise x2APIC capability and Hypervisor guest.
175 */
176 regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
177
178 regs[2] |= CPUID2_HV;
179
180 if (x2apic_state != X2APIC_DISABLED)
181 regs[2] |= CPUID2_X2APIC;
182 else
183 regs[2] &= ~CPUID2_X2APIC;
184
185 /*
186 * Only advertise CPUID2_XSAVE in the guest if
187 * the host is using XSAVE.
188 */
189 if (!(regs[2] & CPUID2_OSXSAVE))
190 regs[2] &= ~CPUID2_XSAVE;
191
192 /*
193 * If CPUID2_XSAVE is being advertised and the
194 * guest has set CR4_XSAVE, set
195 * CPUID2_OSXSAVE.
196 */
197 regs[2] &= ~CPUID2_OSXSAVE;
198 if (regs[2] & CPUID2_XSAVE) {
199 error = vm_get_register(vm, vcpu_id,
200 VM_REG_GUEST_CR4, &cr4);
201 if (error)
202 panic("x86_emulate_cpuid: error %d "
203 "fetching %%cr4", error);
204 if (cr4 & CR4_XSAVE)
205 regs[2] |= CPUID2_OSXSAVE;
206 }
207
208 /*
209 * Hide monitor/mwait until we know how to deal with
210 * these instructions.
211 */
212 regs[2] &= ~CPUID2_MON;
213
214 /*
215 * Hide the performance and debug features.
216 */
217 regs[2] &= ~CPUID2_PDCM;
218
219 /*
220 * No TSC deadline support in the APIC yet
221 */
222 regs[2] &= ~CPUID2_TSCDLT;
223
224 /*
225 * Hide thermal monitoring
226 */
227 regs[3] &= ~(CPUID_ACPI | CPUID_TM);
228
229 /*
230 * Machine check handling is done in the host.
231 * Hide MTRR capability.
232 */
233 regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
234
235 /*
236 * Hide the debug store capability.
237 */
238 regs[3] &= ~CPUID_DS;
239
240 logical_cpus = threads_per_core * cores_per_package;
241 regs[1] &= ~CPUID_HTT_CORES;
242 regs[1] |= (logical_cpus & 0xff) << 16;
243 regs[3] |= CPUID_HTT;
244 break;
245
246 case CPUID_0000_0004:
247 cpuid_count(*eax, *ecx, regs);
248
249 if (regs[0] || regs[1] || regs[2] || regs[3]) {
250 regs[0] &= 0x3ff;
251 regs[0] |= (cores_per_package - 1) << 26;
252 /*
253 * Cache topology:
254 * - L1 and L2 are shared only by the logical
255 * processors in a single core.
256 * - L3 and above are shared by all logical
257 * processors in the package.
258 */
259 logical_cpus = threads_per_core;
260 level = (regs[0] >> 5) & 0x7;
261 if (level >= 3)
262 logical_cpus *= cores_per_package;
263 regs[0] |= (logical_cpus - 1) << 14;
264 }
265 break;
266
267 case CPUID_0000_0007:
268 regs[0] = 0;
269 regs[1] = 0;
270 regs[2] = 0;
271 regs[3] = 0;
272
273 /* leaf 0 */
274 if (*ecx == 0) {
275 cpuid_count(*eax, *ecx, regs);
276
277 /* Only leaf 0 is supported */
278 regs[0] = 0;
279
280 /*
281 * Expose known-safe features.
282 */
283 regs[1] &= (CPUID_STDEXT_FSGSBASE |
284 CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
285 CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
286 CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
287 CPUID_STDEXT_AVX512F |
288 CPUID_STDEXT_AVX512PF |
289 CPUID_STDEXT_AVX512ER |
290 CPUID_STDEXT_AVX512CD);
291 regs[2] = 0;
292 regs[3] = 0;
293
294 /* Advertise INVPCID if it is enabled. */
295 error = vm_get_capability(vm, vcpu_id,
296 VM_CAP_ENABLE_INVPCID, &enable_invpcid);
297 if (error == 0 && enable_invpcid)
298 regs[1] |= CPUID_STDEXT_INVPCID;
299 }
300 break;
301
302 case CPUID_0000_0006:
303 case CPUID_0000_000A:
304 /*
305 * Handle the access, but report 0 for
306 * all options
307 */
308 regs[0] = 0;
309 regs[1] = 0;
310 regs[2] = 0;
311 regs[3] = 0;
312 break;
313
314 case CPUID_0000_000B:
315 /*
316 * Processor topology enumeration
317 */
318 if (*ecx == 0) {
319 logical_cpus = threads_per_core;
320 width = log2(logical_cpus);
321 level = CPUID_TYPE_SMT;
322 x2apic_id = vcpu_id;
323 }
324
325 if (*ecx == 1) {
326 logical_cpus = threads_per_core *
327 cores_per_package;
328 width = log2(logical_cpus);
329 level = CPUID_TYPE_CORE;
330 x2apic_id = vcpu_id;
331 }
332
333 if (!cpuid_leaf_b || *ecx >= 2) {
334 width = 0;
335 logical_cpus = 0;
336 level = 0;
337 x2apic_id = 0;
338 }
339
340 regs[0] = width & 0x1f;
341 regs[1] = logical_cpus & 0xffff;
342 regs[2] = (level << 8) | (*ecx & 0xff);
343 regs[3] = x2apic_id;
344 break;
345
346 case CPUID_0000_000D:
347 limits = vmm_get_xsave_limits();
348 if (!limits->xsave_enabled) {
349 regs[0] = 0;
350 regs[1] = 0;
351 regs[2] = 0;
352 regs[3] = 0;
353 break;
354 }
355
356 cpuid_count(*eax, *ecx, regs);
357 switch (*ecx) {
358 case 0:
359 /*
360 * Only permit the guest to use bits
361 * that are active in the host in
362 * %xcr0. Also, claim that the
363 * maximum save area size is
364 * equivalent to the host's current
365 * save area size. Since this runs
366 * "inside" of vmrun(), it runs with
367 * the guest's xcr0, so the current
368 * save area size is correct as-is.
369 */
370 regs[0] &= limits->xcr0_allowed;
371 regs[2] = limits->xsave_max_size;
372 regs[3] &= (limits->xcr0_allowed >> 32);
373 break;
374 case 1:
375 /* Only permit XSAVEOPT. */
376 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
377 regs[1] = 0;
378 regs[2] = 0;
379 regs[3] = 0;
380 break;
381 default:
382 /*
383 * If the leaf is for a permitted feature,
384 * pass through as-is, otherwise return
385 * all zeroes.
386 */
387 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
388 regs[0] = 0;
389 regs[1] = 0;
390 regs[2] = 0;
391 regs[3] = 0;
392 }
393 break;
394 }
395 break;
396
397 case 0x40000000:
398 regs[0] = CPUID_VM_HIGH;
399 bcopy(bhyve_id, &regs[1], 4);
400 bcopy(bhyve_id + 4, &regs[2], 4);
401 bcopy(bhyve_id + 8, &regs[3], 4);
402 break;
403
404 default:
405 /*
406 * The leaf value has already been clamped so
407 * simply pass this through, keeping count of
408 * how many unhandled leaf values have been seen.
409 */
410 atomic_add_long(&bhyve_xcpuids, 1);
411 cpuid_count(*eax, *ecx, regs);
412 break;
413 }
414
415 *eax = regs[0];
416 *ebx = regs[1];
417 *ecx = regs[2];
418 *edx = regs[3];
419
420 return (1);
421}
216 break;
217
218 case CPUID_0000_0001:
219 do_cpuid(1, regs);
220
221 error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
222 if (error) {
223 panic("x86_emulate_cpuid: error %d "
224 "fetching x2apic state", error);
225 }
226
227 /*
228 * Override the APIC ID only in ebx
229 */
230 regs[1] &= ~(CPUID_LOCAL_APIC_ID);
231 regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
232
233 /*
234 * Don't expose VMX, SpeedStep or TME capability.
235 * Advertise x2APIC capability and Hypervisor guest.
236 */
237 regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
238
239 regs[2] |= CPUID2_HV;
240
241 if (x2apic_state != X2APIC_DISABLED)
242 regs[2] |= CPUID2_X2APIC;
243 else
244 regs[2] &= ~CPUID2_X2APIC;
245
246 /*
247 * Only advertise CPUID2_XSAVE in the guest if
248 * the host is using XSAVE.
249 */
250 if (!(regs[2] & CPUID2_OSXSAVE))
251 regs[2] &= ~CPUID2_XSAVE;
252
253 /*
254 * If CPUID2_XSAVE is being advertised and the
255 * guest has set CR4_XSAVE, set
256 * CPUID2_OSXSAVE.
257 */
258 regs[2] &= ~CPUID2_OSXSAVE;
259 if (regs[2] & CPUID2_XSAVE) {
260 error = vm_get_register(vm, vcpu_id,
261 VM_REG_GUEST_CR4, &cr4);
262 if (error)
263 panic("x86_emulate_cpuid: error %d "
264 "fetching %%cr4", error);
265 if (cr4 & CR4_XSAVE)
266 regs[2] |= CPUID2_OSXSAVE;
267 }
268
269 /*
270 * Hide monitor/mwait until we know how to deal with
271 * these instructions.
272 */
273 regs[2] &= ~CPUID2_MON;
274
275 /*
276 * Hide the performance and debug features.
277 */
278 regs[2] &= ~CPUID2_PDCM;
279
280 /*
281 * No TSC deadline support in the APIC yet
282 */
283 regs[2] &= ~CPUID2_TSCDLT;
284
285 /*
286 * Hide thermal monitoring
287 */
288 regs[3] &= ~(CPUID_ACPI | CPUID_TM);
289
290 /*
291 * Machine check handling is done in the host.
292 * Hide MTRR capability.
293 */
294 regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
295
296 /*
297 * Hide the debug store capability.
298 */
299 regs[3] &= ~CPUID_DS;
300
301 logical_cpus = threads_per_core * cores_per_package;
302 regs[1] &= ~CPUID_HTT_CORES;
303 regs[1] |= (logical_cpus & 0xff) << 16;
304 regs[3] |= CPUID_HTT;
305 break;
306
307 case CPUID_0000_0004:
308 cpuid_count(*eax, *ecx, regs);
309
310 if (regs[0] || regs[1] || regs[2] || regs[3]) {
311 regs[0] &= 0x3ff;
312 regs[0] |= (cores_per_package - 1) << 26;
313 /*
314 * Cache topology:
315 * - L1 and L2 are shared only by the logical
316 * processors in a single core.
317 * - L3 and above are shared by all logical
318 * processors in the package.
319 */
320 logical_cpus = threads_per_core;
321 level = (regs[0] >> 5) & 0x7;
322 if (level >= 3)
323 logical_cpus *= cores_per_package;
324 regs[0] |= (logical_cpus - 1) << 14;
325 }
326 break;
327
328 case CPUID_0000_0007:
329 regs[0] = 0;
330 regs[1] = 0;
331 regs[2] = 0;
332 regs[3] = 0;
333
334 /* leaf 0 */
335 if (*ecx == 0) {
336 cpuid_count(*eax, *ecx, regs);
337
338 /* Only leaf 0 is supported */
339 regs[0] = 0;
340
341 /*
342 * Expose known-safe features.
343 */
344 regs[1] &= (CPUID_STDEXT_FSGSBASE |
345 CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
346 CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
347 CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
348 CPUID_STDEXT_AVX512F |
349 CPUID_STDEXT_AVX512PF |
350 CPUID_STDEXT_AVX512ER |
351 CPUID_STDEXT_AVX512CD);
352 regs[2] = 0;
353 regs[3] = 0;
354
355 /* Advertise INVPCID if it is enabled. */
356 error = vm_get_capability(vm, vcpu_id,
357 VM_CAP_ENABLE_INVPCID, &enable_invpcid);
358 if (error == 0 && enable_invpcid)
359 regs[1] |= CPUID_STDEXT_INVPCID;
360 }
361 break;
362
363 case CPUID_0000_0006:
364 case CPUID_0000_000A:
365 /*
366 * Handle the access, but report 0 for
367 * all options
368 */
369 regs[0] = 0;
370 regs[1] = 0;
371 regs[2] = 0;
372 regs[3] = 0;
373 break;
374
375 case CPUID_0000_000B:
376 /*
377 * Processor topology enumeration
378 */
379 if (*ecx == 0) {
380 logical_cpus = threads_per_core;
381 width = log2(logical_cpus);
382 level = CPUID_TYPE_SMT;
383 x2apic_id = vcpu_id;
384 }
385
386 if (*ecx == 1) {
387 logical_cpus = threads_per_core *
388 cores_per_package;
389 width = log2(logical_cpus);
390 level = CPUID_TYPE_CORE;
391 x2apic_id = vcpu_id;
392 }
393
394 if (!cpuid_leaf_b || *ecx >= 2) {
395 width = 0;
396 logical_cpus = 0;
397 level = 0;
398 x2apic_id = 0;
399 }
400
401 regs[0] = width & 0x1f;
402 regs[1] = logical_cpus & 0xffff;
403 regs[2] = (level << 8) | (*ecx & 0xff);
404 regs[3] = x2apic_id;
405 break;
406
407 case CPUID_0000_000D:
408 limits = vmm_get_xsave_limits();
409 if (!limits->xsave_enabled) {
410 regs[0] = 0;
411 regs[1] = 0;
412 regs[2] = 0;
413 regs[3] = 0;
414 break;
415 }
416
417 cpuid_count(*eax, *ecx, regs);
418 switch (*ecx) {
419 case 0:
420 /*
421 * Only permit the guest to use bits
422 * that are active in the host in
423 * %xcr0. Also, claim that the
424 * maximum save area size is
425 * equivalent to the host's current
426 * save area size. Since this runs
427 * "inside" of vmrun(), it runs with
428 * the guest's xcr0, so the current
429 * save area size is correct as-is.
430 */
431 regs[0] &= limits->xcr0_allowed;
432 regs[2] = limits->xsave_max_size;
433 regs[3] &= (limits->xcr0_allowed >> 32);
434 break;
435 case 1:
436 /* Only permit XSAVEOPT. */
437 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
438 regs[1] = 0;
439 regs[2] = 0;
440 regs[3] = 0;
441 break;
442 default:
443 /*
444 * If the leaf is for a permitted feature,
445 * pass through as-is, otherwise return
446 * all zeroes.
447 */
448 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
449 regs[0] = 0;
450 regs[1] = 0;
451 regs[2] = 0;
452 regs[3] = 0;
453 }
454 break;
455 }
456 break;
457
458 case 0x40000000:
459 regs[0] = CPUID_VM_HIGH;
460 bcopy(bhyve_id, &regs[1], 4);
461 bcopy(bhyve_id + 4, &regs[2], 4);
462 bcopy(bhyve_id + 8, &regs[3], 4);
463 break;
464
465 default:
466 /*
467 * The leaf value has already been clamped so
468 * simply pass this through, keeping count of
469 * how many unhandled leaf values have been seen.
470 */
471 atomic_add_long(&bhyve_xcpuids, 1);
472 cpuid_count(*eax, *ecx, regs);
473 break;
474 }
475
476 *eax = regs[0];
477 *ebx = regs[1];
478 *ecx = regs[2];
479 *edx = regs[3];
480
481 return (1);
482}