1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include <sys/param.h>
35#include <sys/pcpu.h>
36#include <sys/systm.h>
37#include <sys/sysctl.h>
38
39#include <machine/clock.h>
40#include <machine/cpufunc.h>
41#include <machine/md_var.h>
42#include <machine/segments.h>
43#include <machine/specialreg.h>
44
45#include <machine/vmm.h>
46
47#include "vmm_host.h"
48#include "vmm_ktr.h"
49#include "vmm_util.h"
50#include "x86.h"
51
52SYSCTL_DECL(_hw_vmm);
53static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
54
55#define	CPUID_VM_HIGH		0x40000000
56
57static const char bhyve_id[12] = "bhyve bhyve ";
58
59static uint64_t bhyve_xcpuids;
60SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
61    "Number of times an unknown cpuid leaf was accessed");
62
63#if __FreeBSD_version < 1200060	/* Remove after 11 EOL helps MFCing */
64extern u_int threads_per_core;
65SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
66    &threads_per_core, 0, NULL);
67
68extern u_int cores_per_package;
69SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
70    &cores_per_package, 0, NULL);
71#endif
72
73static int cpuid_leaf_b = 1;
74SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
75    &cpuid_leaf_b, 0, NULL);
76
77/*
78 * Round up to the next power of two, if necessary, and then take log2.
79 * Returns -1 if argument is zero.
80 */
81static __inline int
82log2(u_int x)
83{
84
85	return (fls(x << (1 - powerof2(x))) - 1);
86}
87
88int
89x86_emulate_cpuid(struct vm *vm, int vcpu_id,
90		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
91{
92	const struct xsave_limits *limits;
93	uint64_t cr4;
94	int error, enable_invpcid, enable_rdpid, enable_rdtscp, level,
95	    width, x2apic_id;
96	unsigned int func, regs[4], logical_cpus;
97	enum x2apic_state x2apic_state;
98	uint16_t cores, maxcpus, sockets, threads;
99
100	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
101
102	/*
103	 * Requests for invalid CPUID levels should map to the highest
104	 * available level instead.
105	 */
106	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
107		if (*eax > cpu_exthigh)
108			*eax = cpu_exthigh;
109	} else if (*eax >= 0x40000000) {
110		if (*eax > CPUID_VM_HIGH)
111			*eax = CPUID_VM_HIGH;
112	} else if (*eax > cpu_high) {
113		*eax = cpu_high;
114	}
115
116	func = *eax;
117
118	/*
119	 * In general the approach used for CPU topology is to
120	 * advertise a flat topology where all CPUs are packages with
121	 * no multi-core or SMT.
122	 */
123	switch (func) {
124		/*
125		 * Pass these through to the guest
126		 */
127		case CPUID_0000_0000:
128		case CPUID_0000_0002:
129		case CPUID_0000_0003:
130		case CPUID_8000_0000:
131		case CPUID_8000_0002:
132		case CPUID_8000_0003:
133		case CPUID_8000_0004:
134		case CPUID_8000_0006:
135			cpuid_count(*eax, *ecx, regs);
136			break;
137		case CPUID_8000_0008:
138			cpuid_count(*eax, *ecx, regs);
139			if (vmm_is_svm()) {
140				/*
141				 * As on Intel (0000_0007:0, EDX), mask out
142				 * unsupported or unsafe AMD extended features
143				 * (8000_0008 EBX).
144				 */
145				regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
146				    AMDFEID_XSAVEERPTR);
147
148				vm_get_topology(vm, &sockets, &cores, &threads,
149				    &maxcpus);
150				/*
151				 * Here, width is ApicIdCoreIdSize, present on
152				 * at least Family 15h and newer.  It
153				 * represents the "number of bits in the
154				 * initial apicid that indicate thread id
155				 * within a package."
156				 *
157				 * Our topo_probe_amd() uses it for
158				 * pkg_id_shift and other OSes may rely on it.
159				 */
160				width = MIN(0xF, log2(threads * cores));
161				if (width < 0x4)
162					width = 0;
163				logical_cpus = MIN(0xFF, threads * cores - 1);
164				regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
165			}
166			break;
167
168		case CPUID_8000_0001:
169			cpuid_count(*eax, *ecx, regs);
170
171			/*
172			 * Hide SVM from guest.
173			 */
174			regs[2] &= ~AMDID2_SVM;
175
176			/*
177			 * Don't advertise extended performance counter MSRs
178			 * to the guest.
179			 */
180			regs[2] &= ~AMDID2_PCXC;
181			regs[2] &= ~AMDID2_PNXC;
182			regs[2] &= ~AMDID2_PTSCEL2I;
183
184			/*
185			 * Don't advertise Instruction Based Sampling feature.
186			 */
187			regs[2] &= ~AMDID2_IBS;
188
189			/* NodeID MSR not available */
190			regs[2] &= ~AMDID2_NODE_ID;
191
192			/* Don't advertise the OS visible workaround feature */
193			regs[2] &= ~AMDID2_OSVW;
194
195			/* Hide mwaitx/monitorx capability from the guest */
196			regs[2] &= ~AMDID2_MWAITX;
197
198			/* Advertise RDTSCP if it is enabled. */
199			error = vm_get_capability(vm, vcpu_id,
200			    VM_CAP_RDTSCP, &enable_rdtscp);
201			if (error == 0 && enable_rdtscp)
202				regs[3] |= AMDID_RDTSCP;
203			else
204				regs[3] &= ~AMDID_RDTSCP;
205			break;
206
207		case CPUID_8000_0007:
208			/*
209			 * AMD uses this leaf to advertise the processor's
210			 * power monitoring and RAS capabilities. These
211			 * features are hardware-specific and exposing
212			 * them to a guest doesn't make a lot of sense.
213			 *
214			 * Intel uses this leaf only to advertise the
215			 * "Invariant TSC" feature with all other bits
216			 * being reserved (set to zero).
217			 */
218			regs[0] = 0;
219			regs[1] = 0;
220			regs[2] = 0;
221			regs[3] = 0;
222
223			/*
224			 * "Invariant TSC" can be advertised to the guest if:
225			 * - host TSC frequency is invariant
226			 * - host TSCs are synchronized across physical cpus
227			 *
228			 * XXX This still falls short because the vcpu
229			 * can observe the TSC moving backwards as it
230			 * migrates across physical cpus. But at least
231			 * it should discourage the guest from using the
232			 * TSC to keep track of time.
233			 */
234			if (tsc_is_invariant && smp_tsc)
235				regs[3] |= AMDPM_TSC_INVARIANT;
236			break;
237
238		case CPUID_8000_001D:
239			/* AMD Cache topology, like 0000_0004 for Intel. */
240			if (!vmm_is_svm())
241				goto default_leaf;
242
243			/*
244			 * Similar to Intel, generate a ficticious cache
245			 * topology for the guest with L3 shared by the
246			 * package, and L1 and L2 local to a core.
247			 */
248			vm_get_topology(vm, &sockets, &cores, &threads,
249			    &maxcpus);
250			switch (*ecx) {
251			case 0:
252				logical_cpus = threads;
253				level = 1;
254				func = 1;	/* data cache */
255				break;
256			case 1:
257				logical_cpus = threads;
258				level = 2;
259				func = 3;	/* unified cache */
260				break;
261			case 2:
262				logical_cpus = threads * cores;
263				level = 3;
264				func = 3;	/* unified cache */
265				break;
266			default:
267				logical_cpus = 0;
268				level = 0;
269				func = 0;
270				break;
271			}
272
273			logical_cpus = MIN(0xfff, logical_cpus - 1);
274			regs[0] = (logical_cpus << 14) | (1 << 8) |
275			    (level << 5) | func;
276			regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
277			regs[2] = 0;
278			regs[3] = 0;
279			break;
280
281		case CPUID_8000_001E:
282			/*
283			 * AMD Family 16h+ and Hygon Family 18h additional
284			 * identifiers.
285			 */
286			if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16)
287				goto default_leaf;
288
289			vm_get_topology(vm, &sockets, &cores, &threads,
290			    &maxcpus);
291			regs[0] = vcpu_id;
292			threads = MIN(0xFF, threads - 1);
293			regs[1] = (threads << 8) |
294			    (vcpu_id >> log2(threads + 1));
295			/*
296			 * XXX Bhyve topology cannot yet represent >1 node per
297			 * processor.
298			 */
299			regs[2] = 0;
300			regs[3] = 0;
301			break;
302
303		case CPUID_0000_0001:
304			do_cpuid(1, regs);
305
306			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
307			if (error) {
308				panic("x86_emulate_cpuid: error %d "
309				      "fetching x2apic state", error);
310			}
311
312			/*
313			 * Override the APIC ID only in ebx
314			 */
315			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
316			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
317
318			/*
319			 * Don't expose VMX, SpeedStep, TME or SMX capability.
320			 * Advertise x2APIC capability and Hypervisor guest.
321			 */
322			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
323			regs[2] &= ~(CPUID2_SMX);
324
325			regs[2] |= CPUID2_HV;
326
327			if (x2apic_state != X2APIC_DISABLED)
328				regs[2] |= CPUID2_X2APIC;
329			else
330				regs[2] &= ~CPUID2_X2APIC;
331
332			/*
333			 * Only advertise CPUID2_XSAVE in the guest if
334			 * the host is using XSAVE.
335			 */
336			if (!(regs[2] & CPUID2_OSXSAVE))
337				regs[2] &= ~CPUID2_XSAVE;
338
339			/*
340			 * If CPUID2_XSAVE is being advertised and the
341			 * guest has set CR4_XSAVE, set
342			 * CPUID2_OSXSAVE.
343			 */
344			regs[2] &= ~CPUID2_OSXSAVE;
345			if (regs[2] & CPUID2_XSAVE) {
346				error = vm_get_register(vm, vcpu_id,
347				    VM_REG_GUEST_CR4, &cr4);
348				if (error)
349					panic("x86_emulate_cpuid: error %d "
350					      "fetching %%cr4", error);
351				if (cr4 & CR4_XSAVE)
352					regs[2] |= CPUID2_OSXSAVE;
353			}
354
355			/*
356			 * Hide monitor/mwait until we know how to deal with
357			 * these instructions.
358			 */
359			regs[2] &= ~CPUID2_MON;
360
361                        /*
362			 * Hide the performance and debug features.
363			 */
364			regs[2] &= ~CPUID2_PDCM;
365
366			/*
367			 * No TSC deadline support in the APIC yet
368			 */
369			regs[2] &= ~CPUID2_TSCDLT;
370
371			/*
372			 * Hide thermal monitoring
373			 */
374			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
375
376			/*
377			 * Hide the debug store capability.
378			 */
379			regs[3] &= ~CPUID_DS;
380
381			/*
382			 * Advertise the Machine Check and MTRR capability.
383			 *
384			 * Some guest OSes (e.g. Windows) will not boot if
385			 * these features are absent.
386			 */
387			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
388
389			vm_get_topology(vm, &sockets, &cores, &threads,
390			    &maxcpus);
391			logical_cpus = threads * cores;
392			regs[1] &= ~CPUID_HTT_CORES;
393			regs[1] |= (logical_cpus & 0xff) << 16;
394			regs[3] |= CPUID_HTT;
395			break;
396
397		case CPUID_0000_0004:
398			cpuid_count(*eax, *ecx, regs);
399
400			if (regs[0] || regs[1] || regs[2] || regs[3]) {
401				vm_get_topology(vm, &sockets, &cores, &threads,
402				    &maxcpus);
403				regs[0] &= 0x3ff;
404				regs[0] |= (cores - 1) << 26;
405				/*
406				 * Cache topology:
407				 * - L1 and L2 are shared only by the logical
408				 *   processors in a single core.
409				 * - L3 and above are shared by all logical
410				 *   processors in the package.
411				 */
412				logical_cpus = threads;
413				level = (regs[0] >> 5) & 0x7;
414				if (level >= 3)
415					logical_cpus *= cores;
416				regs[0] |= (logical_cpus - 1) << 14;
417			}
418			break;
419
420		case CPUID_0000_0007:
421			regs[0] = 0;
422			regs[1] = 0;
423			regs[2] = 0;
424			regs[3] = 0;
425
426			/* leaf 0 */
427			if (*ecx == 0) {
428				cpuid_count(*eax, *ecx, regs);
429
430				/* Only leaf 0 is supported */
431				regs[0] = 0;
432
433				/*
434				 * Expose known-safe features.
435				 */
436				regs[1] &= (CPUID_STDEXT_FSGSBASE |
437				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
438				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
439				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
440				    CPUID_STDEXT_AVX512F |
441				    CPUID_STDEXT_RDSEED |
442				    CPUID_STDEXT_AVX512PF |
443				    CPUID_STDEXT_AVX512ER |
444				    CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
445				regs[2] = 0;
446				regs[3] &= CPUID_STDEXT3_MD_CLEAR;
447
448				/* Advertise RDPID if it is enabled. */
449				error = vm_get_capability(vm, vcpu_id,
450				    VM_CAP_RDPID, &enable_rdpid);
451				if (error == 0 && enable_rdpid)
452					regs[2] |= CPUID_STDEXT2_RDPID;
453
454				/* Advertise INVPCID if it is enabled. */
455				error = vm_get_capability(vm, vcpu_id,
456				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
457				if (error == 0 && enable_invpcid)
458					regs[1] |= CPUID_STDEXT_INVPCID;
459			}
460			break;
461
462		case CPUID_0000_0006:
463			regs[0] = CPUTPM1_ARAT;
464			regs[1] = 0;
465			regs[2] = 0;
466			regs[3] = 0;
467			break;
468
469		case CPUID_0000_000A:
470			/*
471			 * Handle the access, but report 0 for
472			 * all options
473			 */
474			regs[0] = 0;
475			regs[1] = 0;
476			regs[2] = 0;
477			regs[3] = 0;
478			break;
479
480		case CPUID_0000_000B:
481			/*
482			 * Intel processor topology enumeration
483			 */
484			if (vmm_is_intel()) {
485				vm_get_topology(vm, &sockets, &cores, &threads,
486				    &maxcpus);
487				if (*ecx == 0) {
488					logical_cpus = threads;
489					width = log2(logical_cpus);
490					level = CPUID_TYPE_SMT;
491					x2apic_id = vcpu_id;
492				}
493
494				if (*ecx == 1) {
495					logical_cpus = threads * cores;
496					width = log2(logical_cpus);
497					level = CPUID_TYPE_CORE;
498					x2apic_id = vcpu_id;
499				}
500
501				if (!cpuid_leaf_b || *ecx >= 2) {
502					width = 0;
503					logical_cpus = 0;
504					level = 0;
505					x2apic_id = 0;
506				}
507
508				regs[0] = width & 0x1f;
509				regs[1] = logical_cpus & 0xffff;
510				regs[2] = (level << 8) | (*ecx & 0xff);
511				regs[3] = x2apic_id;
512			} else {
513				regs[0] = 0;
514				regs[1] = 0;
515				regs[2] = 0;
516				regs[3] = 0;
517			}
518			break;
519
520		case CPUID_0000_000D:
521			limits = vmm_get_xsave_limits();
522			if (!limits->xsave_enabled) {
523				regs[0] = 0;
524				regs[1] = 0;
525				regs[2] = 0;
526				regs[3] = 0;
527				break;
528			}
529
530			cpuid_count(*eax, *ecx, regs);
531			switch (*ecx) {
532			case 0:
533				/*
534				 * Only permit the guest to use bits
535				 * that are active in the host in
536				 * %xcr0.  Also, claim that the
537				 * maximum save area size is
538				 * equivalent to the host's current
539				 * save area size.  Since this runs
540				 * "inside" of vmrun(), it runs with
541				 * the guest's xcr0, so the current
542				 * save area size is correct as-is.
543				 */
544				regs[0] &= limits->xcr0_allowed;
545				regs[2] = limits->xsave_max_size;
546				regs[3] &= (limits->xcr0_allowed >> 32);
547				break;
548			case 1:
549				/* Only permit XSAVEOPT. */
550				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
551				regs[1] = 0;
552				regs[2] = 0;
553				regs[3] = 0;
554				break;
555			default:
556				/*
557				 * If the leaf is for a permitted feature,
558				 * pass through as-is, otherwise return
559				 * all zeroes.
560				 */
561				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
562					regs[0] = 0;
563					regs[1] = 0;
564					regs[2] = 0;
565					regs[3] = 0;
566				}
567				break;
568			}
569			break;
570
571		case CPUID_0000_0015:
572			/*
573			 * Don't report CPU TSC/Crystal ratio and clock
574			 * values since guests may use these to derive the
575			 * local APIC frequency..
576			 */
577			regs[0] = 0;
578			regs[1] = 0;
579			regs[2] = 0;
580			regs[3] = 0;
581			break;
582
583		case 0x40000000:
584			regs[0] = CPUID_VM_HIGH;
585			bcopy(bhyve_id, &regs[1], 4);
586			bcopy(bhyve_id + 4, &regs[2], 4);
587			bcopy(bhyve_id + 8, &regs[3], 4);
588			break;
589
590		default:
591default_leaf:
592			/*
593			 * The leaf value has already been clamped so
594			 * simply pass this through, keeping count of
595			 * how many unhandled leaf values have been seen.
596			 */
597			atomic_add_long(&bhyve_xcpuids, 1);
598			cpuid_count(*eax, *ecx, regs);
599			break;
600	}
601
602	*eax = regs[0];
603	*ebx = regs[1];
604	*ecx = regs[2];
605	*edx = regs[3];
606
607	return (1);
608}
609
610bool
611vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
612{
613	bool rv;
614
615	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
616	    __func__, cap));
617
618	/*
619	 * Simply passthrough the capabilities of the host cpu for now.
620	 */
621	rv = false;
622	switch (cap) {
623	case VCC_NO_EXECUTE:
624		if (amd_feature & AMDID_NX)
625			rv = true;
626		break;
627	case VCC_FFXSR:
628		if (amd_feature & AMDID_FFXSR)
629			rv = true;
630		break;
631	case VCC_TCE:
632		if (amd_feature2 & AMDID2_TCE)
633			rv = true;
634		break;
635	default:
636		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
637	}
638	return (rv);
639}
640