x86.c revision 280839
1221828Sgrehan/*- 2221828Sgrehan * Copyright (c) 2011 NetApp, Inc. 3221828Sgrehan * All rights reserved. 4221828Sgrehan * 5221828Sgrehan * Redistribution and use in source and binary forms, with or without 6221828Sgrehan * modification, are permitted provided that the following conditions 7221828Sgrehan * are met: 8221828Sgrehan * 1. Redistributions of source code must retain the above copyright 9221828Sgrehan * notice, this list of conditions and the following disclaimer. 10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright 11221828Sgrehan * notice, this list of conditions and the following disclaimer in the 12221828Sgrehan * documentation and/or other materials provided with the distribution. 13221828Sgrehan * 14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17221828Sgrehan * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24221828Sgrehan * SUCH DAMAGE. 25221828Sgrehan * 26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/x86.c 280839 2015-03-30 07:11:49Z mav $ 27221828Sgrehan */ 28221828Sgrehan 29221828Sgrehan#include <sys/cdefs.h> 30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/x86.c 280839 2015-03-30 07:11:49Z mav $"); 31221828Sgrehan 32240941Sneel#include <sys/param.h> 33267427Sjhb#include <sys/pcpu.h> 34222610Sjhb#include <sys/systm.h> 35240941Sneel#include <sys/cpuset.h> 36276349Sneel#include <sys/sysctl.h> 37221828Sgrehan 38249324Sneel#include <machine/clock.h> 39221828Sgrehan#include <machine/cpufunc.h> 40222610Sjhb#include <machine/md_var.h> 41267427Sjhb#include <machine/segments.h> 42221828Sgrehan#include <machine/specialreg.h> 43221828Sgrehan 44240941Sneel#include <machine/vmm.h> 45240941Sneel 46267427Sjhb#include "vmm_host.h" 47276403Sneel#include "vmm_ktr.h" 48276403Sneel#include "vmm_util.h" 49221828Sgrehan#include "x86.h" 50221828Sgrehan 51276349SneelSYSCTL_DECL(_hw_vmm); 52276349Sneelstatic SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL); 53276349Sneel 54222610Sjhb#define CPUID_VM_HIGH 0x40000000 55222610Sjhb 56252335Sgrehanstatic const char bhyve_id[12] = "bhyve bhyve "; 57222610Sjhb 58252335Sgrehanstatic uint64_t bhyve_xcpuids; 59276403SneelSYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, 60276403Sneel "Number of times an unknown cpuid leaf was accessed"); 61252335Sgrehan 62276349Sneel/* 63276349Sneel * The default CPU topology is a single thread per package. 64276349Sneel */ 65276349Sneelstatic u_int threads_per_core = 1; 66276349SneelSYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN, 67276349Sneel &threads_per_core, 0, NULL); 68276349Sneel 69276349Sneelstatic u_int cores_per_package = 1; 70276349SneelSYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN, 71276349Sneel &cores_per_package, 0, NULL); 72276349Sneel 73276349Sneelstatic int cpuid_leaf_b = 1; 74276349SneelSYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, 75276349Sneel &cpuid_leaf_b, 0, NULL); 76276349Sneel 77276349Sneel/* 78276349Sneel * Round up to the next power of two, if necessary, and then take log2. 79276349Sneel * Returns -1 if argument is zero. 80276349Sneel */ 81276349Sneelstatic __inline int 82276349Sneellog2(u_int x) 83276349Sneel{ 84276349Sneel 85276349Sneel return (fls(x << (1 - powerof2(x))) - 1); 86276349Sneel} 87276349Sneel 88221828Sgrehanint 89240941Sneelx86_emulate_cpuid(struct vm *vm, int vcpu_id, 90240941Sneel uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) 91221828Sgrehan{ 92267427Sjhb const struct xsave_limits *limits; 93267427Sjhb uint64_t cr4; 94276349Sneel int error, enable_invpcid, level, width, x2apic_id; 95276349Sneel unsigned int func, regs[4], logical_cpus; 96240941Sneel enum x2apic_state x2apic_state; 97221828Sgrehan 98276403Sneel VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx); 99276403Sneel 100222610Sjhb /* 101222610Sjhb * Requests for invalid CPUID levels should map to the highest 102222610Sjhb * available level instead. 103222610Sjhb */ 104222610Sjhb if (cpu_exthigh != 0 && *eax >= 0x80000000) { 105222610Sjhb if (*eax > cpu_exthigh) 106222610Sjhb *eax = cpu_exthigh; 107222610Sjhb } else if (*eax >= 0x40000000) { 108222610Sjhb if (*eax > CPUID_VM_HIGH) 109222610Sjhb *eax = CPUID_VM_HIGH; 110222610Sjhb } else if (*eax > cpu_high) { 111222610Sjhb *eax = cpu_high; 112222610Sjhb } 113221828Sgrehan 114246774Sneel func = *eax; 115246774Sneel 116222610Sjhb /* 117222610Sjhb * In general the approach used for CPU topology is to 118222610Sjhb * advertise a flat topology where all CPUs are packages with 119222610Sjhb * no multi-core or SMT. 120222610Sjhb */ 121222610Sjhb switch (func) { 122252335Sgrehan /* 123252335Sgrehan * Pass these through to the guest 124252335Sgrehan */ 125221828Sgrehan case CPUID_0000_0000: 126221828Sgrehan case CPUID_0000_0002: 127221828Sgrehan case CPUID_0000_0003: 128221828Sgrehan case CPUID_8000_0000: 129221828Sgrehan case CPUID_8000_0002: 130221828Sgrehan case CPUID_8000_0003: 131221828Sgrehan case CPUID_8000_0004: 132221828Sgrehan case CPUID_8000_0006: 133276403Sneel cpuid_count(*eax, *ecx, regs); 134276403Sneel break; 135221828Sgrehan case CPUID_8000_0008: 136222610Sjhb cpuid_count(*eax, *ecx, regs); 137276403Sneel if (vmm_is_amd()) { 138276403Sneel /* 139276403Sneel * XXX this might appear silly because AMD 140276403Sneel * cpus don't have threads. 141276403Sneel * 142276403Sneel * However this matches the logical cpus as 143276403Sneel * advertised by leaf 0x1 and will work even 144276403Sneel * if the 'threads_per_core' tunable is set 145276403Sneel * incorrectly on an AMD host. 146276403Sneel */ 147276403Sneel logical_cpus = threads_per_core * 148276403Sneel cores_per_package; 149276403Sneel regs[2] = logical_cpus - 1; 150276403Sneel } 151221828Sgrehan break; 152221828Sgrehan 153252335Sgrehan case CPUID_8000_0001: 154276403Sneel cpuid_count(*eax, *ecx, regs); 155276403Sneel 156252335Sgrehan /* 157276403Sneel * Hide SVM and Topology Extension features from guest. 158276403Sneel */ 159276403Sneel regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY); 160276403Sneel 161276403Sneel /* 162276403Sneel * Don't advertise extended performance counter MSRs 163276403Sneel * to the guest. 164276403Sneel */ 165276403Sneel regs[2] &= ~AMDID2_PCXC; 166276403Sneel regs[2] &= ~AMDID2_PNXC; 167276403Sneel regs[2] &= ~AMDID2_PTSCEL2I; 168276403Sneel 169276403Sneel /* 170276403Sneel * Don't advertise Instruction Based Sampling feature. 171276403Sneel */ 172276403Sneel regs[2] &= ~AMDID2_IBS; 173276403Sneel 174276403Sneel /* NodeID MSR not available */ 175276403Sneel regs[2] &= ~AMDID2_NODE_ID; 176276403Sneel 177276403Sneel /* Don't advertise the OS visible workaround feature */ 178276403Sneel regs[2] &= ~AMDID2_OSVW; 179276403Sneel 180276403Sneel /* 181252335Sgrehan * Hide rdtscp/ia32_tsc_aux until we know how 182252335Sgrehan * to deal with them. 183252335Sgrehan */ 184252335Sgrehan regs[3] &= ~AMDID_RDTSCP; 185252335Sgrehan break; 186252335Sgrehan 187249324Sneel case CPUID_8000_0007: 188249324Sneel /* 189276403Sneel * AMD uses this leaf to advertise the processor's 190276403Sneel * power monitoring and RAS capabilities. These 191276403Sneel * features are hardware-specific and exposing 192276403Sneel * them to a guest doesn't make a lot of sense. 193249324Sneel * 194276403Sneel * Intel uses this leaf only to advertise the 195276403Sneel * "Invariant TSC" feature with all other bits 196276403Sneel * being reserved (set to zero). 197276403Sneel */ 198276403Sneel regs[0] = 0; 199276403Sneel regs[1] = 0; 200276403Sneel regs[2] = 0; 201276403Sneel regs[3] = 0; 202276403Sneel 203276403Sneel /* 204276403Sneel * "Invariant TSC" can be advertised to the guest if: 205276403Sneel * - host TSC frequency is invariant 206276403Sneel * - host TSCs are synchronized across physical cpus 207276403Sneel * 208249324Sneel * XXX This still falls short because the vcpu 209249324Sneel * can observe the TSC moving backwards as it 210249324Sneel * migrates across physical cpus. But at least 211249324Sneel * it should discourage the guest from using the 212249324Sneel * TSC to keep track of time. 213249324Sneel */ 214276403Sneel if (tsc_is_invariant && smp_tsc) 215276403Sneel regs[3] |= AMDPM_TSC_INVARIANT; 216249324Sneel break; 217249324Sneel 218221828Sgrehan case CPUID_0000_0001: 219222610Sjhb do_cpuid(1, regs); 220222610Sjhb 221240941Sneel error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); 222240941Sneel if (error) { 223240941Sneel panic("x86_emulate_cpuid: error %d " 224240941Sneel "fetching x2apic state", error); 225240941Sneel } 226240941Sneel 227221828Sgrehan /* 228221828Sgrehan * Override the APIC ID only in ebx 229221828Sgrehan */ 230222610Sjhb regs[1] &= ~(CPUID_LOCAL_APIC_ID); 231222610Sjhb regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); 232221828Sgrehan 233221828Sgrehan /* 234222105Sgrehan * Don't expose VMX, SpeedStep or TME capability. 235222610Sjhb * Advertise x2APIC capability and Hypervisor guest. 236221828Sgrehan */ 237222610Sjhb regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); 238221828Sgrehan 239240941Sneel regs[2] |= CPUID2_HV; 240240941Sneel 241240941Sneel if (x2apic_state != X2APIC_DISABLED) 242240941Sneel regs[2] |= CPUID2_X2APIC; 243267447Sjhb else 244267447Sjhb regs[2] &= ~CPUID2_X2APIC; 245240941Sneel 246221828Sgrehan /* 247267427Sjhb * Only advertise CPUID2_XSAVE in the guest if 248267427Sjhb * the host is using XSAVE. 249234939Sgrehan */ 250267427Sjhb if (!(regs[2] & CPUID2_OSXSAVE)) 251267427Sjhb regs[2] &= ~CPUID2_XSAVE; 252234939Sgrehan 253234939Sgrehan /* 254267427Sjhb * If CPUID2_XSAVE is being advertised and the 255267427Sjhb * guest has set CR4_XSAVE, set 256267427Sjhb * CPUID2_OSXSAVE. 257267427Sjhb */ 258267427Sjhb regs[2] &= ~CPUID2_OSXSAVE; 259267427Sjhb if (regs[2] & CPUID2_XSAVE) { 260267427Sjhb error = vm_get_register(vm, vcpu_id, 261267427Sjhb VM_REG_GUEST_CR4, &cr4); 262267427Sjhb if (error) 263267427Sjhb panic("x86_emulate_cpuid: error %d " 264267427Sjhb "fetching %%cr4", error); 265267427Sjhb if (cr4 & CR4_XSAVE) 266267427Sjhb regs[2] |= CPUID2_OSXSAVE; 267267427Sjhb } 268267427Sjhb 269267427Sjhb /* 270242060Sneel * Hide monitor/mwait until we know how to deal with 271242060Sneel * these instructions. 272242060Sneel */ 273242060Sneel regs[2] &= ~CPUID2_MON; 274242060Sneel 275252335Sgrehan /* 276252335Sgrehan * Hide the performance and debug features. 277252335Sgrehan */ 278252335Sgrehan regs[2] &= ~CPUID2_PDCM; 279255645Sgrehan 280242060Sneel /* 281255645Sgrehan * No TSC deadline support in the APIC yet 282255645Sgrehan */ 283255645Sgrehan regs[2] &= ~CPUID2_TSCDLT; 284255645Sgrehan 285255645Sgrehan /* 286222105Sgrehan * Hide thermal monitoring 287222105Sgrehan */ 288222105Sgrehan regs[3] &= ~(CPUID_ACPI | CPUID_TM); 289222105Sgrehan 290222105Sgrehan /* 291221828Sgrehan * Machine check handling is done in the host. 292221828Sgrehan * Hide MTRR capability. 293221828Sgrehan */ 294221828Sgrehan regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); 295221828Sgrehan 296252335Sgrehan /* 297252335Sgrehan * Hide the debug store capability. 298252335Sgrehan */ 299252335Sgrehan regs[3] &= ~CPUID_DS; 300252335Sgrehan 301276349Sneel logical_cpus = threads_per_core * cores_per_package; 302222610Sjhb regs[1] &= ~CPUID_HTT_CORES; 303276349Sneel regs[1] |= (logical_cpus & 0xff) << 16; 304276349Sneel regs[3] |= CPUID_HTT; 305221828Sgrehan break; 306221828Sgrehan 307222610Sjhb case CPUID_0000_0004: 308276349Sneel cpuid_count(*eax, *ecx, regs); 309222610Sjhb 310276349Sneel if (regs[0] || regs[1] || regs[2] || regs[3]) { 311276349Sneel regs[0] &= 0x3ff; 312276349Sneel regs[0] |= (cores_per_package - 1) << 26; 313276349Sneel /* 314276349Sneel * Cache topology: 315276349Sneel * - L1 and L2 are shared only by the logical 316276349Sneel * processors in a single core. 317276349Sneel * - L3 and above are shared by all logical 318276349Sneel * processors in the package. 319276349Sneel */ 320276349Sneel logical_cpus = threads_per_core; 321276349Sneel level = (regs[0] >> 5) & 0x7; 322276349Sneel if (level >= 3) 323276349Sneel logical_cpus *= cores_per_package; 324276349Sneel regs[0] |= (logical_cpus - 1) << 14; 325276349Sneel } 326222610Sjhb break; 327222610Sjhb 328256869Sneel case CPUID_0000_0007: 329256869Sneel regs[0] = 0; 330256869Sneel regs[1] = 0; 331256869Sneel regs[2] = 0; 332256869Sneel regs[3] = 0; 333256869Sneel 334256869Sneel /* leaf 0 */ 335256869Sneel if (*ecx == 0) { 336267427Sjhb cpuid_count(*eax, *ecx, regs); 337267427Sjhb 338267427Sjhb /* Only leaf 0 is supported */ 339267427Sjhb regs[0] = 0; 340267427Sjhb 341267427Sjhb /* 342267427Sjhb * Expose known-safe features. 343267427Sjhb */ 344267427Sjhb regs[1] &= (CPUID_STDEXT_FSGSBASE | 345267427Sjhb CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | 346267427Sjhb CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | 347267427Sjhb CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | 348267427Sjhb CPUID_STDEXT_AVX512F | 349267427Sjhb CPUID_STDEXT_AVX512PF | 350267427Sjhb CPUID_STDEXT_AVX512ER | 351267427Sjhb CPUID_STDEXT_AVX512CD); 352267427Sjhb regs[2] = 0; 353267427Sjhb regs[3] = 0; 354267427Sjhb 355267427Sjhb /* Advertise INVPCID if it is enabled. */ 356256869Sneel error = vm_get_capability(vm, vcpu_id, 357256869Sneel VM_CAP_ENABLE_INVPCID, &enable_invpcid); 358256869Sneel if (error == 0 && enable_invpcid) 359256869Sneel regs[1] |= CPUID_STDEXT_INVPCID; 360256869Sneel } 361256869Sneel break; 362256869Sneel 363222105Sgrehan case CPUID_0000_0006: 364280839Smav regs[0] = CPUTPM1_ARAT; 365280839Smav regs[1] = 0; 366280839Smav regs[2] = 0; 367280839Smav regs[3] = 0; 368280839Smav break; 369280839Smav 370252335Sgrehan case CPUID_0000_000A: 371222105Sgrehan /* 372222105Sgrehan * Handle the access, but report 0 for 373222105Sgrehan * all options 374222105Sgrehan */ 375222105Sgrehan regs[0] = 0; 376222105Sgrehan regs[1] = 0; 377222105Sgrehan regs[2] = 0; 378222105Sgrehan regs[3] = 0; 379222105Sgrehan break; 380222105Sgrehan 381221828Sgrehan case CPUID_0000_000B: 382221828Sgrehan /* 383221828Sgrehan * Processor topology enumeration 384221828Sgrehan */ 385276349Sneel if (*ecx == 0) { 386276349Sneel logical_cpus = threads_per_core; 387276349Sneel width = log2(logical_cpus); 388276349Sneel level = CPUID_TYPE_SMT; 389276349Sneel x2apic_id = vcpu_id; 390276349Sneel } 391276349Sneel 392276349Sneel if (*ecx == 1) { 393276349Sneel logical_cpus = threads_per_core * 394276349Sneel cores_per_package; 395276349Sneel width = log2(logical_cpus); 396276349Sneel level = CPUID_TYPE_CORE; 397276349Sneel x2apic_id = vcpu_id; 398276349Sneel } 399276349Sneel 400276349Sneel if (!cpuid_leaf_b || *ecx >= 2) { 401276349Sneel width = 0; 402276349Sneel logical_cpus = 0; 403276349Sneel level = 0; 404276349Sneel x2apic_id = 0; 405276349Sneel } 406276349Sneel 407276349Sneel regs[0] = width & 0x1f; 408276349Sneel regs[1] = logical_cpus & 0xffff; 409276349Sneel regs[2] = (level << 8) | (*ecx & 0xff); 410276349Sneel regs[3] = x2apic_id; 411221828Sgrehan break; 412221828Sgrehan 413267427Sjhb case CPUID_0000_000D: 414267427Sjhb limits = vmm_get_xsave_limits(); 415267427Sjhb if (!limits->xsave_enabled) { 416267427Sjhb regs[0] = 0; 417267427Sjhb regs[1] = 0; 418267427Sjhb regs[2] = 0; 419267427Sjhb regs[3] = 0; 420267427Sjhb break; 421267427Sjhb } 422267427Sjhb 423267427Sjhb cpuid_count(*eax, *ecx, regs); 424267427Sjhb switch (*ecx) { 425267427Sjhb case 0: 426267427Sjhb /* 427267427Sjhb * Only permit the guest to use bits 428267427Sjhb * that are active in the host in 429267427Sjhb * %xcr0. Also, claim that the 430267427Sjhb * maximum save area size is 431267427Sjhb * equivalent to the host's current 432267427Sjhb * save area size. Since this runs 433267427Sjhb * "inside" of vmrun(), it runs with 434267427Sjhb * the guest's xcr0, so the current 435267427Sjhb * save area size is correct as-is. 436267427Sjhb */ 437267427Sjhb regs[0] &= limits->xcr0_allowed; 438267427Sjhb regs[2] = limits->xsave_max_size; 439267427Sjhb regs[3] &= (limits->xcr0_allowed >> 32); 440267427Sjhb break; 441267427Sjhb case 1: 442267427Sjhb /* Only permit XSAVEOPT. */ 443267427Sjhb regs[0] &= CPUID_EXTSTATE_XSAVEOPT; 444267427Sjhb regs[1] = 0; 445267427Sjhb regs[2] = 0; 446267427Sjhb regs[3] = 0; 447267427Sjhb break; 448267427Sjhb default: 449267427Sjhb /* 450267427Sjhb * If the leaf is for a permitted feature, 451267427Sjhb * pass through as-is, otherwise return 452267427Sjhb * all zeroes. 453267427Sjhb */ 454267427Sjhb if (!(limits->xcr0_allowed & (1ul << *ecx))) { 455267427Sjhb regs[0] = 0; 456267427Sjhb regs[1] = 0; 457267427Sjhb regs[2] = 0; 458267427Sjhb regs[3] = 0; 459267427Sjhb } 460267427Sjhb break; 461267427Sjhb } 462267427Sjhb break; 463267427Sjhb 464222610Sjhb case 0x40000000: 465222610Sjhb regs[0] = CPUID_VM_HIGH; 466222610Sjhb bcopy(bhyve_id, ®s[1], 4); 467252335Sgrehan bcopy(bhyve_id + 4, ®s[2], 4); 468252335Sgrehan bcopy(bhyve_id + 8, ®s[3], 4); 469222610Sjhb break; 470252335Sgrehan 471221828Sgrehan default: 472252335Sgrehan /* 473252335Sgrehan * The leaf value has already been clamped so 474252335Sgrehan * simply pass this through, keeping count of 475252335Sgrehan * how many unhandled leaf values have been seen. 476252335Sgrehan */ 477252335Sgrehan atomic_add_long(&bhyve_xcpuids, 1); 478252335Sgrehan cpuid_count(*eax, *ecx, regs); 479252335Sgrehan break; 480221828Sgrehan } 481221828Sgrehan 482221828Sgrehan *eax = regs[0]; 483221828Sgrehan *ebx = regs[1]; 484221828Sgrehan *ecx = regs[2]; 485221828Sgrehan *edx = regs[3]; 486252335Sgrehan 487221828Sgrehan return (1); 488221828Sgrehan} 489