x86.c revision 267427
1221828Sgrehan/*- 2221828Sgrehan * Copyright (c) 2011 NetApp, Inc. 3221828Sgrehan * All rights reserved. 4221828Sgrehan * 5221828Sgrehan * Redistribution and use in source and binary forms, with or without 6221828Sgrehan * modification, are permitted provided that the following conditions 7221828Sgrehan * are met: 8221828Sgrehan * 1. Redistributions of source code must retain the above copyright 9221828Sgrehan * notice, this list of conditions and the following disclaimer. 10221828Sgrehan * 2. Redistributions in binary form must reproduce the above copyright 11221828Sgrehan * notice, this list of conditions and the following disclaimer in the 12221828Sgrehan * documentation and/or other materials provided with the distribution. 13221828Sgrehan * 14221828Sgrehan * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15221828Sgrehan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16221828Sgrehan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17221828Sgrehan * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18221828Sgrehan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19221828Sgrehan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20221828Sgrehan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21221828Sgrehan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22221828Sgrehan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23221828Sgrehan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24221828Sgrehan * SUCH DAMAGE. 25221828Sgrehan * 26221828Sgrehan * $FreeBSD: stable/10/sys/amd64/vmm/x86.c 267427 2014-06-12 19:58:12Z jhb $ 27221828Sgrehan */ 28221828Sgrehan 29221828Sgrehan#include <sys/cdefs.h> 30221828Sgrehan__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/x86.c 267427 2014-06-12 19:58:12Z jhb $"); 31221828Sgrehan 32240941Sneel#include <sys/param.h> 33267427Sjhb#include <sys/pcpu.h> 34222610Sjhb#include <sys/systm.h> 35240941Sneel#include <sys/cpuset.h> 36221828Sgrehan 37249324Sneel#include <machine/clock.h> 38221828Sgrehan#include <machine/cpufunc.h> 39222610Sjhb#include <machine/md_var.h> 40267427Sjhb#include <machine/segments.h> 41221828Sgrehan#include <machine/specialreg.h> 42221828Sgrehan 43240941Sneel#include <machine/vmm.h> 44240941Sneel 45267427Sjhb#include "vmm_host.h" 46221828Sgrehan#include "x86.h" 47221828Sgrehan 48222610Sjhb#define CPUID_VM_HIGH 0x40000000 49222610Sjhb 50252335Sgrehanstatic const char bhyve_id[12] = "bhyve bhyve "; 51222610Sjhb 52252335Sgrehanstatic uint64_t bhyve_xcpuids; 53252335Sgrehan 54221828Sgrehanint 55240941Sneelx86_emulate_cpuid(struct vm *vm, int vcpu_id, 56240941Sneel uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) 57221828Sgrehan{ 58267427Sjhb const struct xsave_limits *limits; 59267427Sjhb uint64_t cr4; 60256869Sneel int error, enable_invpcid; 61221828Sgrehan unsigned int func, regs[4]; 62240941Sneel enum x2apic_state x2apic_state; 63221828Sgrehan 64222610Sjhb /* 65222610Sjhb * Requests for invalid CPUID levels should map to the highest 66222610Sjhb * available level instead. 67222610Sjhb */ 68222610Sjhb if (cpu_exthigh != 0 && *eax >= 0x80000000) { 69222610Sjhb if (*eax > cpu_exthigh) 70222610Sjhb *eax = cpu_exthigh; 71222610Sjhb } else if (*eax >= 0x40000000) { 72222610Sjhb if (*eax > CPUID_VM_HIGH) 73222610Sjhb *eax = CPUID_VM_HIGH; 74222610Sjhb } else if (*eax > cpu_high) { 75222610Sjhb *eax = cpu_high; 76222610Sjhb } 77221828Sgrehan 78246774Sneel func = *eax; 79246774Sneel 80222610Sjhb /* 81222610Sjhb * In general the approach used for CPU topology is to 82222610Sjhb * advertise a flat topology where all CPUs are packages with 83222610Sjhb * no multi-core or SMT. 84222610Sjhb */ 85222610Sjhb switch (func) { 86252335Sgrehan /* 87252335Sgrehan * Pass these through to the guest 88252335Sgrehan */ 89221828Sgrehan case CPUID_0000_0000: 90221828Sgrehan case CPUID_0000_0002: 91221828Sgrehan case CPUID_0000_0003: 92221828Sgrehan case CPUID_8000_0000: 93221828Sgrehan case CPUID_8000_0002: 94221828Sgrehan case CPUID_8000_0003: 95221828Sgrehan case CPUID_8000_0004: 96221828Sgrehan case CPUID_8000_0006: 97221828Sgrehan case CPUID_8000_0008: 98222610Sjhb cpuid_count(*eax, *ecx, regs); 99221828Sgrehan break; 100221828Sgrehan 101252335Sgrehan case CPUID_8000_0001: 102252335Sgrehan /* 103252335Sgrehan * Hide rdtscp/ia32_tsc_aux until we know how 104252335Sgrehan * to deal with them. 105252335Sgrehan */ 106252335Sgrehan cpuid_count(*eax, *ecx, regs); 107252335Sgrehan regs[3] &= ~AMDID_RDTSCP; 108252335Sgrehan break; 109252335Sgrehan 110249324Sneel case CPUID_8000_0007: 111249324Sneel cpuid_count(*eax, *ecx, regs); 112249324Sneel /* 113249324Sneel * If the host TSCs are not synchronized across 114249324Sneel * physical cpus then we cannot advertise an 115249324Sneel * invariant tsc to a vcpu. 116249324Sneel * 117249324Sneel * XXX This still falls short because the vcpu 118249324Sneel * can observe the TSC moving backwards as it 119249324Sneel * migrates across physical cpus. But at least 120249324Sneel * it should discourage the guest from using the 121249324Sneel * TSC to keep track of time. 122249324Sneel */ 123249324Sneel if (!smp_tsc) 124249324Sneel regs[3] &= ~AMDPM_TSC_INVARIANT; 125249324Sneel break; 126249324Sneel 127221828Sgrehan case CPUID_0000_0001: 128222610Sjhb do_cpuid(1, regs); 129222610Sjhb 130240941Sneel error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); 131240941Sneel if (error) { 132240941Sneel panic("x86_emulate_cpuid: error %d " 133240941Sneel "fetching x2apic state", error); 134240941Sneel } 135240941Sneel 136221828Sgrehan /* 137221828Sgrehan * Override the APIC ID only in ebx 138221828Sgrehan */ 139222610Sjhb regs[1] &= ~(CPUID_LOCAL_APIC_ID); 140222610Sjhb regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); 141221828Sgrehan 142221828Sgrehan /* 143222105Sgrehan * Don't expose VMX, SpeedStep or TME capability. 144222610Sjhb * Advertise x2APIC capability and Hypervisor guest. 145221828Sgrehan */ 146222610Sjhb regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); 147221828Sgrehan 148240941Sneel regs[2] |= CPUID2_HV; 149240941Sneel 150240941Sneel if (x2apic_state != X2APIC_DISABLED) 151240941Sneel regs[2] |= CPUID2_X2APIC; 152240941Sneel 153221828Sgrehan /* 154267427Sjhb * Only advertise CPUID2_XSAVE in the guest if 155267427Sjhb * the host is using XSAVE. 156234939Sgrehan */ 157267427Sjhb if (!(regs[2] & CPUID2_OSXSAVE)) 158267427Sjhb regs[2] &= ~CPUID2_XSAVE; 159234939Sgrehan 160234939Sgrehan /* 161267427Sjhb * If CPUID2_XSAVE is being advertised and the 162267427Sjhb * guest has set CR4_XSAVE, set 163267427Sjhb * CPUID2_OSXSAVE. 164267427Sjhb */ 165267427Sjhb regs[2] &= ~CPUID2_OSXSAVE; 166267427Sjhb if (regs[2] & CPUID2_XSAVE) { 167267427Sjhb error = vm_get_register(vm, vcpu_id, 168267427Sjhb VM_REG_GUEST_CR4, &cr4); 169267427Sjhb if (error) 170267427Sjhb panic("x86_emulate_cpuid: error %d " 171267427Sjhb "fetching %%cr4", error); 172267427Sjhb if (cr4 & CR4_XSAVE) 173267427Sjhb regs[2] |= CPUID2_OSXSAVE; 174267427Sjhb } 175267427Sjhb 176267427Sjhb /* 177242060Sneel * Hide monitor/mwait until we know how to deal with 178242060Sneel * these instructions. 179242060Sneel */ 180242060Sneel regs[2] &= ~CPUID2_MON; 181242060Sneel 182252335Sgrehan /* 183252335Sgrehan * Hide the performance and debug features. 184252335Sgrehan */ 185252335Sgrehan regs[2] &= ~CPUID2_PDCM; 186255645Sgrehan 187242060Sneel /* 188255645Sgrehan * No TSC deadline support in the APIC yet 189255645Sgrehan */ 190255645Sgrehan regs[2] &= ~CPUID2_TSCDLT; 191255645Sgrehan 192255645Sgrehan /* 193222105Sgrehan * Hide thermal monitoring 194222105Sgrehan */ 195222105Sgrehan regs[3] &= ~(CPUID_ACPI | CPUID_TM); 196222105Sgrehan 197222105Sgrehan /* 198221828Sgrehan * Machine check handling is done in the host. 199221828Sgrehan * Hide MTRR capability. 200221828Sgrehan */ 201221828Sgrehan regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); 202221828Sgrehan 203252335Sgrehan /* 204252335Sgrehan * Hide the debug store capability. 205252335Sgrehan */ 206252335Sgrehan regs[3] &= ~CPUID_DS; 207252335Sgrehan 208222610Sjhb /* 209222610Sjhb * Disable multi-core. 210222610Sjhb */ 211222610Sjhb regs[1] &= ~CPUID_HTT_CORES; 212222610Sjhb regs[3] &= ~CPUID_HTT; 213221828Sgrehan break; 214221828Sgrehan 215222610Sjhb case CPUID_0000_0004: 216222610Sjhb do_cpuid(4, regs); 217222610Sjhb 218222610Sjhb /* 219222610Sjhb * Do not expose topology. 220222610Sjhb */ 221222610Sjhb regs[0] &= 0xffff8000; 222222610Sjhb regs[0] |= 0x04008000; 223222610Sjhb break; 224222610Sjhb 225256869Sneel case CPUID_0000_0007: 226256869Sneel regs[0] = 0; 227256869Sneel regs[1] = 0; 228256869Sneel regs[2] = 0; 229256869Sneel regs[3] = 0; 230256869Sneel 231256869Sneel /* leaf 0 */ 232256869Sneel if (*ecx == 0) { 233267427Sjhb cpuid_count(*eax, *ecx, regs); 234267427Sjhb 235267427Sjhb /* Only leaf 0 is supported */ 236267427Sjhb regs[0] = 0; 237267427Sjhb 238267427Sjhb /* 239267427Sjhb * Expose known-safe features. 240267427Sjhb */ 241267427Sjhb regs[1] &= (CPUID_STDEXT_FSGSBASE | 242267427Sjhb CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | 243267427Sjhb CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | 244267427Sjhb CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | 245267427Sjhb CPUID_STDEXT_AVX512F | 246267427Sjhb CPUID_STDEXT_AVX512PF | 247267427Sjhb CPUID_STDEXT_AVX512ER | 248267427Sjhb CPUID_STDEXT_AVX512CD); 249267427Sjhb regs[2] = 0; 250267427Sjhb regs[3] = 0; 251267427Sjhb 252267427Sjhb /* Advertise INVPCID if it is enabled. */ 253256869Sneel error = vm_get_capability(vm, vcpu_id, 254256869Sneel VM_CAP_ENABLE_INVPCID, &enable_invpcid); 255256869Sneel if (error == 0 && enable_invpcid) 256256869Sneel regs[1] |= CPUID_STDEXT_INVPCID; 257256869Sneel } 258256869Sneel break; 259256869Sneel 260222105Sgrehan case CPUID_0000_0006: 261252335Sgrehan case CPUID_0000_000A: 262222105Sgrehan /* 263222105Sgrehan * Handle the access, but report 0 for 264222105Sgrehan * all options 265222105Sgrehan */ 266222105Sgrehan regs[0] = 0; 267222105Sgrehan regs[1] = 0; 268222105Sgrehan regs[2] = 0; 269222105Sgrehan regs[3] = 0; 270222105Sgrehan break; 271222105Sgrehan 272221828Sgrehan case CPUID_0000_000B: 273221828Sgrehan /* 274221828Sgrehan * Processor topology enumeration 275221828Sgrehan */ 276221828Sgrehan regs[0] = 0; 277221828Sgrehan regs[1] = 0; 278221828Sgrehan regs[2] = *ecx & 0xff; 279222610Sjhb regs[3] = vcpu_id; 280221828Sgrehan break; 281221828Sgrehan 282267427Sjhb case CPUID_0000_000D: 283267427Sjhb limits = vmm_get_xsave_limits(); 284267427Sjhb if (!limits->xsave_enabled) { 285267427Sjhb regs[0] = 0; 286267427Sjhb regs[1] = 0; 287267427Sjhb regs[2] = 0; 288267427Sjhb regs[3] = 0; 289267427Sjhb break; 290267427Sjhb } 291267427Sjhb 292267427Sjhb cpuid_count(*eax, *ecx, regs); 293267427Sjhb switch (*ecx) { 294267427Sjhb case 0: 295267427Sjhb /* 296267427Sjhb * Only permit the guest to use bits 297267427Sjhb * that are active in the host in 298267427Sjhb * %xcr0. Also, claim that the 299267427Sjhb * maximum save area size is 300267427Sjhb * equivalent to the host's current 301267427Sjhb * save area size. Since this runs 302267427Sjhb * "inside" of vmrun(), it runs with 303267427Sjhb * the guest's xcr0, so the current 304267427Sjhb * save area size is correct as-is. 305267427Sjhb */ 306267427Sjhb regs[0] &= limits->xcr0_allowed; 307267427Sjhb regs[2] = limits->xsave_max_size; 308267427Sjhb regs[3] &= (limits->xcr0_allowed >> 32); 309267427Sjhb break; 310267427Sjhb case 1: 311267427Sjhb /* Only permit XSAVEOPT. */ 312267427Sjhb regs[0] &= CPUID_EXTSTATE_XSAVEOPT; 313267427Sjhb regs[1] = 0; 314267427Sjhb regs[2] = 0; 315267427Sjhb regs[3] = 0; 316267427Sjhb break; 317267427Sjhb default: 318267427Sjhb /* 319267427Sjhb * If the leaf is for a permitted feature, 320267427Sjhb * pass through as-is, otherwise return 321267427Sjhb * all zeroes. 322267427Sjhb */ 323267427Sjhb if (!(limits->xcr0_allowed & (1ul << *ecx))) { 324267427Sjhb regs[0] = 0; 325267427Sjhb regs[1] = 0; 326267427Sjhb regs[2] = 0; 327267427Sjhb regs[3] = 0; 328267427Sjhb } 329267427Sjhb break; 330267427Sjhb } 331267427Sjhb break; 332267427Sjhb 333222610Sjhb case 0x40000000: 334222610Sjhb regs[0] = CPUID_VM_HIGH; 335222610Sjhb bcopy(bhyve_id, ®s[1], 4); 336252335Sgrehan bcopy(bhyve_id + 4, ®s[2], 4); 337252335Sgrehan bcopy(bhyve_id + 8, ®s[3], 4); 338222610Sjhb break; 339252335Sgrehan 340221828Sgrehan default: 341252335Sgrehan /* 342252335Sgrehan * The leaf value has already been clamped so 343252335Sgrehan * simply pass this through, keeping count of 344252335Sgrehan * how many unhandled leaf values have been seen. 345252335Sgrehan */ 346252335Sgrehan atomic_add_long(&bhyve_xcpuids, 1); 347252335Sgrehan cpuid_count(*eax, *ecx, regs); 348252335Sgrehan break; 349221828Sgrehan } 350221828Sgrehan 351221828Sgrehan *eax = regs[0]; 352221828Sgrehan *ebx = regs[1]; 353221828Sgrehan *ecx = regs[2]; 354221828Sgrehan *edx = regs[3]; 355252335Sgrehan 356221828Sgrehan return (1); 357221828Sgrehan} 358