1/*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD: stable/11/sys/amd64/vmm/vmm_instruction_emul.c 349809 2019-07-07 17:31:13Z markj $ 28 */ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: stable/11/sys/amd64/vmm/vmm_instruction_emul.c 349809 2019-07-07 17:31:13Z markj $"); 32 33#ifdef _KERNEL 34#include <sys/param.h> 35#include <sys/pcpu.h> 36#include <sys/systm.h> 37#include <sys/proc.h> 38 39#include <vm/vm.h> 40#include <vm/pmap.h> 41 42#include <machine/vmparam.h> 43#include <machine/vmm.h> 44#else /* !_KERNEL */ 45#include <sys/types.h> 46#include <sys/errno.h> 47#include <sys/_iovec.h> 48 49#include <machine/vmm.h> 50 51#include <assert.h> 52#include <vmmapi.h> 53#define KASSERT(exp,msg) assert((exp)) 54#endif /* _KERNEL */ 55 56#include <machine/vmm_instruction_emul.h> 57#include <x86/psl.h> 58#include <x86/specialreg.h> 59 60/* struct vie_op.op_type */ 61enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_SUB, 69 VIE_OP_TYPE_TWO_BYTE, 70 VIE_OP_TYPE_PUSH, 71 VIE_OP_TYPE_CMP, 72 VIE_OP_TYPE_POP, 73 VIE_OP_TYPE_MOVS, 74 VIE_OP_TYPE_GROUP1, 75 VIE_OP_TYPE_STOS, 76 VIE_OP_TYPE_BITTEST, 77 VIE_OP_TYPE_TWOB_GRP15, 78 VIE_OP_TYPE_ADD, 79 VIE_OP_TYPE_TEST, 80 VIE_OP_TYPE_LAST 81}; 82 83/* struct vie_op.op_flags */ 84#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 85#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 86#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 87#define VIE_OP_F_NO_MODRM (1 << 3) 88#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 89 90static const struct vie_op two_byte_opcodes[256] = { 91 [0xAE] = { 92 .op_byte = 0xAE, 93 .op_type = VIE_OP_TYPE_TWOB_GRP15, 94 }, 95 [0xB6] = { 96 .op_byte = 0xB6, 97 .op_type = VIE_OP_TYPE_MOVZX, 98 }, 99 [0xB7] = { 100 .op_byte = 0xB7, 101 .op_type = VIE_OP_TYPE_MOVZX, 102 }, 103 [0xBA] = { 104 .op_byte = 0xBA, 105 .op_type = VIE_OP_TYPE_BITTEST, 106 .op_flags = VIE_OP_F_IMM8, 107 }, 108 [0xBE] = { 109 .op_byte = 0xBE, 110 .op_type = VIE_OP_TYPE_MOVSX, 111 }, 112}; 113 114static const struct vie_op one_byte_opcodes[256] = { 115 [0x03] = { 116 .op_byte = 0x03, 117 .op_type = VIE_OP_TYPE_ADD, 118 }, 119 [0x0F] = { 120 .op_byte = 0x0F, 121 .op_type = VIE_OP_TYPE_TWO_BYTE 122 }, 123 [0x0B] = { 124 .op_byte = 0x0B, 125 .op_type = VIE_OP_TYPE_OR, 126 }, 127 [0x2B] = { 128 .op_byte = 0x2B, 129 .op_type = VIE_OP_TYPE_SUB, 130 }, 131 [0x39] = { 132 .op_byte = 0x39, 133 .op_type = VIE_OP_TYPE_CMP, 134 }, 135 [0x3B] = { 136 .op_byte = 0x3B, 137 .op_type = VIE_OP_TYPE_CMP, 138 }, 139 [0x88] = { 140 .op_byte = 0x88, 141 .op_type = VIE_OP_TYPE_MOV, 142 }, 143 [0x89] = { 144 .op_byte = 0x89, 145 .op_type = VIE_OP_TYPE_MOV, 146 }, 147 [0x8A] = { 148 .op_byte = 0x8A, 149 .op_type = VIE_OP_TYPE_MOV, 150 }, 151 [0x8B] = { 152 .op_byte = 0x8B, 153 .op_type = VIE_OP_TYPE_MOV, 154 }, 155 [0xA1] = { 156 .op_byte = 0xA1, 157 .op_type = VIE_OP_TYPE_MOV, 158 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 159 }, 160 [0xA3] = { 161 .op_byte = 0xA3, 162 .op_type = VIE_OP_TYPE_MOV, 163 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 164 }, 165 [0xA4] = { 166 .op_byte = 0xA4, 167 .op_type = VIE_OP_TYPE_MOVS, 168 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 169 }, 170 [0xA5] = { 171 .op_byte = 0xA5, 172 .op_type = VIE_OP_TYPE_MOVS, 173 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 174 }, 175 [0xAA] = { 176 .op_byte = 0xAA, 177 .op_type = VIE_OP_TYPE_STOS, 178 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 179 }, 180 [0xAB] = { 181 .op_byte = 0xAB, 182 .op_type = VIE_OP_TYPE_STOS, 183 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 184 }, 185 [0xC6] = { 186 /* XXX Group 11 extended opcode - not just MOV */ 187 .op_byte = 0xC6, 188 .op_type = VIE_OP_TYPE_MOV, 189 .op_flags = VIE_OP_F_IMM8, 190 }, 191 [0xC7] = { 192 .op_byte = 0xC7, 193 .op_type = VIE_OP_TYPE_MOV, 194 .op_flags = VIE_OP_F_IMM, 195 }, 196 [0x23] = { 197 .op_byte = 0x23, 198 .op_type = VIE_OP_TYPE_AND, 199 }, 200 [0x80] = { 201 /* Group 1 extended opcode */ 202 .op_byte = 0x80, 203 .op_type = VIE_OP_TYPE_GROUP1, 204 .op_flags = VIE_OP_F_IMM8, 205 }, 206 [0x81] = { 207 /* Group 1 extended opcode */ 208 .op_byte = 0x81, 209 .op_type = VIE_OP_TYPE_GROUP1, 210 .op_flags = VIE_OP_F_IMM, 211 }, 212 [0x83] = { 213 /* Group 1 extended opcode */ 214 .op_byte = 0x83, 215 .op_type = VIE_OP_TYPE_GROUP1, 216 .op_flags = VIE_OP_F_IMM8, 217 }, 218 [0x8F] = { 219 /* XXX Group 1A extended opcode - not just POP */ 220 .op_byte = 0x8F, 221 .op_type = VIE_OP_TYPE_POP, 222 }, 223 [0xF7] = { 224 /* XXX Group 3 extended opcode - not just TEST */ 225 .op_byte = 0xF7, 226 .op_type = VIE_OP_TYPE_TEST, 227 .op_flags = VIE_OP_F_IMM, 228 }, 229 [0xFF] = { 230 /* XXX Group 5 extended opcode - not just PUSH */ 231 .op_byte = 0xFF, 232 .op_type = VIE_OP_TYPE_PUSH, 233 } 234}; 235 236/* struct vie.mod */ 237#define VIE_MOD_INDIRECT 0 238#define VIE_MOD_INDIRECT_DISP8 1 239#define VIE_MOD_INDIRECT_DISP32 2 240#define VIE_MOD_DIRECT 3 241 242/* struct vie.rm */ 243#define VIE_RM_SIB 4 244#define VIE_RM_DISP32 5 245 246#define GB (1024 * 1024 * 1024) 247 248static enum vm_reg_name gpr_map[16] = { 249 VM_REG_GUEST_RAX, 250 VM_REG_GUEST_RCX, 251 VM_REG_GUEST_RDX, 252 VM_REG_GUEST_RBX, 253 VM_REG_GUEST_RSP, 254 VM_REG_GUEST_RBP, 255 VM_REG_GUEST_RSI, 256 VM_REG_GUEST_RDI, 257 VM_REG_GUEST_R8, 258 VM_REG_GUEST_R9, 259 VM_REG_GUEST_R10, 260 VM_REG_GUEST_R11, 261 VM_REG_GUEST_R12, 262 VM_REG_GUEST_R13, 263 VM_REG_GUEST_R14, 264 VM_REG_GUEST_R15 265}; 266 267static uint64_t size2mask[] = { 268 [1] = 0xff, 269 [2] = 0xffff, 270 [4] = 0xffffffff, 271 [8] = 0xffffffffffffffff, 272}; 273 274static int 275vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 276{ 277 int error; 278 279 error = vm_get_register(vm, vcpuid, reg, rval); 280 281 return (error); 282} 283 284static void 285vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 286{ 287 *lhbr = 0; 288 *reg = gpr_map[vie->reg]; 289 290 /* 291 * 64-bit mode imposes limitations on accessing legacy high byte 292 * registers (lhbr). 293 * 294 * The legacy high-byte registers cannot be addressed if the REX 295 * prefix is present. In this case the values 4, 5, 6 and 7 of the 296 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 297 * 298 * If the REX prefix is not present then the values 4, 5, 6 and 7 299 * of the 'ModRM:reg' field address the legacy high-byte registers, 300 * %ah, %ch, %dh and %bh respectively. 301 */ 302 if (!vie->rex_present) { 303 if (vie->reg & 0x4) { 304 *lhbr = 1; 305 *reg = gpr_map[vie->reg & 0x3]; 306 } 307 } 308} 309 310static int 311vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 312{ 313 uint64_t val; 314 int error, lhbr; 315 enum vm_reg_name reg; 316 317 vie_calc_bytereg(vie, ®, &lhbr); 318 error = vm_get_register(vm, vcpuid, reg, &val); 319 320 /* 321 * To obtain the value of a legacy high byte register shift the 322 * base register right by 8 bits (%ah = %rax >> 8). 323 */ 324 if (lhbr) 325 *rval = val >> 8; 326 else 327 *rval = val; 328 return (error); 329} 330 331static int 332vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 333{ 334 uint64_t origval, val, mask; 335 int error, lhbr; 336 enum vm_reg_name reg; 337 338 vie_calc_bytereg(vie, ®, &lhbr); 339 error = vm_get_register(vm, vcpuid, reg, &origval); 340 if (error == 0) { 341 val = byte; 342 mask = 0xff; 343 if (lhbr) { 344 /* 345 * Shift left by 8 to store 'byte' in a legacy high 346 * byte register. 347 */ 348 val <<= 8; 349 mask <<= 8; 350 } 351 val |= origval & ~mask; 352 error = vm_set_register(vm, vcpuid, reg, val); 353 } 354 return (error); 355} 356 357int 358vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 359 uint64_t val, int size) 360{ 361 int error; 362 uint64_t origval; 363 364 switch (size) { 365 case 1: 366 case 2: 367 error = vie_read_register(vm, vcpuid, reg, &origval); 368 if (error) 369 return (error); 370 val &= size2mask[size]; 371 val |= origval & ~size2mask[size]; 372 break; 373 case 4: 374 val &= 0xffffffffUL; 375 break; 376 case 8: 377 break; 378 default: 379 return (EINVAL); 380 } 381 382 error = vm_set_register(vm, vcpuid, reg, val); 383 return (error); 384} 385 386#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 387 388/* 389 * Return the status flags that would result from doing (x - y). 390 */ 391#define GETCC(sz) \ 392static u_long \ 393getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 394{ \ 395 u_long rflags; \ 396 \ 397 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 398 "=r" (rflags), "+r" (x) : "m" (y)); \ 399 return (rflags); \ 400} struct __hack 401 402GETCC(8); 403GETCC(16); 404GETCC(32); 405GETCC(64); 406 407static u_long 408getcc(int opsize, uint64_t x, uint64_t y) 409{ 410 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 411 ("getcc: invalid operand size %d", opsize)); 412 413 if (opsize == 1) 414 return (getcc8(x, y)); 415 else if (opsize == 2) 416 return (getcc16(x, y)); 417 else if (opsize == 4) 418 return (getcc32(x, y)); 419 else 420 return (getcc64(x, y)); 421} 422 423/* 424 * Macro creation of functions getaddflags{8,16,32,64} 425 */ 426#define GETADDFLAGS(sz) \ 427static u_long \ 428getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 429{ \ 430 u_long rflags; \ 431 \ 432 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 433 "=r" (rflags), "+r" (x) : "m" (y)); \ 434 return (rflags); \ 435} struct __hack 436 437GETADDFLAGS(8); 438GETADDFLAGS(16); 439GETADDFLAGS(32); 440GETADDFLAGS(64); 441 442static u_long 443getaddflags(int opsize, uint64_t x, uint64_t y) 444{ 445 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 446 ("getaddflags: invalid operand size %d", opsize)); 447 448 if (opsize == 1) 449 return (getaddflags8(x, y)); 450 else if (opsize == 2) 451 return (getaddflags16(x, y)); 452 else if (opsize == 4) 453 return (getaddflags32(x, y)); 454 else 455 return (getaddflags64(x, y)); 456} 457 458/* 459 * Return the status flags that would result from doing (x & y). 460 */ 461#define GETANDFLAGS(sz) \ 462static u_long \ 463getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 464{ \ 465 u_long rflags; \ 466 \ 467 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 468 "=r" (rflags), "+r" (x) : "m" (y)); \ 469 return (rflags); \ 470} struct __hack 471 472GETANDFLAGS(8); 473GETANDFLAGS(16); 474GETANDFLAGS(32); 475GETANDFLAGS(64); 476 477static u_long 478getandflags(int opsize, uint64_t x, uint64_t y) 479{ 480 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 481 ("getandflags: invalid operand size %d", opsize)); 482 483 if (opsize == 1) 484 return (getandflags8(x, y)); 485 else if (opsize == 2) 486 return (getandflags16(x, y)); 487 else if (opsize == 4) 488 return (getandflags32(x, y)); 489 else 490 return (getandflags64(x, y)); 491} 492 493static int 494emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 495 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 496{ 497 int error, size; 498 enum vm_reg_name reg; 499 uint8_t byte; 500 uint64_t val; 501 502 size = vie->opsize; 503 error = EINVAL; 504 505 switch (vie->op.op_byte) { 506 case 0x88: 507 /* 508 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 509 * 88/r: mov r/m8, r8 510 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 511 */ 512 size = 1; /* override for byte operation */ 513 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 514 if (error == 0) 515 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 516 break; 517 case 0x89: 518 /* 519 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 520 * 89/r: mov r/m16, r16 521 * 89/r: mov r/m32, r32 522 * REX.W + 89/r mov r/m64, r64 523 */ 524 reg = gpr_map[vie->reg]; 525 error = vie_read_register(vm, vcpuid, reg, &val); 526 if (error == 0) { 527 val &= size2mask[size]; 528 error = memwrite(vm, vcpuid, gpa, val, size, arg); 529 } 530 break; 531 case 0x8A: 532 /* 533 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 534 * 8A/r: mov r8, r/m8 535 * REX + 8A/r: mov r8, r/m8 536 */ 537 size = 1; /* override for byte operation */ 538 error = memread(vm, vcpuid, gpa, &val, size, arg); 539 if (error == 0) 540 error = vie_write_bytereg(vm, vcpuid, vie, val); 541 break; 542 case 0x8B: 543 /* 544 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 545 * 8B/r: mov r16, r/m16 546 * 8B/r: mov r32, r/m32 547 * REX.W 8B/r: mov r64, r/m64 548 */ 549 error = memread(vm, vcpuid, gpa, &val, size, arg); 550 if (error == 0) { 551 reg = gpr_map[vie->reg]; 552 error = vie_update_register(vm, vcpuid, reg, val, size); 553 } 554 break; 555 case 0xA1: 556 /* 557 * MOV from seg:moffset to AX/EAX/RAX 558 * A1: mov AX, moffs16 559 * A1: mov EAX, moffs32 560 * REX.W + A1: mov RAX, moffs64 561 */ 562 error = memread(vm, vcpuid, gpa, &val, size, arg); 563 if (error == 0) { 564 reg = VM_REG_GUEST_RAX; 565 error = vie_update_register(vm, vcpuid, reg, val, size); 566 } 567 break; 568 case 0xA3: 569 /* 570 * MOV from AX/EAX/RAX to seg:moffset 571 * A3: mov moffs16, AX 572 * A3: mov moffs32, EAX 573 * REX.W + A3: mov moffs64, RAX 574 */ 575 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 576 if (error == 0) { 577 val &= size2mask[size]; 578 error = memwrite(vm, vcpuid, gpa, val, size, arg); 579 } 580 break; 581 case 0xC6: 582 /* 583 * MOV from imm8 to mem (ModRM:r/m) 584 * C6/0 mov r/m8, imm8 585 * REX + C6/0 mov r/m8, imm8 586 */ 587 size = 1; /* override for byte operation */ 588 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 589 break; 590 case 0xC7: 591 /* 592 * MOV from imm16/imm32 to mem (ModRM:r/m) 593 * C7/0 mov r/m16, imm16 594 * C7/0 mov r/m32, imm32 595 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 596 */ 597 val = vie->immediate & size2mask[size]; 598 error = memwrite(vm, vcpuid, gpa, val, size, arg); 599 break; 600 default: 601 break; 602 } 603 604 return (error); 605} 606 607static int 608emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 609 mem_region_read_t memread, mem_region_write_t memwrite, 610 void *arg) 611{ 612 int error, size; 613 enum vm_reg_name reg; 614 uint64_t val; 615 616 size = vie->opsize; 617 error = EINVAL; 618 619 switch (vie->op.op_byte) { 620 case 0xB6: 621 /* 622 * MOV and zero extend byte from mem (ModRM:r/m) to 623 * reg (ModRM:reg). 624 * 625 * 0F B6/r movzx r16, r/m8 626 * 0F B6/r movzx r32, r/m8 627 * REX.W + 0F B6/r movzx r64, r/m8 628 */ 629 630 /* get the first operand */ 631 error = memread(vm, vcpuid, gpa, &val, 1, arg); 632 if (error) 633 break; 634 635 /* get the second operand */ 636 reg = gpr_map[vie->reg]; 637 638 /* zero-extend byte */ 639 val = (uint8_t)val; 640 641 /* write the result */ 642 error = vie_update_register(vm, vcpuid, reg, val, size); 643 break; 644 case 0xB7: 645 /* 646 * MOV and zero extend word from mem (ModRM:r/m) to 647 * reg (ModRM:reg). 648 * 649 * 0F B7/r movzx r32, r/m16 650 * REX.W + 0F B7/r movzx r64, r/m16 651 */ 652 error = memread(vm, vcpuid, gpa, &val, 2, arg); 653 if (error) 654 return (error); 655 656 reg = gpr_map[vie->reg]; 657 658 /* zero-extend word */ 659 val = (uint16_t)val; 660 661 error = vie_update_register(vm, vcpuid, reg, val, size); 662 break; 663 case 0xBE: 664 /* 665 * MOV and sign extend byte from mem (ModRM:r/m) to 666 * reg (ModRM:reg). 667 * 668 * 0F BE/r movsx r16, r/m8 669 * 0F BE/r movsx r32, r/m8 670 * REX.W + 0F BE/r movsx r64, r/m8 671 */ 672 673 /* get the first operand */ 674 error = memread(vm, vcpuid, gpa, &val, 1, arg); 675 if (error) 676 break; 677 678 /* get the second operand */ 679 reg = gpr_map[vie->reg]; 680 681 /* sign extend byte */ 682 val = (int8_t)val; 683 684 /* write the result */ 685 error = vie_update_register(vm, vcpuid, reg, val, size); 686 break; 687 default: 688 break; 689 } 690 return (error); 691} 692 693/* 694 * Helper function to calculate and validate a linear address. 695 */ 696static int 697get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, 698 int opsize, int addrsize, int prot, enum vm_reg_name seg, 699 enum vm_reg_name gpr, uint64_t *gla, int *fault) 700{ 701 struct seg_desc desc; 702 uint64_t cr0, val, rflags; 703 int error; 704 705 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 706 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 707 708 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 709 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 710 711 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 712 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 713 __func__, error, seg)); 714 715 error = vie_read_register(vm, vcpuid, gpr, &val); 716 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 717 error, gpr)); 718 719 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 720 addrsize, prot, gla)) { 721 if (seg == VM_REG_GUEST_SS) 722 vm_inject_ss(vm, vcpuid, 0); 723 else 724 vm_inject_gp(vm, vcpuid); 725 goto guest_fault; 726 } 727 728 if (vie_canonical_check(paging->cpu_mode, *gla)) { 729 if (seg == VM_REG_GUEST_SS) 730 vm_inject_ss(vm, vcpuid, 0); 731 else 732 vm_inject_gp(vm, vcpuid); 733 goto guest_fault; 734 } 735 736 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 737 vm_inject_ac(vm, vcpuid, 0); 738 goto guest_fault; 739 } 740 741 *fault = 0; 742 return (0); 743 744guest_fault: 745 *fault = 1; 746 return (0); 747} 748 749static int 750emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 751 struct vm_guest_paging *paging, mem_region_read_t memread, 752 mem_region_write_t memwrite, void *arg) 753{ 754#ifdef _KERNEL 755 struct vm_copyinfo copyinfo[2]; 756#else 757 struct iovec copyinfo[2]; 758#endif 759 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 760 uint64_t rcx, rdi, rsi, rflags; 761 int error, fault, opsize, seg, repeat; 762 763 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 764 val = 0; 765 error = 0; 766 767 /* 768 * XXX although the MOVS instruction is only supposed to be used with 769 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 770 * 771 * Empirically the "repnz" prefix has identical behavior to "rep" 772 * and the zero flag does not make a difference. 773 */ 774 repeat = vie->repz_present | vie->repnz_present; 775 776 if (repeat) { 777 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 778 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 779 780 /* 781 * The count register is %rcx, %ecx or %cx depending on the 782 * address size of the instruction. 783 */ 784 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 785 error = 0; 786 goto done; 787 } 788 } 789 790 /* 791 * Source Destination Comments 792 * -------------------------------------------- 793 * (1) memory memory n/a 794 * (2) memory mmio emulated 795 * (3) mmio memory emulated 796 * (4) mmio mmio emulated 797 * 798 * At this point we don't have sufficient information to distinguish 799 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 800 * out because it will succeed only when operating on regular memory. 801 * 802 * XXX the emulation doesn't properly handle the case where 'gpa' 803 * is straddling the boundary between the normal memory and MMIO. 804 */ 805 806 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 807 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 808 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); 809 if (error || fault) 810 goto done; 811 812 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 813 copyinfo, nitems(copyinfo), &fault); 814 if (error == 0) { 815 if (fault) 816 goto done; /* Resume guest to handle fault */ 817 818 /* 819 * case (2): read from system memory and write to mmio. 820 */ 821 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 822 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 823 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 824 if (error) 825 goto done; 826 } else { 827 /* 828 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 829 * if 'srcaddr' is in the mmio space. 830 */ 831 832 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 833 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, 834 &fault); 835 if (error || fault) 836 goto done; 837 838 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 839 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 840 if (error == 0) { 841 if (fault) 842 goto done; /* Resume guest to handle fault */ 843 844 /* 845 * case (3): read from MMIO and write to system memory. 846 * 847 * A MMIO read can have side-effects so we 848 * commit to it only after vm_copy_setup() is 849 * successful. If a page-fault needs to be 850 * injected into the guest then it will happen 851 * before the MMIO read is attempted. 852 */ 853 error = memread(vm, vcpuid, gpa, &val, opsize, arg); 854 if (error) 855 goto done; 856 857 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 858 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 859 } else { 860 /* 861 * Case (4): read from and write to mmio. 862 * 863 * Commit to the MMIO read/write (with potential 864 * side-effects) only after we are sure that the 865 * instruction is not going to be restarted due 866 * to address translation faults. 867 */ 868 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 869 PROT_READ, &srcgpa, &fault); 870 if (error || fault) 871 goto done; 872 873 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 874 PROT_WRITE, &dstgpa, &fault); 875 if (error || fault) 876 goto done; 877 878 error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); 879 if (error) 880 goto done; 881 882 error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); 883 if (error) 884 goto done; 885 } 886 } 887 888 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 889 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 890 891 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 892 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 893 894 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 895 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 896 897 if (rflags & PSL_D) { 898 rsi -= opsize; 899 rdi -= opsize; 900 } else { 901 rsi += opsize; 902 rdi += opsize; 903 } 904 905 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 906 vie->addrsize); 907 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 908 909 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 910 vie->addrsize); 911 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 912 913 if (repeat) { 914 rcx = rcx - 1; 915 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 916 rcx, vie->addrsize); 917 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 918 919 /* 920 * Repeat the instruction if the count register is not zero. 921 */ 922 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 923 vm_restart_instruction(vm, vcpuid); 924 } 925done: 926 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", 927 __func__, error)); 928 return (error); 929} 930 931static int 932emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 933 struct vm_guest_paging *paging, mem_region_read_t memread, 934 mem_region_write_t memwrite, void *arg) 935{ 936 int error, opsize, repeat; 937 uint64_t val; 938 uint64_t rcx, rdi, rflags; 939 940 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 941 repeat = vie->repz_present | vie->repnz_present; 942 943 if (repeat) { 944 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 945 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 946 947 /* 948 * The count register is %rcx, %ecx or %cx depending on the 949 * address size of the instruction. 950 */ 951 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 952 return (0); 953 } 954 955 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 956 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 957 958 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 959 if (error) 960 return (error); 961 962 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 963 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 964 965 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 966 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 967 968 if (rflags & PSL_D) 969 rdi -= opsize; 970 else 971 rdi += opsize; 972 973 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 974 vie->addrsize); 975 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 976 977 if (repeat) { 978 rcx = rcx - 1; 979 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 980 rcx, vie->addrsize); 981 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 982 983 /* 984 * Repeat the instruction if the count register is not zero. 985 */ 986 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 987 vm_restart_instruction(vm, vcpuid); 988 } 989 990 return (0); 991} 992 993static int 994emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 995 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 996{ 997 int error, size; 998 enum vm_reg_name reg; 999 uint64_t result, rflags, rflags2, val1, val2; 1000 1001 size = vie->opsize; 1002 error = EINVAL; 1003 1004 switch (vie->op.op_byte) { 1005 case 0x23: 1006 /* 1007 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1008 * result in reg. 1009 * 1010 * 23/r and r16, r/m16 1011 * 23/r and r32, r/m32 1012 * REX.W + 23/r and r64, r/m64 1013 */ 1014 1015 /* get the first operand */ 1016 reg = gpr_map[vie->reg]; 1017 error = vie_read_register(vm, vcpuid, reg, &val1); 1018 if (error) 1019 break; 1020 1021 /* get the second operand */ 1022 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1023 if (error) 1024 break; 1025 1026 /* perform the operation and write the result */ 1027 result = val1 & val2; 1028 error = vie_update_register(vm, vcpuid, reg, result, size); 1029 break; 1030 case 0x81: 1031 case 0x83: 1032 /* 1033 * AND mem (ModRM:r/m) with immediate and store the 1034 * result in mem. 1035 * 1036 * 81 /4 and r/m16, imm16 1037 * 81 /4 and r/m32, imm32 1038 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1039 * 1040 * 83 /4 and r/m16, imm8 sign-extended to 16 1041 * 83 /4 and r/m32, imm8 sign-extended to 32 1042 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1043 */ 1044 1045 /* get the first operand */ 1046 error = memread(vm, vcpuid, gpa, &val1, size, arg); 1047 if (error) 1048 break; 1049 1050 /* 1051 * perform the operation with the pre-fetched immediate 1052 * operand and write the result 1053 */ 1054 result = val1 & vie->immediate; 1055 error = memwrite(vm, vcpuid, gpa, result, size, arg); 1056 break; 1057 default: 1058 break; 1059 } 1060 if (error) 1061 return (error); 1062 1063 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1064 if (error) 1065 return (error); 1066 1067 /* 1068 * OF and CF are cleared; the SF, ZF and PF flags are set according 1069 * to the result; AF is undefined. 1070 * 1071 * The updated status flags are obtained by subtracting 0 from 'result'. 1072 */ 1073 rflags2 = getcc(size, result, 0); 1074 rflags &= ~RFLAGS_STATUS_BITS; 1075 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1076 1077 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1078 return (error); 1079} 1080 1081static int 1082emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1083 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1084{ 1085 int error, size; 1086 enum vm_reg_name reg; 1087 uint64_t result, rflags, rflags2, val1, val2; 1088 1089 size = vie->opsize; 1090 error = EINVAL; 1091 1092 switch (vie->op.op_byte) { 1093 case 0x0B: 1094 /* 1095 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1096 * result in reg. 1097 * 1098 * 0b/r or r16, r/m16 1099 * 0b/r or r32, r/m32 1100 * REX.W + 0b/r or r64, r/m64 1101 */ 1102 1103 /* get the first operand */ 1104 reg = gpr_map[vie->reg]; 1105 error = vie_read_register(vm, vcpuid, reg, &val1); 1106 if (error) 1107 break; 1108 1109 /* get the second operand */ 1110 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1111 if (error) 1112 break; 1113 1114 /* perform the operation and write the result */ 1115 result = val1 | val2; 1116 error = vie_update_register(vm, vcpuid, reg, result, size); 1117 break; 1118 case 0x81: 1119 case 0x83: 1120 /* 1121 * OR mem (ModRM:r/m) with immediate and store the 1122 * result in mem. 1123 * 1124 * 81 /1 or r/m16, imm16 1125 * 81 /1 or r/m32, imm32 1126 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1127 * 1128 * 83 /1 or r/m16, imm8 sign-extended to 16 1129 * 83 /1 or r/m32, imm8 sign-extended to 32 1130 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1131 */ 1132 1133 /* get the first operand */ 1134 error = memread(vm, vcpuid, gpa, &val1, size, arg); 1135 if (error) 1136 break; 1137 1138 /* 1139 * perform the operation with the pre-fetched immediate 1140 * operand and write the result 1141 */ 1142 result = val1 | vie->immediate; 1143 error = memwrite(vm, vcpuid, gpa, result, size, arg); 1144 break; 1145 default: 1146 break; 1147 } 1148 if (error) 1149 return (error); 1150 1151 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1152 if (error) 1153 return (error); 1154 1155 /* 1156 * OF and CF are cleared; the SF, ZF and PF flags are set according 1157 * to the result; AF is undefined. 1158 * 1159 * The updated status flags are obtained by subtracting 0 from 'result'. 1160 */ 1161 rflags2 = getcc(size, result, 0); 1162 rflags &= ~RFLAGS_STATUS_BITS; 1163 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1164 1165 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1166 return (error); 1167} 1168 1169static int 1170emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1171 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1172{ 1173 int error, size; 1174 uint64_t regop, memop, op1, op2, rflags, rflags2; 1175 enum vm_reg_name reg; 1176 1177 size = vie->opsize; 1178 switch (vie->op.op_byte) { 1179 case 0x39: 1180 case 0x3B: 1181 /* 1182 * 39/r CMP r/m16, r16 1183 * 39/r CMP r/m32, r32 1184 * REX.W 39/r CMP r/m64, r64 1185 * 1186 * 3B/r CMP r16, r/m16 1187 * 3B/r CMP r32, r/m32 1188 * REX.W + 3B/r CMP r64, r/m64 1189 * 1190 * Compare the first operand with the second operand and 1191 * set status flags in EFLAGS register. The comparison is 1192 * performed by subtracting the second operand from the first 1193 * operand and then setting the status flags. 1194 */ 1195 1196 /* Get the register operand */ 1197 reg = gpr_map[vie->reg]; 1198 error = vie_read_register(vm, vcpuid, reg, ®op); 1199 if (error) 1200 return (error); 1201 1202 /* Get the memory operand */ 1203 error = memread(vm, vcpuid, gpa, &memop, size, arg); 1204 if (error) 1205 return (error); 1206 1207 if (vie->op.op_byte == 0x3B) { 1208 op1 = regop; 1209 op2 = memop; 1210 } else { 1211 op1 = memop; 1212 op2 = regop; 1213 } 1214 rflags2 = getcc(size, op1, op2); 1215 break; 1216 case 0x80: 1217 case 0x81: 1218 case 0x83: 1219 /* 1220 * 80 /7 cmp r/m8, imm8 1221 * REX + 80 /7 cmp r/m8, imm8 1222 * 1223 * 81 /7 cmp r/m16, imm16 1224 * 81 /7 cmp r/m32, imm32 1225 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1226 * 1227 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1228 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1229 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1230 * 1231 * Compare mem (ModRM:r/m) with immediate and set 1232 * status flags according to the results. The 1233 * comparison is performed by subtracting the 1234 * immediate from the first operand and then setting 1235 * the status flags. 1236 * 1237 */ 1238 if (vie->op.op_byte == 0x80) 1239 size = 1; 1240 1241 /* get the first operand */ 1242 error = memread(vm, vcpuid, gpa, &op1, size, arg); 1243 if (error) 1244 return (error); 1245 1246 rflags2 = getcc(size, op1, vie->immediate); 1247 break; 1248 default: 1249 return (EINVAL); 1250 } 1251 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1252 if (error) 1253 return (error); 1254 rflags &= ~RFLAGS_STATUS_BITS; 1255 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1256 1257 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1258 return (error); 1259} 1260 1261static int 1262emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1263 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1264{ 1265 int error, size; 1266 uint64_t op1, rflags, rflags2; 1267 1268 size = vie->opsize; 1269 error = EINVAL; 1270 1271 switch (vie->op.op_byte) { 1272 case 0xF7: 1273 /* 1274 * F7 /0 test r/m16, imm16 1275 * F7 /0 test r/m32, imm32 1276 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1277 * 1278 * Test mem (ModRM:r/m) with immediate and set status 1279 * flags according to the results. The comparison is 1280 * performed by anding the immediate from the first 1281 * operand and then setting the status flags. 1282 */ 1283 if ((vie->reg & 7) != 0) 1284 return (EINVAL); 1285 1286 error = memread(vm, vcpuid, gpa, &op1, size, arg); 1287 if (error) 1288 return (error); 1289 1290 rflags2 = getandflags(size, op1, vie->immediate); 1291 break; 1292 default: 1293 return (EINVAL); 1294 } 1295 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1296 if (error) 1297 return (error); 1298 1299 /* 1300 * OF and CF are cleared; the SF, ZF and PF flags are set according 1301 * to the result; AF is undefined. 1302 */ 1303 rflags &= ~RFLAGS_STATUS_BITS; 1304 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1305 1306 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1307 return (error); 1308} 1309 1310static int 1311emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1312 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1313{ 1314 int error, size; 1315 uint64_t nval, rflags, rflags2, val1, val2; 1316 enum vm_reg_name reg; 1317 1318 size = vie->opsize; 1319 error = EINVAL; 1320 1321 switch (vie->op.op_byte) { 1322 case 0x03: 1323 /* 1324 * ADD r/m to r and store the result in r 1325 * 1326 * 03/r ADD r16, r/m16 1327 * 03/r ADD r32, r/m32 1328 * REX.W + 03/r ADD r64, r/m64 1329 */ 1330 1331 /* get the first operand */ 1332 reg = gpr_map[vie->reg]; 1333 error = vie_read_register(vm, vcpuid, reg, &val1); 1334 if (error) 1335 break; 1336 1337 /* get the second operand */ 1338 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1339 if (error) 1340 break; 1341 1342 /* perform the operation and write the result */ 1343 nval = val1 + val2; 1344 error = vie_update_register(vm, vcpuid, reg, nval, size); 1345 break; 1346 default: 1347 break; 1348 } 1349 1350 if (!error) { 1351 rflags2 = getaddflags(size, val1, val2); 1352 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1353 &rflags); 1354 if (error) 1355 return (error); 1356 1357 rflags &= ~RFLAGS_STATUS_BITS; 1358 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1359 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1360 rflags, 8); 1361 } 1362 1363 return (error); 1364} 1365 1366static int 1367emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1368 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1369{ 1370 int error, size; 1371 uint64_t nval, rflags, rflags2, val1, val2; 1372 enum vm_reg_name reg; 1373 1374 size = vie->opsize; 1375 error = EINVAL; 1376 1377 switch (vie->op.op_byte) { 1378 case 0x2B: 1379 /* 1380 * SUB r/m from r and store the result in r 1381 * 1382 * 2B/r SUB r16, r/m16 1383 * 2B/r SUB r32, r/m32 1384 * REX.W + 2B/r SUB r64, r/m64 1385 */ 1386 1387 /* get the first operand */ 1388 reg = gpr_map[vie->reg]; 1389 error = vie_read_register(vm, vcpuid, reg, &val1); 1390 if (error) 1391 break; 1392 1393 /* get the second operand */ 1394 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1395 if (error) 1396 break; 1397 1398 /* perform the operation and write the result */ 1399 nval = val1 - val2; 1400 error = vie_update_register(vm, vcpuid, reg, nval, size); 1401 break; 1402 default: 1403 break; 1404 } 1405 1406 if (!error) { 1407 rflags2 = getcc(size, val1, val2); 1408 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1409 &rflags); 1410 if (error) 1411 return (error); 1412 1413 rflags &= ~RFLAGS_STATUS_BITS; 1414 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1415 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1416 rflags, 8); 1417 } 1418 1419 return (error); 1420} 1421 1422static int 1423emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1424 struct vm_guest_paging *paging, mem_region_read_t memread, 1425 mem_region_write_t memwrite, void *arg) 1426{ 1427#ifdef _KERNEL 1428 struct vm_copyinfo copyinfo[2]; 1429#else 1430 struct iovec copyinfo[2]; 1431#endif 1432 struct seg_desc ss_desc; 1433 uint64_t cr0, rflags, rsp, stack_gla, val; 1434 int error, fault, size, stackaddrsize, pushop; 1435 1436 val = 0; 1437 size = vie->opsize; 1438 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1439 1440 /* 1441 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1442 */ 1443 if (paging->cpu_mode == CPU_MODE_REAL) { 1444 stackaddrsize = 2; 1445 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1446 /* 1447 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1448 * - Stack pointer size is always 64-bits. 1449 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1450 * - 16-bit PUSH/POP is supported by using the operand size 1451 * override prefix (66H). 1452 */ 1453 stackaddrsize = 8; 1454 size = vie->opsize_override ? 2 : 8; 1455 } else { 1456 /* 1457 * In protected or compatibility mode the 'B' flag in the 1458 * stack-segment descriptor determines the size of the 1459 * stack pointer. 1460 */ 1461 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1462 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1463 __func__, error)); 1464 if (SEG_DESC_DEF32(ss_desc.access)) 1465 stackaddrsize = 4; 1466 else 1467 stackaddrsize = 2; 1468 } 1469 1470 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1471 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1472 1473 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1474 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1475 1476 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1477 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1478 if (pushop) { 1479 rsp -= size; 1480 } 1481 1482 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1483 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1484 &stack_gla)) { 1485 vm_inject_ss(vm, vcpuid, 0); 1486 return (0); 1487 } 1488 1489 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1490 vm_inject_ss(vm, vcpuid, 0); 1491 return (0); 1492 } 1493 1494 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1495 vm_inject_ac(vm, vcpuid, 0); 1496 return (0); 1497 } 1498 1499 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1500 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1501 &fault); 1502 if (error || fault) 1503 return (error); 1504 1505 if (pushop) { 1506 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 1507 if (error == 0) 1508 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1509 } else { 1510 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1511 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); 1512 rsp += size; 1513 } 1514 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1515 1516 if (error == 0) { 1517 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1518 stackaddrsize); 1519 KASSERT(error == 0, ("error %d updating rsp", error)); 1520 } 1521 return (error); 1522} 1523 1524static int 1525emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1526 struct vm_guest_paging *paging, mem_region_read_t memread, 1527 mem_region_write_t memwrite, void *arg) 1528{ 1529 int error; 1530 1531 /* 1532 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1533 * 1534 * PUSH is part of the group 5 extended opcodes and is identified 1535 * by ModRM:reg = b110. 1536 */ 1537 if ((vie->reg & 7) != 6) 1538 return (EINVAL); 1539 1540 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1541 memwrite, arg); 1542 return (error); 1543} 1544 1545static int 1546emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1547 struct vm_guest_paging *paging, mem_region_read_t memread, 1548 mem_region_write_t memwrite, void *arg) 1549{ 1550 int error; 1551 1552 /* 1553 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1554 * 1555 * POP is part of the group 1A extended opcodes and is identified 1556 * by ModRM:reg = b000. 1557 */ 1558 if ((vie->reg & 7) != 0) 1559 return (EINVAL); 1560 1561 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1562 memwrite, arg); 1563 return (error); 1564} 1565 1566static int 1567emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1568 struct vm_guest_paging *paging, mem_region_read_t memread, 1569 mem_region_write_t memwrite, void *memarg) 1570{ 1571 int error; 1572 1573 switch (vie->reg & 7) { 1574 case 0x1: /* OR */ 1575 error = emulate_or(vm, vcpuid, gpa, vie, 1576 memread, memwrite, memarg); 1577 break; 1578 case 0x4: /* AND */ 1579 error = emulate_and(vm, vcpuid, gpa, vie, 1580 memread, memwrite, memarg); 1581 break; 1582 case 0x7: /* CMP */ 1583 error = emulate_cmp(vm, vcpuid, gpa, vie, 1584 memread, memwrite, memarg); 1585 break; 1586 default: 1587 error = EINVAL; 1588 break; 1589 } 1590 1591 return (error); 1592} 1593 1594static int 1595emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1596 mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) 1597{ 1598 uint64_t val, rflags; 1599 int error, bitmask, bitoff; 1600 1601 /* 1602 * 0F BA is a Group 8 extended opcode. 1603 * 1604 * Currently we only emulate the 'Bit Test' instruction which is 1605 * identified by a ModR/M:reg encoding of 100b. 1606 */ 1607 if ((vie->reg & 7) != 4) 1608 return (EINVAL); 1609 1610 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1611 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1612 1613 error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); 1614 if (error) 1615 return (error); 1616 1617 /* 1618 * Intel SDM, Vol 2, Table 3-2: 1619 * "Range of Bit Positions Specified by Bit Offset Operands" 1620 */ 1621 bitmask = vie->opsize * 8 - 1; 1622 bitoff = vie->immediate & bitmask; 1623 1624 /* Copy the bit into the Carry flag in %rflags */ 1625 if (val & (1UL << bitoff)) 1626 rflags |= PSL_C; 1627 else 1628 rflags &= ~PSL_C; 1629 1630 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1631 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 1632 1633 return (0); 1634} 1635 1636static int 1637emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1638 mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) 1639{ 1640 int error; 1641 uint64_t buf; 1642 1643 switch (vie->reg & 7) { 1644 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 1645 if (vie->mod == 0x3) { 1646 /* 1647 * SFENCE. Ignore it, VM exit provides enough 1648 * barriers on its own. 1649 */ 1650 error = 0; 1651 } else { 1652 /* 1653 * CLFLUSH, CLFLUSHOPT. Only check for access 1654 * rights. 1655 */ 1656 error = memread(vm, vcpuid, gpa, &buf, 1, memarg); 1657 } 1658 break; 1659 default: 1660 error = EINVAL; 1661 break; 1662 } 1663 1664 return (error); 1665} 1666 1667int 1668vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1669 struct vm_guest_paging *paging, mem_region_read_t memread, 1670 mem_region_write_t memwrite, void *memarg) 1671{ 1672 int error; 1673 1674 if (!vie->decoded) 1675 return (EINVAL); 1676 1677 switch (vie->op.op_type) { 1678 case VIE_OP_TYPE_GROUP1: 1679 error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, 1680 memwrite, memarg); 1681 break; 1682 case VIE_OP_TYPE_POP: 1683 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, 1684 memwrite, memarg); 1685 break; 1686 case VIE_OP_TYPE_PUSH: 1687 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 1688 memwrite, memarg); 1689 break; 1690 case VIE_OP_TYPE_CMP: 1691 error = emulate_cmp(vm, vcpuid, gpa, vie, 1692 memread, memwrite, memarg); 1693 break; 1694 case VIE_OP_TYPE_MOV: 1695 error = emulate_mov(vm, vcpuid, gpa, vie, 1696 memread, memwrite, memarg); 1697 break; 1698 case VIE_OP_TYPE_MOVSX: 1699 case VIE_OP_TYPE_MOVZX: 1700 error = emulate_movx(vm, vcpuid, gpa, vie, 1701 memread, memwrite, memarg); 1702 break; 1703 case VIE_OP_TYPE_MOVS: 1704 error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, 1705 memwrite, memarg); 1706 break; 1707 case VIE_OP_TYPE_STOS: 1708 error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, 1709 memwrite, memarg); 1710 break; 1711 case VIE_OP_TYPE_AND: 1712 error = emulate_and(vm, vcpuid, gpa, vie, 1713 memread, memwrite, memarg); 1714 break; 1715 case VIE_OP_TYPE_OR: 1716 error = emulate_or(vm, vcpuid, gpa, vie, 1717 memread, memwrite, memarg); 1718 break; 1719 case VIE_OP_TYPE_SUB: 1720 error = emulate_sub(vm, vcpuid, gpa, vie, 1721 memread, memwrite, memarg); 1722 break; 1723 case VIE_OP_TYPE_BITTEST: 1724 error = emulate_bittest(vm, vcpuid, gpa, vie, 1725 memread, memwrite, memarg); 1726 break; 1727 case VIE_OP_TYPE_TWOB_GRP15: 1728 error = emulate_twob_group15(vm, vcpuid, gpa, vie, 1729 memread, memwrite, memarg); 1730 break; 1731 case VIE_OP_TYPE_ADD: 1732 error = emulate_add(vm, vcpuid, gpa, vie, memread, 1733 memwrite, memarg); 1734 break; 1735 case VIE_OP_TYPE_TEST: 1736 error = emulate_test(vm, vcpuid, gpa, vie, 1737 memread, memwrite, memarg); 1738 break; 1739 default: 1740 error = EINVAL; 1741 break; 1742 } 1743 1744 return (error); 1745} 1746 1747int 1748vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1749{ 1750 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1751 ("%s: invalid size %d", __func__, size)); 1752 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1753 1754 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1755 return (0); 1756 1757 return ((gla & (size - 1)) ? 1 : 0); 1758} 1759 1760int 1761vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1762{ 1763 uint64_t mask; 1764 1765 if (cpu_mode != CPU_MODE_64BIT) 1766 return (0); 1767 1768 /* 1769 * The value of the bit 47 in the 'gla' should be replicated in the 1770 * most significant 16 bits. 1771 */ 1772 mask = ~((1UL << 48) - 1); 1773 if (gla & (1UL << 47)) 1774 return ((gla & mask) != mask); 1775 else 1776 return ((gla & mask) != 0); 1777} 1778 1779uint64_t 1780vie_size2mask(int size) 1781{ 1782 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1783 ("vie_size2mask: invalid size %d", size)); 1784 return (size2mask[size]); 1785} 1786 1787int 1788vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1789 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1790 int prot, uint64_t *gla) 1791{ 1792 uint64_t firstoff, low_limit, high_limit, segbase; 1793 int glasize, type; 1794 1795 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1796 ("%s: invalid segment %d", __func__, seg)); 1797 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1798 ("%s: invalid operand size %d", __func__, length)); 1799 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1800 ("%s: invalid prot %#x", __func__, prot)); 1801 1802 firstoff = offset; 1803 if (cpu_mode == CPU_MODE_64BIT) { 1804 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1805 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1806 glasize = 8; 1807 } else { 1808 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1809 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1810 glasize = 4; 1811 /* 1812 * If the segment selector is loaded with a NULL selector 1813 * then the descriptor is unusable and attempting to use 1814 * it results in a #GP(0). 1815 */ 1816 if (SEG_DESC_UNUSABLE(desc->access)) 1817 return (-1); 1818 1819 /* 1820 * The processor generates a #NP exception when a segment 1821 * register is loaded with a selector that points to a 1822 * descriptor that is not present. If this was the case then 1823 * it would have been checked before the VM-exit. 1824 */ 1825 KASSERT(SEG_DESC_PRESENT(desc->access), 1826 ("segment %d not present: %#x", seg, desc->access)); 1827 1828 /* 1829 * The descriptor type must indicate a code/data segment. 1830 */ 1831 type = SEG_DESC_TYPE(desc->access); 1832 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1833 "descriptor type %#x", seg, type)); 1834 1835 if (prot & PROT_READ) { 1836 /* #GP on a read access to a exec-only code segment */ 1837 if ((type & 0xA) == 0x8) 1838 return (-1); 1839 } 1840 1841 if (prot & PROT_WRITE) { 1842 /* 1843 * #GP on a write access to a code segment or a 1844 * read-only data segment. 1845 */ 1846 if (type & 0x8) /* code segment */ 1847 return (-1); 1848 1849 if ((type & 0xA) == 0) /* read-only data seg */ 1850 return (-1); 1851 } 1852 1853 /* 1854 * 'desc->limit' is fully expanded taking granularity into 1855 * account. 1856 */ 1857 if ((type & 0xC) == 0x4) { 1858 /* expand-down data segment */ 1859 low_limit = desc->limit + 1; 1860 high_limit = SEG_DESC_DEF32(desc->access) ? 1861 0xffffffff : 0xffff; 1862 } else { 1863 /* code segment or expand-up data segment */ 1864 low_limit = 0; 1865 high_limit = desc->limit; 1866 } 1867 1868 while (length > 0) { 1869 offset &= vie_size2mask(addrsize); 1870 if (offset < low_limit || offset > high_limit) 1871 return (-1); 1872 offset++; 1873 length--; 1874 } 1875 } 1876 1877 /* 1878 * In 64-bit mode all segments except %fs and %gs have a segment 1879 * base address of 0. 1880 */ 1881 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1882 seg != VM_REG_GUEST_GS) { 1883 segbase = 0; 1884 } else { 1885 segbase = desc->base; 1886 } 1887 1888 /* 1889 * Truncate 'firstoff' to the effective address size before adding 1890 * it to the segment base. 1891 */ 1892 firstoff &= vie_size2mask(addrsize); 1893 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1894 return (0); 1895} 1896 1897#ifdef _KERNEL 1898void 1899vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 1900{ 1901 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 1902 ("%s: invalid instruction length (%d)", __func__, inst_length)); 1903 1904 bzero(vie, sizeof(struct vie)); 1905 1906 vie->base_register = VM_REG_LAST; 1907 vie->index_register = VM_REG_LAST; 1908 vie->segment_register = VM_REG_LAST; 1909 1910 if (inst_length) { 1911 bcopy(inst_bytes, vie->inst, inst_length); 1912 vie->num_valid = inst_length; 1913 } 1914} 1915 1916static int 1917pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1918{ 1919 int error_code = 0; 1920 1921 if (pte & PG_V) 1922 error_code |= PGEX_P; 1923 if (prot & VM_PROT_WRITE) 1924 error_code |= PGEX_W; 1925 if (usermode) 1926 error_code |= PGEX_U; 1927 if (rsvd) 1928 error_code |= PGEX_RSV; 1929 if (prot & VM_PROT_EXECUTE) 1930 error_code |= PGEX_I; 1931 1932 return (error_code); 1933} 1934 1935static void 1936ptp_release(void **cookie) 1937{ 1938 if (*cookie != NULL) { 1939 vm_gpa_release(*cookie); 1940 *cookie = NULL; 1941 } 1942} 1943 1944static void * 1945ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) 1946{ 1947 void *ptr; 1948 1949 ptp_release(cookie); 1950 ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); 1951 return (ptr); 1952} 1953 1954int 1955vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1956 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 1957{ 1958 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1959 u_int retries; 1960 uint64_t *ptpbase, ptpphys, pte, pgsize; 1961 uint32_t *ptpbase32, pte32; 1962 void *cookie; 1963 1964 *guest_fault = 0; 1965 1966 usermode = (paging->cpl == 3 ? 1 : 0); 1967 writable = prot & VM_PROT_WRITE; 1968 cookie = NULL; 1969 retval = 0; 1970 retries = 0; 1971restart: 1972 ptpphys = paging->cr3; /* root of the page tables */ 1973 ptp_release(&cookie); 1974 if (retries++ > 0) 1975 maybe_yield(); 1976 1977 if (vie_canonical_check(paging->cpu_mode, gla)) { 1978 /* 1979 * XXX assuming a non-stack reference otherwise a stack fault 1980 * should be generated. 1981 */ 1982 vm_inject_gp(vm, vcpuid); 1983 goto fault; 1984 } 1985 1986 if (paging->paging_mode == PAGING_MODE_FLAT) { 1987 *gpa = gla; 1988 goto done; 1989 } 1990 1991 if (paging->paging_mode == PAGING_MODE_32) { 1992 nlevels = 2; 1993 while (--nlevels >= 0) { 1994 /* Zero out the lower 12 bits. */ 1995 ptpphys &= ~0xfff; 1996 1997 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, 1998 &cookie); 1999 2000 if (ptpbase32 == NULL) 2001 goto error; 2002 2003 ptpshift = PAGE_SHIFT + nlevels * 10; 2004 ptpindex = (gla >> ptpshift) & 0x3FF; 2005 pgsize = 1UL << ptpshift; 2006 2007 pte32 = ptpbase32[ptpindex]; 2008 2009 if ((pte32 & PG_V) == 0 || 2010 (usermode && (pte32 & PG_U) == 0) || 2011 (writable && (pte32 & PG_RW) == 0)) { 2012 pfcode = pf_error_code(usermode, prot, 0, 2013 pte32); 2014 vm_inject_pf(vm, vcpuid, pfcode, gla); 2015 goto fault; 2016 } 2017 2018 /* 2019 * Emulate the x86 MMU's management of the accessed 2020 * and dirty flags. While the accessed flag is set 2021 * at every level of the page table, the dirty flag 2022 * is only set at the last level providing the guest 2023 * physical address. 2024 */ 2025 if ((pte32 & PG_A) == 0) { 2026 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2027 pte32, pte32 | PG_A) == 0) { 2028 goto restart; 2029 } 2030 } 2031 2032 /* XXX must be ignored if CR4.PSE=0 */ 2033 if (nlevels > 0 && (pte32 & PG_PS) != 0) 2034 break; 2035 2036 ptpphys = pte32; 2037 } 2038 2039 /* Set the dirty bit in the page table entry if necessary */ 2040 if (writable && (pte32 & PG_M) == 0) { 2041 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2042 pte32, pte32 | PG_M) == 0) { 2043 goto restart; 2044 } 2045 } 2046 2047 /* Zero out the lower 'ptpshift' bits */ 2048 pte32 >>= ptpshift; pte32 <<= ptpshift; 2049 *gpa = pte32 | (gla & (pgsize - 1)); 2050 goto done; 2051 } 2052 2053 if (paging->paging_mode == PAGING_MODE_PAE) { 2054 /* Zero out the lower 5 bits and the upper 32 bits */ 2055 ptpphys &= 0xffffffe0UL; 2056 2057 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, 2058 &cookie); 2059 if (ptpbase == NULL) 2060 goto error; 2061 2062 ptpindex = (gla >> 30) & 0x3; 2063 2064 pte = ptpbase[ptpindex]; 2065 2066 if ((pte & PG_V) == 0) { 2067 pfcode = pf_error_code(usermode, prot, 0, pte); 2068 vm_inject_pf(vm, vcpuid, pfcode, gla); 2069 goto fault; 2070 } 2071 2072 ptpphys = pte; 2073 2074 nlevels = 2; 2075 } else 2076 nlevels = 4; 2077 while (--nlevels >= 0) { 2078 /* Zero out the lower 12 bits and the upper 12 bits */ 2079 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 2080 2081 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); 2082 if (ptpbase == NULL) 2083 goto error; 2084 2085 ptpshift = PAGE_SHIFT + nlevels * 9; 2086 ptpindex = (gla >> ptpshift) & 0x1FF; 2087 pgsize = 1UL << ptpshift; 2088 2089 pte = ptpbase[ptpindex]; 2090 2091 if ((pte & PG_V) == 0 || 2092 (usermode && (pte & PG_U) == 0) || 2093 (writable && (pte & PG_RW) == 0)) { 2094 pfcode = pf_error_code(usermode, prot, 0, pte); 2095 vm_inject_pf(vm, vcpuid, pfcode, gla); 2096 goto fault; 2097 } 2098 2099 /* Set the accessed bit in the page table entry */ 2100 if ((pte & PG_A) == 0) { 2101 if (atomic_cmpset_64(&ptpbase[ptpindex], 2102 pte, pte | PG_A) == 0) { 2103 goto restart; 2104 } 2105 } 2106 2107 if (nlevels > 0 && (pte & PG_PS) != 0) { 2108 if (pgsize > 1 * GB) { 2109 pfcode = pf_error_code(usermode, prot, 1, pte); 2110 vm_inject_pf(vm, vcpuid, pfcode, gla); 2111 goto fault; 2112 } 2113 break; 2114 } 2115 2116 ptpphys = pte; 2117 } 2118 2119 /* Set the dirty bit in the page table entry if necessary */ 2120 if (writable && (pte & PG_M) == 0) { 2121 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 2122 goto restart; 2123 } 2124 2125 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 2126 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 2127 *gpa = pte | (gla & (pgsize - 1)); 2128done: 2129 ptp_release(&cookie); 2130 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", 2131 __func__, retval)); 2132 return (retval); 2133error: 2134 retval = EFAULT; 2135 goto done; 2136fault: 2137 *guest_fault = 1; 2138 goto done; 2139} 2140 2141int 2142vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2143 uint64_t rip, int inst_length, struct vie *vie, int *faultptr) 2144{ 2145 struct vm_copyinfo copyinfo[2]; 2146 int error, prot; 2147 2148 if (inst_length > VIE_INST_SIZE) 2149 panic("vmm_fetch_instruction: invalid length %d", inst_length); 2150 2151 prot = PROT_READ | PROT_EXEC; 2152 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 2153 copyinfo, nitems(copyinfo), faultptr); 2154 if (error || *faultptr) 2155 return (error); 2156 2157 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 2158 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2159 vie->num_valid = inst_length; 2160 return (0); 2161} 2162 2163static int 2164vie_peek(struct vie *vie, uint8_t *x) 2165{ 2166 2167 if (vie->num_processed < vie->num_valid) { 2168 *x = vie->inst[vie->num_processed]; 2169 return (0); 2170 } else 2171 return (-1); 2172} 2173 2174static void 2175vie_advance(struct vie *vie) 2176{ 2177 2178 vie->num_processed++; 2179} 2180 2181static bool 2182segment_override(uint8_t x, int *seg) 2183{ 2184 2185 switch (x) { 2186 case 0x2E: 2187 *seg = VM_REG_GUEST_CS; 2188 break; 2189 case 0x36: 2190 *seg = VM_REG_GUEST_SS; 2191 break; 2192 case 0x3E: 2193 *seg = VM_REG_GUEST_DS; 2194 break; 2195 case 0x26: 2196 *seg = VM_REG_GUEST_ES; 2197 break; 2198 case 0x64: 2199 *seg = VM_REG_GUEST_FS; 2200 break; 2201 case 0x65: 2202 *seg = VM_REG_GUEST_GS; 2203 break; 2204 default: 2205 return (false); 2206 } 2207 return (true); 2208} 2209 2210static int 2211decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 2212{ 2213 uint8_t x; 2214 2215 while (1) { 2216 if (vie_peek(vie, &x)) 2217 return (-1); 2218 2219 if (x == 0x66) 2220 vie->opsize_override = 1; 2221 else if (x == 0x67) 2222 vie->addrsize_override = 1; 2223 else if (x == 0xF3) 2224 vie->repz_present = 1; 2225 else if (x == 0xF2) 2226 vie->repnz_present = 1; 2227 else if (segment_override(x, &vie->segment_register)) 2228 vie->segment_override = 1; 2229 else 2230 break; 2231 2232 vie_advance(vie); 2233 } 2234 2235 /* 2236 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 2237 * - Only one REX prefix is allowed per instruction. 2238 * - The REX prefix must immediately precede the opcode byte or the 2239 * escape opcode byte. 2240 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 2241 * the mandatory prefix must come before the REX prefix. 2242 */ 2243 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 2244 vie->rex_present = 1; 2245 vie->rex_w = x & 0x8 ? 1 : 0; 2246 vie->rex_r = x & 0x4 ? 1 : 0; 2247 vie->rex_x = x & 0x2 ? 1 : 0; 2248 vie->rex_b = x & 0x1 ? 1 : 0; 2249 vie_advance(vie); 2250 } 2251 2252 /* 2253 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 2254 */ 2255 if (cpu_mode == CPU_MODE_64BIT) { 2256 /* 2257 * Default address size is 64-bits and default operand size 2258 * is 32-bits. 2259 */ 2260 vie->addrsize = vie->addrsize_override ? 4 : 8; 2261 if (vie->rex_w) 2262 vie->opsize = 8; 2263 else if (vie->opsize_override) 2264 vie->opsize = 2; 2265 else 2266 vie->opsize = 4; 2267 } else if (cs_d) { 2268 /* Default address and operand sizes are 32-bits */ 2269 vie->addrsize = vie->addrsize_override ? 2 : 4; 2270 vie->opsize = vie->opsize_override ? 2 : 4; 2271 } else { 2272 /* Default address and operand sizes are 16-bits */ 2273 vie->addrsize = vie->addrsize_override ? 4 : 2; 2274 vie->opsize = vie->opsize_override ? 4 : 2; 2275 } 2276 return (0); 2277} 2278 2279static int 2280decode_two_byte_opcode(struct vie *vie) 2281{ 2282 uint8_t x; 2283 2284 if (vie_peek(vie, &x)) 2285 return (-1); 2286 2287 vie->op = two_byte_opcodes[x]; 2288 2289 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2290 return (-1); 2291 2292 vie_advance(vie); 2293 return (0); 2294} 2295 2296static int 2297decode_opcode(struct vie *vie) 2298{ 2299 uint8_t x; 2300 2301 if (vie_peek(vie, &x)) 2302 return (-1); 2303 2304 vie->op = one_byte_opcodes[x]; 2305 2306 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2307 return (-1); 2308 2309 vie_advance(vie); 2310 2311 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 2312 return (decode_two_byte_opcode(vie)); 2313 2314 return (0); 2315} 2316 2317static int 2318decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 2319{ 2320 uint8_t x; 2321 2322 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 2323 return (0); 2324 2325 if (cpu_mode == CPU_MODE_REAL) 2326 return (-1); 2327 2328 if (vie_peek(vie, &x)) 2329 return (-1); 2330 2331 vie->mod = (x >> 6) & 0x3; 2332 vie->rm = (x >> 0) & 0x7; 2333 vie->reg = (x >> 3) & 0x7; 2334 2335 /* 2336 * A direct addressing mode makes no sense in the context of an EPT 2337 * fault. There has to be a memory access involved to cause the 2338 * EPT fault. 2339 */ 2340 if (vie->mod == VIE_MOD_DIRECT) 2341 return (-1); 2342 2343 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 2344 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 2345 /* 2346 * Table 2-5: Special Cases of REX Encodings 2347 * 2348 * mod=0, r/m=5 is used in the compatibility mode to 2349 * indicate a disp32 without a base register. 2350 * 2351 * mod!=3, r/m=4 is used in the compatibility mode to 2352 * indicate that the SIB byte is present. 2353 * 2354 * The 'b' bit in the REX prefix is don't care in 2355 * this case. 2356 */ 2357 } else { 2358 vie->rm |= (vie->rex_b << 3); 2359 } 2360 2361 vie->reg |= (vie->rex_r << 3); 2362 2363 /* SIB */ 2364 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 2365 goto done; 2366 2367 vie->base_register = gpr_map[vie->rm]; 2368 2369 switch (vie->mod) { 2370 case VIE_MOD_INDIRECT_DISP8: 2371 vie->disp_bytes = 1; 2372 break; 2373 case VIE_MOD_INDIRECT_DISP32: 2374 vie->disp_bytes = 4; 2375 break; 2376 case VIE_MOD_INDIRECT: 2377 if (vie->rm == VIE_RM_DISP32) { 2378 vie->disp_bytes = 4; 2379 /* 2380 * Table 2-7. RIP-Relative Addressing 2381 * 2382 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 2383 * whereas in compatibility mode it just implies disp32. 2384 */ 2385 2386 if (cpu_mode == CPU_MODE_64BIT) 2387 vie->base_register = VM_REG_GUEST_RIP; 2388 else 2389 vie->base_register = VM_REG_LAST; 2390 } 2391 break; 2392 } 2393 2394done: 2395 vie_advance(vie); 2396 2397 return (0); 2398} 2399 2400static int 2401decode_sib(struct vie *vie) 2402{ 2403 uint8_t x; 2404 2405 /* Proceed only if SIB byte is present */ 2406 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 2407 return (0); 2408 2409 if (vie_peek(vie, &x)) 2410 return (-1); 2411 2412 /* De-construct the SIB byte */ 2413 vie->ss = (x >> 6) & 0x3; 2414 vie->index = (x >> 3) & 0x7; 2415 vie->base = (x >> 0) & 0x7; 2416 2417 /* Apply the REX prefix modifiers */ 2418 vie->index |= vie->rex_x << 3; 2419 vie->base |= vie->rex_b << 3; 2420 2421 switch (vie->mod) { 2422 case VIE_MOD_INDIRECT_DISP8: 2423 vie->disp_bytes = 1; 2424 break; 2425 case VIE_MOD_INDIRECT_DISP32: 2426 vie->disp_bytes = 4; 2427 break; 2428 } 2429 2430 if (vie->mod == VIE_MOD_INDIRECT && 2431 (vie->base == 5 || vie->base == 13)) { 2432 /* 2433 * Special case when base register is unused if mod = 0 2434 * and base = %rbp or %r13. 2435 * 2436 * Documented in: 2437 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2438 * Table 2-5: Special Cases of REX Encodings 2439 */ 2440 vie->disp_bytes = 4; 2441 } else { 2442 vie->base_register = gpr_map[vie->base]; 2443 } 2444 2445 /* 2446 * All encodings of 'index' are valid except for %rsp (4). 2447 * 2448 * Documented in: 2449 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2450 * Table 2-5: Special Cases of REX Encodings 2451 */ 2452 if (vie->index != 4) 2453 vie->index_register = gpr_map[vie->index]; 2454 2455 /* 'scale' makes sense only in the context of an index register */ 2456 if (vie->index_register < VM_REG_LAST) 2457 vie->scale = 1 << vie->ss; 2458 2459 vie_advance(vie); 2460 2461 return (0); 2462} 2463 2464static int 2465decode_displacement(struct vie *vie) 2466{ 2467 int n, i; 2468 uint8_t x; 2469 2470 union { 2471 char buf[4]; 2472 int8_t signed8; 2473 int32_t signed32; 2474 } u; 2475 2476 if ((n = vie->disp_bytes) == 0) 2477 return (0); 2478 2479 if (n != 1 && n != 4) 2480 panic("decode_displacement: invalid disp_bytes %d", n); 2481 2482 for (i = 0; i < n; i++) { 2483 if (vie_peek(vie, &x)) 2484 return (-1); 2485 2486 u.buf[i] = x; 2487 vie_advance(vie); 2488 } 2489 2490 if (n == 1) 2491 vie->displacement = u.signed8; /* sign-extended */ 2492 else 2493 vie->displacement = u.signed32; /* sign-extended */ 2494 2495 return (0); 2496} 2497 2498static int 2499decode_immediate(struct vie *vie) 2500{ 2501 int i, n; 2502 uint8_t x; 2503 union { 2504 char buf[4]; 2505 int8_t signed8; 2506 int16_t signed16; 2507 int32_t signed32; 2508 } u; 2509 2510 /* Figure out immediate operand size (if any) */ 2511 if (vie->op.op_flags & VIE_OP_F_IMM) { 2512 /* 2513 * Section 2.2.1.5 "Immediates", Intel SDM: 2514 * In 64-bit mode the typical size of immediate operands 2515 * remains 32-bits. When the operand size if 64-bits, the 2516 * processor sign-extends all immediates to 64-bits prior 2517 * to their use. 2518 */ 2519 if (vie->opsize == 4 || vie->opsize == 8) 2520 vie->imm_bytes = 4; 2521 else 2522 vie->imm_bytes = 2; 2523 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2524 vie->imm_bytes = 1; 2525 } 2526 2527 if ((n = vie->imm_bytes) == 0) 2528 return (0); 2529 2530 KASSERT(n == 1 || n == 2 || n == 4, 2531 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2532 2533 for (i = 0; i < n; i++) { 2534 if (vie_peek(vie, &x)) 2535 return (-1); 2536 2537 u.buf[i] = x; 2538 vie_advance(vie); 2539 } 2540 2541 /* sign-extend the immediate value before use */ 2542 if (n == 1) 2543 vie->immediate = u.signed8; 2544 else if (n == 2) 2545 vie->immediate = u.signed16; 2546 else 2547 vie->immediate = u.signed32; 2548 2549 return (0); 2550} 2551 2552static int 2553decode_moffset(struct vie *vie) 2554{ 2555 int i, n; 2556 uint8_t x; 2557 union { 2558 char buf[8]; 2559 uint64_t u64; 2560 } u; 2561 2562 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2563 return (0); 2564 2565 /* 2566 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2567 * The memory offset size follows the address-size of the instruction. 2568 */ 2569 n = vie->addrsize; 2570 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2571 2572 u.u64 = 0; 2573 for (i = 0; i < n; i++) { 2574 if (vie_peek(vie, &x)) 2575 return (-1); 2576 2577 u.buf[i] = x; 2578 vie_advance(vie); 2579 } 2580 vie->displacement = u.u64; 2581 return (0); 2582} 2583 2584/* 2585 * Verify that the 'guest linear address' provided as collateral of the nested 2586 * page table fault matches with our instruction decoding. 2587 */ 2588static int 2589verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, 2590 enum vm_cpu_mode cpu_mode) 2591{ 2592 int error; 2593 uint64_t base, segbase, idx, gla2; 2594 enum vm_reg_name seg; 2595 struct seg_desc desc; 2596 2597 /* Skip 'gla' verification */ 2598 if (gla == VIE_INVALID_GLA) 2599 return (0); 2600 2601 base = 0; 2602 if (vie->base_register != VM_REG_LAST) { 2603 error = vm_get_register(vm, cpuid, vie->base_register, &base); 2604 if (error) { 2605 printf("verify_gla: error %d getting base reg %d\n", 2606 error, vie->base_register); 2607 return (-1); 2608 } 2609 2610 /* 2611 * RIP-relative addressing starts from the following 2612 * instruction 2613 */ 2614 if (vie->base_register == VM_REG_GUEST_RIP) 2615 base += vie->num_processed; 2616 } 2617 2618 idx = 0; 2619 if (vie->index_register != VM_REG_LAST) { 2620 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 2621 if (error) { 2622 printf("verify_gla: error %d getting index reg %d\n", 2623 error, vie->index_register); 2624 return (-1); 2625 } 2626 } 2627 2628 /* 2629 * From "Specifying a Segment Selector", Intel SDM, Vol 1 2630 * 2631 * In 64-bit mode, segmentation is generally (but not 2632 * completely) disabled. The exceptions are the FS and GS 2633 * segments. 2634 * 2635 * In legacy IA-32 mode, when the ESP or EBP register is used 2636 * as the base, the SS segment is the default segment. For 2637 * other data references, except when relative to stack or 2638 * string destination the DS segment is the default. These 2639 * can be overridden to allow other segments to be accessed. 2640 */ 2641 if (vie->segment_override) 2642 seg = vie->segment_register; 2643 else if (vie->base_register == VM_REG_GUEST_RSP || 2644 vie->base_register == VM_REG_GUEST_RBP) 2645 seg = VM_REG_GUEST_SS; 2646 else 2647 seg = VM_REG_GUEST_DS; 2648 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2649 seg != VM_REG_GUEST_GS) { 2650 segbase = 0; 2651 } else { 2652 error = vm_get_seg_desc(vm, cpuid, seg, &desc); 2653 if (error) { 2654 printf("verify_gla: error %d getting segment" 2655 " descriptor %d", error, 2656 vie->segment_register); 2657 return (-1); 2658 } 2659 segbase = desc.base; 2660 } 2661 2662 gla2 = segbase + base + vie->scale * idx + vie->displacement; 2663 gla2 &= size2mask[vie->addrsize]; 2664 if (gla != gla2) { 2665 printf("verify_gla mismatch: segbase(0x%0lx)" 2666 "base(0x%0lx), scale(%d), index(0x%0lx), " 2667 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2668 segbase, base, vie->scale, idx, vie->displacement, 2669 gla, gla2); 2670 return (-1); 2671 } 2672 2673 return (0); 2674} 2675 2676int 2677vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 2678 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2679{ 2680 2681 if (decode_prefixes(vie, cpu_mode, cs_d)) 2682 return (-1); 2683 2684 if (decode_opcode(vie)) 2685 return (-1); 2686 2687 if (decode_modrm(vie, cpu_mode)) 2688 return (-1); 2689 2690 if (decode_sib(vie)) 2691 return (-1); 2692 2693 if (decode_displacement(vie)) 2694 return (-1); 2695 2696 if (decode_immediate(vie)) 2697 return (-1); 2698 2699 if (decode_moffset(vie)) 2700 return (-1); 2701 2702 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2703 if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) 2704 return (-1); 2705 } 2706 2707 vie->decoded = 1; /* success */ 2708 2709 return (0); 2710} 2711#endif /* _KERNEL */ 2712