1/* 2 * Copyright 2019, Data61, CSIRO (ABN 41 687 119 230) 3 * 4 * SPDX-License-Identifier: GPL-2.0-only 5 */ 6 7/* x86 fetch/decode/emulate code 8 9Author: W.A. 10*/ 11 12#include <stdio.h> 13#include <stdlib.h> 14#include <string.h> 15 16#include <sel4vm/guest_vm.h> 17#include <sel4vm/guest_ram.h> 18#include <sel4vm/arch/guest_x86_context.h> 19 20#include "sel4vm/guest_memory.h" 21 22#include "processor/platfeature.h" 23#include "processor/decode.h" 24#include "guest_state.h" 25 26/* TODO are these defined elsewhere? */ 27#define IA32_PDE_SIZE(pde) (pde & BIT(7)) 28#define IA32_PDE_PRESENT(pde) (pde & BIT(0)) 29#define IA32_PTE_ADDR(pte) (pte & 0xFFFFF000) 30#define IA32_PSE_ADDR(pde) (pde & 0xFFC00000) 31 32#define IA32_OPCODE_S(op) (op & BIT(0)) 33#define IA32_OPCODE_D(op) (op & BIT(1)) 34#define IA32_OPCODY_BODY(op) (op & 0b11111100) 35#define IA32_MODRM_REG(m) ((m & 0b00111000) >> 3) 36 37#define SEG_MULT (0x10) 38 39enum decode_instr { 40 DECODE_INSTR_MOV, 41 DECODE_INSTR_MOVQ, 42 DECODE_INSTR_INVALID 43}; 44 45enum decode_prefix { 46 ES_SEG_OVERRIDE = 0x26, 47 CS_SEG_OVERRIDE = 0x2e, 48 SS_SEG_OVERRIDE = 0x36, 49 DS_SEG_OVERRIDE = 0x3e, 50 FS_SEG_OVERRIDE = 0x64, 51 GS_SEG_OVERRIDE = 0x65, 52 OP_SIZE_OVERRIDE = 0x66, 53 ADDR_SIZE_OVERRIDE = 0x67 54}; 55 56struct x86_op { 57 int reg; 58 uint32_t val; 59 size_t len; 60}; 61 62struct decode_op { 63 int curr_byte; 64 uint8_t *instr; 65 size_t instr_len; 66 struct x86_op op; 67}; 68 69struct decode_table { 70 enum decode_instr instr; 71 void (*decode_fn)(struct decode_op *); 72}; 73 74static void debug_print_instruction(uint8_t *instr, int instr_len); 75 76static void decode_modrm_reg_op(struct decode_op *decode_op) 77{ 78 /* Mov with register */ 79 uint8_t modrm = decode_op->instr[decode_op->curr_byte]; 80 decode_op->curr_byte++; 81 decode_op->op.reg = IA32_MODRM_REG(modrm); 82 return; 83} 84 85static void decode_imm_op(struct decode_op *decode_op) 86{ 87 /* Mov with immediate */ 88 decode_op->op.reg = -1; 89 uint32_t immediate = 0; 90 for (int j = 0; j < decode_op->op.len; j++) { 91 immediate <<= 8; 92 immediate |= decode_op->instr[decode_op->instr_len - j - 1]; 93 } 94 decode_op->op.val = immediate; 95 return; 96} 97 98static void decode_invalid_op(struct decode_op *decode_op) 99{ 100 ZF_LOGE("can't emulate instruction!\n"); 101 debug_print_instruction(decode_op->instr, decode_op->instr_len); 102 assert(0); 103} 104 105static const struct decode_table decode_table_1op[] = { 106 [0 ... MAX_INSTR_OPCODES] = {DECODE_INSTR_INVALID, decode_invalid_op}, 107 [0x88] = {DECODE_INSTR_MOV, decode_modrm_reg_op}, 108 [0x89] = {DECODE_INSTR_MOV, decode_modrm_reg_op}, 109 [0x8a] = {DECODE_INSTR_MOV, decode_modrm_reg_op}, 110 [0x8b] = {DECODE_INSTR_MOV, decode_modrm_reg_op}, 111 [0x8c] = {DECODE_INSTR_MOV, decode_modrm_reg_op}, 112 [0xc6] = {DECODE_INSTR_MOV, decode_imm_op}, 113 [0xc7] = {DECODE_INSTR_MOV, decode_imm_op} 114}; 115 116static const struct decode_table decode_table_2op[] = { 117 [0 ... MAX_INSTR_OPCODES] = {DECODE_INSTR_INVALID, decode_invalid_op}, 118 [0x6f] = {DECODE_INSTR_MOVQ, decode_modrm_reg_op} 119}; 120 121/* Get a word from a guest physical address */ 122inline static uint32_t guest_get_phys_word(vm_t *vm, uintptr_t addr) 123{ 124 uint32_t val; 125 126 vm_ram_touch(vm, addr, sizeof(uint32_t), 127 vm_guest_ram_read_callback, &val); 128 129 return val; 130} 131 132/* Fetch a guest's instruction */ 133int vm_fetch_instruction(vm_vcpu_t *vcpu, uint32_t eip, uintptr_t cr3, 134 int len, uint8_t *buf) 135{ 136 /* Walk page tables to get physical address of instruction */ 137 uintptr_t instr_phys = 0; 138 139 /* ensure that PAE is not enabled */ 140 if (vm_guest_state_get_cr4(vcpu->vcpu_arch.guest_state, vcpu->vcpu.cptr) & X86_CR4_PAE) { 141 ZF_LOGE("Do not support walking PAE paging structures"); 142 return -1; 143 } 144 145 // TODO implement page-boundary crossing properly 146 assert((eip >> 12) == ((eip + len) >> 12)); 147 148 uint32_t pdi = eip >> 22; 149 uint32_t pti = (eip >> 12) & 0x3FF; 150 151 uint32_t pde = guest_get_phys_word(vcpu->vm, cr3 + pdi * 4); 152 153 assert(IA32_PDE_PRESENT(pde)); /* WTF? */ 154 155 if (IA32_PDE_SIZE(pde)) { 156 /* PSE is used, 4M pages */ 157 instr_phys = (uintptr_t)IA32_PSE_ADDR(pde) + (eip & 0x3FFFFF); 158 } else { 159 /* 4k pages */ 160 uint32_t pte = guest_get_phys_word(vcpu->vm, 161 (uintptr_t)IA32_PTE_ADDR(pde) + pti * 4); 162 163 assert(IA32_PDE_PRESENT(pte)); 164 165 instr_phys = (uintptr_t)IA32_PTE_ADDR(pte) + (eip & 0xFFF); 166 } 167 168 /* Fetch instruction */ 169 vm_ram_touch(vcpu->vm, instr_phys, len, 170 vm_guest_ram_read_callback, buf); 171 172 return 0; 173} 174 175/* Returns 1 if this byte is an x86 instruction prefix */ 176static int is_prefix(uint8_t byte) 177{ 178 switch (byte) { 179 case ES_SEG_OVERRIDE: 180 case CS_SEG_OVERRIDE: 181 case SS_SEG_OVERRIDE: 182 case DS_SEG_OVERRIDE: 183 case FS_SEG_OVERRIDE: 184 case GS_SEG_OVERRIDE: 185 case ADDR_SIZE_OVERRIDE: 186 case OP_SIZE_OVERRIDE: 187 return 1; 188 } 189 190 return 0; 191} 192 193static void debug_print_instruction(uint8_t *instr, int instr_len) 194{ 195 printf("instruction dump: "); 196 for (int j = 0; j < instr_len; j++) { 197 printf("%2x ", instr[j]); 198 } 199 printf("\n"); 200} 201 202/* Partial support to decode an instruction for a memory access 203 This is very crude. It can break in many ways. */ 204int vm_decode_instruction(uint8_t *instr, int instr_len, int *reg, uint32_t *imm, int *op_len) 205{ 206 struct decode_op dec_op; 207 dec_op.instr = instr; 208 dec_op.instr_len = instr_len; 209 dec_op.op.len = 1; 210 /* First loop through and check prefixes */ 211 int i; 212 for (i = 0; i < instr_len; i++) { 213 if (is_prefix(instr[i])) { 214 if (instr[i] == OP_SIZE_OVERRIDE) { 215 /* 16 bit modifier */ 216 dec_op.op.len = 2; 217 } 218 } else { 219 /* We've hit the opcode */ 220 break; 221 } 222 } 223 224 dec_op.curr_byte = i; 225 assert(dec_op.curr_byte < instr_len); /* We still need an opcode */ 226 227 uint8_t opcode = instr[dec_op.curr_byte]; 228 dec_op.curr_byte++; 229 if (opcode == OP_ESCAPE) { 230 opcode = instr[dec_op.curr_byte]; 231 dec_op.curr_byte++; 232 decode_table_2op[opcode].decode_fn(&dec_op); 233 } else { 234 decode_table_1op[opcode].decode_fn(&dec_op); 235 } 236 237 if (dec_op.op.len != 2 && IA32_OPCODE_S(opcode)) { 238 dec_op.op.len = 4; 239 } 240 241 *reg = dec_op.op.reg; 242 *imm = dec_op.op.val; 243 *op_len = dec_op.op.len; 244 return 0; 245} 246 247void vm_decode_ept_violation(vm_vcpu_t *vcpu, int *reg, uint32_t *imm, int *size) 248{ 249 /* Decode instruction */ 250 uint8_t ibuf[15]; 251 int instr_len = vm_guest_exit_get_int_len(vcpu->vcpu_arch.guest_state); 252 vm_fetch_instruction(vcpu, 253 vm_guest_state_get_eip(vcpu->vcpu_arch.guest_state), 254 vm_guest_state_get_cr3(vcpu->vcpu_arch.guest_state, vcpu->vcpu.cptr), 255 instr_len, ibuf); 256 257 vm_decode_instruction(ibuf, instr_len, reg, imm, size); 258} 259 260/* 261 Useful information: The GDT loaded by the Linux SMP trampoline looks like: 2620x00: 00 00 00 00 00 00 00 00 2630x08: 00 00 00 00 00 00 00 00 2640x10: ff ff 00 00 00 9b cf 00 <- Executable 0x00000000-0xffffffff 2650x18: ff ff 00 00 00 93 cf 00 <- RW data 0x00000000-0xffffffff 266*/ 267 268/* Interpret just enough virtual 8086 instructions to run trampoline code. 269 Returns the final jump address */ 270uintptr_t vm_emulate_realmode(vm_vcpu_t *vcpu, uint8_t *instr_buf, 271 uint16_t *segment, uintptr_t eip, uint32_t len, guest_state_t *gs) 272{ 273 /* We only track one segment, and assume that code and data are in the same 274 segment, which is valid for most trampoline and bootloader code */ 275 uint8_t *instr = instr_buf; 276 assert(segment); 277 278 while (instr - instr_buf < len) { 279 uintptr_t mem = 0; 280 uint32_t lit = 0; 281 int m66 = 0; 282 283 uint32_t base = 0; 284 uint32_t limit = 0; 285 286 if (*instr == 0x66) { 287 m66 = 1; 288 instr++; 289 } 290 291 if (*instr == 0x0f) { 292 instr++; 293 if (*instr == 0x01) { 294 instr++; 295 if (*instr == 0x1e) { 296 // lidtl 297 instr++; 298 memcpy(&mem, instr, 2); 299 mem += *segment * SEG_MULT; 300 instr += 2; 301 302 /* Limit is first 2 bytes, base is next 4 bytes */ 303 vm_ram_touch(vcpu->vm, mem, 304 2, vm_guest_ram_read_callback, &limit); 305 vm_ram_touch(vcpu->vm, mem + 2, 306 4, vm_guest_ram_read_callback, &base); 307 ZF_LOGD("lidtl %p\n", (void *)mem); 308 309 vm_guest_state_set_idt_base(gs, base); 310 vm_guest_state_set_idt_limit(gs, limit); 311 } else if (*instr == 0x16) { 312 // lgdtl 313 instr++; 314 memcpy(&mem, instr, 2); 315 mem += *segment * SEG_MULT; 316 instr += 2; 317 318 /* Limit is first 2 bytes, base is next 4 bytes */ 319 vm_ram_touch(vcpu->vm, mem, 320 2, vm_guest_ram_read_callback, &limit); 321 vm_ram_touch(vcpu->vm, mem + 2, 322 4, vm_guest_ram_read_callback, &base); 323 ZF_LOGD("lgdtl %p; base = %x, limit = %x\n", (void *)mem, 324 base, limit); 325 326 vm_guest_state_set_gdt_base(gs, base); 327 vm_guest_state_set_gdt_limit(gs, limit); 328 } else { 329 //ignore 330 instr++; 331 } 332 } else { 333 //ignore 334 instr++; 335 } 336 } else if (*instr == 0xea) { 337 /* Absolute jmp */ 338 instr++; 339 uint32_t base = 0; 340 uintptr_t jmp_addr = 0; 341 if (m66) { 342 // base is 4 bytes 343 /* Make the wild assumptions that we are now in protected mode 344 and the relevant GDT entry just covers all memory. Therefore 345 the base address is our absolute address. This just happens 346 to work with Linux and probably other modern systems that 347 don't use the GDT much. */ 348 memcpy(&base, instr, 4); 349 instr += 4; 350 jmp_addr = base; 351 memcpy(segment, instr, 2); 352 } else { 353 memcpy(&base, instr, 2); 354 instr += 2; 355 memcpy(segment, instr, 2); 356 jmp_addr = *segment * SEG_MULT + base; 357 } 358 instr += 2; 359 ZF_LOGD("absolute jmpf $%p, cs now %04x\n", (void *)jmp_addr, *segment); 360 if (((int64_t)jmp_addr - (int64_t)(len + eip)) >= 0) { 361 vm_guest_state_set_cs_selector(gs, *segment); 362 return jmp_addr; 363 } else { 364 instr = jmp_addr - eip + instr_buf; 365 } 366 } else { 367 switch (*instr) { 368 case 0xa1: 369 /* mov offset memory to eax */ 370 instr++; 371 memcpy(&mem, instr, 2); 372 instr += 2; 373 mem += *segment * SEG_MULT; 374 ZF_LOGD("mov %p, eax\n", (void *)mem); 375 uint32_t eax; 376 vm_ram_touch(vcpu->vm, mem, 377 4, vm_guest_ram_read_callback, &eax); 378 vm_set_thread_context_reg(vcpu, VCPU_CONTEXT_EAX, eax); 379 break; 380 case 0xc7: 381 instr++; 382 if (*instr == 0x06) { // modrm 383 int size; 384 instr++; 385 /* mov literal to memory */ 386 memcpy(&mem, instr, 2); 387 mem += *segment * SEG_MULT; 388 instr += 2; 389 if (m66) { 390 memcpy(&lit, instr, 4); 391 size = 4; 392 } else { 393 memcpy(&lit, instr, 2); 394 size = 2; 395 } 396 instr += size; 397 ZF_LOGD("mov $0x%x, %p\n", lit, (void *)mem); 398 vm_ram_touch(vcpu->vm, mem, 399 size, vm_guest_ram_write_callback, &lit); 400 } 401 break; 402 case 0xba: 403 //?????mov literal to dx 404 /* ignore */ 405 instr += 2; 406 break; 407 case 0x8c: 408 case 0x8e: 409 /* mov to/from sreg. ignore */ 410 instr += 2; 411 break; 412 default: 413 /* Assume this is a single byte instruction we can ignore */ 414 instr++; 415 } 416 } 417 418 ZF_LOGI("read %zu bytes\n", (size_t)(instr - instr_buf)); 419 } 420 421 return 0; 422} 423