1/*
2 * Copyright 2019, Data61, CSIRO (ABN 41 687 119 230)
3 *
4 * SPDX-License-Identifier: GPL-2.0-only
5 */
6
7/* x86 fetch/decode/emulate code
8
9Author: W.A.
10*/
11
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include <sel4vm/guest_vm.h>
17#include <sel4vm/guest_ram.h>
18#include <sel4vm/arch/guest_x86_context.h>
19
20#include "sel4vm/guest_memory.h"
21
22#include "processor/platfeature.h"
23#include "processor/decode.h"
24#include "guest_state.h"
25
26/* TODO are these defined elsewhere? */
27#define IA32_PDE_SIZE(pde) (pde & BIT(7))
28#define IA32_PDE_PRESENT(pde) (pde & BIT(0))
29#define IA32_PTE_ADDR(pte) (pte & 0xFFFFF000)
30#define IA32_PSE_ADDR(pde) (pde & 0xFFC00000)
31
32#define IA32_OPCODE_S(op) (op & BIT(0))
33#define IA32_OPCODE_D(op) (op & BIT(1))
34#define IA32_OPCODY_BODY(op) (op & 0b11111100)
35#define IA32_MODRM_REG(m) ((m & 0b00111000) >> 3)
36
37#define SEG_MULT (0x10)
38
39enum decode_instr {
40    DECODE_INSTR_MOV,
41    DECODE_INSTR_MOVQ,
42    DECODE_INSTR_INVALID
43};
44
45enum decode_prefix {
46    ES_SEG_OVERRIDE = 0x26,
47    CS_SEG_OVERRIDE = 0x2e,
48    SS_SEG_OVERRIDE = 0x36,
49    DS_SEG_OVERRIDE = 0x3e,
50    FS_SEG_OVERRIDE = 0x64,
51    GS_SEG_OVERRIDE = 0x65,
52    OP_SIZE_OVERRIDE = 0x66,
53    ADDR_SIZE_OVERRIDE = 0x67
54};
55
56struct x86_op {
57    int reg;
58    uint32_t val;
59    size_t len;
60};
61
62struct decode_op {
63    int curr_byte;
64    uint8_t *instr;
65    size_t instr_len;
66    struct x86_op op;
67};
68
69struct decode_table {
70    enum decode_instr instr;
71    void (*decode_fn)(struct decode_op *);
72};
73
74static void debug_print_instruction(uint8_t *instr, int instr_len);
75
76static void decode_modrm_reg_op(struct decode_op *decode_op)
77{
78    /* Mov with register */
79    uint8_t modrm = decode_op->instr[decode_op->curr_byte];
80    decode_op->curr_byte++;
81    decode_op->op.reg = IA32_MODRM_REG(modrm);
82    return;
83}
84
85static void decode_imm_op(struct decode_op *decode_op)
86{
87    /* Mov with immediate */
88    decode_op->op.reg = -1;
89    uint32_t immediate = 0;
90    for (int j = 0; j < decode_op->op.len; j++) {
91        immediate <<= 8;
92        immediate |= decode_op->instr[decode_op->instr_len - j - 1];
93    }
94    decode_op->op.val = immediate;
95    return;
96}
97
98static void decode_invalid_op(struct decode_op *decode_op)
99{
100    ZF_LOGE("can't emulate instruction!\n");
101    debug_print_instruction(decode_op->instr, decode_op->instr_len);
102    assert(0);
103}
104
105static const struct decode_table decode_table_1op[] = {
106    [0 ... MAX_INSTR_OPCODES] = {DECODE_INSTR_INVALID, decode_invalid_op},
107    [0x88] = {DECODE_INSTR_MOV, decode_modrm_reg_op},
108    [0x89] = {DECODE_INSTR_MOV, decode_modrm_reg_op},
109    [0x8a] = {DECODE_INSTR_MOV, decode_modrm_reg_op},
110    [0x8b] = {DECODE_INSTR_MOV, decode_modrm_reg_op},
111    [0x8c] = {DECODE_INSTR_MOV, decode_modrm_reg_op},
112    [0xc6] = {DECODE_INSTR_MOV, decode_imm_op},
113    [0xc7] = {DECODE_INSTR_MOV, decode_imm_op}
114};
115
116static const struct decode_table decode_table_2op[] = {
117    [0 ... MAX_INSTR_OPCODES] = {DECODE_INSTR_INVALID, decode_invalid_op},
118    [0x6f] = {DECODE_INSTR_MOVQ, decode_modrm_reg_op}
119};
120
121/* Get a word from a guest physical address */
122inline static uint32_t guest_get_phys_word(vm_t *vm, uintptr_t addr)
123{
124    uint32_t val;
125
126    vm_ram_touch(vm, addr, sizeof(uint32_t),
127                 vm_guest_ram_read_callback, &val);
128
129    return val;
130}
131
132/* Fetch a guest's instruction */
133int vm_fetch_instruction(vm_vcpu_t *vcpu, uint32_t eip, uintptr_t cr3,
134                         int len, uint8_t *buf)
135{
136    /* Walk page tables to get physical address of instruction */
137    uintptr_t instr_phys = 0;
138
139    /* ensure that PAE is not enabled */
140    if (vm_guest_state_get_cr4(vcpu->vcpu_arch.guest_state, vcpu->vcpu.cptr) & X86_CR4_PAE) {
141        ZF_LOGE("Do not support walking PAE paging structures");
142        return -1;
143    }
144
145    // TODO implement page-boundary crossing properly
146    assert((eip >> 12) == ((eip + len) >> 12));
147
148    uint32_t pdi = eip >> 22;
149    uint32_t pti = (eip >> 12) & 0x3FF;
150
151    uint32_t pde = guest_get_phys_word(vcpu->vm, cr3 + pdi * 4);
152
153    assert(IA32_PDE_PRESENT(pde)); /* WTF? */
154
155    if (IA32_PDE_SIZE(pde)) {
156        /* PSE is used, 4M pages */
157        instr_phys = (uintptr_t)IA32_PSE_ADDR(pde) + (eip & 0x3FFFFF);
158    } else {
159        /* 4k pages */
160        uint32_t pte = guest_get_phys_word(vcpu->vm,
161                                           (uintptr_t)IA32_PTE_ADDR(pde) + pti * 4);
162
163        assert(IA32_PDE_PRESENT(pte));
164
165        instr_phys = (uintptr_t)IA32_PTE_ADDR(pte) + (eip & 0xFFF);
166    }
167
168    /* Fetch instruction */
169    vm_ram_touch(vcpu->vm, instr_phys, len,
170                 vm_guest_ram_read_callback, buf);
171
172    return 0;
173}
174
175/* Returns 1 if this byte is an x86 instruction prefix */
176static int is_prefix(uint8_t byte)
177{
178    switch (byte) {
179    case ES_SEG_OVERRIDE:
180    case CS_SEG_OVERRIDE:
181    case SS_SEG_OVERRIDE:
182    case DS_SEG_OVERRIDE:
183    case FS_SEG_OVERRIDE:
184    case GS_SEG_OVERRIDE:
185    case ADDR_SIZE_OVERRIDE:
186    case OP_SIZE_OVERRIDE:
187        return 1;
188    }
189
190    return 0;
191}
192
193static void debug_print_instruction(uint8_t *instr, int instr_len)
194{
195    printf("instruction dump: ");
196    for (int j = 0; j < instr_len; j++) {
197        printf("%2x ", instr[j]);
198    }
199    printf("\n");
200}
201
202/* Partial support to decode an instruction for a memory access
203   This is very crude. It can break in many ways. */
204int vm_decode_instruction(uint8_t *instr, int instr_len, int *reg, uint32_t *imm, int *op_len)
205{
206    struct decode_op dec_op;
207    dec_op.instr = instr;
208    dec_op.instr_len = instr_len;
209    dec_op.op.len = 1;
210    /* First loop through and check prefixes */
211    int i;
212    for (i = 0; i < instr_len; i++) {
213        if (is_prefix(instr[i])) {
214            if (instr[i] == OP_SIZE_OVERRIDE) {
215                /* 16 bit modifier */
216                dec_op.op.len = 2;
217            }
218        } else {
219            /* We've hit the opcode */
220            break;
221        }
222    }
223
224    dec_op.curr_byte = i;
225    assert(dec_op.curr_byte < instr_len); /* We still need an opcode */
226
227    uint8_t opcode = instr[dec_op.curr_byte];
228    dec_op.curr_byte++;
229    if (opcode == OP_ESCAPE) {
230        opcode = instr[dec_op.curr_byte];
231        dec_op.curr_byte++;
232        decode_table_2op[opcode].decode_fn(&dec_op);
233    } else {
234        decode_table_1op[opcode].decode_fn(&dec_op);
235    }
236
237    if (dec_op.op.len != 2 && IA32_OPCODE_S(opcode)) {
238        dec_op.op.len = 4;
239    }
240
241    *reg = dec_op.op.reg;
242    *imm = dec_op.op.val;
243    *op_len = dec_op.op.len;
244    return 0;
245}
246
247void vm_decode_ept_violation(vm_vcpu_t *vcpu, int *reg, uint32_t *imm, int *size)
248{
249    /* Decode instruction */
250    uint8_t ibuf[15];
251    int instr_len = vm_guest_exit_get_int_len(vcpu->vcpu_arch.guest_state);
252    vm_fetch_instruction(vcpu,
253                         vm_guest_state_get_eip(vcpu->vcpu_arch.guest_state),
254                         vm_guest_state_get_cr3(vcpu->vcpu_arch.guest_state, vcpu->vcpu.cptr),
255                         instr_len, ibuf);
256
257    vm_decode_instruction(ibuf, instr_len, reg, imm, size);
258}
259
260/*
261   Useful information: The GDT loaded by the Linux SMP trampoline looks like:
2620x00: 00 00 00 00 00 00 00 00
2630x08: 00 00 00 00 00 00 00 00
2640x10: ff ff 00 00 00 9b cf 00 <- Executable 0x00000000-0xffffffff
2650x18: ff ff 00 00 00 93 cf 00 <- RW data    0x00000000-0xffffffff
266*/
267
268/* Interpret just enough virtual 8086 instructions to run trampoline code.
269   Returns the final jump address */
270uintptr_t vm_emulate_realmode(vm_vcpu_t *vcpu, uint8_t *instr_buf,
271                              uint16_t *segment, uintptr_t eip, uint32_t len, guest_state_t *gs)
272{
273    /* We only track one segment, and assume that code and data are in the same
274       segment, which is valid for most trampoline and bootloader code */
275    uint8_t *instr = instr_buf;
276    assert(segment);
277
278    while (instr - instr_buf < len) {
279        uintptr_t mem = 0;
280        uint32_t lit = 0;
281        int m66 = 0;
282
283        uint32_t base = 0;
284        uint32_t limit = 0;
285
286        if (*instr == 0x66) {
287            m66 = 1;
288            instr++;
289        }
290
291        if (*instr == 0x0f) {
292            instr++;
293            if (*instr == 0x01) {
294                instr++;
295                if (*instr == 0x1e) {
296                    // lidtl
297                    instr++;
298                    memcpy(&mem, instr, 2);
299                    mem += *segment * SEG_MULT;
300                    instr += 2;
301
302                    /* Limit is first 2 bytes, base is next 4 bytes */
303                    vm_ram_touch(vcpu->vm, mem,
304                                 2, vm_guest_ram_read_callback, &limit);
305                    vm_ram_touch(vcpu->vm, mem + 2,
306                                 4, vm_guest_ram_read_callback, &base);
307                    ZF_LOGD("lidtl %p\n", (void *)mem);
308
309                    vm_guest_state_set_idt_base(gs, base);
310                    vm_guest_state_set_idt_limit(gs, limit);
311                } else if (*instr == 0x16) {
312                    // lgdtl
313                    instr++;
314                    memcpy(&mem, instr, 2);
315                    mem += *segment * SEG_MULT;
316                    instr += 2;
317
318                    /* Limit is first 2 bytes, base is next 4 bytes */
319                    vm_ram_touch(vcpu->vm, mem,
320                                 2, vm_guest_ram_read_callback, &limit);
321                    vm_ram_touch(vcpu->vm, mem + 2,
322                                 4, vm_guest_ram_read_callback, &base);
323                    ZF_LOGD("lgdtl %p; base = %x, limit = %x\n", (void *)mem,
324                            base, limit);
325
326                    vm_guest_state_set_gdt_base(gs, base);
327                    vm_guest_state_set_gdt_limit(gs, limit);
328                } else {
329                    //ignore
330                    instr++;
331                }
332            } else {
333                //ignore
334                instr++;
335            }
336        } else if (*instr == 0xea) {
337            /* Absolute jmp */
338            instr++;
339            uint32_t base = 0;
340            uintptr_t jmp_addr = 0;
341            if (m66) {
342                // base is 4 bytes
343                /* Make the wild assumptions that we are now in protected mode
344                   and the relevant GDT entry just covers all memory. Therefore
345                   the base address is our absolute address. This just happens
346                   to work with Linux and probably other modern systems that
347                   don't use the GDT much. */
348                memcpy(&base, instr, 4);
349                instr += 4;
350                jmp_addr = base;
351                memcpy(segment, instr, 2);
352            } else {
353                memcpy(&base, instr, 2);
354                instr += 2;
355                memcpy(segment, instr, 2);
356                jmp_addr = *segment * SEG_MULT + base;
357            }
358            instr += 2;
359            ZF_LOGD("absolute jmpf $%p, cs now %04x\n", (void *)jmp_addr, *segment);
360            if (((int64_t)jmp_addr - (int64_t)(len + eip)) >= 0) {
361                vm_guest_state_set_cs_selector(gs, *segment);
362                return jmp_addr;
363            } else {
364                instr = jmp_addr - eip + instr_buf;
365            }
366        } else {
367            switch (*instr) {
368            case 0xa1:
369                /* mov offset memory to eax */
370                instr++;
371                memcpy(&mem, instr, 2);
372                instr += 2;
373                mem += *segment * SEG_MULT;
374                ZF_LOGD("mov %p, eax\n", (void *)mem);
375                uint32_t eax;
376                vm_ram_touch(vcpu->vm, mem,
377                             4, vm_guest_ram_read_callback, &eax);
378                vm_set_thread_context_reg(vcpu, VCPU_CONTEXT_EAX, eax);
379                break;
380            case 0xc7:
381                instr++;
382                if (*instr == 0x06) { // modrm
383                    int size;
384                    instr++;
385                    /* mov literal to memory */
386                    memcpy(&mem, instr, 2);
387                    mem += *segment * SEG_MULT;
388                    instr += 2;
389                    if (m66) {
390                        memcpy(&lit, instr, 4);
391                        size = 4;
392                    } else {
393                        memcpy(&lit, instr, 2);
394                        size = 2;
395                    }
396                    instr += size;
397                    ZF_LOGD("mov $0x%x, %p\n", lit, (void *)mem);
398                    vm_ram_touch(vcpu->vm, mem,
399                                 size, vm_guest_ram_write_callback, &lit);
400                }
401                break;
402            case 0xba:
403                //?????mov literal to dx
404                /* ignore */
405                instr += 2;
406                break;
407            case 0x8c:
408            case 0x8e:
409                /* mov to/from sreg. ignore */
410                instr += 2;
411                break;
412            default:
413                /* Assume this is a single byte instruction we can ignore */
414                instr++;
415            }
416        }
417
418        ZF_LOGI("read %zu bytes\n", (size_t)(instr - instr_buf));
419    }
420
421    return 0;
422}
423