1/*-
2 * Copyright (c) 2012 Sandvine, Inc.
3 * Copyright (c) 2012 NetApp, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: stable/11/sys/amd64/vmm/vmm_instruction_emul.c 349809 2019-07-07 17:31:13Z markj $
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/11/sys/amd64/vmm/vmm_instruction_emul.c 349809 2019-07-07 17:31:13Z markj $");
32
33#ifdef _KERNEL
34#include <sys/param.h>
35#include <sys/pcpu.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38
39#include <vm/vm.h>
40#include <vm/pmap.h>
41
42#include <machine/vmparam.h>
43#include <machine/vmm.h>
44#else	/* !_KERNEL */
45#include <sys/types.h>
46#include <sys/errno.h>
47#include <sys/_iovec.h>
48
49#include <machine/vmm.h>
50
51#include <assert.h>
52#include <vmmapi.h>
53#define	KASSERT(exp,msg)	assert((exp))
54#endif	/* _KERNEL */
55
56#include <machine/vmm_instruction_emul.h>
57#include <x86/psl.h>
58#include <x86/specialreg.h>
59
60/* struct vie_op.op_type */
61enum {
62	VIE_OP_TYPE_NONE = 0,
63	VIE_OP_TYPE_MOV,
64	VIE_OP_TYPE_MOVSX,
65	VIE_OP_TYPE_MOVZX,
66	VIE_OP_TYPE_AND,
67	VIE_OP_TYPE_OR,
68	VIE_OP_TYPE_SUB,
69	VIE_OP_TYPE_TWO_BYTE,
70	VIE_OP_TYPE_PUSH,
71	VIE_OP_TYPE_CMP,
72	VIE_OP_TYPE_POP,
73	VIE_OP_TYPE_MOVS,
74	VIE_OP_TYPE_GROUP1,
75	VIE_OP_TYPE_STOS,
76	VIE_OP_TYPE_BITTEST,
77	VIE_OP_TYPE_TWOB_GRP15,
78	VIE_OP_TYPE_ADD,
79	VIE_OP_TYPE_TEST,
80	VIE_OP_TYPE_LAST
81};
82
83/* struct vie_op.op_flags */
84#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
85#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
86#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
87#define	VIE_OP_F_NO_MODRM	(1 << 3)
88#define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
89
90static const struct vie_op two_byte_opcodes[256] = {
91	[0xAE] = {
92		  .op_byte = 0xAE,
93		  .op_type = VIE_OP_TYPE_TWOB_GRP15,
94	},
95	[0xB6] = {
96		.op_byte = 0xB6,
97		.op_type = VIE_OP_TYPE_MOVZX,
98	},
99	[0xB7] = {
100		.op_byte = 0xB7,
101		.op_type = VIE_OP_TYPE_MOVZX,
102	},
103	[0xBA] = {
104		.op_byte = 0xBA,
105		.op_type = VIE_OP_TYPE_BITTEST,
106		.op_flags = VIE_OP_F_IMM8,
107	},
108	[0xBE] = {
109		.op_byte = 0xBE,
110		.op_type = VIE_OP_TYPE_MOVSX,
111	},
112};
113
114static const struct vie_op one_byte_opcodes[256] = {
115	[0x03] = {
116		.op_byte = 0x03,
117		.op_type = VIE_OP_TYPE_ADD,
118	},
119	[0x0F] = {
120		.op_byte = 0x0F,
121		.op_type = VIE_OP_TYPE_TWO_BYTE
122	},
123	[0x0B] = {
124		.op_byte = 0x0B,
125		.op_type = VIE_OP_TYPE_OR,
126	},
127	[0x2B] = {
128		.op_byte = 0x2B,
129		.op_type = VIE_OP_TYPE_SUB,
130	},
131	[0x39] = {
132		.op_byte = 0x39,
133		.op_type = VIE_OP_TYPE_CMP,
134	},
135	[0x3B] = {
136		.op_byte = 0x3B,
137		.op_type = VIE_OP_TYPE_CMP,
138	},
139	[0x88] = {
140		.op_byte = 0x88,
141		.op_type = VIE_OP_TYPE_MOV,
142	},
143	[0x89] = {
144		.op_byte = 0x89,
145		.op_type = VIE_OP_TYPE_MOV,
146	},
147	[0x8A] = {
148		.op_byte = 0x8A,
149		.op_type = VIE_OP_TYPE_MOV,
150	},
151	[0x8B] = {
152		.op_byte = 0x8B,
153		.op_type = VIE_OP_TYPE_MOV,
154	},
155	[0xA1] = {
156		.op_byte = 0xA1,
157		.op_type = VIE_OP_TYPE_MOV,
158		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
159	},
160	[0xA3] = {
161		.op_byte = 0xA3,
162		.op_type = VIE_OP_TYPE_MOV,
163		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
164	},
165	[0xA4] = {
166		.op_byte = 0xA4,
167		.op_type = VIE_OP_TYPE_MOVS,
168		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
169	},
170	[0xA5] = {
171		.op_byte = 0xA5,
172		.op_type = VIE_OP_TYPE_MOVS,
173		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
174	},
175	[0xAA] = {
176		.op_byte = 0xAA,
177		.op_type = VIE_OP_TYPE_STOS,
178		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
179	},
180	[0xAB] = {
181		.op_byte = 0xAB,
182		.op_type = VIE_OP_TYPE_STOS,
183		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
184	},
185	[0xC6] = {
186		/* XXX Group 11 extended opcode - not just MOV */
187		.op_byte = 0xC6,
188		.op_type = VIE_OP_TYPE_MOV,
189		.op_flags = VIE_OP_F_IMM8,
190	},
191	[0xC7] = {
192		.op_byte = 0xC7,
193		.op_type = VIE_OP_TYPE_MOV,
194		.op_flags = VIE_OP_F_IMM,
195	},
196	[0x23] = {
197		.op_byte = 0x23,
198		.op_type = VIE_OP_TYPE_AND,
199	},
200	[0x80] = {
201		/* Group 1 extended opcode */
202		.op_byte = 0x80,
203		.op_type = VIE_OP_TYPE_GROUP1,
204		.op_flags = VIE_OP_F_IMM8,
205	},
206	[0x81] = {
207		/* Group 1 extended opcode */
208		.op_byte = 0x81,
209		.op_type = VIE_OP_TYPE_GROUP1,
210		.op_flags = VIE_OP_F_IMM,
211	},
212	[0x83] = {
213		/* Group 1 extended opcode */
214		.op_byte = 0x83,
215		.op_type = VIE_OP_TYPE_GROUP1,
216		.op_flags = VIE_OP_F_IMM8,
217	},
218	[0x8F] = {
219		/* XXX Group 1A extended opcode - not just POP */
220		.op_byte = 0x8F,
221		.op_type = VIE_OP_TYPE_POP,
222	},
223	[0xF7] = {
224		/* XXX Group 3 extended opcode - not just TEST */
225		.op_byte = 0xF7,
226		.op_type = VIE_OP_TYPE_TEST,
227		.op_flags = VIE_OP_F_IMM,
228	},
229	[0xFF] = {
230		/* XXX Group 5 extended opcode - not just PUSH */
231		.op_byte = 0xFF,
232		.op_type = VIE_OP_TYPE_PUSH,
233	}
234};
235
236/* struct vie.mod */
237#define	VIE_MOD_INDIRECT		0
238#define	VIE_MOD_INDIRECT_DISP8		1
239#define	VIE_MOD_INDIRECT_DISP32		2
240#define	VIE_MOD_DIRECT			3
241
242/* struct vie.rm */
243#define	VIE_RM_SIB			4
244#define	VIE_RM_DISP32			5
245
246#define	GB				(1024 * 1024 * 1024)
247
248static enum vm_reg_name gpr_map[16] = {
249	VM_REG_GUEST_RAX,
250	VM_REG_GUEST_RCX,
251	VM_REG_GUEST_RDX,
252	VM_REG_GUEST_RBX,
253	VM_REG_GUEST_RSP,
254	VM_REG_GUEST_RBP,
255	VM_REG_GUEST_RSI,
256	VM_REG_GUEST_RDI,
257	VM_REG_GUEST_R8,
258	VM_REG_GUEST_R9,
259	VM_REG_GUEST_R10,
260	VM_REG_GUEST_R11,
261	VM_REG_GUEST_R12,
262	VM_REG_GUEST_R13,
263	VM_REG_GUEST_R14,
264	VM_REG_GUEST_R15
265};
266
267static uint64_t size2mask[] = {
268	[1] = 0xff,
269	[2] = 0xffff,
270	[4] = 0xffffffff,
271	[8] = 0xffffffffffffffff,
272};
273
274static int
275vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
276{
277	int error;
278
279	error = vm_get_register(vm, vcpuid, reg, rval);
280
281	return (error);
282}
283
284static void
285vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
286{
287	*lhbr = 0;
288	*reg = gpr_map[vie->reg];
289
290	/*
291	 * 64-bit mode imposes limitations on accessing legacy high byte
292	 * registers (lhbr).
293	 *
294	 * The legacy high-byte registers cannot be addressed if the REX
295	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
296	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
297	 *
298	 * If the REX prefix is not present then the values 4, 5, 6 and 7
299	 * of the 'ModRM:reg' field address the legacy high-byte registers,
300	 * %ah, %ch, %dh and %bh respectively.
301	 */
302	if (!vie->rex_present) {
303		if (vie->reg & 0x4) {
304			*lhbr = 1;
305			*reg = gpr_map[vie->reg & 0x3];
306		}
307	}
308}
309
310static int
311vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
312{
313	uint64_t val;
314	int error, lhbr;
315	enum vm_reg_name reg;
316
317	vie_calc_bytereg(vie, &reg, &lhbr);
318	error = vm_get_register(vm, vcpuid, reg, &val);
319
320	/*
321	 * To obtain the value of a legacy high byte register shift the
322	 * base register right by 8 bits (%ah = %rax >> 8).
323	 */
324	if (lhbr)
325		*rval = val >> 8;
326	else
327		*rval = val;
328	return (error);
329}
330
331static int
332vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
333{
334	uint64_t origval, val, mask;
335	int error, lhbr;
336	enum vm_reg_name reg;
337
338	vie_calc_bytereg(vie, &reg, &lhbr);
339	error = vm_get_register(vm, vcpuid, reg, &origval);
340	if (error == 0) {
341		val = byte;
342		mask = 0xff;
343		if (lhbr) {
344			/*
345			 * Shift left by 8 to store 'byte' in a legacy high
346			 * byte register.
347			 */
348			val <<= 8;
349			mask <<= 8;
350		}
351		val |= origval & ~mask;
352		error = vm_set_register(vm, vcpuid, reg, val);
353	}
354	return (error);
355}
356
357int
358vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
359		    uint64_t val, int size)
360{
361	int error;
362	uint64_t origval;
363
364	switch (size) {
365	case 1:
366	case 2:
367		error = vie_read_register(vm, vcpuid, reg, &origval);
368		if (error)
369			return (error);
370		val &= size2mask[size];
371		val |= origval & ~size2mask[size];
372		break;
373	case 4:
374		val &= 0xffffffffUL;
375		break;
376	case 8:
377		break;
378	default:
379		return (EINVAL);
380	}
381
382	error = vm_set_register(vm, vcpuid, reg, val);
383	return (error);
384}
385
386#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
387
388/*
389 * Return the status flags that would result from doing (x - y).
390 */
391#define	GETCC(sz)							\
392static u_long								\
393getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
394{									\
395	u_long rflags;							\
396									\
397	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
398	    "=r" (rflags), "+r" (x) : "m" (y));				\
399	return (rflags);						\
400} struct __hack
401
402GETCC(8);
403GETCC(16);
404GETCC(32);
405GETCC(64);
406
407static u_long
408getcc(int opsize, uint64_t x, uint64_t y)
409{
410	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
411	    ("getcc: invalid operand size %d", opsize));
412
413	if (opsize == 1)
414		return (getcc8(x, y));
415	else if (opsize == 2)
416		return (getcc16(x, y));
417	else if (opsize == 4)
418		return (getcc32(x, y));
419	else
420		return (getcc64(x, y));
421}
422
423/*
424 * Macro creation of functions getaddflags{8,16,32,64}
425 */
426#define	GETADDFLAGS(sz)							\
427static u_long								\
428getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
429{									\
430	u_long rflags;							\
431									\
432	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
433	    "=r" (rflags), "+r" (x) : "m" (y));				\
434	return (rflags);						\
435} struct __hack
436
437GETADDFLAGS(8);
438GETADDFLAGS(16);
439GETADDFLAGS(32);
440GETADDFLAGS(64);
441
442static u_long
443getaddflags(int opsize, uint64_t x, uint64_t y)
444{
445	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
446	    ("getaddflags: invalid operand size %d", opsize));
447
448	if (opsize == 1)
449		return (getaddflags8(x, y));
450	else if (opsize == 2)
451		return (getaddflags16(x, y));
452	else if (opsize == 4)
453		return (getaddflags32(x, y));
454	else
455		return (getaddflags64(x, y));
456}
457
458/*
459 * Return the status flags that would result from doing (x & y).
460 */
461#define	GETANDFLAGS(sz)							\
462static u_long								\
463getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
464{									\
465	u_long rflags;							\
466									\
467	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
468	    "=r" (rflags), "+r" (x) : "m" (y));				\
469	return (rflags);						\
470} struct __hack
471
472GETANDFLAGS(8);
473GETANDFLAGS(16);
474GETANDFLAGS(32);
475GETANDFLAGS(64);
476
477static u_long
478getandflags(int opsize, uint64_t x, uint64_t y)
479{
480	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
481	    ("getandflags: invalid operand size %d", opsize));
482
483	if (opsize == 1)
484		return (getandflags8(x, y));
485	else if (opsize == 2)
486		return (getandflags16(x, y));
487	else if (opsize == 4)
488		return (getandflags32(x, y));
489	else
490		return (getandflags64(x, y));
491}
492
493static int
494emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
495	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
496{
497	int error, size;
498	enum vm_reg_name reg;
499	uint8_t byte;
500	uint64_t val;
501
502	size = vie->opsize;
503	error = EINVAL;
504
505	switch (vie->op.op_byte) {
506	case 0x88:
507		/*
508		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
509		 * 88/r:	mov r/m8, r8
510		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
511		 */
512		size = 1;	/* override for byte operation */
513		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
514		if (error == 0)
515			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
516		break;
517	case 0x89:
518		/*
519		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
520		 * 89/r:	mov r/m16, r16
521		 * 89/r:	mov r/m32, r32
522		 * REX.W + 89/r	mov r/m64, r64
523		 */
524		reg = gpr_map[vie->reg];
525		error = vie_read_register(vm, vcpuid, reg, &val);
526		if (error == 0) {
527			val &= size2mask[size];
528			error = memwrite(vm, vcpuid, gpa, val, size, arg);
529		}
530		break;
531	case 0x8A:
532		/*
533		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
534		 * 8A/r:	mov r8, r/m8
535		 * REX + 8A/r:	mov r8, r/m8
536		 */
537		size = 1;	/* override for byte operation */
538		error = memread(vm, vcpuid, gpa, &val, size, arg);
539		if (error == 0)
540			error = vie_write_bytereg(vm, vcpuid, vie, val);
541		break;
542	case 0x8B:
543		/*
544		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
545		 * 8B/r:	mov r16, r/m16
546		 * 8B/r:	mov r32, r/m32
547		 * REX.W 8B/r:	mov r64, r/m64
548		 */
549		error = memread(vm, vcpuid, gpa, &val, size, arg);
550		if (error == 0) {
551			reg = gpr_map[vie->reg];
552			error = vie_update_register(vm, vcpuid, reg, val, size);
553		}
554		break;
555	case 0xA1:
556		/*
557		 * MOV from seg:moffset to AX/EAX/RAX
558		 * A1:		mov AX, moffs16
559		 * A1:		mov EAX, moffs32
560		 * REX.W + A1:	mov RAX, moffs64
561		 */
562		error = memread(vm, vcpuid, gpa, &val, size, arg);
563		if (error == 0) {
564			reg = VM_REG_GUEST_RAX;
565			error = vie_update_register(vm, vcpuid, reg, val, size);
566		}
567		break;
568	case 0xA3:
569		/*
570		 * MOV from AX/EAX/RAX to seg:moffset
571		 * A3:		mov moffs16, AX
572		 * A3:		mov moffs32, EAX
573		 * REX.W + A3:	mov moffs64, RAX
574		 */
575		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
576		if (error == 0) {
577			val &= size2mask[size];
578			error = memwrite(vm, vcpuid, gpa, val, size, arg);
579		}
580		break;
581	case 0xC6:
582		/*
583		 * MOV from imm8 to mem (ModRM:r/m)
584		 * C6/0		mov r/m8, imm8
585		 * REX + C6/0	mov r/m8, imm8
586		 */
587		size = 1;	/* override for byte operation */
588		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
589		break;
590	case 0xC7:
591		/*
592		 * MOV from imm16/imm32 to mem (ModRM:r/m)
593		 * C7/0		mov r/m16, imm16
594		 * C7/0		mov r/m32, imm32
595		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
596		 */
597		val = vie->immediate & size2mask[size];
598		error = memwrite(vm, vcpuid, gpa, val, size, arg);
599		break;
600	default:
601		break;
602	}
603
604	return (error);
605}
606
607static int
608emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
609	     mem_region_read_t memread, mem_region_write_t memwrite,
610	     void *arg)
611{
612	int error, size;
613	enum vm_reg_name reg;
614	uint64_t val;
615
616	size = vie->opsize;
617	error = EINVAL;
618
619	switch (vie->op.op_byte) {
620	case 0xB6:
621		/*
622		 * MOV and zero extend byte from mem (ModRM:r/m) to
623		 * reg (ModRM:reg).
624		 *
625		 * 0F B6/r		movzx r16, r/m8
626		 * 0F B6/r		movzx r32, r/m8
627		 * REX.W + 0F B6/r	movzx r64, r/m8
628		 */
629
630		/* get the first operand */
631		error = memread(vm, vcpuid, gpa, &val, 1, arg);
632		if (error)
633			break;
634
635		/* get the second operand */
636		reg = gpr_map[vie->reg];
637
638		/* zero-extend byte */
639		val = (uint8_t)val;
640
641		/* write the result */
642		error = vie_update_register(vm, vcpuid, reg, val, size);
643		break;
644	case 0xB7:
645		/*
646		 * MOV and zero extend word from mem (ModRM:r/m) to
647		 * reg (ModRM:reg).
648		 *
649		 * 0F B7/r		movzx r32, r/m16
650		 * REX.W + 0F B7/r	movzx r64, r/m16
651		 */
652		error = memread(vm, vcpuid, gpa, &val, 2, arg);
653		if (error)
654			return (error);
655
656		reg = gpr_map[vie->reg];
657
658		/* zero-extend word */
659		val = (uint16_t)val;
660
661		error = vie_update_register(vm, vcpuid, reg, val, size);
662		break;
663	case 0xBE:
664		/*
665		 * MOV and sign extend byte from mem (ModRM:r/m) to
666		 * reg (ModRM:reg).
667		 *
668		 * 0F BE/r		movsx r16, r/m8
669		 * 0F BE/r		movsx r32, r/m8
670		 * REX.W + 0F BE/r	movsx r64, r/m8
671		 */
672
673		/* get the first operand */
674		error = memread(vm, vcpuid, gpa, &val, 1, arg);
675		if (error)
676			break;
677
678		/* get the second operand */
679		reg = gpr_map[vie->reg];
680
681		/* sign extend byte */
682		val = (int8_t)val;
683
684		/* write the result */
685		error = vie_update_register(vm, vcpuid, reg, val, size);
686		break;
687	default:
688		break;
689	}
690	return (error);
691}
692
693/*
694 * Helper function to calculate and validate a linear address.
695 */
696static int
697get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
698    int opsize, int addrsize, int prot, enum vm_reg_name seg,
699    enum vm_reg_name gpr, uint64_t *gla, int *fault)
700{
701	struct seg_desc desc;
702	uint64_t cr0, val, rflags;
703	int error;
704
705	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
706	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
707
708	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
709	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
710
711	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
712	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
713	    __func__, error, seg));
714
715	error = vie_read_register(vm, vcpuid, gpr, &val);
716	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
717	    error, gpr));
718
719	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
720	    addrsize, prot, gla)) {
721		if (seg == VM_REG_GUEST_SS)
722			vm_inject_ss(vm, vcpuid, 0);
723		else
724			vm_inject_gp(vm, vcpuid);
725		goto guest_fault;
726	}
727
728	if (vie_canonical_check(paging->cpu_mode, *gla)) {
729		if (seg == VM_REG_GUEST_SS)
730			vm_inject_ss(vm, vcpuid, 0);
731		else
732			vm_inject_gp(vm, vcpuid);
733		goto guest_fault;
734	}
735
736	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
737		vm_inject_ac(vm, vcpuid, 0);
738		goto guest_fault;
739	}
740
741	*fault = 0;
742	return (0);
743
744guest_fault:
745	*fault = 1;
746	return (0);
747}
748
749static int
750emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
751    struct vm_guest_paging *paging, mem_region_read_t memread,
752    mem_region_write_t memwrite, void *arg)
753{
754#ifdef _KERNEL
755	struct vm_copyinfo copyinfo[2];
756#else
757	struct iovec copyinfo[2];
758#endif
759	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
760	uint64_t rcx, rdi, rsi, rflags;
761	int error, fault, opsize, seg, repeat;
762
763	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
764	val = 0;
765	error = 0;
766
767	/*
768	 * XXX although the MOVS instruction is only supposed to be used with
769	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
770	 *
771	 * Empirically the "repnz" prefix has identical behavior to "rep"
772	 * and the zero flag does not make a difference.
773	 */
774	repeat = vie->repz_present | vie->repnz_present;
775
776	if (repeat) {
777		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
778		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
779
780		/*
781		 * The count register is %rcx, %ecx or %cx depending on the
782		 * address size of the instruction.
783		 */
784		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
785			error = 0;
786			goto done;
787		}
788	}
789
790	/*
791	 *	Source		Destination	Comments
792	 *	--------------------------------------------
793	 * (1)  memory		memory		n/a
794	 * (2)  memory		mmio		emulated
795	 * (3)  mmio		memory		emulated
796	 * (4)  mmio		mmio		emulated
797	 *
798	 * At this point we don't have sufficient information to distinguish
799	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
800	 * out because it will succeed only when operating on regular memory.
801	 *
802	 * XXX the emulation doesn't properly handle the case where 'gpa'
803	 * is straddling the boundary between the normal memory and MMIO.
804	 */
805
806	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
807	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
808	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
809	if (error || fault)
810		goto done;
811
812	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
813	    copyinfo, nitems(copyinfo), &fault);
814	if (error == 0) {
815		if (fault)
816			goto done;	/* Resume guest to handle fault */
817
818		/*
819		 * case (2): read from system memory and write to mmio.
820		 */
821		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
822		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
823		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
824		if (error)
825			goto done;
826	} else {
827		/*
828		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
829		 * if 'srcaddr' is in the mmio space.
830		 */
831
832		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
833		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
834		    &fault);
835		if (error || fault)
836			goto done;
837
838		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
839		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
840		if (error == 0) {
841			if (fault)
842				goto done;    /* Resume guest to handle fault */
843
844			/*
845			 * case (3): read from MMIO and write to system memory.
846			 *
847			 * A MMIO read can have side-effects so we
848			 * commit to it only after vm_copy_setup() is
849			 * successful. If a page-fault needs to be
850			 * injected into the guest then it will happen
851			 * before the MMIO read is attempted.
852			 */
853			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
854			if (error)
855				goto done;
856
857			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
858			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
859		} else {
860			/*
861			 * Case (4): read from and write to mmio.
862			 *
863			 * Commit to the MMIO read/write (with potential
864			 * side-effects) only after we are sure that the
865			 * instruction is not going to be restarted due
866			 * to address translation faults.
867			 */
868			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
869			    PROT_READ, &srcgpa, &fault);
870			if (error || fault)
871				goto done;
872
873			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
874			   PROT_WRITE, &dstgpa, &fault);
875			if (error || fault)
876				goto done;
877
878			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
879			if (error)
880				goto done;
881
882			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
883			if (error)
884				goto done;
885		}
886	}
887
888	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
889	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
890
891	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
892	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
893
894	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
895	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
896
897	if (rflags & PSL_D) {
898		rsi -= opsize;
899		rdi -= opsize;
900	} else {
901		rsi += opsize;
902		rdi += opsize;
903	}
904
905	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
906	    vie->addrsize);
907	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
908
909	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
910	    vie->addrsize);
911	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
912
913	if (repeat) {
914		rcx = rcx - 1;
915		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
916		    rcx, vie->addrsize);
917		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
918
919		/*
920		 * Repeat the instruction if the count register is not zero.
921		 */
922		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
923			vm_restart_instruction(vm, vcpuid);
924	}
925done:
926	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
927	    __func__, error));
928	return (error);
929}
930
931static int
932emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
933    struct vm_guest_paging *paging, mem_region_read_t memread,
934    mem_region_write_t memwrite, void *arg)
935{
936	int error, opsize, repeat;
937	uint64_t val;
938	uint64_t rcx, rdi, rflags;
939
940	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
941	repeat = vie->repz_present | vie->repnz_present;
942
943	if (repeat) {
944		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
945		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
946
947		/*
948		 * The count register is %rcx, %ecx or %cx depending on the
949		 * address size of the instruction.
950		 */
951		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
952			return (0);
953	}
954
955	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
956	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
957
958	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
959	if (error)
960		return (error);
961
962	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
963	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
964
965	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
966	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
967
968	if (rflags & PSL_D)
969		rdi -= opsize;
970	else
971		rdi += opsize;
972
973	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
974	    vie->addrsize);
975	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
976
977	if (repeat) {
978		rcx = rcx - 1;
979		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
980		    rcx, vie->addrsize);
981		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
982
983		/*
984		 * Repeat the instruction if the count register is not zero.
985		 */
986		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
987			vm_restart_instruction(vm, vcpuid);
988	}
989
990	return (0);
991}
992
993static int
994emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
995	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
996{
997	int error, size;
998	enum vm_reg_name reg;
999	uint64_t result, rflags, rflags2, val1, val2;
1000
1001	size = vie->opsize;
1002	error = EINVAL;
1003
1004	switch (vie->op.op_byte) {
1005	case 0x23:
1006		/*
1007		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1008		 * result in reg.
1009		 *
1010		 * 23/r		and r16, r/m16
1011		 * 23/r		and r32, r/m32
1012		 * REX.W + 23/r	and r64, r/m64
1013		 */
1014
1015		/* get the first operand */
1016		reg = gpr_map[vie->reg];
1017		error = vie_read_register(vm, vcpuid, reg, &val1);
1018		if (error)
1019			break;
1020
1021		/* get the second operand */
1022		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1023		if (error)
1024			break;
1025
1026		/* perform the operation and write the result */
1027		result = val1 & val2;
1028		error = vie_update_register(vm, vcpuid, reg, result, size);
1029		break;
1030	case 0x81:
1031	case 0x83:
1032		/*
1033		 * AND mem (ModRM:r/m) with immediate and store the
1034		 * result in mem.
1035		 *
1036		 * 81 /4		and r/m16, imm16
1037		 * 81 /4		and r/m32, imm32
1038		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1039		 *
1040		 * 83 /4		and r/m16, imm8 sign-extended to 16
1041		 * 83 /4		and r/m32, imm8 sign-extended to 32
1042		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1043		 */
1044
1045		/* get the first operand */
1046                error = memread(vm, vcpuid, gpa, &val1, size, arg);
1047                if (error)
1048			break;
1049
1050                /*
1051		 * perform the operation with the pre-fetched immediate
1052		 * operand and write the result
1053		 */
1054                result = val1 & vie->immediate;
1055                error = memwrite(vm, vcpuid, gpa, result, size, arg);
1056		break;
1057	default:
1058		break;
1059	}
1060	if (error)
1061		return (error);
1062
1063	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1064	if (error)
1065		return (error);
1066
1067	/*
1068	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1069	 * to the result; AF is undefined.
1070	 *
1071	 * The updated status flags are obtained by subtracting 0 from 'result'.
1072	 */
1073	rflags2 = getcc(size, result, 0);
1074	rflags &= ~RFLAGS_STATUS_BITS;
1075	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1076
1077	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1078	return (error);
1079}
1080
1081static int
1082emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1083	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1084{
1085	int error, size;
1086	enum vm_reg_name reg;
1087	uint64_t result, rflags, rflags2, val1, val2;
1088
1089	size = vie->opsize;
1090	error = EINVAL;
1091
1092	switch (vie->op.op_byte) {
1093	case 0x0B:
1094		/*
1095		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1096		 * result in reg.
1097		 *
1098		 * 0b/r         or r16, r/m16
1099		 * 0b/r         or r32, r/m32
1100		 * REX.W + 0b/r or r64, r/m64
1101		 */
1102
1103		/* get the first operand */
1104		reg = gpr_map[vie->reg];
1105		error = vie_read_register(vm, vcpuid, reg, &val1);
1106		if (error)
1107			break;
1108
1109		/* get the second operand */
1110		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1111		if (error)
1112			break;
1113
1114		/* perform the operation and write the result */
1115		result = val1 | val2;
1116		error = vie_update_register(vm, vcpuid, reg, result, size);
1117		break;
1118	case 0x81:
1119	case 0x83:
1120		/*
1121		 * OR mem (ModRM:r/m) with immediate and store the
1122		 * result in mem.
1123		 *
1124		 * 81 /1		or r/m16, imm16
1125		 * 81 /1		or r/m32, imm32
1126		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1127		 *
1128		 * 83 /1		or r/m16, imm8 sign-extended to 16
1129		 * 83 /1		or r/m32, imm8 sign-extended to 32
1130		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1131		 */
1132
1133		/* get the first operand */
1134                error = memread(vm, vcpuid, gpa, &val1, size, arg);
1135                if (error)
1136			break;
1137
1138                /*
1139		 * perform the operation with the pre-fetched immediate
1140		 * operand and write the result
1141		 */
1142                result = val1 | vie->immediate;
1143                error = memwrite(vm, vcpuid, gpa, result, size, arg);
1144		break;
1145	default:
1146		break;
1147	}
1148	if (error)
1149		return (error);
1150
1151	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1152	if (error)
1153		return (error);
1154
1155	/*
1156	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1157	 * to the result; AF is undefined.
1158	 *
1159	 * The updated status flags are obtained by subtracting 0 from 'result'.
1160	 */
1161	rflags2 = getcc(size, result, 0);
1162	rflags &= ~RFLAGS_STATUS_BITS;
1163	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1164
1165	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1166	return (error);
1167}
1168
1169static int
1170emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1171	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1172{
1173	int error, size;
1174	uint64_t regop, memop, op1, op2, rflags, rflags2;
1175	enum vm_reg_name reg;
1176
1177	size = vie->opsize;
1178	switch (vie->op.op_byte) {
1179	case 0x39:
1180	case 0x3B:
1181		/*
1182		 * 39/r		CMP r/m16, r16
1183		 * 39/r		CMP r/m32, r32
1184		 * REX.W 39/r	CMP r/m64, r64
1185		 *
1186		 * 3B/r		CMP r16, r/m16
1187		 * 3B/r		CMP r32, r/m32
1188		 * REX.W + 3B/r	CMP r64, r/m64
1189		 *
1190		 * Compare the first operand with the second operand and
1191		 * set status flags in EFLAGS register. The comparison is
1192		 * performed by subtracting the second operand from the first
1193		 * operand and then setting the status flags.
1194		 */
1195
1196		/* Get the register operand */
1197		reg = gpr_map[vie->reg];
1198		error = vie_read_register(vm, vcpuid, reg, &regop);
1199		if (error)
1200			return (error);
1201
1202		/* Get the memory operand */
1203		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1204		if (error)
1205			return (error);
1206
1207		if (vie->op.op_byte == 0x3B) {
1208			op1 = regop;
1209			op2 = memop;
1210		} else {
1211			op1 = memop;
1212			op2 = regop;
1213		}
1214		rflags2 = getcc(size, op1, op2);
1215		break;
1216	case 0x80:
1217	case 0x81:
1218	case 0x83:
1219		/*
1220		 * 80 /7		cmp r/m8, imm8
1221		 * REX + 80 /7		cmp r/m8, imm8
1222		 *
1223		 * 81 /7		cmp r/m16, imm16
1224		 * 81 /7		cmp r/m32, imm32
1225		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1226		 *
1227		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1228		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1229		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1230		 *
1231		 * Compare mem (ModRM:r/m) with immediate and set
1232		 * status flags according to the results.  The
1233		 * comparison is performed by subtracting the
1234		 * immediate from the first operand and then setting
1235		 * the status flags.
1236		 *
1237		 */
1238		if (vie->op.op_byte == 0x80)
1239			size = 1;
1240
1241		/* get the first operand */
1242                error = memread(vm, vcpuid, gpa, &op1, size, arg);
1243		if (error)
1244			return (error);
1245
1246		rflags2 = getcc(size, op1, vie->immediate);
1247		break;
1248	default:
1249		return (EINVAL);
1250	}
1251	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1252	if (error)
1253		return (error);
1254	rflags &= ~RFLAGS_STATUS_BITS;
1255	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1256
1257	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1258	return (error);
1259}
1260
1261static int
1262emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1263    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1264{
1265	int error, size;
1266	uint64_t op1, rflags, rflags2;
1267
1268	size = vie->opsize;
1269	error = EINVAL;
1270
1271	switch (vie->op.op_byte) {
1272	case 0xF7:
1273		/*
1274		 * F7 /0		test r/m16, imm16
1275		 * F7 /0		test r/m32, imm32
1276		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1277		 *
1278		 * Test mem (ModRM:r/m) with immediate and set status
1279		 * flags according to the results.  The comparison is
1280		 * performed by anding the immediate from the first
1281		 * operand and then setting the status flags.
1282		 */
1283		if ((vie->reg & 7) != 0)
1284			return (EINVAL);
1285
1286		error = memread(vm, vcpuid, gpa, &op1, size, arg);
1287		if (error)
1288			return (error);
1289
1290		rflags2 = getandflags(size, op1, vie->immediate);
1291		break;
1292	default:
1293		return (EINVAL);
1294	}
1295	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1296	if (error)
1297		return (error);
1298
1299	/*
1300	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1301	 * to the result; AF is undefined.
1302	 */
1303	rflags &= ~RFLAGS_STATUS_BITS;
1304	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1305
1306	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1307	return (error);
1308}
1309
1310static int
1311emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1312	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1313{
1314	int error, size;
1315	uint64_t nval, rflags, rflags2, val1, val2;
1316	enum vm_reg_name reg;
1317
1318	size = vie->opsize;
1319	error = EINVAL;
1320
1321	switch (vie->op.op_byte) {
1322	case 0x03:
1323		/*
1324		 * ADD r/m to r and store the result in r
1325		 *
1326		 * 03/r            ADD r16, r/m16
1327		 * 03/r            ADD r32, r/m32
1328		 * REX.W + 03/r    ADD r64, r/m64
1329		 */
1330
1331		/* get the first operand */
1332		reg = gpr_map[vie->reg];
1333		error = vie_read_register(vm, vcpuid, reg, &val1);
1334		if (error)
1335			break;
1336
1337		/* get the second operand */
1338		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1339		if (error)
1340			break;
1341
1342		/* perform the operation and write the result */
1343		nval = val1 + val2;
1344		error = vie_update_register(vm, vcpuid, reg, nval, size);
1345		break;
1346	default:
1347		break;
1348	}
1349
1350	if (!error) {
1351		rflags2 = getaddflags(size, val1, val2);
1352		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1353		    &rflags);
1354		if (error)
1355			return (error);
1356
1357		rflags &= ~RFLAGS_STATUS_BITS;
1358		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1359		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1360		    rflags, 8);
1361	}
1362
1363	return (error);
1364}
1365
1366static int
1367emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1368	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1369{
1370	int error, size;
1371	uint64_t nval, rflags, rflags2, val1, val2;
1372	enum vm_reg_name reg;
1373
1374	size = vie->opsize;
1375	error = EINVAL;
1376
1377	switch (vie->op.op_byte) {
1378	case 0x2B:
1379		/*
1380		 * SUB r/m from r and store the result in r
1381		 *
1382		 * 2B/r            SUB r16, r/m16
1383		 * 2B/r            SUB r32, r/m32
1384		 * REX.W + 2B/r    SUB r64, r/m64
1385		 */
1386
1387		/* get the first operand */
1388		reg = gpr_map[vie->reg];
1389		error = vie_read_register(vm, vcpuid, reg, &val1);
1390		if (error)
1391			break;
1392
1393		/* get the second operand */
1394		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1395		if (error)
1396			break;
1397
1398		/* perform the operation and write the result */
1399		nval = val1 - val2;
1400		error = vie_update_register(vm, vcpuid, reg, nval, size);
1401		break;
1402	default:
1403		break;
1404	}
1405
1406	if (!error) {
1407		rflags2 = getcc(size, val1, val2);
1408		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1409		    &rflags);
1410		if (error)
1411			return (error);
1412
1413		rflags &= ~RFLAGS_STATUS_BITS;
1414		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1415		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1416		    rflags, 8);
1417	}
1418
1419	return (error);
1420}
1421
1422static int
1423emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1424    struct vm_guest_paging *paging, mem_region_read_t memread,
1425    mem_region_write_t memwrite, void *arg)
1426{
1427#ifdef _KERNEL
1428	struct vm_copyinfo copyinfo[2];
1429#else
1430	struct iovec copyinfo[2];
1431#endif
1432	struct seg_desc ss_desc;
1433	uint64_t cr0, rflags, rsp, stack_gla, val;
1434	int error, fault, size, stackaddrsize, pushop;
1435
1436	val = 0;
1437	size = vie->opsize;
1438	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1439
1440	/*
1441	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1442	 */
1443	if (paging->cpu_mode == CPU_MODE_REAL) {
1444		stackaddrsize = 2;
1445	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1446		/*
1447		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1448		 * - Stack pointer size is always 64-bits.
1449		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1450		 * - 16-bit PUSH/POP is supported by using the operand size
1451		 *   override prefix (66H).
1452		 */
1453		stackaddrsize = 8;
1454		size = vie->opsize_override ? 2 : 8;
1455	} else {
1456		/*
1457		 * In protected or compatibility mode the 'B' flag in the
1458		 * stack-segment descriptor determines the size of the
1459		 * stack pointer.
1460		 */
1461		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1462		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1463		    __func__, error));
1464		if (SEG_DESC_DEF32(ss_desc.access))
1465			stackaddrsize = 4;
1466		else
1467			stackaddrsize = 2;
1468	}
1469
1470	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1471	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1472
1473	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1474	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1475
1476	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1477	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1478	if (pushop) {
1479		rsp -= size;
1480	}
1481
1482	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1483	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1484	    &stack_gla)) {
1485		vm_inject_ss(vm, vcpuid, 0);
1486		return (0);
1487	}
1488
1489	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1490		vm_inject_ss(vm, vcpuid, 0);
1491		return (0);
1492	}
1493
1494	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1495		vm_inject_ac(vm, vcpuid, 0);
1496		return (0);
1497	}
1498
1499	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1500	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1501	    &fault);
1502	if (error || fault)
1503		return (error);
1504
1505	if (pushop) {
1506		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1507		if (error == 0)
1508			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1509	} else {
1510		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1511		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1512		rsp += size;
1513	}
1514	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1515
1516	if (error == 0) {
1517		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1518		    stackaddrsize);
1519		KASSERT(error == 0, ("error %d updating rsp", error));
1520	}
1521	return (error);
1522}
1523
1524static int
1525emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1526    struct vm_guest_paging *paging, mem_region_read_t memread,
1527    mem_region_write_t memwrite, void *arg)
1528{
1529	int error;
1530
1531	/*
1532	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1533	 *
1534	 * PUSH is part of the group 5 extended opcodes and is identified
1535	 * by ModRM:reg = b110.
1536	 */
1537	if ((vie->reg & 7) != 6)
1538		return (EINVAL);
1539
1540	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1541	    memwrite, arg);
1542	return (error);
1543}
1544
1545static int
1546emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1547    struct vm_guest_paging *paging, mem_region_read_t memread,
1548    mem_region_write_t memwrite, void *arg)
1549{
1550	int error;
1551
1552	/*
1553	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1554	 *
1555	 * POP is part of the group 1A extended opcodes and is identified
1556	 * by ModRM:reg = b000.
1557	 */
1558	if ((vie->reg & 7) != 0)
1559		return (EINVAL);
1560
1561	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1562	    memwrite, arg);
1563	return (error);
1564}
1565
1566static int
1567emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1568    struct vm_guest_paging *paging, mem_region_read_t memread,
1569    mem_region_write_t memwrite, void *memarg)
1570{
1571	int error;
1572
1573	switch (vie->reg & 7) {
1574	case 0x1:	/* OR */
1575		error = emulate_or(vm, vcpuid, gpa, vie,
1576		    memread, memwrite, memarg);
1577		break;
1578	case 0x4:	/* AND */
1579		error = emulate_and(vm, vcpuid, gpa, vie,
1580		    memread, memwrite, memarg);
1581		break;
1582	case 0x7:	/* CMP */
1583		error = emulate_cmp(vm, vcpuid, gpa, vie,
1584		    memread, memwrite, memarg);
1585		break;
1586	default:
1587		error = EINVAL;
1588		break;
1589	}
1590
1591	return (error);
1592}
1593
1594static int
1595emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1596    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1597{
1598	uint64_t val, rflags;
1599	int error, bitmask, bitoff;
1600
1601	/*
1602	 * 0F BA is a Group 8 extended opcode.
1603	 *
1604	 * Currently we only emulate the 'Bit Test' instruction which is
1605	 * identified by a ModR/M:reg encoding of 100b.
1606	 */
1607	if ((vie->reg & 7) != 4)
1608		return (EINVAL);
1609
1610	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1611	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1612
1613	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1614	if (error)
1615		return (error);
1616
1617	/*
1618	 * Intel SDM, Vol 2, Table 3-2:
1619	 * "Range of Bit Positions Specified by Bit Offset Operands"
1620	 */
1621	bitmask = vie->opsize * 8 - 1;
1622	bitoff = vie->immediate & bitmask;
1623
1624	/* Copy the bit into the Carry flag in %rflags */
1625	if (val & (1UL << bitoff))
1626		rflags |= PSL_C;
1627	else
1628		rflags &= ~PSL_C;
1629
1630	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1631	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1632
1633	return (0);
1634}
1635
1636static int
1637emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1638    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1639{
1640	int error;
1641	uint64_t buf;
1642
1643	switch (vie->reg & 7) {
1644	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1645		if (vie->mod == 0x3) {
1646			/*
1647			 * SFENCE.  Ignore it, VM exit provides enough
1648			 * barriers on its own.
1649			 */
1650			error = 0;
1651		} else {
1652			/*
1653			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1654			 * rights.
1655			 */
1656			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1657		}
1658		break;
1659	default:
1660		error = EINVAL;
1661		break;
1662	}
1663
1664	return (error);
1665}
1666
1667int
1668vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1669    struct vm_guest_paging *paging, mem_region_read_t memread,
1670    mem_region_write_t memwrite, void *memarg)
1671{
1672	int error;
1673
1674	if (!vie->decoded)
1675		return (EINVAL);
1676
1677	switch (vie->op.op_type) {
1678	case VIE_OP_TYPE_GROUP1:
1679		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1680		    memwrite, memarg);
1681		break;
1682	case VIE_OP_TYPE_POP:
1683		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1684		    memwrite, memarg);
1685		break;
1686	case VIE_OP_TYPE_PUSH:
1687		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1688		    memwrite, memarg);
1689		break;
1690	case VIE_OP_TYPE_CMP:
1691		error = emulate_cmp(vm, vcpuid, gpa, vie,
1692				    memread, memwrite, memarg);
1693		break;
1694	case VIE_OP_TYPE_MOV:
1695		error = emulate_mov(vm, vcpuid, gpa, vie,
1696				    memread, memwrite, memarg);
1697		break;
1698	case VIE_OP_TYPE_MOVSX:
1699	case VIE_OP_TYPE_MOVZX:
1700		error = emulate_movx(vm, vcpuid, gpa, vie,
1701				     memread, memwrite, memarg);
1702		break;
1703	case VIE_OP_TYPE_MOVS:
1704		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1705		    memwrite, memarg);
1706		break;
1707	case VIE_OP_TYPE_STOS:
1708		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1709		    memwrite, memarg);
1710		break;
1711	case VIE_OP_TYPE_AND:
1712		error = emulate_and(vm, vcpuid, gpa, vie,
1713				    memread, memwrite, memarg);
1714		break;
1715	case VIE_OP_TYPE_OR:
1716		error = emulate_or(vm, vcpuid, gpa, vie,
1717				    memread, memwrite, memarg);
1718		break;
1719	case VIE_OP_TYPE_SUB:
1720		error = emulate_sub(vm, vcpuid, gpa, vie,
1721				    memread, memwrite, memarg);
1722		break;
1723	case VIE_OP_TYPE_BITTEST:
1724		error = emulate_bittest(vm, vcpuid, gpa, vie,
1725		    memread, memwrite, memarg);
1726		break;
1727	case VIE_OP_TYPE_TWOB_GRP15:
1728		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1729		    memread, memwrite, memarg);
1730		break;
1731	case VIE_OP_TYPE_ADD:
1732		error = emulate_add(vm, vcpuid, gpa, vie, memread,
1733		    memwrite, memarg);
1734		break;
1735	case VIE_OP_TYPE_TEST:
1736		error = emulate_test(vm, vcpuid, gpa, vie,
1737		    memread, memwrite, memarg);
1738		break;
1739	default:
1740		error = EINVAL;
1741		break;
1742	}
1743
1744	return (error);
1745}
1746
1747int
1748vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1749{
1750	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1751	    ("%s: invalid size %d", __func__, size));
1752	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1753
1754	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1755		return (0);
1756
1757	return ((gla & (size - 1)) ? 1 : 0);
1758}
1759
1760int
1761vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1762{
1763	uint64_t mask;
1764
1765	if (cpu_mode != CPU_MODE_64BIT)
1766		return (0);
1767
1768	/*
1769	 * The value of the bit 47 in the 'gla' should be replicated in the
1770	 * most significant 16 bits.
1771	 */
1772	mask = ~((1UL << 48) - 1);
1773	if (gla & (1UL << 47))
1774		return ((gla & mask) != mask);
1775	else
1776		return ((gla & mask) != 0);
1777}
1778
1779uint64_t
1780vie_size2mask(int size)
1781{
1782	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1783	    ("vie_size2mask: invalid size %d", size));
1784	return (size2mask[size]);
1785}
1786
1787int
1788vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1789    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1790    int prot, uint64_t *gla)
1791{
1792	uint64_t firstoff, low_limit, high_limit, segbase;
1793	int glasize, type;
1794
1795	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1796	    ("%s: invalid segment %d", __func__, seg));
1797	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1798	    ("%s: invalid operand size %d", __func__, length));
1799	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1800	    ("%s: invalid prot %#x", __func__, prot));
1801
1802	firstoff = offset;
1803	if (cpu_mode == CPU_MODE_64BIT) {
1804		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1805		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1806		glasize = 8;
1807	} else {
1808		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1809		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1810		glasize = 4;
1811		/*
1812		 * If the segment selector is loaded with a NULL selector
1813		 * then the descriptor is unusable and attempting to use
1814		 * it results in a #GP(0).
1815		 */
1816		if (SEG_DESC_UNUSABLE(desc->access))
1817			return (-1);
1818
1819		/*
1820		 * The processor generates a #NP exception when a segment
1821		 * register is loaded with a selector that points to a
1822		 * descriptor that is not present. If this was the case then
1823		 * it would have been checked before the VM-exit.
1824		 */
1825		KASSERT(SEG_DESC_PRESENT(desc->access),
1826		    ("segment %d not present: %#x", seg, desc->access));
1827
1828		/*
1829		 * The descriptor type must indicate a code/data segment.
1830		 */
1831		type = SEG_DESC_TYPE(desc->access);
1832		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1833		    "descriptor type %#x", seg, type));
1834
1835		if (prot & PROT_READ) {
1836			/* #GP on a read access to a exec-only code segment */
1837			if ((type & 0xA) == 0x8)
1838				return (-1);
1839		}
1840
1841		if (prot & PROT_WRITE) {
1842			/*
1843			 * #GP on a write access to a code segment or a
1844			 * read-only data segment.
1845			 */
1846			if (type & 0x8)			/* code segment */
1847				return (-1);
1848
1849			if ((type & 0xA) == 0)		/* read-only data seg */
1850				return (-1);
1851		}
1852
1853		/*
1854		 * 'desc->limit' is fully expanded taking granularity into
1855		 * account.
1856		 */
1857		if ((type & 0xC) == 0x4) {
1858			/* expand-down data segment */
1859			low_limit = desc->limit + 1;
1860			high_limit = SEG_DESC_DEF32(desc->access) ?
1861			    0xffffffff : 0xffff;
1862		} else {
1863			/* code segment or expand-up data segment */
1864			low_limit = 0;
1865			high_limit = desc->limit;
1866		}
1867
1868		while (length > 0) {
1869			offset &= vie_size2mask(addrsize);
1870			if (offset < low_limit || offset > high_limit)
1871				return (-1);
1872			offset++;
1873			length--;
1874		}
1875	}
1876
1877	/*
1878	 * In 64-bit mode all segments except %fs and %gs have a segment
1879	 * base address of 0.
1880	 */
1881	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1882	    seg != VM_REG_GUEST_GS) {
1883		segbase = 0;
1884	} else {
1885		segbase = desc->base;
1886	}
1887
1888	/*
1889	 * Truncate 'firstoff' to the effective address size before adding
1890	 * it to the segment base.
1891	 */
1892	firstoff &= vie_size2mask(addrsize);
1893	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1894	return (0);
1895}
1896
1897#ifdef _KERNEL
1898void
1899vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1900{
1901	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1902	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1903
1904	bzero(vie, sizeof(struct vie));
1905
1906	vie->base_register = VM_REG_LAST;
1907	vie->index_register = VM_REG_LAST;
1908	vie->segment_register = VM_REG_LAST;
1909
1910	if (inst_length) {
1911		bcopy(inst_bytes, vie->inst, inst_length);
1912		vie->num_valid = inst_length;
1913	}
1914}
1915
1916static int
1917pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1918{
1919	int error_code = 0;
1920
1921	if (pte & PG_V)
1922		error_code |= PGEX_P;
1923	if (prot & VM_PROT_WRITE)
1924		error_code |= PGEX_W;
1925	if (usermode)
1926		error_code |= PGEX_U;
1927	if (rsvd)
1928		error_code |= PGEX_RSV;
1929	if (prot & VM_PROT_EXECUTE)
1930		error_code |= PGEX_I;
1931
1932	return (error_code);
1933}
1934
1935static void
1936ptp_release(void **cookie)
1937{
1938	if (*cookie != NULL) {
1939		vm_gpa_release(*cookie);
1940		*cookie = NULL;
1941	}
1942}
1943
1944static void *
1945ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
1946{
1947	void *ptr;
1948
1949	ptp_release(cookie);
1950	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
1951	return (ptr);
1952}
1953
1954int
1955vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1956    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
1957{
1958	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1959	u_int retries;
1960	uint64_t *ptpbase, ptpphys, pte, pgsize;
1961	uint32_t *ptpbase32, pte32;
1962	void *cookie;
1963
1964	*guest_fault = 0;
1965
1966	usermode = (paging->cpl == 3 ? 1 : 0);
1967	writable = prot & VM_PROT_WRITE;
1968	cookie = NULL;
1969	retval = 0;
1970	retries = 0;
1971restart:
1972	ptpphys = paging->cr3;		/* root of the page tables */
1973	ptp_release(&cookie);
1974	if (retries++ > 0)
1975		maybe_yield();
1976
1977	if (vie_canonical_check(paging->cpu_mode, gla)) {
1978		/*
1979		 * XXX assuming a non-stack reference otherwise a stack fault
1980		 * should be generated.
1981		 */
1982		vm_inject_gp(vm, vcpuid);
1983		goto fault;
1984	}
1985
1986	if (paging->paging_mode == PAGING_MODE_FLAT) {
1987		*gpa = gla;
1988		goto done;
1989	}
1990
1991	if (paging->paging_mode == PAGING_MODE_32) {
1992		nlevels = 2;
1993		while (--nlevels >= 0) {
1994			/* Zero out the lower 12 bits. */
1995			ptpphys &= ~0xfff;
1996
1997			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
1998			    &cookie);
1999
2000			if (ptpbase32 == NULL)
2001				goto error;
2002
2003			ptpshift = PAGE_SHIFT + nlevels * 10;
2004			ptpindex = (gla >> ptpshift) & 0x3FF;
2005			pgsize = 1UL << ptpshift;
2006
2007			pte32 = ptpbase32[ptpindex];
2008
2009			if ((pte32 & PG_V) == 0 ||
2010			    (usermode && (pte32 & PG_U) == 0) ||
2011			    (writable && (pte32 & PG_RW) == 0)) {
2012				pfcode = pf_error_code(usermode, prot, 0,
2013				    pte32);
2014				vm_inject_pf(vm, vcpuid, pfcode, gla);
2015				goto fault;
2016			}
2017
2018			/*
2019			 * Emulate the x86 MMU's management of the accessed
2020			 * and dirty flags. While the accessed flag is set
2021			 * at every level of the page table, the dirty flag
2022			 * is only set at the last level providing the guest
2023			 * physical address.
2024			 */
2025			if ((pte32 & PG_A) == 0) {
2026				if (atomic_cmpset_32(&ptpbase32[ptpindex],
2027				    pte32, pte32 | PG_A) == 0) {
2028					goto restart;
2029				}
2030			}
2031
2032			/* XXX must be ignored if CR4.PSE=0 */
2033			if (nlevels > 0 && (pte32 & PG_PS) != 0)
2034				break;
2035
2036			ptpphys = pte32;
2037		}
2038
2039		/* Set the dirty bit in the page table entry if necessary */
2040		if (writable && (pte32 & PG_M) == 0) {
2041			if (atomic_cmpset_32(&ptpbase32[ptpindex],
2042			    pte32, pte32 | PG_M) == 0) {
2043				goto restart;
2044			}
2045		}
2046
2047		/* Zero out the lower 'ptpshift' bits */
2048		pte32 >>= ptpshift; pte32 <<= ptpshift;
2049		*gpa = pte32 | (gla & (pgsize - 1));
2050		goto done;
2051	}
2052
2053	if (paging->paging_mode == PAGING_MODE_PAE) {
2054		/* Zero out the lower 5 bits and the upper 32 bits */
2055		ptpphys &= 0xffffffe0UL;
2056
2057		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
2058		    &cookie);
2059		if (ptpbase == NULL)
2060			goto error;
2061
2062		ptpindex = (gla >> 30) & 0x3;
2063
2064		pte = ptpbase[ptpindex];
2065
2066		if ((pte & PG_V) == 0) {
2067			pfcode = pf_error_code(usermode, prot, 0, pte);
2068			vm_inject_pf(vm, vcpuid, pfcode, gla);
2069			goto fault;
2070		}
2071
2072		ptpphys = pte;
2073
2074		nlevels = 2;
2075	} else
2076		nlevels = 4;
2077	while (--nlevels >= 0) {
2078		/* Zero out the lower 12 bits and the upper 12 bits */
2079		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2080
2081		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
2082		if (ptpbase == NULL)
2083			goto error;
2084
2085		ptpshift = PAGE_SHIFT + nlevels * 9;
2086		ptpindex = (gla >> ptpshift) & 0x1FF;
2087		pgsize = 1UL << ptpshift;
2088
2089		pte = ptpbase[ptpindex];
2090
2091		if ((pte & PG_V) == 0 ||
2092		    (usermode && (pte & PG_U) == 0) ||
2093		    (writable && (pte & PG_RW) == 0)) {
2094			pfcode = pf_error_code(usermode, prot, 0, pte);
2095			vm_inject_pf(vm, vcpuid, pfcode, gla);
2096			goto fault;
2097		}
2098
2099		/* Set the accessed bit in the page table entry */
2100		if ((pte & PG_A) == 0) {
2101			if (atomic_cmpset_64(&ptpbase[ptpindex],
2102			    pte, pte | PG_A) == 0) {
2103				goto restart;
2104			}
2105		}
2106
2107		if (nlevels > 0 && (pte & PG_PS) != 0) {
2108			if (pgsize > 1 * GB) {
2109				pfcode = pf_error_code(usermode, prot, 1, pte);
2110				vm_inject_pf(vm, vcpuid, pfcode, gla);
2111				goto fault;
2112			}
2113			break;
2114		}
2115
2116		ptpphys = pte;
2117	}
2118
2119	/* Set the dirty bit in the page table entry if necessary */
2120	if (writable && (pte & PG_M) == 0) {
2121		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2122			goto restart;
2123	}
2124
2125	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2126	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2127	*gpa = pte | (gla & (pgsize - 1));
2128done:
2129	ptp_release(&cookie);
2130	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2131	    __func__, retval));
2132	return (retval);
2133error:
2134	retval = EFAULT;
2135	goto done;
2136fault:
2137	*guest_fault = 1;
2138	goto done;
2139}
2140
2141int
2142vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2143    uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2144{
2145	struct vm_copyinfo copyinfo[2];
2146	int error, prot;
2147
2148	if (inst_length > VIE_INST_SIZE)
2149		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2150
2151	prot = PROT_READ | PROT_EXEC;
2152	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
2153	    copyinfo, nitems(copyinfo), faultptr);
2154	if (error || *faultptr)
2155		return (error);
2156
2157	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
2158	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2159	vie->num_valid = inst_length;
2160	return (0);
2161}
2162
2163static int
2164vie_peek(struct vie *vie, uint8_t *x)
2165{
2166
2167	if (vie->num_processed < vie->num_valid) {
2168		*x = vie->inst[vie->num_processed];
2169		return (0);
2170	} else
2171		return (-1);
2172}
2173
2174static void
2175vie_advance(struct vie *vie)
2176{
2177
2178	vie->num_processed++;
2179}
2180
2181static bool
2182segment_override(uint8_t x, int *seg)
2183{
2184
2185	switch (x) {
2186	case 0x2E:
2187		*seg = VM_REG_GUEST_CS;
2188		break;
2189	case 0x36:
2190		*seg = VM_REG_GUEST_SS;
2191		break;
2192	case 0x3E:
2193		*seg = VM_REG_GUEST_DS;
2194		break;
2195	case 0x26:
2196		*seg = VM_REG_GUEST_ES;
2197		break;
2198	case 0x64:
2199		*seg = VM_REG_GUEST_FS;
2200		break;
2201	case 0x65:
2202		*seg = VM_REG_GUEST_GS;
2203		break;
2204	default:
2205		return (false);
2206	}
2207	return (true);
2208}
2209
2210static int
2211decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2212{
2213	uint8_t x;
2214
2215	while (1) {
2216		if (vie_peek(vie, &x))
2217			return (-1);
2218
2219		if (x == 0x66)
2220			vie->opsize_override = 1;
2221		else if (x == 0x67)
2222			vie->addrsize_override = 1;
2223		else if (x == 0xF3)
2224			vie->repz_present = 1;
2225		else if (x == 0xF2)
2226			vie->repnz_present = 1;
2227		else if (segment_override(x, &vie->segment_register))
2228			vie->segment_override = 1;
2229		else
2230			break;
2231
2232		vie_advance(vie);
2233	}
2234
2235	/*
2236	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2237	 * - Only one REX prefix is allowed per instruction.
2238	 * - The REX prefix must immediately precede the opcode byte or the
2239	 *   escape opcode byte.
2240	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2241	 *   the mandatory prefix must come before the REX prefix.
2242	 */
2243	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2244		vie->rex_present = 1;
2245		vie->rex_w = x & 0x8 ? 1 : 0;
2246		vie->rex_r = x & 0x4 ? 1 : 0;
2247		vie->rex_x = x & 0x2 ? 1 : 0;
2248		vie->rex_b = x & 0x1 ? 1 : 0;
2249		vie_advance(vie);
2250	}
2251
2252	/*
2253	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2254	 */
2255	if (cpu_mode == CPU_MODE_64BIT) {
2256		/*
2257		 * Default address size is 64-bits and default operand size
2258		 * is 32-bits.
2259		 */
2260		vie->addrsize = vie->addrsize_override ? 4 : 8;
2261		if (vie->rex_w)
2262			vie->opsize = 8;
2263		else if (vie->opsize_override)
2264			vie->opsize = 2;
2265		else
2266			vie->opsize = 4;
2267	} else if (cs_d) {
2268		/* Default address and operand sizes are 32-bits */
2269		vie->addrsize = vie->addrsize_override ? 2 : 4;
2270		vie->opsize = vie->opsize_override ? 2 : 4;
2271	} else {
2272		/* Default address and operand sizes are 16-bits */
2273		vie->addrsize = vie->addrsize_override ? 4 : 2;
2274		vie->opsize = vie->opsize_override ? 4 : 2;
2275	}
2276	return (0);
2277}
2278
2279static int
2280decode_two_byte_opcode(struct vie *vie)
2281{
2282	uint8_t x;
2283
2284	if (vie_peek(vie, &x))
2285		return (-1);
2286
2287	vie->op = two_byte_opcodes[x];
2288
2289	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2290		return (-1);
2291
2292	vie_advance(vie);
2293	return (0);
2294}
2295
2296static int
2297decode_opcode(struct vie *vie)
2298{
2299	uint8_t x;
2300
2301	if (vie_peek(vie, &x))
2302		return (-1);
2303
2304	vie->op = one_byte_opcodes[x];
2305
2306	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2307		return (-1);
2308
2309	vie_advance(vie);
2310
2311	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2312		return (decode_two_byte_opcode(vie));
2313
2314	return (0);
2315}
2316
2317static int
2318decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2319{
2320	uint8_t x;
2321
2322	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2323		return (0);
2324
2325	if (cpu_mode == CPU_MODE_REAL)
2326		return (-1);
2327
2328	if (vie_peek(vie, &x))
2329		return (-1);
2330
2331	vie->mod = (x >> 6) & 0x3;
2332	vie->rm =  (x >> 0) & 0x7;
2333	vie->reg = (x >> 3) & 0x7;
2334
2335	/*
2336	 * A direct addressing mode makes no sense in the context of an EPT
2337	 * fault. There has to be a memory access involved to cause the
2338	 * EPT fault.
2339	 */
2340	if (vie->mod == VIE_MOD_DIRECT)
2341		return (-1);
2342
2343	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2344	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2345		/*
2346		 * Table 2-5: Special Cases of REX Encodings
2347		 *
2348		 * mod=0, r/m=5 is used in the compatibility mode to
2349		 * indicate a disp32 without a base register.
2350		 *
2351		 * mod!=3, r/m=4 is used in the compatibility mode to
2352		 * indicate that the SIB byte is present.
2353		 *
2354		 * The 'b' bit in the REX prefix is don't care in
2355		 * this case.
2356		 */
2357	} else {
2358		vie->rm |= (vie->rex_b << 3);
2359	}
2360
2361	vie->reg |= (vie->rex_r << 3);
2362
2363	/* SIB */
2364	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2365		goto done;
2366
2367	vie->base_register = gpr_map[vie->rm];
2368
2369	switch (vie->mod) {
2370	case VIE_MOD_INDIRECT_DISP8:
2371		vie->disp_bytes = 1;
2372		break;
2373	case VIE_MOD_INDIRECT_DISP32:
2374		vie->disp_bytes = 4;
2375		break;
2376	case VIE_MOD_INDIRECT:
2377		if (vie->rm == VIE_RM_DISP32) {
2378			vie->disp_bytes = 4;
2379			/*
2380			 * Table 2-7. RIP-Relative Addressing
2381			 *
2382			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2383			 * whereas in compatibility mode it just implies disp32.
2384			 */
2385
2386			if (cpu_mode == CPU_MODE_64BIT)
2387				vie->base_register = VM_REG_GUEST_RIP;
2388			else
2389				vie->base_register = VM_REG_LAST;
2390		}
2391		break;
2392	}
2393
2394done:
2395	vie_advance(vie);
2396
2397	return (0);
2398}
2399
2400static int
2401decode_sib(struct vie *vie)
2402{
2403	uint8_t x;
2404
2405	/* Proceed only if SIB byte is present */
2406	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2407		return (0);
2408
2409	if (vie_peek(vie, &x))
2410		return (-1);
2411
2412	/* De-construct the SIB byte */
2413	vie->ss = (x >> 6) & 0x3;
2414	vie->index = (x >> 3) & 0x7;
2415	vie->base = (x >> 0) & 0x7;
2416
2417	/* Apply the REX prefix modifiers */
2418	vie->index |= vie->rex_x << 3;
2419	vie->base |= vie->rex_b << 3;
2420
2421	switch (vie->mod) {
2422	case VIE_MOD_INDIRECT_DISP8:
2423		vie->disp_bytes = 1;
2424		break;
2425	case VIE_MOD_INDIRECT_DISP32:
2426		vie->disp_bytes = 4;
2427		break;
2428	}
2429
2430	if (vie->mod == VIE_MOD_INDIRECT &&
2431	    (vie->base == 5 || vie->base == 13)) {
2432		/*
2433		 * Special case when base register is unused if mod = 0
2434		 * and base = %rbp or %r13.
2435		 *
2436		 * Documented in:
2437		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2438		 * Table 2-5: Special Cases of REX Encodings
2439		 */
2440		vie->disp_bytes = 4;
2441	} else {
2442		vie->base_register = gpr_map[vie->base];
2443	}
2444
2445	/*
2446	 * All encodings of 'index' are valid except for %rsp (4).
2447	 *
2448	 * Documented in:
2449	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2450	 * Table 2-5: Special Cases of REX Encodings
2451	 */
2452	if (vie->index != 4)
2453		vie->index_register = gpr_map[vie->index];
2454
2455	/* 'scale' makes sense only in the context of an index register */
2456	if (vie->index_register < VM_REG_LAST)
2457		vie->scale = 1 << vie->ss;
2458
2459	vie_advance(vie);
2460
2461	return (0);
2462}
2463
2464static int
2465decode_displacement(struct vie *vie)
2466{
2467	int n, i;
2468	uint8_t x;
2469
2470	union {
2471		char	buf[4];
2472		int8_t	signed8;
2473		int32_t	signed32;
2474	} u;
2475
2476	if ((n = vie->disp_bytes) == 0)
2477		return (0);
2478
2479	if (n != 1 && n != 4)
2480		panic("decode_displacement: invalid disp_bytes %d", n);
2481
2482	for (i = 0; i < n; i++) {
2483		if (vie_peek(vie, &x))
2484			return (-1);
2485
2486		u.buf[i] = x;
2487		vie_advance(vie);
2488	}
2489
2490	if (n == 1)
2491		vie->displacement = u.signed8;		/* sign-extended */
2492	else
2493		vie->displacement = u.signed32;		/* sign-extended */
2494
2495	return (0);
2496}
2497
2498static int
2499decode_immediate(struct vie *vie)
2500{
2501	int i, n;
2502	uint8_t x;
2503	union {
2504		char	buf[4];
2505		int8_t	signed8;
2506		int16_t	signed16;
2507		int32_t	signed32;
2508	} u;
2509
2510	/* Figure out immediate operand size (if any) */
2511	if (vie->op.op_flags & VIE_OP_F_IMM) {
2512		/*
2513		 * Section 2.2.1.5 "Immediates", Intel SDM:
2514		 * In 64-bit mode the typical size of immediate operands
2515		 * remains 32-bits. When the operand size if 64-bits, the
2516		 * processor sign-extends all immediates to 64-bits prior
2517		 * to their use.
2518		 */
2519		if (vie->opsize == 4 || vie->opsize == 8)
2520			vie->imm_bytes = 4;
2521		else
2522			vie->imm_bytes = 2;
2523	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2524		vie->imm_bytes = 1;
2525	}
2526
2527	if ((n = vie->imm_bytes) == 0)
2528		return (0);
2529
2530	KASSERT(n == 1 || n == 2 || n == 4,
2531	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2532
2533	for (i = 0; i < n; i++) {
2534		if (vie_peek(vie, &x))
2535			return (-1);
2536
2537		u.buf[i] = x;
2538		vie_advance(vie);
2539	}
2540
2541	/* sign-extend the immediate value before use */
2542	if (n == 1)
2543		vie->immediate = u.signed8;
2544	else if (n == 2)
2545		vie->immediate = u.signed16;
2546	else
2547		vie->immediate = u.signed32;
2548
2549	return (0);
2550}
2551
2552static int
2553decode_moffset(struct vie *vie)
2554{
2555	int i, n;
2556	uint8_t x;
2557	union {
2558		char	buf[8];
2559		uint64_t u64;
2560	} u;
2561
2562	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2563		return (0);
2564
2565	/*
2566	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2567	 * The memory offset size follows the address-size of the instruction.
2568	 */
2569	n = vie->addrsize;
2570	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2571
2572	u.u64 = 0;
2573	for (i = 0; i < n; i++) {
2574		if (vie_peek(vie, &x))
2575			return (-1);
2576
2577		u.buf[i] = x;
2578		vie_advance(vie);
2579	}
2580	vie->displacement = u.u64;
2581	return (0);
2582}
2583
2584/*
2585 * Verify that the 'guest linear address' provided as collateral of the nested
2586 * page table fault matches with our instruction decoding.
2587 */
2588static int
2589verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2590    enum vm_cpu_mode cpu_mode)
2591{
2592	int error;
2593	uint64_t base, segbase, idx, gla2;
2594	enum vm_reg_name seg;
2595	struct seg_desc desc;
2596
2597	/* Skip 'gla' verification */
2598	if (gla == VIE_INVALID_GLA)
2599		return (0);
2600
2601	base = 0;
2602	if (vie->base_register != VM_REG_LAST) {
2603		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2604		if (error) {
2605			printf("verify_gla: error %d getting base reg %d\n",
2606				error, vie->base_register);
2607			return (-1);
2608		}
2609
2610		/*
2611		 * RIP-relative addressing starts from the following
2612		 * instruction
2613		 */
2614		if (vie->base_register == VM_REG_GUEST_RIP)
2615			base += vie->num_processed;
2616	}
2617
2618	idx = 0;
2619	if (vie->index_register != VM_REG_LAST) {
2620		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2621		if (error) {
2622			printf("verify_gla: error %d getting index reg %d\n",
2623				error, vie->index_register);
2624			return (-1);
2625		}
2626	}
2627
2628	/*
2629	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2630	 *
2631	 * In 64-bit mode, segmentation is generally (but not
2632	 * completely) disabled.  The exceptions are the FS and GS
2633	 * segments.
2634	 *
2635	 * In legacy IA-32 mode, when the ESP or EBP register is used
2636	 * as the base, the SS segment is the default segment.  For
2637	 * other data references, except when relative to stack or
2638	 * string destination the DS segment is the default.  These
2639	 * can be overridden to allow other segments to be accessed.
2640	 */
2641	if (vie->segment_override)
2642		seg = vie->segment_register;
2643	else if (vie->base_register == VM_REG_GUEST_RSP ||
2644	    vie->base_register == VM_REG_GUEST_RBP)
2645		seg = VM_REG_GUEST_SS;
2646	else
2647		seg = VM_REG_GUEST_DS;
2648	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2649	    seg != VM_REG_GUEST_GS) {
2650		segbase = 0;
2651	} else {
2652		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2653		if (error) {
2654			printf("verify_gla: error %d getting segment"
2655			       " descriptor %d", error,
2656			       vie->segment_register);
2657			return (-1);
2658		}
2659		segbase = desc.base;
2660	}
2661
2662	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2663	gla2 &= size2mask[vie->addrsize];
2664	if (gla != gla2) {
2665		printf("verify_gla mismatch: segbase(0x%0lx)"
2666		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2667		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2668		       segbase, base, vie->scale, idx, vie->displacement,
2669		       gla, gla2);
2670		return (-1);
2671	}
2672
2673	return (0);
2674}
2675
2676int
2677vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2678		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2679{
2680
2681	if (decode_prefixes(vie, cpu_mode, cs_d))
2682		return (-1);
2683
2684	if (decode_opcode(vie))
2685		return (-1);
2686
2687	if (decode_modrm(vie, cpu_mode))
2688		return (-1);
2689
2690	if (decode_sib(vie))
2691		return (-1);
2692
2693	if (decode_displacement(vie))
2694		return (-1);
2695
2696	if (decode_immediate(vie))
2697		return (-1);
2698
2699	if (decode_moffset(vie))
2700		return (-1);
2701
2702	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2703		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2704			return (-1);
2705	}
2706
2707	vie->decoded = 1;	/* success */
2708
2709	return (0);
2710}
2711#endif	/* _KERNEL */
2712