1/*	$OpenBSD: x86_vm.c,v 1.2 2024/07/12 13:51:12 dv Exp $	*/
2/*
3 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <sys/stat.h>
19#include <sys/types.h>
20
21#include <dev/ic/i8253reg.h>
22#include <dev/isa/isareg.h>
23
24#include <machine/psl.h>
25#include <machine/pte.h>
26#include <machine/specialreg.h>
27#include <machine/vmmvar.h>
28
29#include <errno.h>
30#include <string.h>
31#include <unistd.h>
32
33#include <zlib.h>
34
35#include "atomicio.h"
36#include "fw_cfg.h"
37#include "i8253.h"
38#include "i8259.h"
39#include "loadfile.h"
40#include "mc146818.h"
41#include "ns8250.h"
42#include "pci.h"
43#include "virtio.h"
44
45typedef uint8_t (*io_fn_t)(struct vm_run_params *);
46
47#define MAX_PORTS 65536
48
49io_fn_t	ioports_map[MAX_PORTS];
50extern char *__progname;
51
52void	 create_memory_map(struct vm_create_params *);
53int	 translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
54
55static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
56    size_t);
57static int	loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
58static int	vcpu_exit_eptviolation(struct vm_run_params *);
59static void	vcpu_exit_inout(struct vm_run_params *);
60
61extern struct vmd_vm	*current_vm;
62extern int		 con_fd;
63
64/*
65 * Represents a standard register set for an OS to be booted
66 * as a flat 64 bit address space.
67 *
68 * NOT set here are:
69 *  RIP
70 *  RSP
71 *  GDTR BASE
72 *
73 * Specific bootloaders should clone this structure and override
74 * those fields as needed.
75 *
76 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
77 *        features of the CPU in use.
78 */
79static const struct vcpu_reg_state vcpu_init_flat64 = {
80	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
81	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
82	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
83	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
84	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
85	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
86	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
87	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
88	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
89	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
90	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
91	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
92	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
93	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
94	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
95	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
96	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
97	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
98	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
99	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
100	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
101	.vrs_drs[VCPU_REGS_DR0] = 0x0,
102	.vrs_drs[VCPU_REGS_DR1] = 0x0,
103	.vrs_drs[VCPU_REGS_DR2] = 0x0,
104	.vrs_drs[VCPU_REGS_DR3] = 0x0,
105	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
106	.vrs_drs[VCPU_REGS_DR7] = 0x400,
107	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
108	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
109	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
110	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
111	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
112	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
113	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
114};
115
116/*
117 * Represents a standard register set for an BIOS to be booted
118 * as a flat 16 bit address space.
119 */
120static const struct vcpu_reg_state vcpu_init_flat16 = {
121	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
122	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
123	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
124	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
125	.vrs_crs[VCPU_REGS_CR3] = 0,
126	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
127	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
128	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
129	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
130	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
131	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
132	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
133	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
134	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
135	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
136	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
137	.vrs_drs[VCPU_REGS_DR0] = 0x0,
138	.vrs_drs[VCPU_REGS_DR1] = 0x0,
139	.vrs_drs[VCPU_REGS_DR2] = 0x0,
140	.vrs_drs[VCPU_REGS_DR3] = 0x0,
141	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
142	.vrs_drs[VCPU_REGS_DR7] = 0x400,
143	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
144	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
145	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
146	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
147	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
148	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
149};
150
151/*
152 * create_memory_map
153 *
154 * Sets up the guest physical memory ranges that the VM can access.
155 *
156 * Parameters:
157 *  vcp: VM create parameters describing the VM whose memory map
158 *       is being created
159 *
160 * Return values:
161 *  nothing
162 */
163void
164create_memory_map(struct vm_create_params *vcp)
165{
166	size_t len, mem_bytes;
167	size_t above_1m = 0, above_4g = 0;
168
169	mem_bytes = vcp->vcp_memranges[0].vmr_size;
170	vcp->vcp_nmemranges = 0;
171	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
172		return;
173
174	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
175	len = LOWMEM_KB * 1024;
176	vcp->vcp_memranges[0].vmr_gpa = 0x0;
177	vcp->vcp_memranges[0].vmr_size = len;
178	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
179	mem_bytes -= len;
180
181	/*
182	 * Second memory region: LOWMEM_KB - 1MB.
183	 *
184	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
185	 * We have to add this region, because some systems
186	 * unconditionally write to 0xb8000 (VGA RAM), and
187	 * we need to make sure that vmm(4) permits accesses
188	 * to it. So allocate guest memory for it.
189	 */
190	len = MB(1) - (LOWMEM_KB * 1024);
191	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
192	vcp->vcp_memranges[1].vmr_size = len;
193	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
194	mem_bytes -= len;
195
196	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
197	if (mem_bytes <= MB(2)) {
198		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
199		vcp->vcp_memranges[2].vmr_size = MB(2);
200		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
201		vcp->vcp_nmemranges = 3;
202		return;
203	}
204
205	/*
206	 * Calculate the how to split any remaining memory across the 4GB
207	 * boundary while making sure we do not place physical memory into
208	 * MMIO ranges.
209	 */
210	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
211		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
212		above_4g = mem_bytes - above_1m;
213	} else {
214		above_1m = mem_bytes;
215		above_4g = 0;
216	}
217
218	/* Third memory region: area above 1MB to MMIO region */
219	vcp->vcp_memranges[2].vmr_gpa = MB(1);
220	vcp->vcp_memranges[2].vmr_size = above_1m;
221	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
222
223	/* Fourth region: PCI MMIO range */
224	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
225	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
226	    VMM_PCI_MMIO_BAR_BASE + 1;
227	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
228
229	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
230	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
231	vcp->vcp_memranges[4].vmr_size = MB(2);
232	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
233
234	/* Sixth region: any remainder above 4GB */
235	if (above_4g > 0) {
236		vcp->vcp_memranges[5].vmr_gpa = GB(4);
237		vcp->vcp_memranges[5].vmr_size = above_4g;
238		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
239		vcp->vcp_nmemranges = 6;
240	} else
241		vcp->vcp_nmemranges = 5;
242}
243
244int
245load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs)
246{
247	int		ret;
248	gzFile		fp;
249	struct stat	sb;
250
251	/*
252	 * Set up default "flat 64 bit" register state - RIP, RSP, and
253	 * GDT info will be set in bootloader
254	 */
255	memcpy(vrs, &vcpu_init_flat64, sizeof(*vrs));
256
257	/* Find and open kernel image */
258	if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
259		fatalx("failed to open kernel - exiting");
260
261	/* Load kernel image */
262	ret = loadfile_elf(fp, vm, vrs, vm->vm_params.vmc_bootdevice);
263
264	/*
265	 * Try BIOS as a fallback (only if it was provided as an image
266	 * with vm->vm_kernel and the file is not compressed)
267	 */
268	if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
269	    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
270		ret = loadfile_bios(fp, sb.st_size, vrs);
271
272	gzclose(fp);
273
274	return (ret);
275}
276
277
278/*
279 * loadfile_bios
280 *
281 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
282 * directly into memory.
283 *
284 * Parameters:
285 *  fp: file of a kernel file to load
286 *  size: uncompressed size of the image
287 *  (out) vrs: register state to set on init for this kernel
288 *
289 * Return values:
290 *  0 if successful
291 *  various error codes returned from read(2) or loadelf functions
292 */
293int
294loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
295{
296	off_t	 off;
297
298	/* Set up a "flat 16 bit" register state for BIOS */
299	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
300
301	/* Seek to the beginning of the BIOS image */
302	if (gzseek(fp, 0, SEEK_SET) == -1)
303		return (-1);
304
305	/* The BIOS image must end at 1MB */
306	if ((off = MB(1) - size) < 0)
307		return (-1);
308
309	/* Read BIOS image into memory */
310	if (mread(fp, off, size) != (size_t)size) {
311		errno = EIO;
312		return (-1);
313	}
314
315	if (gzseek(fp, 0, SEEK_SET) == -1)
316		return (-1);
317
318	/* Read a second BIOS copy into memory ending at 4GB */
319	off = GB(4) - size;
320	if (mread(fp, off, size) != (size_t)size) {
321		errno = EIO;
322		return (-1);
323	}
324
325	log_debug("%s: loaded BIOS image", __func__);
326
327	return (0);
328}
329
330/*
331 * init_emulated_hw
332 *
333 * Initializes the userspace hardware emulation
334 */
335void
336init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
337    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
338{
339	struct vm_create_params *vcp = &vmc->vmc_params;
340	size_t i;
341	uint64_t memlo, memhi;
342
343	/* Calculate memory size for NVRAM registers */
344	memlo = memhi = 0;
345	for (i = 0; i < vcp->vcp_nmemranges; i++) {
346		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
347		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
348			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
349		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
350			memhi = vcp->vcp_memranges[i].vmr_size;
351	}
352
353	/* Reset the IO port map */
354	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
355
356	/* Init i8253 PIT */
357	i8253_init(vcp->vcp_id);
358	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
359	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
360	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
361	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
362	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
363
364	/* Init mc146818 RTC */
365	mc146818_init(vcp->vcp_id, memlo, memhi);
366	ioports_map[IO_RTC] = vcpu_exit_mc146818;
367	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
368
369	/* Init master and slave PICs */
370	i8259_init();
371	ioports_map[IO_ICU1] = vcpu_exit_i8259;
372	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
373	ioports_map[IO_ICU2] = vcpu_exit_i8259;
374	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
375	ioports_map[ELCR0] = vcpu_exit_elcr;
376	ioports_map[ELCR1] = vcpu_exit_elcr;
377
378	/* Init ns8250 UART */
379	ns8250_init(con_fd, vcp->vcp_id);
380	for (i = COM1_DATA; i <= COM1_SCR; i++)
381		ioports_map[i] = vcpu_exit_com;
382
383	/* Initialize PCI */
384	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
385		ioports_map[i] = vcpu_exit_pci;
386
387	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
388	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
389	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
390	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
391	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
392	pci_init();
393
394	/* Initialize virtio devices */
395	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
396
397	/*
398	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
399	 * detection.
400	 */
401	fw_cfg_init(vmc);
402	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
403	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
404	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
405	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
406}
407
408/*
409 * restore_emulated_hw
410 *
411 * Restores the userspace hardware emulation from fd
412 */
413void
414restore_emulated_hw(struct vm_create_params *vcp, int fd,
415    int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
416{
417	/* struct vm_create_params *vcp = &vmc->vmc_params; */
418	int i;
419	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
420
421	/* Init i8253 PIT */
422	i8253_restore(fd, vcp->vcp_id);
423	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
424	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
425	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
426	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
427
428	/* Init master and slave PICs */
429	i8259_restore(fd);
430	ioports_map[IO_ICU1] = vcpu_exit_i8259;
431	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
432	ioports_map[IO_ICU2] = vcpu_exit_i8259;
433	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
434
435	/* Init ns8250 UART */
436	ns8250_restore(fd, con_fd, vcp->vcp_id);
437	for (i = COM1_DATA; i <= COM1_SCR; i++)
438		ioports_map[i] = vcpu_exit_com;
439
440	/* Init mc146818 RTC */
441	mc146818_restore(fd, vcp->vcp_id);
442	ioports_map[IO_RTC] = vcpu_exit_mc146818;
443	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
444
445	/* Init QEMU fw_cfg interface */
446	fw_cfg_restore(fd);
447	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
448	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
449	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
450	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
451
452	/* Initialize PCI */
453	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
454		ioports_map[i] = vcpu_exit_pci;
455
456	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
457	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
458	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
459	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
460	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
461	pci_restore(fd);
462	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
463}
464
465void
466pause_vm_md(struct vmd_vm *vm)
467{
468	i8253_stop();
469	mc146818_stop();
470	ns8250_stop();
471	virtio_stop(vm);
472}
473
474void
475unpause_vm_md(struct vmd_vm *vm)
476{
477	i8253_start();
478	mc146818_start();
479	ns8250_start();
480	virtio_start(vm);
481}
482
483int
484dump_devs(int fd)
485{
486	int ret = 0;
487
488	if ((ret = i8253_dump(fd)))
489		return ret;
490	if ((ret = i8259_dump(fd)))
491		return ret;
492	if ((ret = ns8250_dump(fd)))
493		return ret;
494	if ((ret = mc146818_dump(fd)))
495		return ret;
496	ret = fw_cfg_dump(fd);
497
498	return ret;
499}
500
501int
502dump_send_header(int fd) {
503	struct vm_dump_header	   vmh;
504	int			   i;
505
506	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
507	    sizeof(vmh.vmh_signature));
508
509	vmh.vmh_cpuids[0].code = 0x00;
510	vmh.vmh_cpuids[0].leaf = 0x00;
511
512	vmh.vmh_cpuids[1].code = 0x01;
513	vmh.vmh_cpuids[1].leaf = 0x00;
514
515	vmh.vmh_cpuids[2].code = 0x07;
516	vmh.vmh_cpuids[2].leaf = 0x00;
517
518	vmh.vmh_cpuids[3].code = 0x0d;
519	vmh.vmh_cpuids[3].leaf = 0x00;
520
521	vmh.vmh_cpuids[4].code = 0x80000001;
522	vmh.vmh_cpuids[4].leaf = 0x00;
523
524	vmh.vmh_version = VM_DUMP_VERSION;
525
526	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
527		CPUID_LEAF(vmh.vmh_cpuids[i].code,
528		    vmh.vmh_cpuids[i].leaf,
529		    vmh.vmh_cpuids[i].a,
530		    vmh.vmh_cpuids[i].b,
531		    vmh.vmh_cpuids[i].c,
532		    vmh.vmh_cpuids[i].d);
533	}
534
535	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
536		return (-1);
537
538	return (0);
539}
540
541
542/*
543 * vcpu_exit_inout
544 *
545 * Handle all I/O exits that need to be emulated in vmd. This includes the
546 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
547 *
548 * Parameters:
549 *  vrp: vcpu run parameters containing guest state for this exit
550 */
551void
552vcpu_exit_inout(struct vm_run_params *vrp)
553{
554	struct vm_exit *vei = vrp->vrp_exit;
555	uint8_t intr = 0xFF;
556
557	if (vei->vei.vei_rep || vei->vei.vei_string) {
558#ifdef MMIO_DEBUG
559		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
560		    __func__,
561		    vei->vei.vei_rep == 0 ? "" : "REP ",
562		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
563		    vei->vei.vei_string == 0 ? "" : "S",
564		    vei->vei.vei_size, vei->vei.vei_encoding,
565		    vei->vei.vei_data, vei->vei.vei_port);
566		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
567		    __func__,
568		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
569		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
570		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
571#endif /* MMIO_DEBUG */
572		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
573		    __func__);
574	}
575
576	if (ioports_map[vei->vei.vei_port] != NULL)
577		intr = ioports_map[vei->vei.vei_port](vrp);
578	else if (vei->vei.vei_dir == VEI_DIR_IN)
579		set_return_data(vei, 0xFFFFFFFF);
580
581	vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
582
583	if (intr != 0xFF)
584		vcpu_assert_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
585}
586
587/*
588 * vcpu_exit
589 *
590 * Handle a vcpu exit. This function is called when it is determined that
591 * vmm(4) requires the assistance of vmd to support a particular guest
592 * exit type (eg, accessing an I/O port or device). Guest state is contained
593 * in 'vrp', and will be resent to vmm(4) on exit completion.
594 *
595 * Upon conclusion of handling the exit, the function determines if any
596 * interrupts should be injected into the guest, and asserts the proper
597 * IRQ line whose interrupt should be vectored.
598 *
599 * Parameters:
600 *  vrp: vcpu run parameters containing guest state for this exit
601 *
602 * Return values:
603 *  0: the exit was handled successfully
604 *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
605 */
606int
607vcpu_exit(struct vm_run_params *vrp)
608{
609	int ret;
610
611	switch (vrp->vrp_exit_reason) {
612	case VMX_EXIT_INT_WINDOW:
613	case SVM_VMEXIT_VINTR:
614	case VMX_EXIT_CPUID:
615	case VMX_EXIT_EXTINT:
616	case SVM_VMEXIT_INTR:
617	case SVM_VMEXIT_MSR:
618	case SVM_VMEXIT_CPUID:
619		/*
620		 * We may be exiting to vmd to handle a pending interrupt but
621		 * at the same time the last exit type may have been one of
622		 * these. In this case, there's nothing extra to be done
623		 * here (and falling through to the default case below results
624		 * in more vmd log spam).
625		 */
626		break;
627	case SVM_VMEXIT_NPF:
628	case VMX_EXIT_EPT_VIOLATION:
629		ret = vcpu_exit_eptviolation(vrp);
630		if (ret)
631			return (ret);
632		break;
633	case VMX_EXIT_IO:
634	case SVM_VMEXIT_IOIO:
635		vcpu_exit_inout(vrp);
636		break;
637	case VMX_EXIT_HLT:
638	case SVM_VMEXIT_HLT:
639		vcpu_halt(vrp->vrp_vcpu_id);
640		break;
641	case VMX_EXIT_TRIPLE_FAULT:
642	case SVM_VMEXIT_SHUTDOWN:
643		/* reset VM */
644		return (EAGAIN);
645	default:
646		log_debug("%s: unknown exit reason 0x%x",
647		    __progname, vrp->vrp_exit_reason);
648	}
649
650	return (0);
651}
652
653/*
654 * vcpu_exit_eptviolation
655 *
656 * handle an EPT Violation
657 *
658 * Parameters:
659 *  vrp: vcpu run parameters containing guest state for this exit
660 *
661 * Return values:
662 *  0: no action required
663 *  EFAULT: a protection fault occured, kill the vm.
664 */
665static int
666vcpu_exit_eptviolation(struct vm_run_params *vrp)
667{
668	struct vm_exit *ve = vrp->vrp_exit;
669	int ret = 0;
670#if MMIO_NOTYET
671	struct x86_insn insn;
672	uint64_t va, pa;
673	size_t len = 15;		/* Max instruction length in x86. */
674#endif /* MMIO_NOTYET */
675	switch (ve->vee.vee_fault_type) {
676	case VEE_FAULT_HANDLED:
677		break;
678
679#if MMIO_NOTYET
680	case VEE_FAULT_MMIO_ASSIST:
681		/* Intel VMX might give us the length of the instruction. */
682		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
683			len = ve->vee.vee_insn_len;
684
685		if (len > 15)
686			fatalx("%s: invalid instruction length %lu", __func__,
687			    len);
688
689		/* If we weren't given instruction bytes, we need to fetch. */
690		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
691			memset(ve->vee.vee_insn_bytes, 0,
692			    sizeof(ve->vee.vee_insn_bytes));
693			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
694
695			/* XXX Only support instructions that fit on 1 page. */
696			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
697				log_warnx("%s: instruction might cross page "
698				    "boundary", __func__);
699				ret = EINVAL;
700				break;
701			}
702
703			ret = translate_gva(ve, va, &pa, PROT_EXEC);
704			if (ret != 0) {
705				log_warnx("%s: failed gva translation",
706				    __func__);
707				break;
708			}
709
710			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
711			if (ret != 0) {
712				log_warnx("%s: failed to fetch instruction "
713				    "bytes from 0x%llx", __func__, pa);
714				break;
715			}
716		}
717
718		ret = insn_decode(ve, &insn);
719		if (ret == 0)
720			ret = insn_emulate(ve, &insn);
721		break;
722#endif /* MMIO_NOTYET */
723
724	case VEE_FAULT_PROTECT:
725		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
726		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
727		ret = EFAULT;
728		break;
729
730	default:
731		fatalx("%s: invalid fault_type %d", __progname,
732		    ve->vee.vee_fault_type);
733		/* UNREACHED */
734	}
735
736	return (ret);
737}
738
739/*
740 * vcpu_exit_pci
741 *
742 * Handle all I/O to the emulated PCI subsystem.
743 *
744 * Parameters:
745 *  vrp: vcpu run parameters containing guest state for this exit
746 *
747 * Return value:
748 *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
749 *      be injected.
750 */
751uint8_t
752vcpu_exit_pci(struct vm_run_params *vrp)
753{
754	struct vm_exit *vei = vrp->vrp_exit;
755	uint8_t intr;
756
757	intr = 0xFF;
758
759	switch (vei->vei.vei_port) {
760	case PCI_MODE1_ADDRESS_REG:
761		pci_handle_address_reg(vrp);
762		break;
763	case PCI_MODE1_DATA_REG:
764	case PCI_MODE1_DATA_REG + 1:
765	case PCI_MODE1_DATA_REG + 2:
766	case PCI_MODE1_DATA_REG + 3:
767		pci_handle_data_reg(vrp);
768		break;
769	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
770		intr = pci_handle_io(vrp);
771		break;
772	default:
773		log_warnx("%s: unknown PCI register 0x%llx",
774		    __progname, (uint64_t)vei->vei.vei_port);
775		break;
776	}
777
778	return (intr);
779}
780
781/*
782 * find_gpa_range
783 *
784 * Search for a contiguous guest physical mem range.
785 *
786 * Parameters:
787 *  vcp: VM create parameters that contain the memory map to search in
788 *  gpa: the starting guest physical address
789 *  len: the length of the memory range
790 *
791 * Return values:
792 *  NULL: on failure if there is no memory range as described by the parameters
793 *  Pointer to vm_mem_range that contains the start of the range otherwise.
794 */
795static struct vm_mem_range *
796find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
797{
798	size_t i, n;
799	struct vm_mem_range *vmr;
800
801	/* Find the first vm_mem_range that contains gpa */
802	for (i = 0; i < vcp->vcp_nmemranges; i++) {
803		vmr = &vcp->vcp_memranges[i];
804		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
805			break;
806	}
807
808	/* No range found. */
809	if (i == vcp->vcp_nmemranges)
810		return (NULL);
811
812	/*
813	 * vmr may cover the range [gpa, gpa + len) only partly. Make
814	 * sure that the following vm_mem_ranges are contiguous and
815	 * cover the rest.
816	 */
817	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
818	if (len < n)
819		len = 0;
820	else
821		len -= n;
822	gpa = vmr->vmr_gpa + vmr->vmr_size;
823	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
824		vmr = &vcp->vcp_memranges[i];
825		if (gpa != vmr->vmr_gpa)
826			return (NULL);
827		if (len <= vmr->vmr_size)
828			len = 0;
829		else
830			len -= vmr->vmr_size;
831
832		gpa = vmr->vmr_gpa + vmr->vmr_size;
833	}
834
835	if (len != 0)
836		return (NULL);
837
838	return (vmr);
839}
840/*
841 * write_mem
842 *
843 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
844 *
845 * Parameters:
846 *  dst: the destination paddr_t in the guest VM
847 *  buf: data to copy (or NULL to zero the data)
848 *  len: number of bytes to copy
849 *
850 * Return values:
851 *  0: success
852 *  EINVAL: if the guest physical memory range [dst, dst + len) does not
853 *      exist in the guest.
854 */
855int
856write_mem(paddr_t dst, const void *buf, size_t len)
857{
858	const char *from = buf;
859	char *to;
860	size_t n, off;
861	struct vm_mem_range *vmr;
862
863	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
864	if (vmr == NULL) {
865		errno = EINVAL;
866		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
867		    "len = 0x%zx", __func__, dst, len);
868		return (EINVAL);
869	}
870
871	off = dst - vmr->vmr_gpa;
872	while (len != 0) {
873		n = vmr->vmr_size - off;
874		if (len < n)
875			n = len;
876
877		to = (char *)vmr->vmr_va + off;
878		if (buf == NULL)
879			memset(to, 0, n);
880		else {
881			memcpy(to, from, n);
882			from += n;
883		}
884		len -= n;
885		off = 0;
886		vmr++;
887	}
888
889	return (0);
890}
891
892/*
893 * read_mem
894 *
895 * Reads memory at guest paddr 'src' into 'buf'.
896 *
897 * Parameters:
898 *  src: the source paddr_t in the guest VM to read from.
899 *  buf: destination (local) buffer
900 *  len: number of bytes to read
901 *
902 * Return values:
903 *  0: success
904 *  EINVAL: if the guest physical memory range [dst, dst + len) does not
905 *      exist in the guest.
906 */
907int
908read_mem(paddr_t src, void *buf, size_t len)
909{
910	char *from, *to = buf;
911	size_t n, off;
912	struct vm_mem_range *vmr;
913
914	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
915	if (vmr == NULL) {
916		errno = EINVAL;
917		log_warn("%s: failed - invalid memory range src = 0x%lx, "
918		    "len = 0x%zx", __func__, src, len);
919		return (EINVAL);
920	}
921
922	off = src - vmr->vmr_gpa;
923	while (len != 0) {
924		n = vmr->vmr_size - off;
925		if (len < n)
926			n = len;
927
928		from = (char *)vmr->vmr_va + off;
929		memcpy(to, from, n);
930
931		to += n;
932		len -= n;
933		off = 0;
934		vmr++;
935	}
936
937	return (0);
938}
939
940/*
941 * hvaddr_mem
942 *
943 * Translate a guest physical address to a host virtual address, checking the
944 * provided memory range length to confirm it's contiguous within the same
945 * guest memory range (vm_mem_range).
946 *
947 * Parameters:
948 *  gpa: guest physical address to translate
949 *  len: number of bytes in the intended range
950 *
951 * Return values:
952 *  void* to host virtual memory on success
953 *  NULL on error, setting errno to:
954 *    EFAULT: gpa falls outside guest memory ranges
955 *    EINVAL: requested len extends beyond memory range
956 */
957void *
958hvaddr_mem(paddr_t gpa, size_t len)
959{
960	struct vm_mem_range *vmr;
961	size_t off;
962
963	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
964	if (vmr == NULL) {
965		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
966		errno = EFAULT;
967		return (NULL);
968	}
969
970	off = gpa - vmr->vmr_gpa;
971	if (len > (vmr->vmr_size - off)) {
972		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
973		    "len=%zu", __func__, gpa, len);
974		errno = EINVAL;
975		return (NULL);
976	}
977
978	return ((char *)vmr->vmr_va + off);
979}
980
981/*
982 * vcpu_assert_irq
983 *
984 * Injects the specified IRQ on the supplied vcpu/vm
985 *
986 * Parameters:
987 *  vm_id: VM ID to inject to
988 *  vcpu_id: VCPU ID to inject to
989 *  irq: IRQ to inject
990 */
991void
992vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
993{
994	i8259_assert_irq(irq);
995
996	if (i8259_is_pending()) {
997		if (vcpu_intr(vm_id, vcpu_id, 1))
998			fatalx("%s: can't assert INTR", __func__);
999
1000		vcpu_unhalt(vcpu_id);
1001		vcpu_signal_run(vcpu_id);
1002	}
1003}
1004
1005/*
1006 * vcpu_deassert_pic_irq
1007 *
1008 * Clears the specified IRQ on the supplied vcpu/vm
1009 *
1010 * Parameters:
1011 *  vm_id: VM ID to clear in
1012 *  vcpu_id: VCPU ID to clear in
1013 *  irq: IRQ to clear
1014 */
1015void
1016vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1017{
1018	i8259_deassert_irq(irq);
1019
1020	if (!i8259_is_pending()) {
1021		if (vcpu_intr(vm_id, vcpu_id, 0))
1022			fatalx("%s: can't deassert INTR for vm_id %d, "
1023			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1024	}
1025}
1026/*
1027 * set_return_data
1028 *
1029 * Utility function for manipulating register data in vm exit info structs. This
1030 * function ensures that the data is copied to the vei->vei.vei_data field with
1031 * the proper size for the operation being performed.
1032 *
1033 * Parameters:
1034 *  vei: exit information
1035 *  data: return data
1036 */
1037void
1038set_return_data(struct vm_exit *vei, uint32_t data)
1039{
1040	switch (vei->vei.vei_size) {
1041	case 1:
1042		vei->vei.vei_data &= ~0xFF;
1043		vei->vei.vei_data |= (uint8_t)data;
1044		break;
1045	case 2:
1046		vei->vei.vei_data &= ~0xFFFF;
1047		vei->vei.vei_data |= (uint16_t)data;
1048		break;
1049	case 4:
1050		vei->vei.vei_data = data;
1051		break;
1052	}
1053}
1054
1055/*
1056 * get_input_data
1057 *
1058 * Utility function for manipulating register data in vm exit info
1059 * structs. This function ensures that the data is copied from the
1060 * vei->vei.vei_data field with the proper size for the operation being
1061 * performed.
1062 *
1063 * Parameters:
1064 *  vei: exit information
1065 *  data: location to store the result
1066 */
1067void
1068get_input_data(struct vm_exit *vei, uint32_t *data)
1069{
1070	switch (vei->vei.vei_size) {
1071	case 1:
1072		*data &= 0xFFFFFF00;
1073		*data |= (uint8_t)vei->vei.vei_data;
1074		break;
1075	case 2:
1076		*data &= 0xFFFF0000;
1077		*data |= (uint16_t)vei->vei.vei_data;
1078		break;
1079	case 4:
1080		*data = vei->vei.vei_data;
1081		break;
1082	default:
1083		log_warnx("%s: invalid i/o size %d", __func__,
1084		    vei->vei.vei_size);
1085	}
1086
1087}
1088
1089/*
1090 * translate_gva
1091 *
1092 * Translates a guest virtual address to a guest physical address by walking
1093 * the currently active page table (if needed).
1094 *
1095 * XXX ensure translate_gva updates the A bit in the PTE
1096 * XXX ensure translate_gva respects segment base and limits in i386 mode
1097 * XXX ensure translate_gva respects segment wraparound in i8086 mode
1098 * XXX ensure translate_gva updates the A bit in the segment selector
1099 * XXX ensure translate_gva respects CR4.LMSLE if available
1100 *
1101 * Parameters:
1102 *  exit: The VCPU this translation should be performed for (guest MMU settings
1103 *   are gathered from this VCPU)
1104 *  va: virtual address to translate
1105 *  pa: pointer to paddr_t variable that will receive the translated physical
1106 *   address. 'pa' is unchanged on error.
1107 *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
1108 *   the address should be translated
1109 *
1110 * Return values:
1111 *  0: the address was successfully translated - 'pa' contains the physical
1112 *     address currently mapped by 'va'.
1113 *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
1114 *     and %cr2 set in the vcpu structure.
1115 *  EINVAL: an error occurred reading paging table structures
1116 */
1117int
1118translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
1119{
1120	int level, shift, pdidx;
1121	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
1122	uint64_t shift_width, pte_size;
1123	struct vcpu_reg_state *vrs;
1124
1125	vrs = &exit->vrs;
1126
1127	if (!pa)
1128		return (EINVAL);
1129
1130	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
1131		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
1132		*pa = va;
1133		return (0);
1134	}
1135
1136	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
1137
1138	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
1139	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
1140
1141	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
1142		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
1143			pte_size = sizeof(uint64_t);
1144			shift_width = 9;
1145
1146			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
1147				/* 4 level paging */
1148				level = 4;
1149				mask = L4_MASK;
1150				shift = L4_SHIFT;
1151			} else {
1152				/* 32 bit with PAE paging */
1153				level = 3;
1154				mask = L3_MASK;
1155				shift = L3_SHIFT;
1156			}
1157		} else {
1158			/* 32 bit paging */
1159			level = 2;
1160			shift_width = 10;
1161			mask = 0xFFC00000;
1162			shift = 22;
1163			pte_size = sizeof(uint32_t);
1164		}
1165	} else
1166		return (EINVAL);
1167
1168	/* XXX: Check for R bit in segment selector and set A bit */
1169
1170	for (;level > 0; level--) {
1171		pdidx = (va & mask) >> shift;
1172		pte_paddr = (pt_paddr) + (pdidx * pte_size);
1173
1174		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
1175		    level, pte_paddr);
1176		if (read_mem(pte_paddr, &pte, pte_size)) {
1177			log_warn("%s: failed to read pte", __func__);
1178			return (EFAULT);
1179		}
1180
1181		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
1182		    pte);
1183
1184		/* XXX: Set CR2  */
1185		if (!(pte & PG_V))
1186			return (EFAULT);
1187
1188		/* XXX: Check for SMAP */
1189		if ((mode == PROT_WRITE) && !(pte & PG_RW))
1190			return (EPERM);
1191
1192		if ((exit->cpl > 0) && !(pte & PG_u))
1193			return (EPERM);
1194
1195		pte = pte | PG_U;
1196		if (mode == PROT_WRITE)
1197			pte = pte | PG_M;
1198		if (write_mem(pte_paddr, &pte, pte_size)) {
1199			log_warn("%s: failed to write back flags to pte",
1200			    __func__);
1201			return (EIO);
1202		}
1203
1204		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
1205		if (pte & PG_PS)
1206			break;
1207
1208		if (level > 1) {
1209			pt_paddr = pte & PG_FRAME;
1210			shift -= shift_width;
1211			mask = mask >> shift_width;
1212		}
1213	}
1214
1215	low_mask = (1 << shift) - 1;
1216	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
1217	*pa = (pte & high_mask) | (va & low_mask);
1218
1219	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
1220
1221	return (0);
1222}
1223
1224int
1225intr_pending(struct vmd_vm *vm)
1226{
1227	/* XXX select active interrupt controller */
1228	return i8259_is_pending();
1229}
1230
1231int
1232intr_ack(struct vmd_vm *vm)
1233{
1234	/* XXX select active interrupt controller */
1235	return i8259_ack();
1236}
1237
1238void
1239intr_toggle_el(struct vmd_vm *vm, int irq, int val)
1240{
1241	/* XXX select active interrupt controller */
1242	pic_set_elcr(irq, val);
1243}
1244
1245int
1246vmd_check_vmh(struct vm_dump_header *vmh)
1247{
1248	int i;
1249	unsigned int code, leaf;
1250	unsigned int a, b, c, d;
1251
1252	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
1253		log_warnx("%s: incompatible dump signature", __func__);
1254		return (-1);
1255	}
1256
1257	if (vmh->vmh_version != VM_DUMP_VERSION) {
1258		log_warnx("%s: incompatible dump version", __func__);
1259		return (-1);
1260	}
1261
1262	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
1263		code = vmh->vmh_cpuids[i].code;
1264		leaf = vmh->vmh_cpuids[i].leaf;
1265		if (leaf != 0x00) {
1266			log_debug("%s: invalid leaf 0x%x for code 0x%x",
1267			    __func__, leaf, code);
1268			return (-1);
1269		}
1270
1271		switch (code) {
1272		case 0x00:
1273			CPUID_LEAF(code, leaf, a, b, c, d);
1274			if (vmh->vmh_cpuids[i].a > a) {
1275				log_debug("%s: incompatible cpuid level",
1276				    __func__);
1277				return (-1);
1278			}
1279			if (!(vmh->vmh_cpuids[i].b == b &&
1280			    vmh->vmh_cpuids[i].c == c &&
1281			    vmh->vmh_cpuids[i].d == d)) {
1282				log_debug("%s: incompatible cpu brand",
1283				    __func__);
1284				return (-1);
1285			}
1286			break;
1287
1288		case 0x01:
1289			CPUID_LEAF(code, leaf, a, b, c, d);
1290			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
1291			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
1292				log_debug("%s: incompatible cpu features "
1293				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1294				    code, leaf);
1295				return (-1);
1296			}
1297			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
1298			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
1299				log_debug("%s: incompatible cpu features "
1300				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1301				    code, leaf);
1302				return (-1);
1303			}
1304			break;
1305
1306		case 0x07:
1307			CPUID_LEAF(code, leaf, a, b, c, d);
1308			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
1309			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
1310				log_debug("%s: incompatible cpu features "
1311				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1312				    code, leaf);
1313				return (-1);
1314			}
1315			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
1316			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
1317				log_debug("%s: incompatible cpu features "
1318				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1319				    code, leaf);
1320				return (-1);
1321			}
1322			break;
1323
1324		case 0x0d:
1325			CPUID_LEAF(code, leaf, a, b, c, d);
1326			if (vmh->vmh_cpuids[i].b > b) {
1327				log_debug("%s: incompatible cpu: insufficient "
1328				    "max save area for enabled XCR0 features",
1329				    __func__);
1330				return (-1);
1331			}
1332			if (vmh->vmh_cpuids[i].c > c) {
1333				log_debug("%s: incompatible cpu: insufficient "
1334				    "max save area for supported XCR0 features",
1335				    __func__);
1336				return (-1);
1337			}
1338			break;
1339
1340		case 0x80000001:
1341			CPUID_LEAF(code, leaf, a, b, c, d);
1342			if ((vmh->vmh_cpuids[i].a & a) !=
1343			    vmh->vmh_cpuids[i].a) {
1344				log_debug("%s: incompatible cpu features "
1345				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
1346				    code, leaf);
1347				return (-1);
1348			}
1349			if ((vmh->vmh_cpuids[i].c & c) !=
1350			    vmh->vmh_cpuids[i].c) {
1351				log_debug("%s: incompatible cpu features "
1352				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1353				    code, leaf);
1354				return (-1);
1355			}
1356			if ((vmh->vmh_cpuids[i].d & d) !=
1357			    vmh->vmh_cpuids[i].d) {
1358				log_debug("%s: incompatible cpu features "
1359				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1360				    code, leaf);
1361				return (-1);
1362			}
1363			break;
1364
1365		default:
1366			log_debug("%s: unknown code 0x%x", __func__, code);
1367			return (-1);
1368		}
1369	}
1370
1371	return (0);
1372}
1373