1/*	$OpenBSD: vm.c,v 1.100 2024/04/29 14:47:06 dv Exp $	*/
2
3/*
4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
20#include <sys/types.h>
21#include <sys/ioctl.h>
22#include <sys/queue.h>
23#include <sys/wait.h>
24#include <sys/uio.h>
25#include <sys/stat.h>
26#include <sys/socket.h>
27#include <sys/time.h>
28#include <sys/mman.h>
29#include <sys/resource.h>
30
31#include <dev/ic/i8253reg.h>
32#include <dev/isa/isareg.h>
33#include <dev/pci/pcireg.h>
34
35#include <machine/psl.h>
36#include <machine/pte.h>
37#include <machine/specialreg.h>
38#include <machine/vmmvar.h>
39
40#include <net/if.h>
41
42#include <errno.h>
43#include <event.h>
44#include <fcntl.h>
45#include <imsg.h>
46#include <limits.h>
47#include <poll.h>
48#include <pthread.h>
49#include <pthread_np.h>
50#include <stddef.h>
51#include <stdio.h>
52#include <stdlib.h>
53#include <string.h>
54#include <unistd.h>
55#include <util.h>
56
57#include "atomicio.h"
58#include "fw_cfg.h"
59#include "i8253.h"
60#include "i8259.h"
61#include "loadfile.h"
62#include "mc146818.h"
63#include "mmio.h"
64#include "ns8250.h"
65#include "pci.h"
66#include "virtio.h"
67#include "vmd.h"
68#include "vmm.h"
69
70#define MB(x)	(x * 1024UL * 1024UL)
71#define GB(x)	(x * 1024UL * 1024UL * 1024UL)
72
73#define MMIO_NOTYET 0
74
75io_fn_t ioports_map[MAX_PORTS];
76
77static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
78void vm_dispatch_vmm(int, short, void *);
79void *event_thread(void *);
80void *vcpu_run_loop(void *);
81int vcpu_exit(struct vm_run_params *);
82int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
83void create_memory_map(struct vm_create_params *);
84static int vmm_create_vm(struct vmd_vm *);
85int alloc_guest_mem(struct vmd_vm *);
86void init_emulated_hw(struct vmop_create_params *, int,
87    int[][VM_MAX_BASE_PER_DISK], int *);
88void restore_emulated_hw(struct vm_create_params *, int, int *,
89    int[][VM_MAX_BASE_PER_DISK],int);
90void vcpu_exit_inout(struct vm_run_params *);
91int vcpu_exit_eptviolation(struct vm_run_params *);
92uint8_t vcpu_exit_pci(struct vm_run_params *);
93int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
94int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
95static int send_vm(int, struct vmd_vm *);
96int dump_send_header(int);
97static int dump_vmr(int , struct vm_mem_range *);
98static int dump_mem(int, struct vmd_vm *);
99void restore_vmr(int, struct vm_mem_range *);
100void restore_mem(int, struct vm_create_params *);
101int restore_vm_params(int, struct vm_create_params *);
102static void pause_vm(struct vmd_vm *);
103static void unpause_vm(struct vmd_vm *);
104
105int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
106
107static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
108    size_t);
109
110int con_fd;
111struct vmd_vm *current_vm;
112
113extern struct vmd *env;
114
115extern char *__progname;
116
117pthread_mutex_t threadmutex;
118pthread_cond_t threadcond;
119
120pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
121pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
122pthread_barrier_t vm_pause_barrier;
123pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
124pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
125uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
126uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
127
128/*
129 * Represents a standard register set for an OS to be booted
130 * as a flat 64 bit address space.
131 *
132 * NOT set here are:
133 *  RIP
134 *  RSP
135 *  GDTR BASE
136 *
137 * Specific bootloaders should clone this structure and override
138 * those fields as needed.
139 *
140 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
141 *        features of the CPU in use.
142 */
143static const struct vcpu_reg_state vcpu_init_flat64 = {
144	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
145	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
146	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
147	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
148	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
149	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
150	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
151	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
152	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
153	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
154	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
155	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
156	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
157	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
158	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
159	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
160	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
161	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
162	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
163	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
164	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
165	.vrs_drs[VCPU_REGS_DR0] = 0x0,
166	.vrs_drs[VCPU_REGS_DR1] = 0x0,
167	.vrs_drs[VCPU_REGS_DR2] = 0x0,
168	.vrs_drs[VCPU_REGS_DR3] = 0x0,
169	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
170	.vrs_drs[VCPU_REGS_DR7] = 0x400,
171	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
172	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
173	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
174	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
175	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
176	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
177	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
178};
179
180/*
181 * Represents a standard register set for an BIOS to be booted
182 * as a flat 16 bit address space.
183 */
184static const struct vcpu_reg_state vcpu_init_flat16 = {
185	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
186	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
187	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
188	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
189	.vrs_crs[VCPU_REGS_CR3] = 0,
190	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
191	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
192	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
193	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
194	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
195	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
196	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
197	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
198	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
199	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
200	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
201	.vrs_drs[VCPU_REGS_DR0] = 0x0,
202	.vrs_drs[VCPU_REGS_DR1] = 0x0,
203	.vrs_drs[VCPU_REGS_DR2] = 0x0,
204	.vrs_drs[VCPU_REGS_DR3] = 0x0,
205	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
206	.vrs_drs[VCPU_REGS_DR7] = 0x400,
207	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
208	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
209	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
210	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
211	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
212	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
213};
214
215/*
216 * vm_main
217 *
218 * Primary entrypoint for launching a vm. Does not return.
219 *
220 * fd: file descriptor for communicating with vmm process.
221 * fd_vmm: file descriptor for communicating with vmm(4) device
222 */
223void
224vm_main(int fd, int fd_vmm)
225{
226	struct vm_create_params	*vcp = NULL;
227	struct vmd_vm		 vm;
228	size_t			 sz = 0;
229	int			 ret = 0;
230
231	/*
232	 * The vm process relies on global state. Set the fd for /dev/vmm.
233	 */
234	env->vmd_fd = fd_vmm;
235
236	/*
237	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
238	 */
239	if (unveil(env->argv0, "x") == -1)
240		fatal("unveil %s", env->argv0);
241	if (unveil(NULL, NULL) == -1)
242		fatal("unveil lock");
243
244	/*
245	 * pledge in the vm processes:
246	 * stdio - for malloc and basic I/O including events.
247	 * vmm - for the vmm ioctls and operations.
248	 * proc exec - fork/exec for launching devices.
249	 * recvfd - for vm send/recv and sending fd to devices.
250	 */
251	if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
252		fatal("pledge");
253
254	/* Receive our vm configuration. */
255	memset(&vm, 0, sizeof(vm));
256	sz = atomicio(read, fd, &vm, sizeof(vm));
257	if (sz != sizeof(vm)) {
258		log_warnx("failed to receive start message");
259		_exit(EIO);
260	}
261
262	/* Update process with the vm name. */
263	vcp = &vm.vm_params.vmc_params;
264	setproctitle("%s", vcp->vcp_name);
265	log_procinit("vm/%s", vcp->vcp_name);
266
267	/* Receive the local prefix settings. */
268	sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix,
269	    sizeof(env->vmd_cfg.cfg_localprefix));
270	if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
271		log_warnx("failed to receive local prefix");
272		_exit(EIO);
273	}
274
275	/*
276	 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
277	 * kernel or a BIOS image.
278	 */
279	if (!(vm.vm_state & VM_STATE_RECEIVED)) {
280		if (vm.vm_kernel == -1) {
281			log_warnx("%s: failed to receive boot fd",
282			    vcp->vcp_name);
283			_exit(EINVAL);
284		}
285	}
286
287	ret = start_vm(&vm, fd);
288	_exit(ret);
289}
290
291/*
292 * loadfile_bios
293 *
294 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
295 * directly into memory.
296 *
297 * Parameters:
298 *  fp: file of a kernel file to load
299 *  size: uncompressed size of the image
300 *  (out) vrs: register state to set on init for this kernel
301 *
302 * Return values:
303 *  0 if successful
304 *  various error codes returned from read(2) or loadelf functions
305 */
306int
307loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
308{
309	off_t	 off;
310
311	/* Set up a "flat 16 bit" register state for BIOS */
312	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
313
314	/* Seek to the beginning of the BIOS image */
315	if (gzseek(fp, 0, SEEK_SET) == -1)
316		return (-1);
317
318	/* The BIOS image must end at 1MB */
319	if ((off = MB(1) - size) < 0)
320		return (-1);
321
322	/* Read BIOS image into memory */
323	if (mread(fp, off, size) != (size_t)size) {
324		errno = EIO;
325		return (-1);
326	}
327
328	if (gzseek(fp, 0, SEEK_SET) == -1)
329		return (-1);
330
331	/* Read a second BIOS copy into memory ending at 4GB */
332	off = GB(4) - size;
333	if (mread(fp, off, size) != (size_t)size) {
334		errno = EIO;
335		return (-1);
336	}
337
338	log_debug("%s: loaded BIOS image", __func__);
339
340	return (0);
341}
342
343/*
344 * start_vm
345 *
346 * After forking a new VM process, starts the new VM with the creation
347 * parameters supplied (in the incoming vm->vm_params field). This
348 * function performs a basic sanity check on the incoming parameters
349 * and then performs the following steps to complete the creation of the VM:
350 *
351 * 1. validates and create the new VM
352 * 2. opens the imsg control channel to the parent and drops more privilege
353 * 3. drops additional privileges by calling pledge(2)
354 * 4. loads the kernel from the disk image or file descriptor
355 * 5. runs the VM's VCPU loops.
356 *
357 * Parameters:
358 *  vm: The VM data structure that is including the VM create parameters.
359 *  fd: The imsg socket that is connected to the parent process.
360 *
361 * Return values:
362 *  0: success
363 *  !0 : failure - typically an errno indicating the source of the failure
364 */
365int
366start_vm(struct vmd_vm *vm, int fd)
367{
368	struct vmop_create_params *vmc = &vm->vm_params;
369	struct vm_create_params	*vcp = &vmc->vmc_params;
370	struct vcpu_reg_state	 vrs;
371	int			 nicfds[VM_MAX_NICS_PER_VM];
372	int			 ret;
373	gzFile			 fp;
374	size_t			 i;
375	struct vm_rwregs_params  vrp;
376	struct stat		 sb;
377
378	/*
379	 * We first try to initialize and allocate memory before bothering
380	 * vmm(4) with a request to create a new vm.
381	 */
382	if (!(vm->vm_state & VM_STATE_RECEIVED))
383		create_memory_map(vcp);
384
385	ret = alloc_guest_mem(vm);
386	if (ret) {
387		struct rlimit lim;
388		char buf[FMT_SCALED_STRSIZE];
389		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
390			if (fmt_scaled(lim.rlim_cur, buf) == 0)
391				fatalx("could not allocate guest memory (data "
392				    "limit is %s)", buf);
393		}
394		errno = ret;
395		log_warn("could not allocate guest memory");
396		return (ret);
397	}
398
399	/* We've allocated guest memory, so now create the vm in vmm(4). */
400	ret = vmm_create_vm(vm);
401	if (ret) {
402		/* Let the vmm process know we failed by sending a 0 vm id. */
403		vcp->vcp_id = 0;
404		atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
405		return (ret);
406	}
407
408	/*
409	 * Some of vmd currently relies on global state (current_vm, con_fd).
410	 */
411	current_vm = vm;
412	con_fd = vm->vm_tty;
413	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
414		log_warn("failed to set nonblocking mode on console");
415		return (1);
416	}
417
418	/*
419	 * We now let the vmm process know we were successful by sending it our
420	 * vmm(4) assigned vm id.
421	 */
422	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
423	    sizeof(vcp->vcp_id)) {
424		log_warn("failed to send created vm id to vmm process");
425		return (1);
426	}
427
428	/* Prepare either our boot image or receive an existing vm to launch. */
429	if (vm->vm_state & VM_STATE_RECEIVED) {
430		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
431		if (ret != sizeof(vrp))
432			fatal("received incomplete vrp - exiting");
433		vrs = vrp.vrwp_regs;
434	} else {
435		/*
436		 * Set up default "flat 64 bit" register state - RIP,
437		 * RSP, and GDT info will be set in bootloader
438		 */
439		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
440
441		/* Find and open kernel image */
442		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
443			fatalx("failed to open kernel - exiting");
444
445		/* Load kernel image */
446		ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice);
447
448		/*
449		 * Try BIOS as a fallback (only if it was provided as an image
450		 * with vm->vm_kernel and the file is not compressed)
451		 */
452		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
453		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
454			ret = loadfile_bios(fp, sb.st_size, &vrs);
455
456		if (ret)
457			fatal("failed to load kernel or BIOS - exiting");
458
459		gzclose(fp);
460	}
461
462	if (vm->vm_kernel != -1)
463		close_fd(vm->vm_kernel);
464
465	/* Initialize our mutexes. */
466	ret = pthread_mutex_init(&threadmutex, NULL);
467	if (ret) {
468		log_warn("%s: could not initialize thread state mutex",
469		    __func__);
470		return (ret);
471	}
472	ret = pthread_cond_init(&threadcond, NULL);
473	if (ret) {
474		log_warn("%s: could not initialize thread state "
475		    "condition variable", __func__);
476		return (ret);
477	}
478	mutex_lock(&threadmutex);
479
480
481	/*
482	 * Finalize our communication socket with the vmm process. From here
483	 * onwards, communication with the vmm process is event-based.
484	 */
485	event_init();
486	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
487		fatal("setup vm pipe");
488
489	/*
490	 * Initialize or restore our emulated hardware.
491	 */
492	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
493		nicfds[i] = vm->vm_ifs[i].vif_fd;
494
495	if (vm->vm_state & VM_STATE_RECEIVED) {
496		restore_mem(vm->vm_receive_fd, vcp);
497		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
498		    vm->vm_disks, vm->vm_cdrom);
499		if (restore_vm_params(vm->vm_receive_fd, vcp))
500			fatal("restore vm params failed");
501		unpause_vm(vm);
502	} else
503		init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
504
505	/* Drop privleges further before starting the vcpu run loop(s). */
506	if (pledge("stdio vmm recvfd", NULL) == -1)
507		fatal("pledge");
508
509	/*
510	 * Execute the vcpu run loop(s) for this VM.
511	 */
512	ret = run_vm(&vm->vm_params, &vrs);
513
514	/* Ensure that any in-flight data is written back */
515	virtio_shutdown(vm);
516
517	return (ret);
518}
519
520/*
521 * vm_dispatch_vmm
522 *
523 * imsg callback for messages that are received from the vmm parent process.
524 */
525void
526vm_dispatch_vmm(int fd, short event, void *arg)
527{
528	struct vmd_vm		*vm = arg;
529	struct vmop_result	 vmr;
530	struct vmop_addr_result	 var;
531	struct imsgev		*iev = &vm->vm_iev;
532	struct imsgbuf		*ibuf = &iev->ibuf;
533	struct imsg		 imsg;
534	ssize_t			 n;
535	int			 verbose;
536
537	if (event & EV_READ) {
538		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
539			fatal("%s: imsg_read", __func__);
540		if (n == 0)
541			_exit(0);
542	}
543
544	if (event & EV_WRITE) {
545		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
546			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
547		if (n == 0)
548			_exit(0);
549	}
550
551	for (;;) {
552		if ((n = imsg_get(ibuf, &imsg)) == -1)
553			fatal("%s: imsg_get", __func__);
554		if (n == 0)
555			break;
556
557#if DEBUG > 1
558		log_debug("%s: got imsg %d from %s",
559		    __func__, imsg.hdr.type,
560		    vm->vm_params.vmc_params.vcp_name);
561#endif
562
563		switch (imsg.hdr.type) {
564		case IMSG_CTL_VERBOSE:
565			IMSG_SIZE_CHECK(&imsg, &verbose);
566			memcpy(&verbose, imsg.data, sizeof(verbose));
567			log_setverbose(verbose);
568			virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose,
569			    sizeof(verbose));
570			break;
571		case IMSG_VMDOP_VM_SHUTDOWN:
572			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
573				_exit(0);
574			break;
575		case IMSG_VMDOP_VM_REBOOT:
576			if (vmmci_ctl(VMMCI_REBOOT) == -1)
577				_exit(0);
578			break;
579		case IMSG_VMDOP_PAUSE_VM:
580			vmr.vmr_result = 0;
581			vmr.vmr_id = vm->vm_vmid;
582			pause_vm(vm);
583			imsg_compose_event(&vm->vm_iev,
584			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
585			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
586			    sizeof(vmr));
587			break;
588		case IMSG_VMDOP_UNPAUSE_VM:
589			vmr.vmr_result = 0;
590			vmr.vmr_id = vm->vm_vmid;
591			unpause_vm(vm);
592			imsg_compose_event(&vm->vm_iev,
593			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
594			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
595			    sizeof(vmr));
596			break;
597		case IMSG_VMDOP_SEND_VM_REQUEST:
598			vmr.vmr_id = vm->vm_vmid;
599			vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm);
600			imsg_compose_event(&vm->vm_iev,
601			    IMSG_VMDOP_SEND_VM_RESPONSE,
602			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
603			    sizeof(vmr));
604			if (!vmr.vmr_result) {
605				imsg_flush(&current_vm->vm_iev.ibuf);
606				_exit(0);
607			}
608			break;
609		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
610			IMSG_SIZE_CHECK(&imsg, &var);
611			memcpy(&var, imsg.data, sizeof(var));
612
613			log_debug("%s: received tap addr %s for nic %d",
614			    vm->vm_params.vmc_params.vcp_name,
615			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
616
617			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
618			break;
619		default:
620			fatalx("%s: got invalid imsg %d from %s",
621			    __func__, imsg.hdr.type,
622			    vm->vm_params.vmc_params.vcp_name);
623		}
624		imsg_free(&imsg);
625	}
626	imsg_event_add(iev);
627}
628
629/*
630 * vm_shutdown
631 *
632 * Tell the vmm parent process to shutdown or reboot the VM and exit.
633 */
634__dead void
635vm_shutdown(unsigned int cmd)
636{
637	switch (cmd) {
638	case VMMCI_NONE:
639	case VMMCI_SHUTDOWN:
640		(void)imsg_compose_event(&current_vm->vm_iev,
641		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
642		break;
643	case VMMCI_REBOOT:
644		(void)imsg_compose_event(&current_vm->vm_iev,
645		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
646		break;
647	default:
648		fatalx("invalid vm ctl command: %d", cmd);
649	}
650	imsg_flush(&current_vm->vm_iev.ibuf);
651
652	_exit(0);
653}
654
655int
656send_vm(int fd, struct vmd_vm *vm)
657{
658	struct vm_rwregs_params	   vrp;
659	struct vm_rwvmparams_params vpp;
660	struct vmop_create_params *vmc;
661	struct vm_terminate_params vtp;
662	unsigned int		   flags = 0;
663	unsigned int		   i;
664	int			   ret = 0;
665	size_t			   sz;
666
667	if (dump_send_header(fd)) {
668		log_warnx("%s: failed to send vm dump header", __func__);
669		goto err;
670	}
671
672	pause_vm(vm);
673
674	vmc = calloc(1, sizeof(struct vmop_create_params));
675	if (vmc == NULL) {
676		log_warn("%s: calloc error getting vmc", __func__);
677		ret = -1;
678		goto err;
679	}
680
681	flags |= VMOP_CREATE_MEMORY;
682	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
683	    vmop_create_params));
684	vmc->vmc_flags = flags;
685	vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id;
686	vrp.vrwp_mask = VM_RWREGS_ALL;
687	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
688	vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id;
689
690	sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params));
691	if (sz != sizeof(struct vmop_create_params)) {
692		ret = -1;
693		goto err;
694	}
695
696	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
697		vrp.vrwp_vcpu_id = i;
698		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
699			log_warn("%s: readregs failed", __func__);
700			goto err;
701		}
702
703		sz = atomicio(vwrite, fd, &vrp,
704		    sizeof(struct vm_rwregs_params));
705		if (sz != sizeof(struct vm_rwregs_params)) {
706			log_warn("%s: dumping registers failed", __func__);
707			ret = -1;
708			goto err;
709		}
710	}
711
712	/* Dump memory before devices to aid in restoration. */
713	if ((ret = dump_mem(fd, vm)))
714		goto err;
715	if ((ret = i8253_dump(fd)))
716		goto err;
717	if ((ret = i8259_dump(fd)))
718		goto err;
719	if ((ret = ns8250_dump(fd)))
720		goto err;
721	if ((ret = mc146818_dump(fd)))
722		goto err;
723	if ((ret = fw_cfg_dump(fd)))
724		goto err;
725	if ((ret = pci_dump(fd)))
726		goto err;
727	if ((ret = virtio_dump(fd)))
728		goto err;
729
730	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
731		vpp.vpp_vcpu_id = i;
732		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
733			log_warn("%s: readvmparams failed", __func__);
734			goto err;
735		}
736
737		sz = atomicio(vwrite, fd, &vpp,
738		    sizeof(struct vm_rwvmparams_params));
739		if (sz != sizeof(struct vm_rwvmparams_params)) {
740			log_warn("%s: dumping vm params failed", __func__);
741			ret = -1;
742			goto err;
743		}
744	}
745
746	vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id;
747	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
748		log_warnx("%s: term IOC error: %d, %d", __func__,
749		    errno, ENOENT);
750	}
751err:
752	close(fd);
753	if (ret)
754		unpause_vm(vm);
755	return ret;
756}
757
758int
759dump_send_header(int fd) {
760	struct vm_dump_header	   vmh;
761	int			   i;
762
763	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
764	    sizeof(vmh.vmh_signature));
765
766	vmh.vmh_cpuids[0].code = 0x00;
767	vmh.vmh_cpuids[0].leaf = 0x00;
768
769	vmh.vmh_cpuids[1].code = 0x01;
770	vmh.vmh_cpuids[1].leaf = 0x00;
771
772	vmh.vmh_cpuids[2].code = 0x07;
773	vmh.vmh_cpuids[2].leaf = 0x00;
774
775	vmh.vmh_cpuids[3].code = 0x0d;
776	vmh.vmh_cpuids[3].leaf = 0x00;
777
778	vmh.vmh_cpuids[4].code = 0x80000001;
779	vmh.vmh_cpuids[4].leaf = 0x00;
780
781	vmh.vmh_version = VM_DUMP_VERSION;
782
783	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
784		CPUID_LEAF(vmh.vmh_cpuids[i].code,
785		    vmh.vmh_cpuids[i].leaf,
786		    vmh.vmh_cpuids[i].a,
787		    vmh.vmh_cpuids[i].b,
788		    vmh.vmh_cpuids[i].c,
789		    vmh.vmh_cpuids[i].d);
790	}
791
792	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
793		return (-1);
794
795	return (0);
796}
797
798int
799dump_mem(int fd, struct vmd_vm *vm)
800{
801	unsigned int	i;
802	int		ret;
803	struct		vm_mem_range *vmr;
804
805	for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) {
806		vmr = &vm->vm_params.vmc_params.vcp_memranges[i];
807		ret = dump_vmr(fd, vmr);
808		if (ret)
809			return ret;
810	}
811	return (0);
812}
813
814int
815restore_vm_params(int fd, struct vm_create_params *vcp) {
816	unsigned int			i;
817	struct vm_rwvmparams_params    vpp;
818
819	for (i = 0; i < vcp->vcp_ncpus; i++) {
820		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
821			log_warn("%s: error restoring vm params", __func__);
822			return (-1);
823		}
824		vpp.vpp_vm_id = vcp->vcp_id;
825		vpp.vpp_vcpu_id = i;
826		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
827			log_debug("%s: writing vm params failed", __func__);
828			return (-1);
829		}
830	}
831	return (0);
832}
833
834void
835restore_mem(int fd, struct vm_create_params *vcp)
836{
837	unsigned int	     i;
838	struct vm_mem_range *vmr;
839
840	for (i = 0; i < vcp->vcp_nmemranges; i++) {
841		vmr = &vcp->vcp_memranges[i];
842		restore_vmr(fd, vmr);
843	}
844}
845
846int
847dump_vmr(int fd, struct vm_mem_range *vmr)
848{
849	size_t	rem = vmr->vmr_size, read=0;
850	char	buf[PAGE_SIZE];
851
852	while (rem > 0) {
853		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
854			log_warn("failed to read vmr");
855			return (-1);
856		}
857		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
858			log_warn("failed to dump vmr");
859			return (-1);
860		}
861		rem = rem - PAGE_SIZE;
862		read = read + PAGE_SIZE;
863	}
864	return (0);
865}
866
867void
868restore_vmr(int fd, struct vm_mem_range *vmr)
869{
870	size_t	rem = vmr->vmr_size, wrote=0;
871	char	buf[PAGE_SIZE];
872
873	while (rem > 0) {
874		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
875			fatal("failed to restore vmr");
876		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
877			fatal("failed to write vmr");
878		rem = rem - PAGE_SIZE;
879		wrote = wrote + PAGE_SIZE;
880	}
881}
882
883static void
884pause_vm(struct vmd_vm *vm)
885{
886	unsigned int n;
887	int ret;
888	if (vm->vm_state & VM_STATE_PAUSED)
889		return;
890
891	current_vm->vm_state |= VM_STATE_PAUSED;
892
893	ret = pthread_barrier_init(&vm_pause_barrier, NULL,
894	    vm->vm_params.vmc_params.vcp_ncpus + 1);
895	if (ret) {
896		log_warnx("%s: cannot initialize pause barrier (%d)",
897		    __progname, ret);
898		return;
899	}
900
901	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
902		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
903		if (ret) {
904			log_warnx("%s: can't broadcast vcpu run cond (%d)",
905			    __func__, (int)ret);
906			return;
907		}
908	}
909	ret = pthread_barrier_wait(&vm_pause_barrier);
910	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
911		log_warnx("%s: could not wait on pause barrier (%d)",
912		    __func__, (int)ret);
913		return;
914	}
915
916	ret = pthread_barrier_destroy(&vm_pause_barrier);
917	if (ret) {
918		log_warnx("%s: could not destroy pause barrier (%d)",
919		    __progname, ret);
920		return;
921	}
922
923	i8253_stop();
924	mc146818_stop();
925	ns8250_stop();
926	virtio_stop(vm);
927}
928
929static void
930unpause_vm(struct vmd_vm *vm)
931{
932	unsigned int n;
933	int ret;
934	if (!(vm->vm_state & VM_STATE_PAUSED))
935		return;
936
937	current_vm->vm_state &= ~VM_STATE_PAUSED;
938	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
939		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
940		if (ret) {
941			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
942			    __func__, (int)ret);
943			return;
944		}
945	}
946
947	i8253_start();
948	mc146818_start();
949	ns8250_start();
950	virtio_start(vm);
951}
952
953/*
954 * vcpu_reset
955 *
956 * Requests vmm(4) to reset the VCPUs in the indicated VM to
957 * the register state provided
958 *
959 * Parameters
960 *  vmid: VM ID to reset
961 *  vcpu_id: VCPU ID to reset
962 *  vrs: the register state to initialize
963 *
964 * Return values:
965 *  0: success
966 *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
967 *      valid)
968 */
969int
970vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
971{
972	struct vm_resetcpu_params vrp;
973
974	memset(&vrp, 0, sizeof(vrp));
975	vrp.vrp_vm_id = vmid;
976	vrp.vrp_vcpu_id = vcpu_id;
977	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
978
979	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
980
981	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
982		return (errno);
983
984	return (0);
985}
986
987/*
988 * create_memory_map
989 *
990 * Sets up the guest physical memory ranges that the VM can access.
991 *
992 * Parameters:
993 *  vcp: VM create parameters describing the VM whose memory map
994 *       is being created
995 *
996 * Return values:
997 *  nothing
998 */
999void
1000create_memory_map(struct vm_create_params *vcp)
1001{
1002	size_t len, mem_bytes;
1003	size_t above_1m = 0, above_4g = 0;
1004
1005	mem_bytes = vcp->vcp_memranges[0].vmr_size;
1006	vcp->vcp_nmemranges = 0;
1007	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
1008		return;
1009
1010	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
1011	len = LOWMEM_KB * 1024;
1012	vcp->vcp_memranges[0].vmr_gpa = 0x0;
1013	vcp->vcp_memranges[0].vmr_size = len;
1014	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
1015	mem_bytes -= len;
1016
1017	/*
1018	 * Second memory region: LOWMEM_KB - 1MB.
1019	 *
1020	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
1021	 * We have to add this region, because some systems
1022	 * unconditionally write to 0xb8000 (VGA RAM), and
1023	 * we need to make sure that vmm(4) permits accesses
1024	 * to it. So allocate guest memory for it.
1025	 */
1026	len = MB(1) - (LOWMEM_KB * 1024);
1027	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
1028	vcp->vcp_memranges[1].vmr_size = len;
1029	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
1030	mem_bytes -= len;
1031
1032	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
1033	if (mem_bytes <= MB(2)) {
1034		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
1035		vcp->vcp_memranges[2].vmr_size = MB(2);
1036		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
1037		vcp->vcp_nmemranges = 3;
1038		return;
1039	}
1040
1041	/*
1042	 * Calculate the how to split any remaining memory across the 4GB
1043	 * boundary while making sure we do not place physical memory into
1044	 * MMIO ranges.
1045	 */
1046	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
1047		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
1048		above_4g = mem_bytes - above_1m;
1049	} else {
1050		above_1m = mem_bytes;
1051		above_4g = 0;
1052	}
1053
1054	/* Third memory region: area above 1MB to MMIO region */
1055	vcp->vcp_memranges[2].vmr_gpa = MB(1);
1056	vcp->vcp_memranges[2].vmr_size = above_1m;
1057	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
1058
1059	/* Fourth region: PCI MMIO range */
1060	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
1061	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
1062	    VMM_PCI_MMIO_BAR_BASE + 1;
1063	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
1064
1065	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
1066	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
1067	vcp->vcp_memranges[4].vmr_size = MB(2);
1068	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
1069
1070	/* Sixth region: any remainder above 4GB */
1071	if (above_4g > 0) {
1072		vcp->vcp_memranges[5].vmr_gpa = GB(4);
1073		vcp->vcp_memranges[5].vmr_size = above_4g;
1074		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
1075		vcp->vcp_nmemranges = 6;
1076	} else
1077		vcp->vcp_nmemranges = 5;
1078}
1079
1080/*
1081 * alloc_guest_mem
1082 *
1083 * Allocates memory for the guest.
1084 * Instead of doing a single allocation with one mmap(), we allocate memory
1085 * separately for every range for the following reasons:
1086 * - ASLR for the individual ranges
1087 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
1088 *   map the single mmap'd userspace memory to the individual guest physical
1089 *   memory ranges, the underlying amap of the single mmap'd range would have
1090 *   to allocate per-page reference counters. The reason is that the
1091 *   individual guest physical ranges would reference the single mmap'd region
1092 *   only partially. However, if every guest physical range has its own
1093 *   corresponding mmap'd userspace allocation, there are no partial
1094 *   references: every guest physical range fully references an mmap'd
1095 *   range => no per-page reference counters have to be allocated.
1096 *
1097 * Return values:
1098 *  0: success
1099 *  !0: failure - errno indicating the source of the failure
1100 */
1101int
1102alloc_guest_mem(struct vmd_vm *vm)
1103{
1104	void *p;
1105	int ret = 0;
1106	size_t i, j;
1107	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1108	struct vm_mem_range *vmr;
1109
1110	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1111		vmr = &vcp->vcp_memranges[i];
1112
1113		/*
1114		 * We only need R/W as userland. vmm(4) will use R/W/X in its
1115		 * mapping.
1116		 *
1117		 * We must use MAP_SHARED so emulated devices will be able
1118		 * to generate shared mappings.
1119		 */
1120		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
1121		    MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
1122		if (p == MAP_FAILED) {
1123			ret = errno;
1124			for (j = 0; j < i; j++) {
1125				vmr = &vcp->vcp_memranges[j];
1126				munmap((void *)vmr->vmr_va, vmr->vmr_size);
1127			}
1128			return (ret);
1129		}
1130		vmr->vmr_va = (vaddr_t)p;
1131	}
1132
1133	return (ret);
1134}
1135
1136/*
1137 * vmm_create_vm
1138 *
1139 * Requests vmm(4) to create a new VM using the supplied creation
1140 * parameters. This operation results in the creation of the in-kernel
1141 * structures for the VM, but does not start the VM's vcpu(s).
1142 *
1143 * Parameters:
1144 *  vm: pointer to the vm object
1145 *
1146 * Return values:
1147 *  0: success
1148 *  !0 : ioctl to vmm(4) failed
1149 */
1150static int
1151vmm_create_vm(struct vmd_vm *vm)
1152{
1153	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1154
1155	/* Sanity check arguments */
1156	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1157		return (EINVAL);
1158
1159	if (vcp->vcp_nmemranges == 0 ||
1160	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1161		return (EINVAL);
1162
1163	if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM)
1164		return (EINVAL);
1165
1166	if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM)
1167		return (EINVAL);
1168
1169	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1170		return (errno);
1171
1172	return (0);
1173}
1174
1175/*
1176 * init_emulated_hw
1177 *
1178 * Initializes the userspace hardware emulation
1179 */
1180void
1181init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1182    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1183{
1184	struct vm_create_params *vcp = &vmc->vmc_params;
1185	size_t i;
1186	uint64_t memlo, memhi;
1187
1188	/* Calculate memory size for NVRAM registers */
1189	memlo = memhi = 0;
1190	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1191		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
1192		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
1193			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
1194		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
1195			memhi = vcp->vcp_memranges[i].vmr_size;
1196	}
1197
1198	/* Reset the IO port map */
1199	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1200
1201	/* Init i8253 PIT */
1202	i8253_init(vcp->vcp_id);
1203	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1204	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1205	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1206	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1207	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1208
1209	/* Init mc146818 RTC */
1210	mc146818_init(vcp->vcp_id, memlo, memhi);
1211	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1212	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1213
1214	/* Init master and slave PICs */
1215	i8259_init();
1216	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1217	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1218	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1219	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1220	ioports_map[ELCR0] = vcpu_exit_elcr;
1221	ioports_map[ELCR1] = vcpu_exit_elcr;
1222
1223	/* Init ns8250 UART */
1224	ns8250_init(con_fd, vcp->vcp_id);
1225	for (i = COM1_DATA; i <= COM1_SCR; i++)
1226		ioports_map[i] = vcpu_exit_com;
1227
1228	/* Initialize PCI */
1229	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1230		ioports_map[i] = vcpu_exit_pci;
1231
1232	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1233	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1234	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1235	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1236	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1237	pci_init();
1238
1239	/* Initialize virtio devices */
1240	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1241
1242	/*
1243	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
1244	 * detection.
1245	 */
1246	fw_cfg_init(vmc);
1247	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1248	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1249	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1250	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1251}
1252
1253/*
1254 * restore_emulated_hw
1255 *
1256 * Restores the userspace hardware emulation from fd
1257 */
1258void
1259restore_emulated_hw(struct vm_create_params *vcp, int fd,
1260    int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1261{
1262	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1263	int i;
1264	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1265
1266	/* Init i8253 PIT */
1267	i8253_restore(fd, vcp->vcp_id);
1268	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1269	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1270	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1271	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1272
1273	/* Init master and slave PICs */
1274	i8259_restore(fd);
1275	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1276	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1277	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1278	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1279
1280	/* Init ns8250 UART */
1281	ns8250_restore(fd, con_fd, vcp->vcp_id);
1282	for (i = COM1_DATA; i <= COM1_SCR; i++)
1283		ioports_map[i] = vcpu_exit_com;
1284
1285	/* Init mc146818 RTC */
1286	mc146818_restore(fd, vcp->vcp_id);
1287	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1288	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1289
1290	/* Init QEMU fw_cfg interface */
1291	fw_cfg_restore(fd);
1292	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1293	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1294	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1295	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1296
1297	/* Initialize PCI */
1298	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1299		ioports_map[i] = vcpu_exit_pci;
1300
1301	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1302	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1303	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1304	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1305	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1306	pci_restore(fd);
1307	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1308}
1309
1310/*
1311 * run_vm
1312 *
1313 * Runs the VM whose creation parameters are specified in vcp
1314 *
1315 * Parameters:
1316 *  child_cdrom: previously-opened child ISO disk file descriptor
1317 *  child_disks: previously-opened child VM disk file file descriptors
1318 *  child_taps: previously-opened child tap file descriptors
1319 *  vmc: vmop_create_params struct containing the VM's desired creation
1320 *      configuration
1321 *  vrs: VCPU register state to initialize
1322 *
1323 * Return values:
1324 *  0: the VM exited normally
1325 *  !0 : the VM exited abnormally or failed to start
1326 */
1327static int
1328run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
1329{
1330	struct vm_create_params *vcp = &vmc->vmc_params;
1331	struct vm_rwregs_params vregsp;
1332	uint8_t evdone = 0;
1333	size_t i;
1334	int ret;
1335	pthread_t *tid, evtid;
1336	char tname[MAXCOMLEN + 1];
1337	struct vm_run_params **vrp;
1338	void *exit_status;
1339
1340	if (vcp == NULL)
1341		return (EINVAL);
1342
1343	if (vcp->vcp_nmemranges == 0 ||
1344	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1345		return (EINVAL);
1346
1347	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1348	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1349	if (tid == NULL || vrp == NULL) {
1350		log_warn("%s: memory allocation error - exiting.",
1351		    __progname);
1352		return (ENOMEM);
1353	}
1354
1355	log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
1356	    vcp->vcp_ncpus, vcp->vcp_name);
1357
1358	/*
1359	 * Create and launch one thread for each VCPU. These threads may
1360	 * migrate between PCPUs over time; the need to reload CPU state
1361	 * in such situations is detected and performed by vmm(4) in the
1362	 * kernel.
1363	 */
1364	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1365		vrp[i] = malloc(sizeof(struct vm_run_params));
1366		if (vrp[i] == NULL) {
1367			log_warn("%s: memory allocation error - "
1368			    "exiting.", __progname);
1369			/* caller will exit, so skip freeing */
1370			return (ENOMEM);
1371		}
1372		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1373		if (vrp[i]->vrp_exit == NULL) {
1374			log_warn("%s: memory allocation error - "
1375			    "exiting.", __progname);
1376			/* caller will exit, so skip freeing */
1377			return (ENOMEM);
1378		}
1379		vrp[i]->vrp_vm_id = vcp->vcp_id;
1380		vrp[i]->vrp_vcpu_id = i;
1381
1382		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1383			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1384			    __progname, i);
1385			return (EIO);
1386		}
1387
1388		/* once more because reset_cpu changes regs */
1389		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1390			vregsp.vrwp_vm_id = vcp->vcp_id;
1391			vregsp.vrwp_vcpu_id = i;
1392			vregsp.vrwp_regs = *vrs;
1393			vregsp.vrwp_mask = VM_RWREGS_ALL;
1394			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1395			    &vregsp)) == -1) {
1396				log_warn("%s: writeregs failed", __func__);
1397				return (ret);
1398			}
1399		}
1400
1401		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1402		if (ret) {
1403			log_warnx("%s: cannot initialize cond var (%d)",
1404			    __progname, ret);
1405			return (ret);
1406		}
1407
1408		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1409		if (ret) {
1410			log_warnx("%s: cannot initialize mtx (%d)",
1411			    __progname, ret);
1412			return (ret);
1413		}
1414
1415		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1416		if (ret) {
1417			log_warnx("%s: cannot initialize unpause var (%d)",
1418			    __progname, ret);
1419			return (ret);
1420		}
1421
1422		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1423		if (ret) {
1424			log_warnx("%s: cannot initialize unpause mtx (%d)",
1425			    __progname, ret);
1426			return (ret);
1427		}
1428
1429		vcpu_hlt[i] = 0;
1430
1431		/* Start each VCPU run thread at vcpu_run_loop */
1432		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1433		if (ret) {
1434			/* caller will _exit after this return */
1435			ret = errno;
1436			log_warn("%s: could not create vcpu thread %zu",
1437			    __func__, i);
1438			return (ret);
1439		}
1440
1441		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1442		pthread_set_name_np(tid[i], tname);
1443	}
1444
1445	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1446	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1447	if (ret) {
1448		errno = ret;
1449		log_warn("%s: could not create event thread", __func__);
1450		return (ret);
1451	}
1452	pthread_set_name_np(evtid, "event");
1453
1454	for (;;) {
1455		ret = pthread_cond_wait(&threadcond, &threadmutex);
1456		if (ret) {
1457			log_warn("%s: waiting on thread state condition "
1458			    "variable failed", __func__);
1459			return (ret);
1460		}
1461
1462		/*
1463		 * Did a VCPU thread exit with an error? => return the first one
1464		 */
1465		for (i = 0; i < vcp->vcp_ncpus; i++) {
1466			if (vcpu_done[i] == 0)
1467				continue;
1468
1469			if (pthread_join(tid[i], &exit_status)) {
1470				log_warn("%s: failed to join thread %zd - "
1471				    "exiting", __progname, i);
1472				return (EIO);
1473			}
1474
1475			ret = (intptr_t)exit_status;
1476		}
1477
1478		/* Did the event thread exit? => return with an error */
1479		if (evdone) {
1480			if (pthread_join(evtid, &exit_status)) {
1481				log_warn("%s: failed to join event thread - "
1482				    "exiting", __progname);
1483				return (EIO);
1484			}
1485
1486			log_warnx("%s: vm %d event thread exited "
1487			    "unexpectedly", __progname, vcp->vcp_id);
1488			return (EIO);
1489		}
1490
1491		/* Did all VCPU threads exit successfully? => return */
1492		for (i = 0; i < vcp->vcp_ncpus; i++) {
1493			if (vcpu_done[i] == 0)
1494				break;
1495		}
1496		if (i == vcp->vcp_ncpus)
1497			return (ret);
1498
1499		/* Some more threads to wait for, start over */
1500	}
1501
1502	return (ret);
1503}
1504
1505void *
1506event_thread(void *arg)
1507{
1508	uint8_t *donep = arg;
1509	intptr_t ret;
1510
1511	ret = event_dispatch();
1512
1513	mutex_lock(&threadmutex);
1514	*donep = 1;
1515	pthread_cond_signal(&threadcond);
1516	mutex_unlock(&threadmutex);
1517
1518	return (void *)ret;
1519 }
1520
1521/*
1522 * vcpu_run_loop
1523 *
1524 * Runs a single VCPU until vmm(4) requires help handling an exit,
1525 * or the VM terminates.
1526 *
1527 * Parameters:
1528 *  arg: vcpu_run_params for the VCPU being run by this thread
1529 *
1530 * Return values:
1531 *  NULL: the VCPU shutdown properly
1532 *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1533 */
1534void *
1535vcpu_run_loop(void *arg)
1536{
1537	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1538	intptr_t ret = 0;
1539	uint32_t n;
1540
1541	n = vrp->vrp_vcpu_id;
1542
1543	for (;;) {
1544		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1545
1546		if (ret) {
1547			log_warnx("%s: can't lock vcpu run mtx (%d)",
1548			    __func__, (int)ret);
1549			return ((void *)ret);
1550		}
1551
1552		/* If we are halted and need to pause, pause */
1553		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1554			ret = pthread_barrier_wait(&vm_pause_barrier);
1555			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1556				log_warnx("%s: could not wait on pause barrier (%d)",
1557				    __func__, (int)ret);
1558				return ((void *)ret);
1559			}
1560
1561			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1562			if (ret) {
1563				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1564				    __func__, (int)ret);
1565				return ((void *)ret);
1566			}
1567
1568			/* i8259 may be firing as we pause, release run mtx. */
1569			mutex_unlock(&vcpu_run_mtx[n]);
1570			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1571			    &vcpu_unpause_mtx[n]);
1572			if (ret) {
1573				log_warnx(
1574				    "%s: can't wait on unpause cond (%d)",
1575				    __func__, (int)ret);
1576				break;
1577			}
1578			mutex_lock(&vcpu_run_mtx[n]);
1579
1580			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1581			if (ret) {
1582				log_warnx("%s: can't unlock unpause mtx (%d)",
1583				    __func__, (int)ret);
1584				break;
1585			}
1586		}
1587
1588		/* If we are halted and not paused, wait */
1589		if (vcpu_hlt[n]) {
1590			ret = pthread_cond_wait(&vcpu_run_cond[n],
1591			    &vcpu_run_mtx[n]);
1592
1593			if (ret) {
1594				log_warnx(
1595				    "%s: can't wait on cond (%d)",
1596				    __func__, (int)ret);
1597				(void)pthread_mutex_unlock(
1598				    &vcpu_run_mtx[n]);
1599				break;
1600			}
1601		}
1602
1603		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1604
1605		if (ret) {
1606			log_warnx("%s: can't unlock mutex on cond (%d)",
1607			    __func__, (int)ret);
1608			break;
1609		}
1610
1611		if (vrp->vrp_irqready && i8259_is_pending()) {
1612			vrp->vrp_inject.vie_vector = i8259_ack();
1613			vrp->vrp_inject.vie_type = VCPU_INJECT_INTR;
1614		} else
1615			vrp->vrp_inject.vie_type = VCPU_INJECT_NONE;
1616
1617		/* Still more interrupts pending? */
1618		vrp->vrp_intr_pending = i8259_is_pending();
1619
1620		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1621			/* If run ioctl failed, exit */
1622			ret = errno;
1623			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1624			    __func__, current_vm->vm_vmid, n);
1625			break;
1626		}
1627
1628		/* If the VM is terminating, exit normally */
1629		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1630			ret = (intptr_t)NULL;
1631			break;
1632		}
1633
1634		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1635			/*
1636			 * vmm(4) needs help handling an exit, handle in
1637			 * vcpu_exit.
1638			 */
1639			ret = vcpu_exit(vrp);
1640			if (ret)
1641				break;
1642		}
1643	}
1644
1645	mutex_lock(&threadmutex);
1646	vcpu_done[n] = 1;
1647	pthread_cond_signal(&threadcond);
1648	mutex_unlock(&threadmutex);
1649
1650	return ((void *)ret);
1651}
1652
1653int
1654vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1655{
1656	struct vm_intr_params vip;
1657
1658	memset(&vip, 0, sizeof(vip));
1659
1660	vip.vip_vm_id = vm_id;
1661	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1662	vip.vip_intr = intr;
1663
1664	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1665		return (errno);
1666
1667	return (0);
1668}
1669
1670/*
1671 * vcpu_exit_pci
1672 *
1673 * Handle all I/O to the emulated PCI subsystem.
1674 *
1675 * Parameters:
1676 *  vrp: vcpu run parameters containing guest state for this exit
1677 *
1678 * Return value:
1679 *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1680 *      be injected.
1681 */
1682uint8_t
1683vcpu_exit_pci(struct vm_run_params *vrp)
1684{
1685	struct vm_exit *vei = vrp->vrp_exit;
1686	uint8_t intr;
1687
1688	intr = 0xFF;
1689
1690	switch (vei->vei.vei_port) {
1691	case PCI_MODE1_ADDRESS_REG:
1692		pci_handle_address_reg(vrp);
1693		break;
1694	case PCI_MODE1_DATA_REG:
1695	case PCI_MODE1_DATA_REG + 1:
1696	case PCI_MODE1_DATA_REG + 2:
1697	case PCI_MODE1_DATA_REG + 3:
1698		pci_handle_data_reg(vrp);
1699		break;
1700	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
1701		intr = pci_handle_io(vrp);
1702		break;
1703	default:
1704		log_warnx("%s: unknown PCI register 0x%llx",
1705		    __progname, (uint64_t)vei->vei.vei_port);
1706		break;
1707	}
1708
1709	return (intr);
1710}
1711
1712/*
1713 * vcpu_exit_inout
1714 *
1715 * Handle all I/O exits that need to be emulated in vmd. This includes the
1716 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1717 *
1718 * Parameters:
1719 *  vrp: vcpu run parameters containing guest state for this exit
1720 */
1721void
1722vcpu_exit_inout(struct vm_run_params *vrp)
1723{
1724	struct vm_exit *vei = vrp->vrp_exit;
1725	uint8_t intr = 0xFF;
1726
1727	if (vei->vei.vei_rep || vei->vei.vei_string) {
1728#ifdef MMIO_DEBUG
1729		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
1730		    __func__,
1731		    vei->vei.vei_rep == 0 ? "" : "REP ",
1732		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
1733		    vei->vei.vei_string == 0 ? "" : "S",
1734		    vei->vei.vei_size, vei->vei.vei_encoding,
1735		    vei->vei.vei_data, vei->vei.vei_port);
1736		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
1737		    __func__,
1738		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
1739		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
1740		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
1741#endif /* MMIO_DEBUG */
1742		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
1743		    __func__);
1744	}
1745
1746	if (ioports_map[vei->vei.vei_port] != NULL)
1747		intr = ioports_map[vei->vei.vei_port](vrp);
1748	else if (vei->vei.vei_dir == VEI_DIR_IN)
1749		set_return_data(vei, 0xFFFFFFFF);
1750
1751	vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
1752
1753	if (intr != 0xFF)
1754		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1755}
1756
1757/*
1758 * vcpu_exit_eptviolation
1759 *
1760 * handle an EPT Violation
1761 *
1762 * Parameters:
1763 *  vrp: vcpu run parameters containing guest state for this exit
1764 *
1765 * Return values:
1766 *  0: no action required
1767 *  EFAULT: a protection fault occured, kill the vm.
1768 */
1769int
1770vcpu_exit_eptviolation(struct vm_run_params *vrp)
1771{
1772	struct vm_exit *ve = vrp->vrp_exit;
1773	int ret = 0;
1774#if MMIO_NOTYET
1775	struct x86_insn insn;
1776	uint64_t va, pa;
1777	size_t len = 15;		/* Max instruction length in x86. */
1778#endif /* MMIO_NOTYET */
1779	switch (ve->vee.vee_fault_type) {
1780	case VEE_FAULT_HANDLED:
1781		log_debug("%s: fault already handled", __func__);
1782		break;
1783
1784#if MMIO_NOTYET
1785	case VEE_FAULT_MMIO_ASSIST:
1786		/* Intel VMX might give us the length of the instruction. */
1787		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
1788			len = ve->vee.vee_insn_len;
1789
1790		if (len > 15)
1791			fatalx("%s: invalid instruction length %lu", __func__,
1792			    len);
1793
1794		/* If we weren't given instruction bytes, we need to fetch. */
1795		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
1796			memset(ve->vee.vee_insn_bytes, 0,
1797			    sizeof(ve->vee.vee_insn_bytes));
1798			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
1799
1800			/* XXX Only support instructions that fit on 1 page. */
1801			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
1802				log_warnx("%s: instruction might cross page "
1803				    "boundary", __func__);
1804				ret = EINVAL;
1805				break;
1806			}
1807
1808			ret = translate_gva(ve, va, &pa, PROT_EXEC);
1809			if (ret != 0) {
1810				log_warnx("%s: failed gva translation",
1811				    __func__);
1812				break;
1813			}
1814
1815			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
1816			if (ret != 0) {
1817				log_warnx("%s: failed to fetch instruction "
1818				    "bytes from 0x%llx", __func__, pa);
1819				break;
1820			}
1821		}
1822
1823		ret = insn_decode(ve, &insn);
1824		if (ret == 0)
1825			ret = insn_emulate(ve, &insn);
1826		break;
1827#endif /* MMIO_NOTYET */
1828
1829	case VEE_FAULT_PROTECT:
1830		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
1831		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
1832		ret = EFAULT;
1833		break;
1834
1835	default:
1836		fatalx("%s: invalid fault_type %d", __progname,
1837		    ve->vee.vee_fault_type);
1838		/* UNREACHED */
1839	}
1840
1841	return (ret);
1842}
1843
1844/*
1845 * vcpu_exit
1846 *
1847 * Handle a vcpu exit. This function is called when it is determined that
1848 * vmm(4) requires the assistance of vmd to support a particular guest
1849 * exit type (eg, accessing an I/O port or device). Guest state is contained
1850 * in 'vrp', and will be resent to vmm(4) on exit completion.
1851 *
1852 * Upon conclusion of handling the exit, the function determines if any
1853 * interrupts should be injected into the guest, and asserts the proper
1854 * IRQ line whose interrupt should be vectored.
1855 *
1856 * Parameters:
1857 *  vrp: vcpu run parameters containing guest state for this exit
1858 *
1859 * Return values:
1860 *  0: the exit was handled successfully
1861 *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1862 */
1863int
1864vcpu_exit(struct vm_run_params *vrp)
1865{
1866	int ret;
1867
1868	switch (vrp->vrp_exit_reason) {
1869	case VMX_EXIT_INT_WINDOW:
1870	case SVM_VMEXIT_VINTR:
1871	case VMX_EXIT_CPUID:
1872	case VMX_EXIT_EXTINT:
1873	case SVM_VMEXIT_INTR:
1874	case SVM_VMEXIT_MSR:
1875	case SVM_VMEXIT_CPUID:
1876		/*
1877		 * We may be exiting to vmd to handle a pending interrupt but
1878		 * at the same time the last exit type may have been one of
1879		 * these. In this case, there's nothing extra to be done
1880		 * here (and falling through to the default case below results
1881		 * in more vmd log spam).
1882		 */
1883		break;
1884	case SVM_VMEXIT_NPF:
1885	case VMX_EXIT_EPT_VIOLATION:
1886		ret = vcpu_exit_eptviolation(vrp);
1887		if (ret)
1888			return (ret);
1889		break;
1890	case VMX_EXIT_IO:
1891	case SVM_VMEXIT_IOIO:
1892		vcpu_exit_inout(vrp);
1893		break;
1894	case VMX_EXIT_HLT:
1895	case SVM_VMEXIT_HLT:
1896		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1897		if (ret) {
1898			log_warnx("%s: can't lock vcpu mutex (%d)",
1899			    __func__, ret);
1900			return (ret);
1901		}
1902		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1903		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1904		if (ret) {
1905			log_warnx("%s: can't unlock vcpu mutex (%d)",
1906			    __func__, ret);
1907			return (ret);
1908		}
1909		break;
1910	case VMX_EXIT_TRIPLE_FAULT:
1911	case SVM_VMEXIT_SHUTDOWN:
1912		/* reset VM */
1913		return (EAGAIN);
1914	default:
1915		log_debug("%s: unknown exit reason 0x%x",
1916		    __progname, vrp->vrp_exit_reason);
1917	}
1918
1919	return (0);
1920}
1921
1922/*
1923 * find_gpa_range
1924 *
1925 * Search for a contiguous guest physical mem range.
1926 *
1927 * Parameters:
1928 *  vcp: VM create parameters that contain the memory map to search in
1929 *  gpa: the starting guest physical address
1930 *  len: the length of the memory range
1931 *
1932 * Return values:
1933 *  NULL: on failure if there is no memory range as described by the parameters
1934 *  Pointer to vm_mem_range that contains the start of the range otherwise.
1935 */
1936static struct vm_mem_range *
1937find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1938{
1939	size_t i, n;
1940	struct vm_mem_range *vmr;
1941
1942	/* Find the first vm_mem_range that contains gpa */
1943	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1944		vmr = &vcp->vcp_memranges[i];
1945		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
1946			break;
1947	}
1948
1949	/* No range found. */
1950	if (i == vcp->vcp_nmemranges)
1951		return (NULL);
1952
1953	/*
1954	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1955	 * sure that the following vm_mem_ranges are contiguous and
1956	 * cover the rest.
1957	 */
1958	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1959	if (len < n)
1960		len = 0;
1961	else
1962		len -= n;
1963	gpa = vmr->vmr_gpa + vmr->vmr_size;
1964	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1965		vmr = &vcp->vcp_memranges[i];
1966		if (gpa != vmr->vmr_gpa)
1967			return (NULL);
1968		if (len <= vmr->vmr_size)
1969			len = 0;
1970		else
1971			len -= vmr->vmr_size;
1972
1973		gpa = vmr->vmr_gpa + vmr->vmr_size;
1974	}
1975
1976	if (len != 0)
1977		return (NULL);
1978
1979	return (vmr);
1980}
1981
1982/*
1983 * write_mem
1984 *
1985 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1986 *
1987 * Parameters:
1988 *  dst: the destination paddr_t in the guest VM
1989 *  buf: data to copy (or NULL to zero the data)
1990 *  len: number of bytes to copy
1991 *
1992 * Return values:
1993 *  0: success
1994 *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1995 *      exist in the guest.
1996 */
1997int
1998write_mem(paddr_t dst, const void *buf, size_t len)
1999{
2000	const char *from = buf;
2001	char *to;
2002	size_t n, off;
2003	struct vm_mem_range *vmr;
2004
2005	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
2006	if (vmr == NULL) {
2007		errno = EINVAL;
2008		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
2009		    "len = 0x%zx", __func__, dst, len);
2010		return (EINVAL);
2011	}
2012
2013	off = dst - vmr->vmr_gpa;
2014	while (len != 0) {
2015		n = vmr->vmr_size - off;
2016		if (len < n)
2017			n = len;
2018
2019		to = (char *)vmr->vmr_va + off;
2020		if (buf == NULL)
2021			memset(to, 0, n);
2022		else {
2023			memcpy(to, from, n);
2024			from += n;
2025		}
2026		len -= n;
2027		off = 0;
2028		vmr++;
2029	}
2030
2031	return (0);
2032}
2033
2034/*
2035 * read_mem
2036 *
2037 * Reads memory at guest paddr 'src' into 'buf'.
2038 *
2039 * Parameters:
2040 *  src: the source paddr_t in the guest VM to read from.
2041 *  buf: destination (local) buffer
2042 *  len: number of bytes to read
2043 *
2044 * Return values:
2045 *  0: success
2046 *  EINVAL: if the guest physical memory range [dst, dst + len) does not
2047 *      exist in the guest.
2048 */
2049int
2050read_mem(paddr_t src, void *buf, size_t len)
2051{
2052	char *from, *to = buf;
2053	size_t n, off;
2054	struct vm_mem_range *vmr;
2055
2056	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
2057	if (vmr == NULL) {
2058		errno = EINVAL;
2059		log_warn("%s: failed - invalid memory range src = 0x%lx, "
2060		    "len = 0x%zx", __func__, src, len);
2061		return (EINVAL);
2062	}
2063
2064	off = src - vmr->vmr_gpa;
2065	while (len != 0) {
2066		n = vmr->vmr_size - off;
2067		if (len < n)
2068			n = len;
2069
2070		from = (char *)vmr->vmr_va + off;
2071		memcpy(to, from, n);
2072
2073		to += n;
2074		len -= n;
2075		off = 0;
2076		vmr++;
2077	}
2078
2079	return (0);
2080}
2081
2082/*
2083 * hvaddr_mem
2084 *
2085 * Translate a guest physical address to a host virtual address, checking the
2086 * provided memory range length to confirm it's contiguous within the same
2087 * guest memory range (vm_mem_range).
2088 *
2089 * Parameters:
2090 *  gpa: guest physical address to translate
2091 *  len: number of bytes in the intended range
2092 *
2093 * Return values:
2094 *  void* to host virtual memory on success
2095 *  NULL on error, setting errno to:
2096 *    EFAULT: gpa falls outside guest memory ranges
2097 *    EINVAL: requested len extends beyond memory range
2098 */
2099void *
2100hvaddr_mem(paddr_t gpa, size_t len)
2101{
2102	struct vm_mem_range *vmr;
2103	size_t off;
2104
2105	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
2106	if (vmr == NULL) {
2107		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
2108		errno = EFAULT;
2109		return (NULL);
2110	}
2111
2112	off = gpa - vmr->vmr_gpa;
2113	if (len > (vmr->vmr_size - off)) {
2114		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
2115		    "len=%zu", __func__, gpa, len);
2116		errno = EINVAL;
2117		return (NULL);
2118	}
2119
2120	return ((char *)vmr->vmr_va + off);
2121}
2122
2123/*
2124 * vcpu_assert_pic_irq
2125 *
2126 * Injects the specified IRQ on the supplied vcpu/vm
2127 *
2128 * Parameters:
2129 *  vm_id: VM ID to inject to
2130 *  vcpu_id: VCPU ID to inject to
2131 *  irq: IRQ to inject
2132 */
2133void
2134vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2135{
2136	int ret;
2137
2138	i8259_assert_irq(irq);
2139
2140	if (i8259_is_pending()) {
2141		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
2142			fatalx("%s: can't assert INTR", __func__);
2143		mutex_lock(&vcpu_run_mtx[vcpu_id]);
2144		vcpu_hlt[vcpu_id] = 0;
2145		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
2146		if (ret)
2147			fatalx("%s: can't signal (%d)", __func__, ret);
2148		mutex_unlock(&vcpu_run_mtx[vcpu_id]);
2149	}
2150}
2151
2152/*
2153 * vcpu_deassert_pic_irq
2154 *
2155 * Clears the specified IRQ on the supplied vcpu/vm
2156 *
2157 * Parameters:
2158 *  vm_id: VM ID to clear in
2159 *  vcpu_id: VCPU ID to clear in
2160 *  irq: IRQ to clear
2161 */
2162void
2163vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2164{
2165	i8259_deassert_irq(irq);
2166
2167	if (!i8259_is_pending()) {
2168		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
2169			fatalx("%s: can't deassert INTR for vm_id %d, "
2170			    "vcpu_id %d", __func__, vm_id, vcpu_id);
2171	}
2172}
2173
2174/*
2175 * fd_hasdata
2176 *
2177 * Determines if data can be read from a file descriptor.
2178 *
2179 * Parameters:
2180 *  fd: the fd to check
2181 *
2182 * Return values:
2183 *  1 if data can be read from an fd, or 0 otherwise.
2184 */
2185int
2186fd_hasdata(int fd)
2187{
2188	struct pollfd pfd[1];
2189	int nready, hasdata = 0;
2190
2191	pfd[0].fd = fd;
2192	pfd[0].events = POLLIN;
2193	nready = poll(pfd, 1, 0);
2194	if (nready == -1)
2195		log_warn("checking file descriptor for data failed");
2196	else if (nready == 1 && pfd[0].revents & POLLIN)
2197		hasdata = 1;
2198	return (hasdata);
2199}
2200
2201/*
2202 * mutex_lock
2203 *
2204 * Wrapper function for pthread_mutex_lock that does error checking and that
2205 * exits on failure
2206 */
2207void
2208mutex_lock(pthread_mutex_t *m)
2209{
2210	int ret;
2211
2212	ret = pthread_mutex_lock(m);
2213	if (ret) {
2214		errno = ret;
2215		fatal("could not acquire mutex");
2216	}
2217}
2218
2219/*
2220 * mutex_unlock
2221 *
2222 * Wrapper function for pthread_mutex_unlock that does error checking and that
2223 * exits on failure
2224 */
2225void
2226mutex_unlock(pthread_mutex_t *m)
2227{
2228	int ret;
2229
2230	ret = pthread_mutex_unlock(m);
2231	if (ret) {
2232		errno = ret;
2233		fatal("could not release mutex");
2234	}
2235}
2236
2237/*
2238 * set_return_data
2239 *
2240 * Utility function for manipulating register data in vm exit info structs. This
2241 * function ensures that the data is copied to the vei->vei.vei_data field with
2242 * the proper size for the operation being performed.
2243 *
2244 * Parameters:
2245 *  vei: exit information
2246 *  data: return data
2247 */
2248void
2249set_return_data(struct vm_exit *vei, uint32_t data)
2250{
2251	switch (vei->vei.vei_size) {
2252	case 1:
2253		vei->vei.vei_data &= ~0xFF;
2254		vei->vei.vei_data |= (uint8_t)data;
2255		break;
2256	case 2:
2257		vei->vei.vei_data &= ~0xFFFF;
2258		vei->vei.vei_data |= (uint16_t)data;
2259		break;
2260	case 4:
2261		vei->vei.vei_data = data;
2262		break;
2263	}
2264}
2265
2266/*
2267 * get_input_data
2268 *
2269 * Utility function for manipulating register data in vm exit info
2270 * structs. This function ensures that the data is copied from the
2271 * vei->vei.vei_data field with the proper size for the operation being
2272 * performed.
2273 *
2274 * Parameters:
2275 *  vei: exit information
2276 *  data: location to store the result
2277 */
2278void
2279get_input_data(struct vm_exit *vei, uint32_t *data)
2280{
2281	switch (vei->vei.vei_size) {
2282	case 1:
2283		*data &= 0xFFFFFF00;
2284		*data |= (uint8_t)vei->vei.vei_data;
2285		break;
2286	case 2:
2287		*data &= 0xFFFF0000;
2288		*data |= (uint16_t)vei->vei.vei_data;
2289		break;
2290	case 4:
2291		*data = vei->vei.vei_data;
2292		break;
2293	default:
2294		log_warnx("%s: invalid i/o size %d", __func__,
2295		    vei->vei.vei_size);
2296	}
2297
2298}
2299
2300/*
2301 * translate_gva
2302 *
2303 * Translates a guest virtual address to a guest physical address by walking
2304 * the currently active page table (if needed).
2305 *
2306 * XXX ensure translate_gva updates the A bit in the PTE
2307 * XXX ensure translate_gva respects segment base and limits in i386 mode
2308 * XXX ensure translate_gva respects segment wraparound in i8086 mode
2309 * XXX ensure translate_gva updates the A bit in the segment selector
2310 * XXX ensure translate_gva respects CR4.LMSLE if available
2311 *
2312 * Parameters:
2313 *  exit: The VCPU this translation should be performed for (guest MMU settings
2314 *   are gathered from this VCPU)
2315 *  va: virtual address to translate
2316 *  pa: pointer to paddr_t variable that will receive the translated physical
2317 *   address. 'pa' is unchanged on error.
2318 *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2319 *   the address should be translated
2320 *
2321 * Return values:
2322 *  0: the address was successfully translated - 'pa' contains the physical
2323 *     address currently mapped by 'va'.
2324 *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2325 *     and %cr2 set in the vcpu structure.
2326 *  EINVAL: an error occurred reading paging table structures
2327 */
2328int
2329translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2330{
2331	int level, shift, pdidx;
2332	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2333	uint64_t shift_width, pte_size;
2334	struct vcpu_reg_state *vrs;
2335
2336	vrs = &exit->vrs;
2337
2338	if (!pa)
2339		return (EINVAL);
2340
2341	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2342		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2343		*pa = va;
2344		return (0);
2345	}
2346
2347	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2348
2349	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2350	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2351
2352	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2353		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2354			pte_size = sizeof(uint64_t);
2355			shift_width = 9;
2356
2357			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2358				/* 4 level paging */
2359				level = 4;
2360				mask = L4_MASK;
2361				shift = L4_SHIFT;
2362			} else {
2363				/* 32 bit with PAE paging */
2364				level = 3;
2365				mask = L3_MASK;
2366				shift = L3_SHIFT;
2367			}
2368		} else {
2369			/* 32 bit paging */
2370			level = 2;
2371			shift_width = 10;
2372			mask = 0xFFC00000;
2373			shift = 22;
2374			pte_size = sizeof(uint32_t);
2375		}
2376	} else
2377		return (EINVAL);
2378
2379	/* XXX: Check for R bit in segment selector and set A bit */
2380
2381	for (;level > 0; level--) {
2382		pdidx = (va & mask) >> shift;
2383		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2384
2385		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2386		    level, pte_paddr);
2387		if (read_mem(pte_paddr, &pte, pte_size)) {
2388			log_warn("%s: failed to read pte", __func__);
2389			return (EFAULT);
2390		}
2391
2392		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2393		    pte);
2394
2395		/* XXX: Set CR2  */
2396		if (!(pte & PG_V))
2397			return (EFAULT);
2398
2399		/* XXX: Check for SMAP */
2400		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2401			return (EPERM);
2402
2403		if ((exit->cpl > 0) && !(pte & PG_u))
2404			return (EPERM);
2405
2406		pte = pte | PG_U;
2407		if (mode == PROT_WRITE)
2408			pte = pte | PG_M;
2409		if (write_mem(pte_paddr, &pte, pte_size)) {
2410			log_warn("%s: failed to write back flags to pte",
2411			    __func__);
2412			return (EIO);
2413		}
2414
2415		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
2416		if (pte & PG_PS)
2417			break;
2418
2419		if (level > 1) {
2420			pt_paddr = pte & PG_FRAME;
2421			shift -= shift_width;
2422			mask = mask >> shift_width;
2423		}
2424	}
2425
2426	low_mask = (1 << shift) - 1;
2427	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2428	*pa = (pte & high_mask) | (va & low_mask);
2429
2430	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2431
2432	return (0);
2433}
2434
2435void
2436vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2437{
2438	vm_pipe_init2(p, cb, NULL);
2439}
2440
2441/*
2442 * vm_pipe_init2
2443 *
2444 * Initialize a vm_dev_pipe, setting up its file descriptors and its
2445 * event structure with the given callback and argument.
2446 *
2447 * Parameters:
2448 *  p: pointer to vm_dev_pipe struct to initizlize
2449 *  cb: callback to use for READ events on the read end of the pipe
2450 *  arg: pointer to pass to the callback on event trigger
2451 */
2452void
2453vm_pipe_init2(struct vm_dev_pipe *p, void (*cb)(int, short, void *), void *arg)
2454{
2455	int ret;
2456	int fds[2];
2457
2458	memset(p, 0, sizeof(struct vm_dev_pipe));
2459
2460	ret = pipe2(fds, O_CLOEXEC);
2461	if (ret)
2462		fatal("failed to create vm_dev_pipe pipe");
2463
2464	p->read = fds[0];
2465	p->write = fds[1];
2466
2467	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, arg);
2468}
2469
2470/*
2471 * vm_pipe_send
2472 *
2473 * Send a message to an emulated device vie the provided vm_dev_pipe. This
2474 * relies on the fact sizeof(msg) < PIPE_BUF to ensure atomic writes.
2475 *
2476 * Parameters:
2477 *  p: pointer to initialized vm_dev_pipe
2478 *  msg: message to send in the channel
2479 */
2480void
2481vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2482{
2483	size_t n;
2484	n = write(p->write, &msg, sizeof(msg));
2485	if (n != sizeof(msg))
2486		fatal("failed to write to device pipe");
2487}
2488
2489/*
2490 * vm_pipe_recv
2491 *
2492 * Receive a message for an emulated device via the provided vm_dev_pipe.
2493 * Returns the message value, otherwise will exit on failure. This relies on
2494 * the fact sizeof(enum pipe_msg_type) < PIPE_BUF for atomic reads.
2495 *
2496 * Parameters:
2497 *  p: pointer to initialized vm_dev_pipe
2498 *
2499 * Return values:
2500 *  a value of enum pipe_msg_type or fatal exit on read(2) error
2501 */
2502enum pipe_msg_type
2503vm_pipe_recv(struct vm_dev_pipe *p)
2504{
2505	size_t n;
2506	enum pipe_msg_type msg;
2507	n = read(p->read, &msg, sizeof(msg));
2508	if (n != sizeof(msg))
2509		fatal("failed to read from device pipe");
2510
2511	return msg;
2512}
2513
2514/*
2515 * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE
2516 *
2517 * Returns 0 on success, non-zero in event of failure.
2518 */
2519int
2520remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
2521{
2522	struct vm_create_params	*vcp;
2523	struct vm_mem_range	*vmr;
2524	struct vm_sharemem_params vsp;
2525	size_t			 i, j;
2526	void			*p = NULL;
2527	int			 ret;
2528
2529	if (vm == NULL)
2530		return (1);
2531
2532	vcp = &vm->vm_params.vmc_params;
2533
2534	/*
2535	 * Initialize our VM shared memory request using our original
2536	 * creation parameters. We'll overwrite the va's after mmap(2).
2537	 */
2538	memset(&vsp, 0, sizeof(vsp));
2539	vsp.vsp_nmemranges = vcp->vcp_nmemranges;
2540	vsp.vsp_vm_id = vcp->vcp_id;
2541	memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
2542	    sizeof(vsp.vsp_memranges));
2543
2544	/*
2545	 * Use mmap(2) to identify virtual address space for our mappings.
2546	 */
2547	for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
2548		if (i < vsp.vsp_nmemranges) {
2549			vmr = &vsp.vsp_memranges[i];
2550
2551			/* Ignore any MMIO ranges. */
2552			if (vmr->vmr_type == VM_MEM_MMIO) {
2553				vmr->vmr_va = 0;
2554				vcp->vcp_memranges[i].vmr_va = 0;
2555				continue;
2556			}
2557
2558			/* Make initial mappings for the memrange. */
2559			p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
2560			    0);
2561			if (p == MAP_FAILED) {
2562				ret = errno;
2563				log_warn("%s: mmap", __func__);
2564				for (j = 0; j < i; j++) {
2565					vmr = &vcp->vcp_memranges[j];
2566					munmap((void *)vmr->vmr_va,
2567					    vmr->vmr_size);
2568				}
2569				return (ret);
2570			}
2571			vmr->vmr_va = (vaddr_t)p;
2572			vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
2573		}
2574	}
2575
2576	/*
2577	 * munmap(2) now that we have va's and ranges that don't overlap. vmm
2578	 * will use the va's and sizes to recreate the mappings for us.
2579	 */
2580	for (i = 0; i < vsp.vsp_nmemranges; i++) {
2581		vmr = &vsp.vsp_memranges[i];
2582		if (vmr->vmr_type == VM_MEM_MMIO)
2583			continue;
2584		if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
2585			fatal("%s: munmap", __func__);
2586	}
2587
2588	/*
2589	 * Ask vmm to enter the shared mappings for us. They'll point
2590	 * to the same host physical memory, but will have a randomized
2591	 * virtual address for the calling process.
2592	 */
2593	if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
2594		return (errno);
2595
2596	return (0);
2597}
2598